| OLD | NEW | 
|---|
|  | (Empty) | 
| 1 ; |  | 
| 2 ;  Copyright (c) 2013 The WebM project authors. All Rights Reserved. |  | 
| 3 ; |  | 
| 4 ;  Use of this source code is governed by a BSD-style license |  | 
| 5 ;  that can be found in the LICENSE file in the root of the source |  | 
| 6 ;  tree. An additional intellectual property rights grant can be found |  | 
| 7 ;  in the file PATENTS.  All contributing project authors may |  | 
| 8 ;  be found in the AUTHORS file in the root of the source tree. |  | 
| 9 ; |  | 
| 10 |  | 
| 11     EXPORT  |vp9_iht4x4_16_add_neon| |  | 
| 12     ARM |  | 
| 13     REQUIRE8 |  | 
| 14     PRESERVE8 |  | 
| 15 |  | 
| 16     AREA ||.text||, CODE, READONLY, ALIGN=2 |  | 
| 17 |  | 
| 18     ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are |  | 
| 19     ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain |  | 
| 20     ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back |  | 
| 21     ; into d16-d19 registers. This macro will touch q10- q15 registers and use |  | 
| 22     ; them as buffer during calculation. |  | 
| 23     MACRO |  | 
| 24     IDCT4x4_1D |  | 
| 25     ; stage 1 |  | 
| 26     vadd.s16    d23, d16, d18   ; (input[0] + input[2]) |  | 
| 27     vsub.s16    d24, d16, d18   ; (input[0] - input[2]) |  | 
| 28 |  | 
| 29     vmull.s16   q15, d17, d2    ; input[1] * cospi_24_64 |  | 
| 30     vmull.s16   q10, d17, d0    ; input[1] * cospi_8_64 |  | 
| 31     vmull.s16   q13, d23, d1    ; (input[0] + input[2]) * cospi_16_64 |  | 
| 32     vmull.s16   q14, d24, d1    ; (input[0] - input[2]) * cospi_16_64 |  | 
| 33     vmlsl.s16   q15, d19, d0    ; input[1] * cospi_24_64 - input[3] * cospi_8_64 |  | 
| 34     vmlal.s16   q10, d19, d2    ; input[1] * cospi_8_64 + input[3] * cospi_24_64 |  | 
| 35 |  | 
| 36     ; dct_const_round_shift |  | 
| 37     vqrshrn.s32 d26, q13, #14 |  | 
| 38     vqrshrn.s32 d27, q14, #14 |  | 
| 39     vqrshrn.s32 d29, q15, #14 |  | 
| 40     vqrshrn.s32 d28, q10, #14 |  | 
| 41 |  | 
| 42     ; stage 2 |  | 
| 43     ; output[0] = step[0] + step[3]; |  | 
| 44     ; output[1] = step[1] + step[2]; |  | 
| 45     ; output[3] = step[0] - step[3]; |  | 
| 46     ; output[2] = step[1] - step[2]; |  | 
| 47     vadd.s16    q8,  q13, q14 |  | 
| 48     vsub.s16    q9,  q13, q14 |  | 
| 49     vswp        d18, d19 |  | 
| 50     MEND |  | 
| 51 |  | 
| 52     ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which |  | 
| 53     ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9. |  | 
| 54     ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be |  | 
| 55     ; stored back into d16-d19 registers. This macro will touch q11,q12,q13, |  | 
| 56     ; q14,q15 registers and use them as buffer during calculation. |  | 
| 57     MACRO |  | 
| 58     IADST4x4_1D |  | 
| 59     vmull.s16   q10, d3, d16    ; s0 = sinpi_1_9 * x0 |  | 
| 60     vmull.s16   q11, d4, d16    ; s1 = sinpi_2_9 * x0 |  | 
| 61     vmull.s16   q12, d6, d17    ; s2 = sinpi_3_9 * x1 |  | 
| 62     vmull.s16   q13, d5, d18    ; s3 = sinpi_4_9 * x2 |  | 
| 63     vmull.s16   q14, d3, d18    ; s4 = sinpi_1_9 * x2 |  | 
| 64     vmovl.s16   q15, d16        ; expand x0 from 16 bit to 32 bit |  | 
| 65     vaddw.s16   q15, q15, d19   ; x0 + x3 |  | 
| 66     vmull.s16   q8, d4, d19     ; s5 = sinpi_2_9 * x3 |  | 
| 67     vsubw.s16   q15, q15, d18   ; s7 = x0 + x3 - x2 |  | 
| 68     vmull.s16   q9, d5, d19     ; s6 = sinpi_4_9 * x3 |  | 
| 69 |  | 
| 70     vadd.s32    q10, q10, q13   ; x0 = s0 + s3 + s5 |  | 
| 71     vadd.s32    q10, q10, q8 |  | 
| 72     vsub.s32    q11, q11, q14   ; x1 = s1 - s4 - s6 |  | 
| 73     vdup.32     q8, r0          ; duplicate sinpi_3_9 |  | 
| 74     vsub.s32    q11, q11, q9 |  | 
| 75     vmul.s32    q15, q15, q8    ; x2 = sinpi_3_9 * s7 |  | 
| 76 |  | 
| 77     vadd.s32    q13, q10, q12   ; s0 = x0 + x3 |  | 
| 78     vadd.s32    q10, q10, q11   ; x0 + x1 |  | 
| 79     vadd.s32    q14, q11, q12   ; s1 = x1 + x3 |  | 
| 80     vsub.s32    q10, q10, q12   ; s3 = x0 + x1 - x3 |  | 
| 81 |  | 
| 82     ; dct_const_round_shift |  | 
| 83     vqrshrn.s32 d16, q13, #14 |  | 
| 84     vqrshrn.s32 d17, q14, #14 |  | 
| 85     vqrshrn.s32 d18, q15, #14 |  | 
| 86     vqrshrn.s32 d19, q10, #14 |  | 
| 87     MEND |  | 
| 88 |  | 
| 89     ; Generate cosine constants in d6 - d8 for the IDCT |  | 
| 90     MACRO |  | 
| 91     GENERATE_COSINE_CONSTANTS |  | 
| 92     ; cospi_8_64 = 15137 = 0x3b21 |  | 
| 93     mov         r0, #0x3b00 |  | 
| 94     add         r0, #0x21 |  | 
| 95     ; cospi_16_64 = 11585 = 0x2d41 |  | 
| 96     mov         r3, #0x2d00 |  | 
| 97     add         r3, #0x41 |  | 
| 98     ; cospi_24_64 = 6270 = 0x187e |  | 
| 99     mov         r12, #0x1800 |  | 
| 100     add         r12, #0x7e |  | 
| 101 |  | 
| 102     ; generate constant vectors |  | 
| 103     vdup.16     d0, r0          ; duplicate cospi_8_64 |  | 
| 104     vdup.16     d1, r3          ; duplicate cospi_16_64 |  | 
| 105     vdup.16     d2, r12         ; duplicate cospi_24_64 |  | 
| 106     MEND |  | 
| 107 |  | 
| 108     ; Generate sine constants in d1 - d4 for the IADST. |  | 
| 109     MACRO |  | 
| 110     GENERATE_SINE_CONSTANTS |  | 
| 111     ; sinpi_1_9 = 5283 = 0x14A3 |  | 
| 112     mov         r0, #0x1400 |  | 
| 113     add         r0, #0xa3 |  | 
| 114     ; sinpi_2_9 = 9929 = 0x26C9 |  | 
| 115     mov         r3, #0x2600 |  | 
| 116     add         r3, #0xc9 |  | 
| 117     ; sinpi_4_9 = 15212 = 0x3B6C |  | 
| 118     mov         r12, #0x3b00 |  | 
| 119     add         r12, #0x6c |  | 
| 120 |  | 
| 121     ; generate constant vectors |  | 
| 122     vdup.16     d3, r0          ; duplicate sinpi_1_9 |  | 
| 123 |  | 
| 124     ; sinpi_3_9 = 13377 = 0x3441 |  | 
| 125     mov         r0, #0x3400 |  | 
| 126     add         r0, #0x41 |  | 
| 127 |  | 
| 128     vdup.16     d4, r3          ; duplicate sinpi_2_9 |  | 
| 129     vdup.16     d5, r12         ; duplicate sinpi_4_9 |  | 
| 130     vdup.16     q3, r0          ; duplicate sinpi_3_9 |  | 
| 131     MEND |  | 
| 132 |  | 
| 133     ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19. |  | 
| 134     MACRO |  | 
| 135     TRANSPOSE4X4 |  | 
| 136     vtrn.16     d16, d17 |  | 
| 137     vtrn.16     d18, d19 |  | 
| 138     vtrn.32     q8, q9 |  | 
| 139     MEND |  | 
| 140 |  | 
| 141     AREA     Block, CODE, READONLY ; name this block of code |  | 
| 142 ;void vp9_iht4x4_16_add_neon(int16_t *input, uint8_t *dest, |  | 
| 143 ;                               int dest_stride, int tx_type) |  | 
| 144 ; |  | 
| 145 ; r0  int16_t input |  | 
| 146 ; r1  uint8_t *dest |  | 
| 147 ; r2  int dest_stride |  | 
| 148 ; r3  int tx_type) |  | 
| 149 ; This function will only handle tx_type of 1,2,3. |  | 
| 150 |vp9_iht4x4_16_add_neon| PROC |  | 
| 151 |  | 
| 152     ; load the inputs into d16-d19 |  | 
| 153     vld1.s16    {q8,q9}, [r0]! |  | 
| 154 |  | 
| 155     ; transpose the input data |  | 
| 156     TRANSPOSE4X4 |  | 
| 157 |  | 
| 158     ; decide the type of transform |  | 
| 159     cmp         r3, #2 |  | 
| 160     beq         idct_iadst |  | 
| 161     cmp         r3, #3 |  | 
| 162     beq         iadst_iadst |  | 
| 163 |  | 
| 164 iadst_idct |  | 
| 165     ; generate constants |  | 
| 166     GENERATE_COSINE_CONSTANTS |  | 
| 167     GENERATE_SINE_CONSTANTS |  | 
| 168 |  | 
| 169     ; first transform rows |  | 
| 170     IDCT4x4_1D |  | 
| 171 |  | 
| 172     ; transpose the matrix |  | 
| 173     TRANSPOSE4X4 |  | 
| 174 |  | 
| 175     ; then transform columns |  | 
| 176     IADST4x4_1D |  | 
| 177 |  | 
| 178     b end_vp9_iht4x4_16_add_neon |  | 
| 179 |  | 
| 180 idct_iadst |  | 
| 181     ; generate constants |  | 
| 182     GENERATE_COSINE_CONSTANTS |  | 
| 183     GENERATE_SINE_CONSTANTS |  | 
| 184 |  | 
| 185     ; first transform rows |  | 
| 186     IADST4x4_1D |  | 
| 187 |  | 
| 188     ; transpose the matrix |  | 
| 189     TRANSPOSE4X4 |  | 
| 190 |  | 
| 191     ; then transform columns |  | 
| 192     IDCT4x4_1D |  | 
| 193 |  | 
| 194     b end_vp9_iht4x4_16_add_neon |  | 
| 195 |  | 
| 196 iadst_iadst |  | 
| 197     ; generate constants |  | 
| 198     GENERATE_SINE_CONSTANTS |  | 
| 199 |  | 
| 200     ; first transform rows |  | 
| 201     IADST4x4_1D |  | 
| 202 |  | 
| 203     ; transpose the matrix |  | 
| 204     TRANSPOSE4X4 |  | 
| 205 |  | 
| 206     ; then transform columns |  | 
| 207     IADST4x4_1D |  | 
| 208 |  | 
| 209 end_vp9_iht4x4_16_add_neon |  | 
| 210     ; ROUND_POWER_OF_TWO(temp_out[j], 4) |  | 
| 211     vrshr.s16   q8, q8, #4 |  | 
| 212     vrshr.s16   q9, q9, #4 |  | 
| 213 |  | 
| 214     vld1.32     {d26[0]}, [r1], r2 |  | 
| 215     vld1.32     {d26[1]}, [r1], r2 |  | 
| 216     vld1.32     {d27[0]}, [r1], r2 |  | 
| 217     vld1.32     {d27[1]}, [r1] |  | 
| 218 |  | 
| 219     ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i] |  | 
| 220     vaddw.u8    q8, q8, d26 |  | 
| 221     vaddw.u8    q9, q9, d27 |  | 
| 222 |  | 
| 223     ; clip_pixel |  | 
| 224     vqmovun.s16 d26, q8 |  | 
| 225     vqmovun.s16 d27, q9 |  | 
| 226 |  | 
| 227     ; do the stores in reverse order with negative post-increment, by changing |  | 
| 228     ; the sign of the stride |  | 
| 229     rsb         r2, r2, #0 |  | 
| 230     vst1.32     {d27[1]}, [r1], r2 |  | 
| 231     vst1.32     {d27[0]}, [r1], r2 |  | 
| 232     vst1.32     {d26[1]}, [r1], r2 |  | 
| 233     vst1.32     {d26[0]}, [r1]  ; no post-increment |  | 
| 234     bx          lr |  | 
| 235     ENDP  ; |vp9_iht4x4_16_add_neon| |  | 
| 236 |  | 
| 237     END |  | 
| OLD | NEW | 
|---|