OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license | |
5 ; that can be found in the LICENSE file in the root of the source | |
6 ; tree. An additional intellectual property rights grant can be found | |
7 ; in the file PATENTS. All contributing project authors may | |
8 ; be found in the AUTHORS file in the root of the source tree. | |
9 ; | |
10 | |
11 EXPORT |vp9_iht4x4_16_add_neon| | |
12 ARM | |
13 REQUIRE8 | |
14 PRESERVE8 | |
15 | |
16 AREA ||.text||, CODE, READONLY, ALIGN=2 | |
17 | |
18 ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are | |
19 ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain | |
20 ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back | |
21 ; into d16-d19 registers. This macro will touch q10- q15 registers and use | |
22 ; them as buffer during calculation. | |
23 MACRO | |
24 IDCT4x4_1D | |
25 ; stage 1 | |
26 vadd.s16 d23, d16, d18 ; (input[0] + input[2]) | |
27 vsub.s16 d24, d16, d18 ; (input[0] - input[2]) | |
28 | |
29 vmull.s16 q15, d17, d2 ; input[1] * cospi_24_64 | |
30 vmull.s16 q10, d17, d0 ; input[1] * cospi_8_64 | |
31 vmull.s16 q13, d23, d1 ; (input[0] + input[2]) * cospi_16_64 | |
32 vmull.s16 q14, d24, d1 ; (input[0] - input[2]) * cospi_16_64 | |
33 vmlsl.s16 q15, d19, d0 ; input[1] * cospi_24_64 - input[3] * cospi_8_64 | |
34 vmlal.s16 q10, d19, d2 ; input[1] * cospi_8_64 + input[3] * cospi_24_64 | |
35 | |
36 ; dct_const_round_shift | |
37 vqrshrn.s32 d26, q13, #14 | |
38 vqrshrn.s32 d27, q14, #14 | |
39 vqrshrn.s32 d29, q15, #14 | |
40 vqrshrn.s32 d28, q10, #14 | |
41 | |
42 ; stage 2 | |
43 ; output[0] = step[0] + step[3]; | |
44 ; output[1] = step[1] + step[2]; | |
45 ; output[3] = step[0] - step[3]; | |
46 ; output[2] = step[1] - step[2]; | |
47 vadd.s16 q8, q13, q14 | |
48 vsub.s16 q9, q13, q14 | |
49 vswp d18, d19 | |
50 MEND | |
51 | |
52 ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which | |
53 ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9. | |
54 ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be | |
55 ; stored back into d16-d19 registers. This macro will touch q11,q12,q13, | |
56 ; q14,q15 registers and use them as buffer during calculation. | |
57 MACRO | |
58 IADST4x4_1D | |
59 vmull.s16 q10, d3, d16 ; s0 = sinpi_1_9 * x0 | |
60 vmull.s16 q11, d4, d16 ; s1 = sinpi_2_9 * x0 | |
61 vmull.s16 q12, d6, d17 ; s2 = sinpi_3_9 * x1 | |
62 vmull.s16 q13, d5, d18 ; s3 = sinpi_4_9 * x2 | |
63 vmull.s16 q14, d3, d18 ; s4 = sinpi_1_9 * x2 | |
64 vmovl.s16 q15, d16 ; expand x0 from 16 bit to 32 bit | |
65 vaddw.s16 q15, q15, d19 ; x0 + x3 | |
66 vmull.s16 q8, d4, d19 ; s5 = sinpi_2_9 * x3 | |
67 vsubw.s16 q15, q15, d18 ; s7 = x0 + x3 - x2 | |
68 vmull.s16 q9, d5, d19 ; s6 = sinpi_4_9 * x3 | |
69 | |
70 vadd.s32 q10, q10, q13 ; x0 = s0 + s3 + s5 | |
71 vadd.s32 q10, q10, q8 | |
72 vsub.s32 q11, q11, q14 ; x1 = s1 - s4 - s6 | |
73 vdup.32 q8, r0 ; duplicate sinpi_3_9 | |
74 vsub.s32 q11, q11, q9 | |
75 vmul.s32 q15, q15, q8 ; x2 = sinpi_3_9 * s7 | |
76 | |
77 vadd.s32 q13, q10, q12 ; s0 = x0 + x3 | |
78 vadd.s32 q10, q10, q11 ; x0 + x1 | |
79 vadd.s32 q14, q11, q12 ; s1 = x1 + x3 | |
80 vsub.s32 q10, q10, q12 ; s3 = x0 + x1 - x3 | |
81 | |
82 ; dct_const_round_shift | |
83 vqrshrn.s32 d16, q13, #14 | |
84 vqrshrn.s32 d17, q14, #14 | |
85 vqrshrn.s32 d18, q15, #14 | |
86 vqrshrn.s32 d19, q10, #14 | |
87 MEND | |
88 | |
89 ; Generate cosine constants in d6 - d8 for the IDCT | |
90 MACRO | |
91 GENERATE_COSINE_CONSTANTS | |
92 ; cospi_8_64 = 15137 = 0x3b21 | |
93 mov r0, #0x3b00 | |
94 add r0, #0x21 | |
95 ; cospi_16_64 = 11585 = 0x2d41 | |
96 mov r3, #0x2d00 | |
97 add r3, #0x41 | |
98 ; cospi_24_64 = 6270 = 0x187e | |
99 mov r12, #0x1800 | |
100 add r12, #0x7e | |
101 | |
102 ; generate constant vectors | |
103 vdup.16 d0, r0 ; duplicate cospi_8_64 | |
104 vdup.16 d1, r3 ; duplicate cospi_16_64 | |
105 vdup.16 d2, r12 ; duplicate cospi_24_64 | |
106 MEND | |
107 | |
108 ; Generate sine constants in d1 - d4 for the IADST. | |
109 MACRO | |
110 GENERATE_SINE_CONSTANTS | |
111 ; sinpi_1_9 = 5283 = 0x14A3 | |
112 mov r0, #0x1400 | |
113 add r0, #0xa3 | |
114 ; sinpi_2_9 = 9929 = 0x26C9 | |
115 mov r3, #0x2600 | |
116 add r3, #0xc9 | |
117 ; sinpi_4_9 = 15212 = 0x3B6C | |
118 mov r12, #0x3b00 | |
119 add r12, #0x6c | |
120 | |
121 ; generate constant vectors | |
122 vdup.16 d3, r0 ; duplicate sinpi_1_9 | |
123 | |
124 ; sinpi_3_9 = 13377 = 0x3441 | |
125 mov r0, #0x3400 | |
126 add r0, #0x41 | |
127 | |
128 vdup.16 d4, r3 ; duplicate sinpi_2_9 | |
129 vdup.16 d5, r12 ; duplicate sinpi_4_9 | |
130 vdup.16 q3, r0 ; duplicate sinpi_3_9 | |
131 MEND | |
132 | |
133 ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19. | |
134 MACRO | |
135 TRANSPOSE4X4 | |
136 vtrn.16 d16, d17 | |
137 vtrn.16 d18, d19 | |
138 vtrn.32 q8, q9 | |
139 MEND | |
140 | |
141 AREA Block, CODE, READONLY ; name this block of code | |
142 ;void vp9_iht4x4_16_add_neon(int16_t *input, uint8_t *dest, | |
143 ; int dest_stride, int tx_type) | |
144 ; | |
145 ; r0 int16_t input | |
146 ; r1 uint8_t *dest | |
147 ; r2 int dest_stride | |
148 ; r3 int tx_type) | |
149 ; This function will only handle tx_type of 1,2,3. | |
150 |vp9_iht4x4_16_add_neon| PROC | |
151 | |
152 ; load the inputs into d16-d19 | |
153 vld1.s16 {q8,q9}, [r0]! | |
154 | |
155 ; transpose the input data | |
156 TRANSPOSE4X4 | |
157 | |
158 ; decide the type of transform | |
159 cmp r3, #2 | |
160 beq idct_iadst | |
161 cmp r3, #3 | |
162 beq iadst_iadst | |
163 | |
164 iadst_idct | |
165 ; generate constants | |
166 GENERATE_COSINE_CONSTANTS | |
167 GENERATE_SINE_CONSTANTS | |
168 | |
169 ; first transform rows | |
170 IDCT4x4_1D | |
171 | |
172 ; transpose the matrix | |
173 TRANSPOSE4X4 | |
174 | |
175 ; then transform columns | |
176 IADST4x4_1D | |
177 | |
178 b end_vp9_iht4x4_16_add_neon | |
179 | |
180 idct_iadst | |
181 ; generate constants | |
182 GENERATE_COSINE_CONSTANTS | |
183 GENERATE_SINE_CONSTANTS | |
184 | |
185 ; first transform rows | |
186 IADST4x4_1D | |
187 | |
188 ; transpose the matrix | |
189 TRANSPOSE4X4 | |
190 | |
191 ; then transform columns | |
192 IDCT4x4_1D | |
193 | |
194 b end_vp9_iht4x4_16_add_neon | |
195 | |
196 iadst_iadst | |
197 ; generate constants | |
198 GENERATE_SINE_CONSTANTS | |
199 | |
200 ; first transform rows | |
201 IADST4x4_1D | |
202 | |
203 ; transpose the matrix | |
204 TRANSPOSE4X4 | |
205 | |
206 ; then transform columns | |
207 IADST4x4_1D | |
208 | |
209 end_vp9_iht4x4_16_add_neon | |
210 ; ROUND_POWER_OF_TWO(temp_out[j], 4) | |
211 vrshr.s16 q8, q8, #4 | |
212 vrshr.s16 q9, q9, #4 | |
213 | |
214 vld1.32 {d26[0]}, [r1], r2 | |
215 vld1.32 {d26[1]}, [r1], r2 | |
216 vld1.32 {d27[0]}, [r1], r2 | |
217 vld1.32 {d27[1]}, [r1] | |
218 | |
219 ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i] | |
220 vaddw.u8 q8, q8, d26 | |
221 vaddw.u8 q9, q9, d27 | |
222 | |
223 ; clip_pixel | |
224 vqmovun.s16 d26, q8 | |
225 vqmovun.s16 d27, q9 | |
226 | |
227 ; do the stores in reverse order with negative post-increment, by changing | |
228 ; the sign of the stride | |
229 rsb r2, r2, #0 | |
230 vst1.32 {d27[1]}, [r1], r2 | |
231 vst1.32 {d27[0]}, [r1], r2 | |
232 vst1.32 {d26[1]}, [r1], r2 | |
233 vst1.32 {d26[0]}, [r1] ; no post-increment | |
234 bx lr | |
235 ENDP ; |vp9_iht4x4_16_add_neon| | |
236 | |
237 END | |
OLD | NEW |