OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license | |
5 ; that can be found in the LICENSE file in the root of the source | |
6 ; tree. An additional intellectual property rights grant can be found | |
7 ; in the file PATENTS. All contributing project authors may | |
8 ; be found in the AUTHORS file in the root of the source tree. | |
9 ; | |
10 | |
11 ;TODO(cd): adjust these constant to be able to use vqdmulh for faster | |
12 ; dct_const_round_shift(a * b) within butterfly calculations. | |
13 cospi_1_64 EQU 16364 | |
14 cospi_2_64 EQU 16305 | |
15 cospi_3_64 EQU 16207 | |
16 cospi_4_64 EQU 16069 | |
17 cospi_5_64 EQU 15893 | |
18 cospi_6_64 EQU 15679 | |
19 cospi_7_64 EQU 15426 | |
20 cospi_8_64 EQU 15137 | |
21 cospi_9_64 EQU 14811 | |
22 cospi_10_64 EQU 14449 | |
23 cospi_11_64 EQU 14053 | |
24 cospi_12_64 EQU 13623 | |
25 cospi_13_64 EQU 13160 | |
26 cospi_14_64 EQU 12665 | |
27 cospi_15_64 EQU 12140 | |
28 cospi_16_64 EQU 11585 | |
29 cospi_17_64 EQU 11003 | |
30 cospi_18_64 EQU 10394 | |
31 cospi_19_64 EQU 9760 | |
32 cospi_20_64 EQU 9102 | |
33 cospi_21_64 EQU 8423 | |
34 cospi_22_64 EQU 7723 | |
35 cospi_23_64 EQU 7005 | |
36 cospi_24_64 EQU 6270 | |
37 cospi_25_64 EQU 5520 | |
38 cospi_26_64 EQU 4756 | |
39 cospi_27_64 EQU 3981 | |
40 cospi_28_64 EQU 3196 | |
41 cospi_29_64 EQU 2404 | |
42 cospi_30_64 EQU 1606 | |
43 cospi_31_64 EQU 804 | |
44 | |
45 | |
46 EXPORT |vp9_idct32x32_1024_add_neon| | |
47 ARM | |
48 REQUIRE8 | |
49 PRESERVE8 | |
50 | |
51 AREA ||.text||, CODE, READONLY, ALIGN=2 | |
52 | |
53 AREA Block, CODE, READONLY | |
54 | |
55 ; -------------------------------------------------------------------------- | |
56 ; Load from transposed_buffer | |
57 ; q13 = transposed_buffer[first_offset] | |
58 ; q14 = transposed_buffer[second_offset] | |
59 ; for proper address calculation, the last offset used when manipulating | |
60 ; transposed_buffer must be passed in. use 0 for first use. | |
61 MACRO | |
62 LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset | |
63 ; address calculation with proper stride and loading | |
64 add r0, #($first_offset - $prev_offset )*8*2 | |
65 vld1.s16 {q14}, [r0] | |
66 add r0, #($second_offset - $first_offset)*8*2 | |
67 vld1.s16 {q13}, [r0] | |
68 ; (used) two registers (q14, q13) | |
69 MEND | |
70 ; -------------------------------------------------------------------------- | |
71 ; Load from output (used as temporary storage) | |
72 ; reg1 = output[first_offset] | |
73 ; reg2 = output[second_offset] | |
74 ; for proper address calculation, the last offset used when manipulating | |
75 ; output, whether reading or storing) must be passed in. use 0 for first | |
76 ; use. | |
77 MACRO | |
78 LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 | |
79 ; address calculation with proper stride and loading | |
80 add r1, #($first_offset - $prev_offset )*32*2 | |
81 vld1.s16 {$reg1}, [r1] | |
82 add r1, #($second_offset - $first_offset)*32*2 | |
83 vld1.s16 {$reg2}, [r1] | |
84 ; (used) two registers ($reg1, $reg2) | |
85 MEND | |
86 ; -------------------------------------------------------------------------- | |
87 ; Store into output (sometimes as as temporary storage) | |
88 ; output[first_offset] = reg1 | |
89 ; output[second_offset] = reg2 | |
90 ; for proper address calculation, the last offset used when manipulating | |
91 ; output, whether reading or storing) must be passed in. use 0 for first | |
92 ; use. | |
93 MACRO | |
94 STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 | |
95 ; address calculation with proper stride and storing | |
96 add r1, #($first_offset - $prev_offset )*32*2 | |
97 vst1.16 {$reg1}, [r1] | |
98 add r1, #($second_offset - $first_offset)*32*2 | |
99 vst1.16 {$reg2}, [r1] | |
100 MEND | |
101 ; -------------------------------------------------------------------------- | |
102 ; Combine-add results with current destination content | |
103 ; q6-q9 contain the results (out[j * 32 + 0-31]) | |
104 MACRO | |
105 STORE_COMBINE_CENTER_RESULTS | |
106 ; load dest[j * dest_stride + 0-31] | |
107 vld1.s16 {d8}, [r10], r2 | |
108 vld1.s16 {d11}, [r9], r11 | |
109 vld1.s16 {d9}, [r10] | |
110 vld1.s16 {d10}, [r9] | |
111 ; ROUND_POWER_OF_TWO | |
112 vrshr.s16 q7, q7, #6 | |
113 vrshr.s16 q8, q8, #6 | |
114 vrshr.s16 q9, q9, #6 | |
115 vrshr.s16 q6, q6, #6 | |
116 ; add to dest[j * dest_stride + 0-31] | |
117 vaddw.u8 q7, q7, d9 | |
118 vaddw.u8 q8, q8, d10 | |
119 vaddw.u8 q9, q9, d11 | |
120 vaddw.u8 q6, q6, d8 | |
121 ; clip pixel | |
122 vqmovun.s16 d9, q7 | |
123 vqmovun.s16 d10, q8 | |
124 vqmovun.s16 d11, q9 | |
125 vqmovun.s16 d8, q6 | |
126 ; store back into dest[j * dest_stride + 0-31] | |
127 vst1.16 {d9}, [r10], r11 | |
128 vst1.16 {d10}, [r9], r2 | |
129 vst1.16 {d8}, [r10] | |
130 vst1.16 {d11}, [r9] | |
131 ; update pointers (by dest_stride * 2) | |
132 sub r9, r9, r2, lsl #1 | |
133 add r10, r10, r2, lsl #1 | |
134 MEND | |
135 ; -------------------------------------------------------------------------- | |
136 ; Combine-add results with current destination content | |
137 ; q6-q9 contain the results (out[j * 32 + 0-31]) | |
138 MACRO | |
139 STORE_COMBINE_CENTER_RESULTS_LAST | |
140 ; load dest[j * dest_stride + 0-31] | |
141 vld1.s16 {d8}, [r10], r2 | |
142 vld1.s16 {d11}, [r9], r11 | |
143 vld1.s16 {d9}, [r10] | |
144 vld1.s16 {d10}, [r9] | |
145 ; ROUND_POWER_OF_TWO | |
146 vrshr.s16 q7, q7, #6 | |
147 vrshr.s16 q8, q8, #6 | |
148 vrshr.s16 q9, q9, #6 | |
149 vrshr.s16 q6, q6, #6 | |
150 ; add to dest[j * dest_stride + 0-31] | |
151 vaddw.u8 q7, q7, d9 | |
152 vaddw.u8 q8, q8, d10 | |
153 vaddw.u8 q9, q9, d11 | |
154 vaddw.u8 q6, q6, d8 | |
155 ; clip pixel | |
156 vqmovun.s16 d9, q7 | |
157 vqmovun.s16 d10, q8 | |
158 vqmovun.s16 d11, q9 | |
159 vqmovun.s16 d8, q6 | |
160 ; store back into dest[j * dest_stride + 0-31] | |
161 vst1.16 {d9}, [r10], r11 | |
162 vst1.16 {d10}, [r9], r2 | |
163 vst1.16 {d8}, [r10]! | |
164 vst1.16 {d11}, [r9]! | |
165 ; update pointers (by dest_stride * 2) | |
166 sub r9, r9, r2, lsl #1 | |
167 add r10, r10, r2, lsl #1 | |
168 MEND | |
169 ; -------------------------------------------------------------------------- | |
170 ; Combine-add results with current destination content | |
171 ; q4-q7 contain the results (out[j * 32 + 0-31]) | |
172 MACRO | |
173 STORE_COMBINE_EXTREME_RESULTS | |
174 ; load dest[j * dest_stride + 0-31] | |
175 vld1.s16 {d4}, [r7], r2 | |
176 vld1.s16 {d7}, [r6], r11 | |
177 vld1.s16 {d5}, [r7] | |
178 vld1.s16 {d6}, [r6] | |
179 ; ROUND_POWER_OF_TWO | |
180 vrshr.s16 q5, q5, #6 | |
181 vrshr.s16 q6, q6, #6 | |
182 vrshr.s16 q7, q7, #6 | |
183 vrshr.s16 q4, q4, #6 | |
184 ; add to dest[j * dest_stride + 0-31] | |
185 vaddw.u8 q5, q5, d5 | |
186 vaddw.u8 q6, q6, d6 | |
187 vaddw.u8 q7, q7, d7 | |
188 vaddw.u8 q4, q4, d4 | |
189 ; clip pixel | |
190 vqmovun.s16 d5, q5 | |
191 vqmovun.s16 d6, q6 | |
192 vqmovun.s16 d7, q7 | |
193 vqmovun.s16 d4, q4 | |
194 ; store back into dest[j * dest_stride + 0-31] | |
195 vst1.16 {d5}, [r7], r11 | |
196 vst1.16 {d6}, [r6], r2 | |
197 vst1.16 {d7}, [r6] | |
198 vst1.16 {d4}, [r7] | |
199 ; update pointers (by dest_stride * 2) | |
200 sub r6, r6, r2, lsl #1 | |
201 add r7, r7, r2, lsl #1 | |
202 MEND | |
203 ; -------------------------------------------------------------------------- | |
204 ; Combine-add results with current destination content | |
205 ; q4-q7 contain the results (out[j * 32 + 0-31]) | |
206 MACRO | |
207 STORE_COMBINE_EXTREME_RESULTS_LAST | |
208 ; load dest[j * dest_stride + 0-31] | |
209 vld1.s16 {d4}, [r7], r2 | |
210 vld1.s16 {d7}, [r6], r11 | |
211 vld1.s16 {d5}, [r7] | |
212 vld1.s16 {d6}, [r6] | |
213 ; ROUND_POWER_OF_TWO | |
214 vrshr.s16 q5, q5, #6 | |
215 vrshr.s16 q6, q6, #6 | |
216 vrshr.s16 q7, q7, #6 | |
217 vrshr.s16 q4, q4, #6 | |
218 ; add to dest[j * dest_stride + 0-31] | |
219 vaddw.u8 q5, q5, d5 | |
220 vaddw.u8 q6, q6, d6 | |
221 vaddw.u8 q7, q7, d7 | |
222 vaddw.u8 q4, q4, d4 | |
223 ; clip pixel | |
224 vqmovun.s16 d5, q5 | |
225 vqmovun.s16 d6, q6 | |
226 vqmovun.s16 d7, q7 | |
227 vqmovun.s16 d4, q4 | |
228 ; store back into dest[j * dest_stride + 0-31] | |
229 vst1.16 {d5}, [r7], r11 | |
230 vst1.16 {d6}, [r6], r2 | |
231 vst1.16 {d7}, [r6]! | |
232 vst1.16 {d4}, [r7]! | |
233 ; update pointers (by dest_stride * 2) | |
234 sub r6, r6, r2, lsl #1 | |
235 add r7, r7, r2, lsl #1 | |
236 MEND | |
237 ; -------------------------------------------------------------------------- | |
238 ; Touches q8-q12, q15 (q13-q14 are preserved) | |
239 ; valid output registers are anything but q8-q11 | |
240 MACRO | |
241 DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant,
$reg1, $reg2, $reg3, $reg4 | |
242 ; TODO(cd): have special case to re-use constants when they are similar for | |
243 ; consecutive butterflies | |
244 ; TODO(cd): have special case when both constants are the same, do the | |
245 ; additions/subtractions before the multiplies. | |
246 ; generate the constants | |
247 ; generate scalar constants | |
248 mov r8, #$first_constant & 0xFF00 | |
249 mov r12, #$second_constant & 0xFF00 | |
250 add r8, #$first_constant & 0x00FF | |
251 add r12, #$second_constant & 0x00FF | |
252 ; generate vector constants | |
253 vdup.16 d30, r8 | |
254 vdup.16 d31, r12 | |
255 ; (used) two for inputs (regA-regD), one for constants (q15) | |
256 ; do some multiplications (ordered for maximum latency hiding) | |
257 vmull.s16 q8, $regC, d30 | |
258 vmull.s16 q10, $regA, d31 | |
259 vmull.s16 q9, $regD, d30 | |
260 vmull.s16 q11, $regB, d31 | |
261 vmull.s16 q12, $regC, d31 | |
262 ; (used) five for intermediate (q8-q12), one for constants (q15) | |
263 ; do some addition/subtractions (to get back two register) | |
264 vsub.s32 q8, q8, q10 | |
265 vsub.s32 q9, q9, q11 | |
266 ; do more multiplications (ordered for maximum latency hiding) | |
267 vmull.s16 q10, $regD, d31 | |
268 vmull.s16 q11, $regA, d30 | |
269 vmull.s16 q15, $regB, d30 | |
270 ; (used) six for intermediate (q8-q12, q15) | |
271 ; do more addition/subtractions | |
272 vadd.s32 q11, q12, q11 | |
273 vadd.s32 q10, q10, q15 | |
274 ; (used) four for intermediate (q8-q11) | |
275 ; dct_const_round_shift | |
276 vqrshrn.s32 $reg1, q8, #14 | |
277 vqrshrn.s32 $reg2, q9, #14 | |
278 vqrshrn.s32 $reg3, q11, #14 | |
279 vqrshrn.s32 $reg4, q10, #14 | |
280 ; (used) two for results, well four d registers | |
281 MEND | |
282 ; -------------------------------------------------------------------------- | |
283 ; Touches q8-q12, q15 (q13-q14 are preserved) | |
284 ; valid output registers are anything but q8-q11 | |
285 MACRO | |
286 DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $re
g4 | |
287 DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $
reg2, $reg3, $reg4 | |
288 MEND | |
289 ; -------------------------------------------------------------------------- | |
290 | |
291 ;void vp9_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride
); | |
292 ; | |
293 ; r0 int16_t *input, | |
294 ; r1 uint8_t *dest, | |
295 ; r2 int dest_stride) | |
296 ; loop counters | |
297 ; r4 bands loop counter | |
298 ; r5 pass loop counter | |
299 ; r8 transpose loop counter | |
300 ; combine-add pointers | |
301 ; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...) | |
302 ; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...) | |
303 ; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...) | |
304 ; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...) | |
305 | |
306 |vp9_idct32x32_1024_add_neon| PROC | |
307 ; This function does one pass of idct32x32 transform. | |
308 ; | |
309 ; This is done by transposing the input and then doing a 1d transform on | |
310 ; columns. In the first pass, the transposed columns are the original | |
311 ; rows. In the second pass, after the transposition, the colums are the | |
312 ; original columns. | |
313 ; The 1d transform is done by looping over bands of eight columns (the | |
314 ; idct32_bands loop). For each band, the transform input transposition | |
315 ; is done on demand, one band of four 8x8 matrices at a time. The four | |
316 ; matrices are transposed by pairs (the idct32_transpose_pair loop). | |
317 push {r4-r11} | |
318 vpush {d8-d15} | |
319 ; stack operation | |
320 ; internal buffer used to transpose 8 lines into before transforming them | |
321 ; int16_t transpose_buffer[32 * 8]; | |
322 ; at sp + [4096, 4607] | |
323 ; results of the first pass (transpose and transform rows) | |
324 ; int16_t pass1[32 * 32]; | |
325 ; at sp + [0, 2047] | |
326 ; results of the second pass (transpose and transform columns) | |
327 ; int16_t pass2[32 * 32]; | |
328 ; at sp + [2048, 4095] | |
329 sub sp, sp, #512+2048+2048 | |
330 | |
331 ; r6 = dest + 31 * dest_stride | |
332 ; r7 = dest + 0 * dest_stride | |
333 ; r9 = dest + 15 * dest_stride | |
334 ; r10 = dest + 16 * dest_stride | |
335 rsb r6, r2, r2, lsl #5 | |
336 rsb r9, r2, r2, lsl #4 | |
337 add r10, r1, r2, lsl #4 | |
338 mov r7, r1 | |
339 add r6, r6, r1 | |
340 add r9, r9, r1 | |
341 ; r11 = -dest_stride | |
342 neg r11, r2 | |
343 ; r3 = input | |
344 mov r3, r0 | |
345 ; parameters for first pass | |
346 ; r0 = transpose_buffer[32 * 8] | |
347 add r0, sp, #4096 | |
348 ; r1 = pass1[32 * 32] | |
349 mov r1, sp | |
350 | |
351 mov r5, #0 ; initialize pass loop counter | |
352 idct32_pass_loop | |
353 mov r4, #4 ; initialize bands loop counter | |
354 idct32_bands_loop | |
355 mov r8, #2 ; initialize transpose loop counter | |
356 idct32_transpose_pair_loop | |
357 ; Load two horizontally consecutive 8x8 16bit data matrices. The first one | |
358 ; into q0-q7 and the second one into q8-q15. There is a stride of 64, | |
359 ; adjusted to 32 because of the two post-increments. | |
360 vld1.s16 {q8}, [r3]! | |
361 vld1.s16 {q0}, [r3]! | |
362 add r3, #32 | |
363 vld1.s16 {q9}, [r3]! | |
364 vld1.s16 {q1}, [r3]! | |
365 add r3, #32 | |
366 vld1.s16 {q10}, [r3]! | |
367 vld1.s16 {q2}, [r3]! | |
368 add r3, #32 | |
369 vld1.s16 {q11}, [r3]! | |
370 vld1.s16 {q3}, [r3]! | |
371 add r3, #32 | |
372 vld1.s16 {q12}, [r3]! | |
373 vld1.s16 {q4}, [r3]! | |
374 add r3, #32 | |
375 vld1.s16 {q13}, [r3]! | |
376 vld1.s16 {q5}, [r3]! | |
377 add r3, #32 | |
378 vld1.s16 {q14}, [r3]! | |
379 vld1.s16 {q6}, [r3]! | |
380 add r3, #32 | |
381 vld1.s16 {q15}, [r3]! | |
382 vld1.s16 {q7}, [r3]! | |
383 | |
384 ; Transpose the two 8x8 16bit data matrices. | |
385 vswp d17, d24 | |
386 vswp d23, d30 | |
387 vswp d21, d28 | |
388 vswp d19, d26 | |
389 vswp d1, d8 | |
390 vswp d7, d14 | |
391 vswp d5, d12 | |
392 vswp d3, d10 | |
393 vtrn.32 q8, q10 | |
394 vtrn.32 q9, q11 | |
395 vtrn.32 q12, q14 | |
396 vtrn.32 q13, q15 | |
397 vtrn.32 q0, q2 | |
398 vtrn.32 q1, q3 | |
399 vtrn.32 q4, q6 | |
400 vtrn.32 q5, q7 | |
401 vtrn.16 q8, q9 | |
402 vtrn.16 q10, q11 | |
403 vtrn.16 q12, q13 | |
404 vtrn.16 q14, q15 | |
405 vtrn.16 q0, q1 | |
406 vtrn.16 q2, q3 | |
407 vtrn.16 q4, q5 | |
408 vtrn.16 q6, q7 | |
409 | |
410 ; Store both matrices after each other. There is a stride of 32, which | |
411 ; adjusts to nothing because of the post-increments. | |
412 vst1.16 {q8}, [r0]! | |
413 vst1.16 {q9}, [r0]! | |
414 vst1.16 {q10}, [r0]! | |
415 vst1.16 {q11}, [r0]! | |
416 vst1.16 {q12}, [r0]! | |
417 vst1.16 {q13}, [r0]! | |
418 vst1.16 {q14}, [r0]! | |
419 vst1.16 {q15}, [r0]! | |
420 vst1.16 {q0}, [r0]! | |
421 vst1.16 {q1}, [r0]! | |
422 vst1.16 {q2}, [r0]! | |
423 vst1.16 {q3}, [r0]! | |
424 vst1.16 {q4}, [r0]! | |
425 vst1.16 {q5}, [r0]! | |
426 vst1.16 {q6}, [r0]! | |
427 vst1.16 {q7}, [r0]! | |
428 | |
429 ; increment pointers by adjusted stride (not necessary for r0/out) | |
430 ; go back by 7*32 for the seven lines moved fully by read and add | |
431 ; go back by 32 for the eigth line only read | |
432 ; advance by 16*2 to go the next pair | |
433 sub r3, r3, #7*32*2 + 32 - 16*2 | |
434 ; transpose pair loop processing | |
435 subs r8, r8, #1 | |
436 bne idct32_transpose_pair_loop | |
437 | |
438 ; restore r0/input to its original value | |
439 sub r0, r0, #32*8*2 | |
440 | |
441 ; Instead of doing the transforms stage by stage, it is done by loading | |
442 ; some input values and doing as many stages as possible to minimize the | |
443 ; storing/loading of intermediate results. To fit within registers, the | |
444 ; final coefficients are cut into four blocks: | |
445 ; BLOCK A: 16-19,28-31 | |
446 ; BLOCK B: 20-23,24-27 | |
447 ; BLOCK C: 8-10,11-15 | |
448 ; BLOCK D: 0-3,4-7 | |
449 ; Blocks A and C are straight calculation through the various stages. In | |
450 ; block B, further calculations are performed using the results from | |
451 ; block A. In block D, further calculations are performed using the results | |
452 ; from block C and then the final calculations are done using results from | |
453 ; block A and B which have been combined at the end of block B. | |
454 | |
455 ; -------------------------------------------------------------------------- | |
456 ; BLOCK A: 16-19,28-31 | |
457 ; -------------------------------------------------------------------------- | |
458 ; generate 16,17,30,31 | |
459 ; -------------------------------------------------------------------------- | |
460 ; part of stage 1 | |
461 ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] * cospi_1_64; | |
462 ;temp2 = input[1 * 32] * cospi_1_64 + input[31 * 32] * cospi_31_64; | |
463 ;step1b[16][i] = dct_const_round_shift(temp1); | |
464 ;step1b[31][i] = dct_const_round_shift(temp2); | |
465 LOAD_FROM_TRANSPOSED 0, 1, 31 | |
466 DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5 | |
467 ; -------------------------------------------------------------------------- | |
468 ; part of stage 1 | |
469 ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64; | |
470 ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64; | |
471 ;step1b[17][i] = dct_const_round_shift(temp1); | |
472 ;step1b[30][i] = dct_const_round_shift(temp2); | |
473 LOAD_FROM_TRANSPOSED 31, 17, 15 | |
474 DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7 | |
475 ; -------------------------------------------------------------------------- | |
476 ; part of stage 2 | |
477 ;step2[16] = step1b[16][i] + step1b[17][i]; | |
478 ;step2[17] = step1b[16][i] - step1b[17][i]; | |
479 ;step2[30] = -step1b[30][i] + step1b[31][i]; | |
480 ;step2[31] = step1b[30][i] + step1b[31][i]; | |
481 vadd.s16 q4, q0, q1 | |
482 vsub.s16 q13, q0, q1 | |
483 vadd.s16 q6, q2, q3 | |
484 vsub.s16 q14, q2, q3 | |
485 ; -------------------------------------------------------------------------- | |
486 ; part of stage 3 | |
487 ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64; | |
488 ;temp2 = step1b[30][i] * cospi_4_64 - step1b[17][i] * cospi_28_64; | |
489 ;step3[17] = dct_const_round_shift(temp1); | |
490 ;step3[30] = dct_const_round_shift(temp2); | |
491 DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15 | |
492 ; -------------------------------------------------------------------------- | |
493 ; generate 18,19,28,29 | |
494 ; -------------------------------------------------------------------------- | |
495 ; part of stage 1 | |
496 ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64; | |
497 ;temp2 = input[9 * 32] * cospi_9_64 + input[23 * 32] * cospi_23_64; | |
498 ;step1b[18][i] = dct_const_round_shift(temp1); | |
499 ;step1b[29][i] = dct_const_round_shift(temp2); | |
500 LOAD_FROM_TRANSPOSED 15, 9, 23 | |
501 DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5 | |
502 ; -------------------------------------------------------------------------- | |
503 ; part of stage 1 | |
504 ;temp1 = input[25 * 32] * cospi_7_64 - input[7 * 32] * cospi_25_64; | |
505 ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64; | |
506 ;step1b[19][i] = dct_const_round_shift(temp1); | |
507 ;step1b[28][i] = dct_const_round_shift(temp2); | |
508 LOAD_FROM_TRANSPOSED 23, 25, 7 | |
509 DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7 | |
510 ; -------------------------------------------------------------------------- | |
511 ; part of stage 2 | |
512 ;step2[18] = -step1b[18][i] + step1b[19][i]; | |
513 ;step2[19] = step1b[18][i] + step1b[19][i]; | |
514 ;step2[28] = step1b[28][i] + step1b[29][i]; | |
515 ;step2[29] = step1b[28][i] - step1b[29][i]; | |
516 vsub.s16 q13, q3, q2 | |
517 vadd.s16 q3, q3, q2 | |
518 vsub.s16 q14, q1, q0 | |
519 vadd.s16 q2, q1, q0 | |
520 ; -------------------------------------------------------------------------- | |
521 ; part of stage 3 | |
522 ;temp1 = step1b[18][i] * (-cospi_4_64) - step1b[29][i] * (-cospi_28_64); | |
523 ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64); | |
524 ;step3[29] = dct_const_round_shift(temp1); | |
525 ;step3[18] = dct_const_round_shift(temp2); | |
526 DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1 | |
527 ; -------------------------------------------------------------------------- | |
528 ; combine 16-19,28-31 | |
529 ; -------------------------------------------------------------------------- | |
530 ; part of stage 4 | |
531 ;step1[16] = step1b[16][i] + step1b[19][i]; | |
532 ;step1[17] = step1b[17][i] + step1b[18][i]; | |
533 ;step1[18] = step1b[17][i] - step1b[18][i]; | |
534 ;step1[29] = step1b[30][i] - step1b[29][i]; | |
535 ;step1[30] = step1b[30][i] + step1b[29][i]; | |
536 ;step1[31] = step1b[31][i] + step1b[28][i]; | |
537 vadd.s16 q8, q4, q2 | |
538 vadd.s16 q9, q5, q0 | |
539 vadd.s16 q10, q7, q1 | |
540 vadd.s16 q15, q6, q3 | |
541 vsub.s16 q13, q5, q0 | |
542 vsub.s16 q14, q7, q1 | |
543 STORE_IN_OUTPUT 0, 16, 31, q8, q15 | |
544 STORE_IN_OUTPUT 31, 17, 30, q9, q10 | |
545 ; -------------------------------------------------------------------------- | |
546 ; part of stage 5 | |
547 ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64; | |
548 ;temp2 = step1b[29][i] * cospi_8_64 + step1b[18][i] * cospi_24_64; | |
549 ;step2[18] = dct_const_round_shift(temp1); | |
550 ;step2[29] = dct_const_round_shift(temp2); | |
551 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3 | |
552 STORE_IN_OUTPUT 30, 29, 18, q1, q0 | |
553 ; -------------------------------------------------------------------------- | |
554 ; part of stage 4 | |
555 ;step1[19] = step1b[16][i] - step1b[19][i]; | |
556 ;step1[28] = step1b[31][i] - step1b[28][i]; | |
557 vsub.s16 q13, q4, q2 | |
558 vsub.s16 q14, q6, q3 | |
559 ; -------------------------------------------------------------------------- | |
560 ; part of stage 5 | |
561 ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64; | |
562 ;temp2 = step1b[28][i] * cospi_8_64 + step1b[19][i] * cospi_24_64; | |
563 ;step2[19] = dct_const_round_shift(temp1); | |
564 ;step2[28] = dct_const_round_shift(temp2); | |
565 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13 | |
566 STORE_IN_OUTPUT 18, 19, 28, q4, q6 | |
567 ; -------------------------------------------------------------------------- | |
568 | |
569 | |
570 ; -------------------------------------------------------------------------- | |
571 ; BLOCK B: 20-23,24-27 | |
572 ; -------------------------------------------------------------------------- | |
573 ; generate 20,21,26,27 | |
574 ; -------------------------------------------------------------------------- | |
575 ; part of stage 1 | |
576 ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64; | |
577 ;temp2 = input[5 * 32] * cospi_5_64 + input[27 * 32] * cospi_27_64; | |
578 ;step1b[20][i] = dct_const_round_shift(temp1); | |
579 ;step1b[27][i] = dct_const_round_shift(temp2); | |
580 LOAD_FROM_TRANSPOSED 7, 5, 27 | |
581 DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5 | |
582 ; -------------------------------------------------------------------------- | |
583 ; part of stage 1 | |
584 ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64; | |
585 ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64; | |
586 ;step1b[21][i] = dct_const_round_shift(temp1); | |
587 ;step1b[26][i] = dct_const_round_shift(temp2); | |
588 LOAD_FROM_TRANSPOSED 27, 21, 11 | |
589 DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7 | |
590 ; -------------------------------------------------------------------------- | |
591 ; part of stage 2 | |
592 ;step2[20] = step1b[20][i] + step1b[21][i]; | |
593 ;step2[21] = step1b[20][i] - step1b[21][i]; | |
594 ;step2[26] = -step1b[26][i] + step1b[27][i]; | |
595 ;step2[27] = step1b[26][i] + step1b[27][i]; | |
596 vsub.s16 q13, q0, q1 | |
597 vadd.s16 q0, q0, q1 | |
598 vsub.s16 q14, q2, q3 | |
599 vadd.s16 q2, q2, q3 | |
600 ; -------------------------------------------------------------------------- | |
601 ; part of stage 3 | |
602 ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64; | |
603 ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64; | |
604 ;step3[21] = dct_const_round_shift(temp1); | |
605 ;step3[26] = dct_const_round_shift(temp2); | |
606 DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 | |
607 ; -------------------------------------------------------------------------- | |
608 ; generate 22,23,24,25 | |
609 ; -------------------------------------------------------------------------- | |
610 ; part of stage 1 | |
611 ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64; | |
612 ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64; | |
613 ;step1b[22][i] = dct_const_round_shift(temp1); | |
614 ;step1b[25][i] = dct_const_round_shift(temp2); | |
615 LOAD_FROM_TRANSPOSED 11, 13, 19 | |
616 DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15 | |
617 ; -------------------------------------------------------------------------- | |
618 ; part of stage 1 | |
619 ;temp1 = input[29 * 32] * cospi_3_64 - input[3 * 32] * cospi_29_64; | |
620 ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64; | |
621 ;step1b[23][i] = dct_const_round_shift(temp1); | |
622 ;step1b[24][i] = dct_const_round_shift(temp2); | |
623 LOAD_FROM_TRANSPOSED 19, 29, 3 | |
624 DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13 | |
625 ; -------------------------------------------------------------------------- | |
626 ; part of stage 2 | |
627 ;step2[22] = -step1b[22][i] + step1b[23][i]; | |
628 ;step2[23] = step1b[22][i] + step1b[23][i]; | |
629 ;step2[24] = step1b[24][i] + step1b[25][i]; | |
630 ;step2[25] = step1b[24][i] - step1b[25][i]; | |
631 vsub.s16 q14, q4, q5 | |
632 vadd.s16 q5, q4, q5 | |
633 vsub.s16 q13, q6, q7 | |
634 vadd.s16 q6, q6, q7 | |
635 ; -------------------------------------------------------------------------- | |
636 ; part of stage 3 | |
637 ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64); | |
638 ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64); | |
639 ;step3[25] = dct_const_round_shift(temp1); | |
640 ;step3[22] = dct_const_round_shift(temp2); | |
641 DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15 | |
642 ; -------------------------------------------------------------------------- | |
643 ; combine 20-23,24-27 | |
644 ; -------------------------------------------------------------------------- | |
645 ; part of stage 4 | |
646 ;step1[22] = step1b[22][i] + step1b[21][i]; | |
647 ;step1[23] = step1b[23][i] + step1b[20][i]; | |
648 vadd.s16 q10, q7, q1 | |
649 vadd.s16 q11, q5, q0 | |
650 ;step1[24] = step1b[24][i] + step1b[27][i]; | |
651 ;step1[25] = step1b[25][i] + step1b[26][i]; | |
652 vadd.s16 q12, q6, q2 | |
653 vadd.s16 q15, q4, q3 | |
654 ; -------------------------------------------------------------------------- | |
655 ; part of stage 6 | |
656 ;step3[16] = step1b[16][i] + step1b[23][i]; | |
657 ;step3[17] = step1b[17][i] + step1b[22][i]; | |
658 ;step3[22] = step1b[17][i] - step1b[22][i]; | |
659 ;step3[23] = step1b[16][i] - step1b[23][i]; | |
660 LOAD_FROM_OUTPUT 28, 16, 17, q14, q13 | |
661 vadd.s16 q8, q14, q11 | |
662 vadd.s16 q9, q13, q10 | |
663 vsub.s16 q13, q13, q10 | |
664 vsub.s16 q11, q14, q11 | |
665 STORE_IN_OUTPUT 17, 17, 16, q9, q8 | |
666 ; -------------------------------------------------------------------------- | |
667 ; part of stage 6 | |
668 ;step3[24] = step1b[31][i] - step1b[24][i]; | |
669 ;step3[25] = step1b[30][i] - step1b[25][i]; | |
670 ;step3[30] = step1b[30][i] + step1b[25][i]; | |
671 ;step3[31] = step1b[31][i] + step1b[24][i]; | |
672 LOAD_FROM_OUTPUT 16, 30, 31, q14, q9 | |
673 vsub.s16 q8, q9, q12 | |
674 vadd.s16 q10, q14, q15 | |
675 vsub.s16 q14, q14, q15 | |
676 vadd.s16 q12, q9, q12 | |
677 STORE_IN_OUTPUT 31, 30, 31, q10, q12 | |
678 ; -------------------------------------------------------------------------- | |
679 ; TODO(cd) do some register allocation change to remove these push/pop | |
680 vpush {q8} ; [24] | |
681 vpush {q11} ; [23] | |
682 ; -------------------------------------------------------------------------- | |
683 ; part of stage 7 | |
684 ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64; | |
685 ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64; | |
686 ;step1[22] = dct_const_round_shift(temp1); | |
687 ;step1[25] = dct_const_round_shift(temp2); | |
688 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 | |
689 STORE_IN_OUTPUT 31, 25, 22, q14, q13 | |
690 ; -------------------------------------------------------------------------- | |
691 ; part of stage 7 | |
692 ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64; | |
693 ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64; | |
694 ;step1[23] = dct_const_round_shift(temp1); | |
695 ;step1[24] = dct_const_round_shift(temp2); | |
696 ; TODO(cd) do some register allocation change to remove these push/pop | |
697 vpop {q13} ; [23] | |
698 vpop {q14} ; [24] | |
699 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 | |
700 STORE_IN_OUTPUT 22, 24, 23, q14, q13 | |
701 ; -------------------------------------------------------------------------- | |
702 ; part of stage 4 | |
703 ;step1[20] = step1b[23][i] - step1b[20][i]; | |
704 ;step1[27] = step1b[24][i] - step1b[27][i]; | |
705 vsub.s16 q14, q5, q0 | |
706 vsub.s16 q13, q6, q2 | |
707 ; -------------------------------------------------------------------------- | |
708 ; part of stage 5 | |
709 ;temp1 = step1b[20][i] * (-cospi_8_64) - step1b[27][i] * (-cospi_24_64); | |
710 ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64); | |
711 ;step2[27] = dct_const_round_shift(temp1); | |
712 ;step2[20] = dct_const_round_shift(temp2); | |
713 DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13 | |
714 ; -------------------------------------------------------------------------- | |
715 ; part of stage 4 | |
716 ;step1[21] = step1b[22][i] - step1b[21][i]; | |
717 ;step1[26] = step1b[25][i] - step1b[26][i]; | |
718 vsub.s16 q14, q7, q1 | |
719 vsub.s16 q13, q4, q3 | |
720 ; -------------------------------------------------------------------------- | |
721 ; part of stage 5 | |
722 ;temp1 = step1b[21][i] * (-cospi_8_64) - step1b[26][i] * (-cospi_24_64); | |
723 ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64); | |
724 ;step2[26] = dct_const_round_shift(temp1); | |
725 ;step2[21] = dct_const_round_shift(temp2); | |
726 DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3 | |
727 ; -------------------------------------------------------------------------- | |
728 ; part of stage 6 | |
729 ;step3[18] = step1b[18][i] + step1b[21][i]; | |
730 ;step3[19] = step1b[19][i] + step1b[20][i]; | |
731 ;step3[20] = step1b[19][i] - step1b[20][i]; | |
732 ;step3[21] = step1b[18][i] - step1b[21][i]; | |
733 LOAD_FROM_OUTPUT 23, 18, 19, q14, q13 | |
734 vadd.s16 q8, q14, q1 | |
735 vadd.s16 q9, q13, q6 | |
736 vsub.s16 q13, q13, q6 | |
737 vsub.s16 q1, q14, q1 | |
738 STORE_IN_OUTPUT 19, 18, 19, q8, q9 | |
739 ; -------------------------------------------------------------------------- | |
740 ; part of stage 6 | |
741 ;step3[27] = step1b[28][i] - step1b[27][i]; | |
742 ;step3[28] = step1b[28][i] + step1b[27][i]; | |
743 ;step3[29] = step1b[29][i] + step1b[26][i]; | |
744 ;step3[26] = step1b[29][i] - step1b[26][i]; | |
745 LOAD_FROM_OUTPUT 19, 28, 29, q8, q9 | |
746 vsub.s16 q14, q8, q5 | |
747 vadd.s16 q10, q8, q5 | |
748 vadd.s16 q11, q9, q0 | |
749 vsub.s16 q0, q9, q0 | |
750 STORE_IN_OUTPUT 29, 28, 29, q10, q11 | |
751 ; -------------------------------------------------------------------------- | |
752 ; part of stage 7 | |
753 ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64; | |
754 ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64; | |
755 ;step1[20] = dct_const_round_shift(temp1); | |
756 ;step1[27] = dct_const_round_shift(temp2); | |
757 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 | |
758 STORE_IN_OUTPUT 29, 20, 27, q13, q14 | |
759 ; -------------------------------------------------------------------------- | |
760 ; part of stage 7 | |
761 ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64; | |
762 ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64; | |
763 ;step1[21] = dct_const_round_shift(temp1); | |
764 ;step1[26] = dct_const_round_shift(temp2); | |
765 DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1 | |
766 STORE_IN_OUTPUT 27, 21, 26, q1, q0 | |
767 ; -------------------------------------------------------------------------- | |
768 | |
769 | |
770 ; -------------------------------------------------------------------------- | |
771 ; BLOCK C: 8-10,11-15 | |
772 ; -------------------------------------------------------------------------- | |
773 ; generate 8,9,14,15 | |
774 ; -------------------------------------------------------------------------- | |
775 ; part of stage 2 | |
776 ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64; | |
777 ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64; | |
778 ;step2[8] = dct_const_round_shift(temp1); | |
779 ;step2[15] = dct_const_round_shift(temp2); | |
780 LOAD_FROM_TRANSPOSED 3, 2, 30 | |
781 DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5 | |
782 ; -------------------------------------------------------------------------- | |
783 ; part of stage 2 | |
784 ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64; | |
785 ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64; | |
786 ;step2[9] = dct_const_round_shift(temp1); | |
787 ;step2[14] = dct_const_round_shift(temp2); | |
788 LOAD_FROM_TRANSPOSED 30, 18, 14 | |
789 DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7 | |
790 ; -------------------------------------------------------------------------- | |
791 ; part of stage 3 | |
792 ;step3[8] = step1b[8][i] + step1b[9][i]; | |
793 ;step3[9] = step1b[8][i] - step1b[9][i]; | |
794 ;step3[14] = step1b[15][i] - step1b[14][i]; | |
795 ;step3[15] = step1b[15][i] + step1b[14][i]; | |
796 vsub.s16 q13, q0, q1 | |
797 vadd.s16 q0, q0, q1 | |
798 vsub.s16 q14, q2, q3 | |
799 vadd.s16 q2, q2, q3 | |
800 ; -------------------------------------------------------------------------- | |
801 ; part of stage 4 | |
802 ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64; | |
803 ;temp2 = step1b[14][i] * cospi_8_64 + step1b[9][i] * cospi_24_64; | |
804 ;step1[9] = dct_const_round_shift(temp1); | |
805 ;step1[14] = dct_const_round_shift(temp2); | |
806 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7 | |
807 ; -------------------------------------------------------------------------- | |
808 ; generate 10,11,12,13 | |
809 ; -------------------------------------------------------------------------- | |
810 ; part of stage 2 | |
811 ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64; | |
812 ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64; | |
813 ;step2[10] = dct_const_round_shift(temp1); | |
814 ;step2[13] = dct_const_round_shift(temp2); | |
815 LOAD_FROM_TRANSPOSED 14, 10, 22 | |
816 DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15 | |
817 ; -------------------------------------------------------------------------- | |
818 ; part of stage 2 | |
819 ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64; | |
820 ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64; | |
821 ;step2[11] = dct_const_round_shift(temp1); | |
822 ;step2[12] = dct_const_round_shift(temp2); | |
823 LOAD_FROM_TRANSPOSED 22, 26, 6 | |
824 DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13 | |
825 ; -------------------------------------------------------------------------- | |
826 ; part of stage 3 | |
827 ;step3[10] = step1b[11][i] - step1b[10][i]; | |
828 ;step3[11] = step1b[11][i] + step1b[10][i]; | |
829 ;step3[12] = step1b[12][i] + step1b[13][i]; | |
830 ;step3[13] = step1b[12][i] - step1b[13][i]; | |
831 vsub.s16 q14, q4, q5 | |
832 vadd.s16 q5, q4, q5 | |
833 vsub.s16 q13, q6, q7 | |
834 vadd.s16 q6, q6, q7 | |
835 ; -------------------------------------------------------------------------- | |
836 ; part of stage 4 | |
837 ;temp1 = step1b[10][i] * (-cospi_8_64) - step1b[13][i] * (-cospi_24_64); | |
838 ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64); | |
839 ;step1[13] = dct_const_round_shift(temp1); | |
840 ;step1[10] = dct_const_round_shift(temp2); | |
841 DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15 | |
842 ; -------------------------------------------------------------------------- | |
843 ; combine 8-10,11-15 | |
844 ; -------------------------------------------------------------------------- | |
845 ; part of stage 5 | |
846 ;step2[8] = step1b[8][i] + step1b[11][i]; | |
847 ;step2[9] = step1b[9][i] + step1b[10][i]; | |
848 ;step2[10] = step1b[9][i] - step1b[10][i]; | |
849 vadd.s16 q8, q0, q5 | |
850 vadd.s16 q9, q1, q7 | |
851 vsub.s16 q13, q1, q7 | |
852 ;step2[13] = step1b[14][i] - step1b[13][i]; | |
853 ;step2[14] = step1b[14][i] + step1b[13][i]; | |
854 ;step2[15] = step1b[15][i] + step1b[12][i]; | |
855 vsub.s16 q14, q3, q4 | |
856 vadd.s16 q10, q3, q4 | |
857 vadd.s16 q15, q2, q6 | |
858 STORE_IN_OUTPUT 26, 8, 15, q8, q15 | |
859 STORE_IN_OUTPUT 15, 9, 14, q9, q10 | |
860 ; -------------------------------------------------------------------------- | |
861 ; part of stage 6 | |
862 ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64; | |
863 ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64; | |
864 ;step3[10] = dct_const_round_shift(temp1); | |
865 ;step3[13] = dct_const_round_shift(temp2); | |
866 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 | |
867 STORE_IN_OUTPUT 14, 13, 10, q3, q1 | |
868 ; -------------------------------------------------------------------------- | |
869 ; part of stage 5 | |
870 ;step2[11] = step1b[8][i] - step1b[11][i]; | |
871 ;step2[12] = step1b[15][i] - step1b[12][i]; | |
872 vsub.s16 q13, q0, q5 | |
873 vsub.s16 q14, q2, q6 | |
874 ; -------------------------------------------------------------------------- | |
875 ; part of stage 6 | |
876 ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64; | |
877 ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64; | |
878 ;step3[11] = dct_const_round_shift(temp1); | |
879 ;step3[12] = dct_const_round_shift(temp2); | |
880 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 | |
881 STORE_IN_OUTPUT 10, 11, 12, q1, q3 | |
882 ; -------------------------------------------------------------------------- | |
883 | |
884 | |
885 ; -------------------------------------------------------------------------- | |
886 ; BLOCK D: 0-3,4-7 | |
887 ; -------------------------------------------------------------------------- | |
888 ; generate 4,5,6,7 | |
889 ; -------------------------------------------------------------------------- | |
890 ; part of stage 3 | |
891 ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64; | |
892 ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64; | |
893 ;step3[4] = dct_const_round_shift(temp1); | |
894 ;step3[7] = dct_const_round_shift(temp2); | |
895 LOAD_FROM_TRANSPOSED 6, 4, 28 | |
896 DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5 | |
897 ; -------------------------------------------------------------------------- | |
898 ; part of stage 3 | |
899 ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64; | |
900 ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64; | |
901 ;step3[5] = dct_const_round_shift(temp1); | |
902 ;step3[6] = dct_const_round_shift(temp2); | |
903 LOAD_FROM_TRANSPOSED 28, 20, 12 | |
904 DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 | |
905 ; -------------------------------------------------------------------------- | |
906 ; part of stage 4 | |
907 ;step1[4] = step1b[4][i] + step1b[5][i]; | |
908 ;step1[5] = step1b[4][i] - step1b[5][i]; | |
909 ;step1[6] = step1b[7][i] - step1b[6][i]; | |
910 ;step1[7] = step1b[7][i] + step1b[6][i]; | |
911 vsub.s16 q13, q0, q1 | |
912 vadd.s16 q0, q0, q1 | |
913 vsub.s16 q14, q2, q3 | |
914 vadd.s16 q2, q2, q3 | |
915 ; -------------------------------------------------------------------------- | |
916 ; part of stage 5 | |
917 ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64; | |
918 ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64; | |
919 ;step2[5] = dct_const_round_shift(temp1); | |
920 ;step2[6] = dct_const_round_shift(temp2); | |
921 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 | |
922 ; -------------------------------------------------------------------------- | |
923 ; generate 0,1,2,3 | |
924 ; -------------------------------------------------------------------------- | |
925 ; part of stage 4 | |
926 ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64; | |
927 ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64; | |
928 ;step1[1] = dct_const_round_shift(temp1); | |
929 ;step1[0] = dct_const_round_shift(temp2); | |
930 LOAD_FROM_TRANSPOSED 12, 0, 16 | |
931 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15 | |
932 ; -------------------------------------------------------------------------- | |
933 ; part of stage 4 | |
934 ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64; | |
935 ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64; | |
936 ;step1[2] = dct_const_round_shift(temp1); | |
937 ;step1[3] = dct_const_round_shift(temp2); | |
938 LOAD_FROM_TRANSPOSED 16, 8, 24 | |
939 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13 | |
940 ; -------------------------------------------------------------------------- | |
941 ; part of stage 5 | |
942 ;step2[0] = step1b[0][i] + step1b[3][i]; | |
943 ;step2[1] = step1b[1][i] + step1b[2][i]; | |
944 ;step2[2] = step1b[1][i] - step1b[2][i]; | |
945 ;step2[3] = step1b[0][i] - step1b[3][i]; | |
946 vadd.s16 q4, q7, q6 | |
947 vsub.s16 q7, q7, q6 | |
948 vsub.s16 q6, q5, q14 | |
949 vadd.s16 q5, q5, q14 | |
950 ; -------------------------------------------------------------------------- | |
951 ; combine 0-3,4-7 | |
952 ; -------------------------------------------------------------------------- | |
953 ; part of stage 6 | |
954 ;step3[0] = step1b[0][i] + step1b[7][i]; | |
955 ;step3[1] = step1b[1][i] + step1b[6][i]; | |
956 ;step3[2] = step1b[2][i] + step1b[5][i]; | |
957 ;step3[3] = step1b[3][i] + step1b[4][i]; | |
958 vadd.s16 q8, q4, q2 | |
959 vadd.s16 q9, q5, q3 | |
960 vadd.s16 q10, q6, q1 | |
961 vadd.s16 q11, q7, q0 | |
962 ;step3[4] = step1b[3][i] - step1b[4][i]; | |
963 ;step3[5] = step1b[2][i] - step1b[5][i]; | |
964 ;step3[6] = step1b[1][i] - step1b[6][i]; | |
965 ;step3[7] = step1b[0][i] - step1b[7][i]; | |
966 vsub.s16 q12, q7, q0 | |
967 vsub.s16 q13, q6, q1 | |
968 vsub.s16 q14, q5, q3 | |
969 vsub.s16 q15, q4, q2 | |
970 ; -------------------------------------------------------------------------- | |
971 ; part of stage 7 | |
972 ;step1[0] = step1b[0][i] + step1b[15][i]; | |
973 ;step1[1] = step1b[1][i] + step1b[14][i]; | |
974 ;step1[14] = step1b[1][i] - step1b[14][i]; | |
975 ;step1[15] = step1b[0][i] - step1b[15][i]; | |
976 LOAD_FROM_OUTPUT 12, 14, 15, q0, q1 | |
977 vadd.s16 q2, q8, q1 | |
978 vadd.s16 q3, q9, q0 | |
979 vsub.s16 q4, q9, q0 | |
980 vsub.s16 q5, q8, q1 | |
981 ; -------------------------------------------------------------------------- | |
982 ; part of final stage | |
983 ;output[14 * 32] = step1b[14][i] + step1b[17][i]; | |
984 ;output[15 * 32] = step1b[15][i] + step1b[16][i]; | |
985 ;output[16 * 32] = step1b[15][i] - step1b[16][i]; | |
986 ;output[17 * 32] = step1b[14][i] - step1b[17][i]; | |
987 LOAD_FROM_OUTPUT 15, 16, 17, q0, q1 | |
988 vadd.s16 q8, q4, q1 | |
989 vadd.s16 q9, q5, q0 | |
990 vsub.s16 q6, q5, q0 | |
991 vsub.s16 q7, q4, q1 | |
992 | |
993 cmp r5, #0 | |
994 bgt idct32_bands_end_2nd_pass | |
995 | |
996 idct32_bands_end_1st_pass | |
997 STORE_IN_OUTPUT 17, 16, 17, q6, q7 | |
998 STORE_IN_OUTPUT 17, 14, 15, q8, q9 | |
999 ; -------------------------------------------------------------------------- | |
1000 ; part of final stage | |
1001 ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; | |
1002 ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; | |
1003 ;output[30 * 32] = step1b[1][i] - step1b[30][i]; | |
1004 ;output[31 * 32] = step1b[0][i] - step1b[31][i]; | |
1005 LOAD_FROM_OUTPUT 15, 30, 31, q0, q1 | |
1006 vadd.s16 q4, q2, q1 | |
1007 vadd.s16 q5, q3, q0 | |
1008 vsub.s16 q6, q3, q0 | |
1009 vsub.s16 q7, q2, q1 | |
1010 STORE_IN_OUTPUT 31, 30, 31, q6, q7 | |
1011 STORE_IN_OUTPUT 31, 0, 1, q4, q5 | |
1012 ; -------------------------------------------------------------------------- | |
1013 ; part of stage 7 | |
1014 ;step1[2] = step1b[2][i] + step1b[13][i]; | |
1015 ;step1[3] = step1b[3][i] + step1b[12][i]; | |
1016 ;step1[12] = step1b[3][i] - step1b[12][i]; | |
1017 ;step1[13] = step1b[2][i] - step1b[13][i]; | |
1018 LOAD_FROM_OUTPUT 1, 12, 13, q0, q1 | |
1019 vadd.s16 q2, q10, q1 | |
1020 vadd.s16 q3, q11, q0 | |
1021 vsub.s16 q4, q11, q0 | |
1022 vsub.s16 q5, q10, q1 | |
1023 ; -------------------------------------------------------------------------- | |
1024 ; part of final stage | |
1025 ;output[12 * 32] = step1b[12][i] + step1b[19][i]; | |
1026 ;output[13 * 32] = step1b[13][i] + step1b[18][i]; | |
1027 ;output[18 * 32] = step1b[13][i] - step1b[18][i]; | |
1028 ;output[19 * 32] = step1b[12][i] - step1b[19][i]; | |
1029 LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 | |
1030 vadd.s16 q8, q4, q1 | |
1031 vadd.s16 q9, q5, q0 | |
1032 vsub.s16 q6, q5, q0 | |
1033 vsub.s16 q7, q4, q1 | |
1034 STORE_IN_OUTPUT 19, 18, 19, q6, q7 | |
1035 STORE_IN_OUTPUT 19, 12, 13, q8, q9 | |
1036 ; -------------------------------------------------------------------------- | |
1037 ; part of final stage | |
1038 ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; | |
1039 ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; | |
1040 ;output[28 * 32] = step1b[3][i] - step1b[28][i]; | |
1041 ;output[29 * 32] = step1b[2][i] - step1b[29][i]; | |
1042 LOAD_FROM_OUTPUT 13, 28, 29, q0, q1 | |
1043 vadd.s16 q4, q2, q1 | |
1044 vadd.s16 q5, q3, q0 | |
1045 vsub.s16 q6, q3, q0 | |
1046 vsub.s16 q7, q2, q1 | |
1047 STORE_IN_OUTPUT 29, 28, 29, q6, q7 | |
1048 STORE_IN_OUTPUT 29, 2, 3, q4, q5 | |
1049 ; -------------------------------------------------------------------------- | |
1050 ; part of stage 7 | |
1051 ;step1[4] = step1b[4][i] + step1b[11][i]; | |
1052 ;step1[5] = step1b[5][i] + step1b[10][i]; | |
1053 ;step1[10] = step1b[5][i] - step1b[10][i]; | |
1054 ;step1[11] = step1b[4][i] - step1b[11][i]; | |
1055 LOAD_FROM_OUTPUT 3, 10, 11, q0, q1 | |
1056 vadd.s16 q2, q12, q1 | |
1057 vadd.s16 q3, q13, q0 | |
1058 vsub.s16 q4, q13, q0 | |
1059 vsub.s16 q5, q12, q1 | |
1060 ; -------------------------------------------------------------------------- | |
1061 ; part of final stage | |
1062 ;output[10 * 32] = step1b[10][i] + step1b[21][i]; | |
1063 ;output[11 * 32] = step1b[11][i] + step1b[20][i]; | |
1064 ;output[20 * 32] = step1b[11][i] - step1b[20][i]; | |
1065 ;output[21 * 32] = step1b[10][i] - step1b[21][i]; | |
1066 LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 | |
1067 vadd.s16 q8, q4, q1 | |
1068 vadd.s16 q9, q5, q0 | |
1069 vsub.s16 q6, q5, q0 | |
1070 vsub.s16 q7, q4, q1 | |
1071 STORE_IN_OUTPUT 21, 20, 21, q6, q7 | |
1072 STORE_IN_OUTPUT 21, 10, 11, q8, q9 | |
1073 ; -------------------------------------------------------------------------- | |
1074 ; part of final stage | |
1075 ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; | |
1076 ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; | |
1077 ;output[26 * 32] = step1b[5][i] - step1b[26][i]; | |
1078 ;output[27 * 32] = step1b[4][i] - step1b[27][i]; | |
1079 LOAD_FROM_OUTPUT 11, 26, 27, q0, q1 | |
1080 vadd.s16 q4, q2, q1 | |
1081 vadd.s16 q5, q3, q0 | |
1082 vsub.s16 q6, q3, q0 | |
1083 vsub.s16 q7, q2, q1 | |
1084 STORE_IN_OUTPUT 27, 26, 27, q6, q7 | |
1085 STORE_IN_OUTPUT 27, 4, 5, q4, q5 | |
1086 ; -------------------------------------------------------------------------- | |
1087 ; part of stage 7 | |
1088 ;step1[6] = step1b[6][i] + step1b[9][i]; | |
1089 ;step1[7] = step1b[7][i] + step1b[8][i]; | |
1090 ;step1[8] = step1b[7][i] - step1b[8][i]; | |
1091 ;step1[9] = step1b[6][i] - step1b[9][i]; | |
1092 LOAD_FROM_OUTPUT 5, 8, 9, q0, q1 | |
1093 vadd.s16 q2, q14, q1 | |
1094 vadd.s16 q3, q15, q0 | |
1095 vsub.s16 q4, q15, q0 | |
1096 vsub.s16 q5, q14, q1 | |
1097 ; -------------------------------------------------------------------------- | |
1098 ; part of final stage | |
1099 ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; | |
1100 ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; | |
1101 ;output[22 * 32] = step1b[9][i] - step1b[22][i]; | |
1102 ;output[23 * 32] = step1b[8][i] - step1b[23][i]; | |
1103 LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 | |
1104 vadd.s16 q8, q4, q1 | |
1105 vadd.s16 q9, q5, q0 | |
1106 vsub.s16 q6, q5, q0 | |
1107 vsub.s16 q7, q4, q1 | |
1108 STORE_IN_OUTPUT 23, 22, 23, q6, q7 | |
1109 STORE_IN_OUTPUT 23, 8, 9, q8, q9 | |
1110 ; -------------------------------------------------------------------------- | |
1111 ; part of final stage | |
1112 ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; | |
1113 ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; | |
1114 ;output[24 * 32] = step1b[7][i] - step1b[24][i]; | |
1115 ;output[25 * 32] = step1b[6][i] - step1b[25][i]; | |
1116 LOAD_FROM_OUTPUT 9, 24, 25, q0, q1 | |
1117 vadd.s16 q4, q2, q1 | |
1118 vadd.s16 q5, q3, q0 | |
1119 vsub.s16 q6, q3, q0 | |
1120 vsub.s16 q7, q2, q1 | |
1121 STORE_IN_OUTPUT 25, 24, 25, q6, q7 | |
1122 STORE_IN_OUTPUT 25, 6, 7, q4, q5 | |
1123 | |
1124 ; restore r0 by removing the last offset from the last | |
1125 ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 | |
1126 sub r0, r0, #24*8*2 | |
1127 ; restore r1 by removing the last offset from the last | |
1128 ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7*32*2 | |
1129 ; advance by 8 columns => 8*2 | |
1130 sub r1, r1, #7*32*2 - 8*2 | |
1131 ; advance by 8 lines (8*32*2) | |
1132 ; go back by the two pairs from the loop (32*2) | |
1133 add r3, r3, #8*32*2 - 32*2 | |
1134 | |
1135 ; bands loop processing | |
1136 subs r4, r4, #1 | |
1137 bne idct32_bands_loop | |
1138 | |
1139 ; parameters for second pass | |
1140 ; the input of pass2 is the result of pass1. we have to remove the offset | |
1141 ; of 32 columns induced by the above idct32_bands_loop | |
1142 sub r3, r1, #32*2 | |
1143 ; r1 = pass2[32 * 32] | |
1144 add r1, sp, #2048 | |
1145 | |
1146 ; pass loop processing | |
1147 add r5, r5, #1 | |
1148 b idct32_pass_loop | |
1149 | |
1150 idct32_bands_end_2nd_pass | |
1151 STORE_COMBINE_CENTER_RESULTS | |
1152 ; -------------------------------------------------------------------------- | |
1153 ; part of final stage | |
1154 ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; | |
1155 ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; | |
1156 ;output[30 * 32] = step1b[1][i] - step1b[30][i]; | |
1157 ;output[31 * 32] = step1b[0][i] - step1b[31][i]; | |
1158 LOAD_FROM_OUTPUT 17, 30, 31, q0, q1 | |
1159 vadd.s16 q4, q2, q1 | |
1160 vadd.s16 q5, q3, q0 | |
1161 vsub.s16 q6, q3, q0 | |
1162 vsub.s16 q7, q2, q1 | |
1163 STORE_COMBINE_EXTREME_RESULTS | |
1164 ; -------------------------------------------------------------------------- | |
1165 ; part of stage 7 | |
1166 ;step1[2] = step1b[2][i] + step1b[13][i]; | |
1167 ;step1[3] = step1b[3][i] + step1b[12][i]; | |
1168 ;step1[12] = step1b[3][i] - step1b[12][i]; | |
1169 ;step1[13] = step1b[2][i] - step1b[13][i]; | |
1170 LOAD_FROM_OUTPUT 31, 12, 13, q0, q1 | |
1171 vadd.s16 q2, q10, q1 | |
1172 vadd.s16 q3, q11, q0 | |
1173 vsub.s16 q4, q11, q0 | |
1174 vsub.s16 q5, q10, q1 | |
1175 ; -------------------------------------------------------------------------- | |
1176 ; part of final stage | |
1177 ;output[12 * 32] = step1b[12][i] + step1b[19][i]; | |
1178 ;output[13 * 32] = step1b[13][i] + step1b[18][i]; | |
1179 ;output[18 * 32] = step1b[13][i] - step1b[18][i]; | |
1180 ;output[19 * 32] = step1b[12][i] - step1b[19][i]; | |
1181 LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 | |
1182 vadd.s16 q8, q4, q1 | |
1183 vadd.s16 q9, q5, q0 | |
1184 vsub.s16 q6, q5, q0 | |
1185 vsub.s16 q7, q4, q1 | |
1186 STORE_COMBINE_CENTER_RESULTS | |
1187 ; -------------------------------------------------------------------------- | |
1188 ; part of final stage | |
1189 ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; | |
1190 ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; | |
1191 ;output[28 * 32] = step1b[3][i] - step1b[28][i]; | |
1192 ;output[29 * 32] = step1b[2][i] - step1b[29][i]; | |
1193 LOAD_FROM_OUTPUT 19, 28, 29, q0, q1 | |
1194 vadd.s16 q4, q2, q1 | |
1195 vadd.s16 q5, q3, q0 | |
1196 vsub.s16 q6, q3, q0 | |
1197 vsub.s16 q7, q2, q1 | |
1198 STORE_COMBINE_EXTREME_RESULTS | |
1199 ; -------------------------------------------------------------------------- | |
1200 ; part of stage 7 | |
1201 ;step1[4] = step1b[4][i] + step1b[11][i]; | |
1202 ;step1[5] = step1b[5][i] + step1b[10][i]; | |
1203 ;step1[10] = step1b[5][i] - step1b[10][i]; | |
1204 ;step1[11] = step1b[4][i] - step1b[11][i]; | |
1205 LOAD_FROM_OUTPUT 29, 10, 11, q0, q1 | |
1206 vadd.s16 q2, q12, q1 | |
1207 vadd.s16 q3, q13, q0 | |
1208 vsub.s16 q4, q13, q0 | |
1209 vsub.s16 q5, q12, q1 | |
1210 ; -------------------------------------------------------------------------- | |
1211 ; part of final stage | |
1212 ;output[10 * 32] = step1b[10][i] + step1b[21][i]; | |
1213 ;output[11 * 32] = step1b[11][i] + step1b[20][i]; | |
1214 ;output[20 * 32] = step1b[11][i] - step1b[20][i]; | |
1215 ;output[21 * 32] = step1b[10][i] - step1b[21][i]; | |
1216 LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 | |
1217 vadd.s16 q8, q4, q1 | |
1218 vadd.s16 q9, q5, q0 | |
1219 vsub.s16 q6, q5, q0 | |
1220 vsub.s16 q7, q4, q1 | |
1221 STORE_COMBINE_CENTER_RESULTS | |
1222 ; -------------------------------------------------------------------------- | |
1223 ; part of final stage | |
1224 ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; | |
1225 ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; | |
1226 ;output[26 * 32] = step1b[5][i] - step1b[26][i]; | |
1227 ;output[27 * 32] = step1b[4][i] - step1b[27][i]; | |
1228 LOAD_FROM_OUTPUT 21, 26, 27, q0, q1 | |
1229 vadd.s16 q4, q2, q1 | |
1230 vadd.s16 q5, q3, q0 | |
1231 vsub.s16 q6, q3, q0 | |
1232 vsub.s16 q7, q2, q1 | |
1233 STORE_COMBINE_EXTREME_RESULTS | |
1234 ; -------------------------------------------------------------------------- | |
1235 ; part of stage 7 | |
1236 ;step1[6] = step1b[6][i] + step1b[9][i]; | |
1237 ;step1[7] = step1b[7][i] + step1b[8][i]; | |
1238 ;step1[8] = step1b[7][i] - step1b[8][i]; | |
1239 ;step1[9] = step1b[6][i] - step1b[9][i]; | |
1240 LOAD_FROM_OUTPUT 27, 8, 9, q0, q1 | |
1241 vadd.s16 q2, q14, q1 | |
1242 vadd.s16 q3, q15, q0 | |
1243 vsub.s16 q4, q15, q0 | |
1244 vsub.s16 q5, q14, q1 | |
1245 ; -------------------------------------------------------------------------- | |
1246 ; part of final stage | |
1247 ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; | |
1248 ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; | |
1249 ;output[22 * 32] = step1b[9][i] - step1b[22][i]; | |
1250 ;output[23 * 32] = step1b[8][i] - step1b[23][i]; | |
1251 LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 | |
1252 vadd.s16 q8, q4, q1 | |
1253 vadd.s16 q9, q5, q0 | |
1254 vsub.s16 q6, q5, q0 | |
1255 vsub.s16 q7, q4, q1 | |
1256 STORE_COMBINE_CENTER_RESULTS_LAST | |
1257 ; -------------------------------------------------------------------------- | |
1258 ; part of final stage | |
1259 ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; | |
1260 ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; | |
1261 ;output[24 * 32] = step1b[7][i] - step1b[24][i]; | |
1262 ;output[25 * 32] = step1b[6][i] - step1b[25][i]; | |
1263 LOAD_FROM_OUTPUT 23, 24, 25, q0, q1 | |
1264 vadd.s16 q4, q2, q1 | |
1265 vadd.s16 q5, q3, q0 | |
1266 vsub.s16 q6, q3, q0 | |
1267 vsub.s16 q7, q2, q1 | |
1268 STORE_COMBINE_EXTREME_RESULTS_LAST | |
1269 ; -------------------------------------------------------------------------- | |
1270 ; restore pointers to their initial indices for next band pass by | |
1271 ; removing/adding dest_stride * 8. The actual increment by eight | |
1272 ; is taken care of within the _LAST macros. | |
1273 add r6, r6, r2, lsl #3 | |
1274 add r9, r9, r2, lsl #3 | |
1275 sub r7, r7, r2, lsl #3 | |
1276 sub r10, r10, r2, lsl #3 | |
1277 | |
1278 ; restore r0 by removing the last offset from the last | |
1279 ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 | |
1280 sub r0, r0, #24*8*2 | |
1281 ; restore r1 by removing the last offset from the last | |
1282 ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2 | |
1283 ; advance by 8 columns => 8*2 | |
1284 sub r1, r1, #25*32*2 - 8*2 | |
1285 ; advance by 8 lines (8*32*2) | |
1286 ; go back by the two pairs from the loop (32*2) | |
1287 add r3, r3, #8*32*2 - 32*2 | |
1288 | |
1289 ; bands loop processing | |
1290 subs r4, r4, #1 | |
1291 bne idct32_bands_loop | |
1292 | |
1293 ; stack operation | |
1294 add sp, sp, #512+2048+2048 | |
1295 vpop {d8-d15} | |
1296 pop {r4-r11} | |
1297 bx lr | |
1298 ENDP ; |vp9_idct32x32_1024_add_neon| | |
1299 END | |
OLD | NEW |