Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(80)

Side by Side Diff: source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon_asm.asm

Issue 812033011: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11 ;TODO(cd): adjust these constant to be able to use vqdmulh for faster
12 ; dct_const_round_shift(a * b) within butterfly calculations.
13 cospi_1_64 EQU 16364
14 cospi_2_64 EQU 16305
15 cospi_3_64 EQU 16207
16 cospi_4_64 EQU 16069
17 cospi_5_64 EQU 15893
18 cospi_6_64 EQU 15679
19 cospi_7_64 EQU 15426
20 cospi_8_64 EQU 15137
21 cospi_9_64 EQU 14811
22 cospi_10_64 EQU 14449
23 cospi_11_64 EQU 14053
24 cospi_12_64 EQU 13623
25 cospi_13_64 EQU 13160
26 cospi_14_64 EQU 12665
27 cospi_15_64 EQU 12140
28 cospi_16_64 EQU 11585
29 cospi_17_64 EQU 11003
30 cospi_18_64 EQU 10394
31 cospi_19_64 EQU 9760
32 cospi_20_64 EQU 9102
33 cospi_21_64 EQU 8423
34 cospi_22_64 EQU 7723
35 cospi_23_64 EQU 7005
36 cospi_24_64 EQU 6270
37 cospi_25_64 EQU 5520
38 cospi_26_64 EQU 4756
39 cospi_27_64 EQU 3981
40 cospi_28_64 EQU 3196
41 cospi_29_64 EQU 2404
42 cospi_30_64 EQU 1606
43 cospi_31_64 EQU 804
44
45
46 EXPORT |vp9_idct32x32_1024_add_neon|
47 ARM
48 REQUIRE8
49 PRESERVE8
50
51 AREA ||.text||, CODE, READONLY, ALIGN=2
52
53 AREA Block, CODE, READONLY
54
55 ; --------------------------------------------------------------------------
56 ; Load from transposed_buffer
57 ; q13 = transposed_buffer[first_offset]
58 ; q14 = transposed_buffer[second_offset]
59 ; for proper address calculation, the last offset used when manipulating
60 ; transposed_buffer must be passed in. use 0 for first use.
61 MACRO
62 LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset
63 ; address calculation with proper stride and loading
64 add r0, #($first_offset - $prev_offset )*8*2
65 vld1.s16 {q14}, [r0]
66 add r0, #($second_offset - $first_offset)*8*2
67 vld1.s16 {q13}, [r0]
68 ; (used) two registers (q14, q13)
69 MEND
70 ; --------------------------------------------------------------------------
71 ; Load from output (used as temporary storage)
72 ; reg1 = output[first_offset]
73 ; reg2 = output[second_offset]
74 ; for proper address calculation, the last offset used when manipulating
75 ; output, whether reading or storing) must be passed in. use 0 for first
76 ; use.
77 MACRO
78 LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
79 ; address calculation with proper stride and loading
80 add r1, #($first_offset - $prev_offset )*32*2
81 vld1.s16 {$reg1}, [r1]
82 add r1, #($second_offset - $first_offset)*32*2
83 vld1.s16 {$reg2}, [r1]
84 ; (used) two registers ($reg1, $reg2)
85 MEND
86 ; --------------------------------------------------------------------------
87 ; Store into output (sometimes as as temporary storage)
88 ; output[first_offset] = reg1
89 ; output[second_offset] = reg2
90 ; for proper address calculation, the last offset used when manipulating
91 ; output, whether reading or storing) must be passed in. use 0 for first
92 ; use.
93 MACRO
94 STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
95 ; address calculation with proper stride and storing
96 add r1, #($first_offset - $prev_offset )*32*2
97 vst1.16 {$reg1}, [r1]
98 add r1, #($second_offset - $first_offset)*32*2
99 vst1.16 {$reg2}, [r1]
100 MEND
101 ; --------------------------------------------------------------------------
102 ; Combine-add results with current destination content
103 ; q6-q9 contain the results (out[j * 32 + 0-31])
104 MACRO
105 STORE_COMBINE_CENTER_RESULTS
106 ; load dest[j * dest_stride + 0-31]
107 vld1.s16 {d8}, [r10], r2
108 vld1.s16 {d11}, [r9], r11
109 vld1.s16 {d9}, [r10]
110 vld1.s16 {d10}, [r9]
111 ; ROUND_POWER_OF_TWO
112 vrshr.s16 q7, q7, #6
113 vrshr.s16 q8, q8, #6
114 vrshr.s16 q9, q9, #6
115 vrshr.s16 q6, q6, #6
116 ; add to dest[j * dest_stride + 0-31]
117 vaddw.u8 q7, q7, d9
118 vaddw.u8 q8, q8, d10
119 vaddw.u8 q9, q9, d11
120 vaddw.u8 q6, q6, d8
121 ; clip pixel
122 vqmovun.s16 d9, q7
123 vqmovun.s16 d10, q8
124 vqmovun.s16 d11, q9
125 vqmovun.s16 d8, q6
126 ; store back into dest[j * dest_stride + 0-31]
127 vst1.16 {d9}, [r10], r11
128 vst1.16 {d10}, [r9], r2
129 vst1.16 {d8}, [r10]
130 vst1.16 {d11}, [r9]
131 ; update pointers (by dest_stride * 2)
132 sub r9, r9, r2, lsl #1
133 add r10, r10, r2, lsl #1
134 MEND
135 ; --------------------------------------------------------------------------
136 ; Combine-add results with current destination content
137 ; q6-q9 contain the results (out[j * 32 + 0-31])
138 MACRO
139 STORE_COMBINE_CENTER_RESULTS_LAST
140 ; load dest[j * dest_stride + 0-31]
141 vld1.s16 {d8}, [r10], r2
142 vld1.s16 {d11}, [r9], r11
143 vld1.s16 {d9}, [r10]
144 vld1.s16 {d10}, [r9]
145 ; ROUND_POWER_OF_TWO
146 vrshr.s16 q7, q7, #6
147 vrshr.s16 q8, q8, #6
148 vrshr.s16 q9, q9, #6
149 vrshr.s16 q6, q6, #6
150 ; add to dest[j * dest_stride + 0-31]
151 vaddw.u8 q7, q7, d9
152 vaddw.u8 q8, q8, d10
153 vaddw.u8 q9, q9, d11
154 vaddw.u8 q6, q6, d8
155 ; clip pixel
156 vqmovun.s16 d9, q7
157 vqmovun.s16 d10, q8
158 vqmovun.s16 d11, q9
159 vqmovun.s16 d8, q6
160 ; store back into dest[j * dest_stride + 0-31]
161 vst1.16 {d9}, [r10], r11
162 vst1.16 {d10}, [r9], r2
163 vst1.16 {d8}, [r10]!
164 vst1.16 {d11}, [r9]!
165 ; update pointers (by dest_stride * 2)
166 sub r9, r9, r2, lsl #1
167 add r10, r10, r2, lsl #1
168 MEND
169 ; --------------------------------------------------------------------------
170 ; Combine-add results with current destination content
171 ; q4-q7 contain the results (out[j * 32 + 0-31])
172 MACRO
173 STORE_COMBINE_EXTREME_RESULTS
174 ; load dest[j * dest_stride + 0-31]
175 vld1.s16 {d4}, [r7], r2
176 vld1.s16 {d7}, [r6], r11
177 vld1.s16 {d5}, [r7]
178 vld1.s16 {d6}, [r6]
179 ; ROUND_POWER_OF_TWO
180 vrshr.s16 q5, q5, #6
181 vrshr.s16 q6, q6, #6
182 vrshr.s16 q7, q7, #6
183 vrshr.s16 q4, q4, #6
184 ; add to dest[j * dest_stride + 0-31]
185 vaddw.u8 q5, q5, d5
186 vaddw.u8 q6, q6, d6
187 vaddw.u8 q7, q7, d7
188 vaddw.u8 q4, q4, d4
189 ; clip pixel
190 vqmovun.s16 d5, q5
191 vqmovun.s16 d6, q6
192 vqmovun.s16 d7, q7
193 vqmovun.s16 d4, q4
194 ; store back into dest[j * dest_stride + 0-31]
195 vst1.16 {d5}, [r7], r11
196 vst1.16 {d6}, [r6], r2
197 vst1.16 {d7}, [r6]
198 vst1.16 {d4}, [r7]
199 ; update pointers (by dest_stride * 2)
200 sub r6, r6, r2, lsl #1
201 add r7, r7, r2, lsl #1
202 MEND
203 ; --------------------------------------------------------------------------
204 ; Combine-add results with current destination content
205 ; q4-q7 contain the results (out[j * 32 + 0-31])
206 MACRO
207 STORE_COMBINE_EXTREME_RESULTS_LAST
208 ; load dest[j * dest_stride + 0-31]
209 vld1.s16 {d4}, [r7], r2
210 vld1.s16 {d7}, [r6], r11
211 vld1.s16 {d5}, [r7]
212 vld1.s16 {d6}, [r6]
213 ; ROUND_POWER_OF_TWO
214 vrshr.s16 q5, q5, #6
215 vrshr.s16 q6, q6, #6
216 vrshr.s16 q7, q7, #6
217 vrshr.s16 q4, q4, #6
218 ; add to dest[j * dest_stride + 0-31]
219 vaddw.u8 q5, q5, d5
220 vaddw.u8 q6, q6, d6
221 vaddw.u8 q7, q7, d7
222 vaddw.u8 q4, q4, d4
223 ; clip pixel
224 vqmovun.s16 d5, q5
225 vqmovun.s16 d6, q6
226 vqmovun.s16 d7, q7
227 vqmovun.s16 d4, q4
228 ; store back into dest[j * dest_stride + 0-31]
229 vst1.16 {d5}, [r7], r11
230 vst1.16 {d6}, [r6], r2
231 vst1.16 {d7}, [r6]!
232 vst1.16 {d4}, [r7]!
233 ; update pointers (by dest_stride * 2)
234 sub r6, r6, r2, lsl #1
235 add r7, r7, r2, lsl #1
236 MEND
237 ; --------------------------------------------------------------------------
238 ; Touches q8-q12, q15 (q13-q14 are preserved)
239 ; valid output registers are anything but q8-q11
240 MACRO
241 DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
242 ; TODO(cd): have special case to re-use constants when they are similar for
243 ; consecutive butterflies
244 ; TODO(cd): have special case when both constants are the same, do the
245 ; additions/subtractions before the multiplies.
246 ; generate the constants
247 ; generate scalar constants
248 mov r8, #$first_constant & 0xFF00
249 mov r12, #$second_constant & 0xFF00
250 add r8, #$first_constant & 0x00FF
251 add r12, #$second_constant & 0x00FF
252 ; generate vector constants
253 vdup.16 d30, r8
254 vdup.16 d31, r12
255 ; (used) two for inputs (regA-regD), one for constants (q15)
256 ; do some multiplications (ordered for maximum latency hiding)
257 vmull.s16 q8, $regC, d30
258 vmull.s16 q10, $regA, d31
259 vmull.s16 q9, $regD, d30
260 vmull.s16 q11, $regB, d31
261 vmull.s16 q12, $regC, d31
262 ; (used) five for intermediate (q8-q12), one for constants (q15)
263 ; do some addition/subtractions (to get back two register)
264 vsub.s32 q8, q8, q10
265 vsub.s32 q9, q9, q11
266 ; do more multiplications (ordered for maximum latency hiding)
267 vmull.s16 q10, $regD, d31
268 vmull.s16 q11, $regA, d30
269 vmull.s16 q15, $regB, d30
270 ; (used) six for intermediate (q8-q12, q15)
271 ; do more addition/subtractions
272 vadd.s32 q11, q12, q11
273 vadd.s32 q10, q10, q15
274 ; (used) four for intermediate (q8-q11)
275 ; dct_const_round_shift
276 vqrshrn.s32 $reg1, q8, #14
277 vqrshrn.s32 $reg2, q9, #14
278 vqrshrn.s32 $reg3, q11, #14
279 vqrshrn.s32 $reg4, q10, #14
280 ; (used) two for results, well four d registers
281 MEND
282 ; --------------------------------------------------------------------------
283 ; Touches q8-q12, q15 (q13-q14 are preserved)
284 ; valid output registers are anything but q8-q11
285 MACRO
286 DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $re g4
287 DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $ reg2, $reg3, $reg4
288 MEND
289 ; --------------------------------------------------------------------------
290
291 ;void vp9_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride );
292 ;
293 ; r0 int16_t *input,
294 ; r1 uint8_t *dest,
295 ; r2 int dest_stride)
296 ; loop counters
297 ; r4 bands loop counter
298 ; r5 pass loop counter
299 ; r8 transpose loop counter
300 ; combine-add pointers
301 ; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...)
302 ; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...)
303 ; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...)
304 ; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...)
305
306 |vp9_idct32x32_1024_add_neon| PROC
307 ; This function does one pass of idct32x32 transform.
308 ;
309 ; This is done by transposing the input and then doing a 1d transform on
310 ; columns. In the first pass, the transposed columns are the original
311 ; rows. In the second pass, after the transposition, the colums are the
312 ; original columns.
313 ; The 1d transform is done by looping over bands of eight columns (the
314 ; idct32_bands loop). For each band, the transform input transposition
315 ; is done on demand, one band of four 8x8 matrices at a time. The four
316 ; matrices are transposed by pairs (the idct32_transpose_pair loop).
317 push {r4-r11}
318 vpush {d8-d15}
319 ; stack operation
320 ; internal buffer used to transpose 8 lines into before transforming them
321 ; int16_t transpose_buffer[32 * 8];
322 ; at sp + [4096, 4607]
323 ; results of the first pass (transpose and transform rows)
324 ; int16_t pass1[32 * 32];
325 ; at sp + [0, 2047]
326 ; results of the second pass (transpose and transform columns)
327 ; int16_t pass2[32 * 32];
328 ; at sp + [2048, 4095]
329 sub sp, sp, #512+2048+2048
330
331 ; r6 = dest + 31 * dest_stride
332 ; r7 = dest + 0 * dest_stride
333 ; r9 = dest + 15 * dest_stride
334 ; r10 = dest + 16 * dest_stride
335 rsb r6, r2, r2, lsl #5
336 rsb r9, r2, r2, lsl #4
337 add r10, r1, r2, lsl #4
338 mov r7, r1
339 add r6, r6, r1
340 add r9, r9, r1
341 ; r11 = -dest_stride
342 neg r11, r2
343 ; r3 = input
344 mov r3, r0
345 ; parameters for first pass
346 ; r0 = transpose_buffer[32 * 8]
347 add r0, sp, #4096
348 ; r1 = pass1[32 * 32]
349 mov r1, sp
350
351 mov r5, #0 ; initialize pass loop counter
352 idct32_pass_loop
353 mov r4, #4 ; initialize bands loop counter
354 idct32_bands_loop
355 mov r8, #2 ; initialize transpose loop counter
356 idct32_transpose_pair_loop
357 ; Load two horizontally consecutive 8x8 16bit data matrices. The first one
358 ; into q0-q7 and the second one into q8-q15. There is a stride of 64,
359 ; adjusted to 32 because of the two post-increments.
360 vld1.s16 {q8}, [r3]!
361 vld1.s16 {q0}, [r3]!
362 add r3, #32
363 vld1.s16 {q9}, [r3]!
364 vld1.s16 {q1}, [r3]!
365 add r3, #32
366 vld1.s16 {q10}, [r3]!
367 vld1.s16 {q2}, [r3]!
368 add r3, #32
369 vld1.s16 {q11}, [r3]!
370 vld1.s16 {q3}, [r3]!
371 add r3, #32
372 vld1.s16 {q12}, [r3]!
373 vld1.s16 {q4}, [r3]!
374 add r3, #32
375 vld1.s16 {q13}, [r3]!
376 vld1.s16 {q5}, [r3]!
377 add r3, #32
378 vld1.s16 {q14}, [r3]!
379 vld1.s16 {q6}, [r3]!
380 add r3, #32
381 vld1.s16 {q15}, [r3]!
382 vld1.s16 {q7}, [r3]!
383
384 ; Transpose the two 8x8 16bit data matrices.
385 vswp d17, d24
386 vswp d23, d30
387 vswp d21, d28
388 vswp d19, d26
389 vswp d1, d8
390 vswp d7, d14
391 vswp d5, d12
392 vswp d3, d10
393 vtrn.32 q8, q10
394 vtrn.32 q9, q11
395 vtrn.32 q12, q14
396 vtrn.32 q13, q15
397 vtrn.32 q0, q2
398 vtrn.32 q1, q3
399 vtrn.32 q4, q6
400 vtrn.32 q5, q7
401 vtrn.16 q8, q9
402 vtrn.16 q10, q11
403 vtrn.16 q12, q13
404 vtrn.16 q14, q15
405 vtrn.16 q0, q1
406 vtrn.16 q2, q3
407 vtrn.16 q4, q5
408 vtrn.16 q6, q7
409
410 ; Store both matrices after each other. There is a stride of 32, which
411 ; adjusts to nothing because of the post-increments.
412 vst1.16 {q8}, [r0]!
413 vst1.16 {q9}, [r0]!
414 vst1.16 {q10}, [r0]!
415 vst1.16 {q11}, [r0]!
416 vst1.16 {q12}, [r0]!
417 vst1.16 {q13}, [r0]!
418 vst1.16 {q14}, [r0]!
419 vst1.16 {q15}, [r0]!
420 vst1.16 {q0}, [r0]!
421 vst1.16 {q1}, [r0]!
422 vst1.16 {q2}, [r0]!
423 vst1.16 {q3}, [r0]!
424 vst1.16 {q4}, [r0]!
425 vst1.16 {q5}, [r0]!
426 vst1.16 {q6}, [r0]!
427 vst1.16 {q7}, [r0]!
428
429 ; increment pointers by adjusted stride (not necessary for r0/out)
430 ; go back by 7*32 for the seven lines moved fully by read and add
431 ; go back by 32 for the eigth line only read
432 ; advance by 16*2 to go the next pair
433 sub r3, r3, #7*32*2 + 32 - 16*2
434 ; transpose pair loop processing
435 subs r8, r8, #1
436 bne idct32_transpose_pair_loop
437
438 ; restore r0/input to its original value
439 sub r0, r0, #32*8*2
440
441 ; Instead of doing the transforms stage by stage, it is done by loading
442 ; some input values and doing as many stages as possible to minimize the
443 ; storing/loading of intermediate results. To fit within registers, the
444 ; final coefficients are cut into four blocks:
445 ; BLOCK A: 16-19,28-31
446 ; BLOCK B: 20-23,24-27
447 ; BLOCK C: 8-10,11-15
448 ; BLOCK D: 0-3,4-7
449 ; Blocks A and C are straight calculation through the various stages. In
450 ; block B, further calculations are performed using the results from
451 ; block A. In block D, further calculations are performed using the results
452 ; from block C and then the final calculations are done using results from
453 ; block A and B which have been combined at the end of block B.
454
455 ; --------------------------------------------------------------------------
456 ; BLOCK A: 16-19,28-31
457 ; --------------------------------------------------------------------------
458 ; generate 16,17,30,31
459 ; --------------------------------------------------------------------------
460 ; part of stage 1
461 ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] * cospi_1_64;
462 ;temp2 = input[1 * 32] * cospi_1_64 + input[31 * 32] * cospi_31_64;
463 ;step1b[16][i] = dct_const_round_shift(temp1);
464 ;step1b[31][i] = dct_const_round_shift(temp2);
465 LOAD_FROM_TRANSPOSED 0, 1, 31
466 DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5
467 ; --------------------------------------------------------------------------
468 ; part of stage 1
469 ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64;
470 ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64;
471 ;step1b[17][i] = dct_const_round_shift(temp1);
472 ;step1b[30][i] = dct_const_round_shift(temp2);
473 LOAD_FROM_TRANSPOSED 31, 17, 15
474 DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7
475 ; --------------------------------------------------------------------------
476 ; part of stage 2
477 ;step2[16] = step1b[16][i] + step1b[17][i];
478 ;step2[17] = step1b[16][i] - step1b[17][i];
479 ;step2[30] = -step1b[30][i] + step1b[31][i];
480 ;step2[31] = step1b[30][i] + step1b[31][i];
481 vadd.s16 q4, q0, q1
482 vsub.s16 q13, q0, q1
483 vadd.s16 q6, q2, q3
484 vsub.s16 q14, q2, q3
485 ; --------------------------------------------------------------------------
486 ; part of stage 3
487 ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64;
488 ;temp2 = step1b[30][i] * cospi_4_64 - step1b[17][i] * cospi_28_64;
489 ;step3[17] = dct_const_round_shift(temp1);
490 ;step3[30] = dct_const_round_shift(temp2);
491 DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15
492 ; --------------------------------------------------------------------------
493 ; generate 18,19,28,29
494 ; --------------------------------------------------------------------------
495 ; part of stage 1
496 ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64;
497 ;temp2 = input[9 * 32] * cospi_9_64 + input[23 * 32] * cospi_23_64;
498 ;step1b[18][i] = dct_const_round_shift(temp1);
499 ;step1b[29][i] = dct_const_round_shift(temp2);
500 LOAD_FROM_TRANSPOSED 15, 9, 23
501 DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5
502 ; --------------------------------------------------------------------------
503 ; part of stage 1
504 ;temp1 = input[25 * 32] * cospi_7_64 - input[7 * 32] * cospi_25_64;
505 ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64;
506 ;step1b[19][i] = dct_const_round_shift(temp1);
507 ;step1b[28][i] = dct_const_round_shift(temp2);
508 LOAD_FROM_TRANSPOSED 23, 25, 7
509 DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7
510 ; --------------------------------------------------------------------------
511 ; part of stage 2
512 ;step2[18] = -step1b[18][i] + step1b[19][i];
513 ;step2[19] = step1b[18][i] + step1b[19][i];
514 ;step2[28] = step1b[28][i] + step1b[29][i];
515 ;step2[29] = step1b[28][i] - step1b[29][i];
516 vsub.s16 q13, q3, q2
517 vadd.s16 q3, q3, q2
518 vsub.s16 q14, q1, q0
519 vadd.s16 q2, q1, q0
520 ; --------------------------------------------------------------------------
521 ; part of stage 3
522 ;temp1 = step1b[18][i] * (-cospi_4_64) - step1b[29][i] * (-cospi_28_64);
523 ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64);
524 ;step3[29] = dct_const_round_shift(temp1);
525 ;step3[18] = dct_const_round_shift(temp2);
526 DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1
527 ; --------------------------------------------------------------------------
528 ; combine 16-19,28-31
529 ; --------------------------------------------------------------------------
530 ; part of stage 4
531 ;step1[16] = step1b[16][i] + step1b[19][i];
532 ;step1[17] = step1b[17][i] + step1b[18][i];
533 ;step1[18] = step1b[17][i] - step1b[18][i];
534 ;step1[29] = step1b[30][i] - step1b[29][i];
535 ;step1[30] = step1b[30][i] + step1b[29][i];
536 ;step1[31] = step1b[31][i] + step1b[28][i];
537 vadd.s16 q8, q4, q2
538 vadd.s16 q9, q5, q0
539 vadd.s16 q10, q7, q1
540 vadd.s16 q15, q6, q3
541 vsub.s16 q13, q5, q0
542 vsub.s16 q14, q7, q1
543 STORE_IN_OUTPUT 0, 16, 31, q8, q15
544 STORE_IN_OUTPUT 31, 17, 30, q9, q10
545 ; --------------------------------------------------------------------------
546 ; part of stage 5
547 ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64;
548 ;temp2 = step1b[29][i] * cospi_8_64 + step1b[18][i] * cospi_24_64;
549 ;step2[18] = dct_const_round_shift(temp1);
550 ;step2[29] = dct_const_round_shift(temp2);
551 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3
552 STORE_IN_OUTPUT 30, 29, 18, q1, q0
553 ; --------------------------------------------------------------------------
554 ; part of stage 4
555 ;step1[19] = step1b[16][i] - step1b[19][i];
556 ;step1[28] = step1b[31][i] - step1b[28][i];
557 vsub.s16 q13, q4, q2
558 vsub.s16 q14, q6, q3
559 ; --------------------------------------------------------------------------
560 ; part of stage 5
561 ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64;
562 ;temp2 = step1b[28][i] * cospi_8_64 + step1b[19][i] * cospi_24_64;
563 ;step2[19] = dct_const_round_shift(temp1);
564 ;step2[28] = dct_const_round_shift(temp2);
565 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13
566 STORE_IN_OUTPUT 18, 19, 28, q4, q6
567 ; --------------------------------------------------------------------------
568
569
570 ; --------------------------------------------------------------------------
571 ; BLOCK B: 20-23,24-27
572 ; --------------------------------------------------------------------------
573 ; generate 20,21,26,27
574 ; --------------------------------------------------------------------------
575 ; part of stage 1
576 ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64;
577 ;temp2 = input[5 * 32] * cospi_5_64 + input[27 * 32] * cospi_27_64;
578 ;step1b[20][i] = dct_const_round_shift(temp1);
579 ;step1b[27][i] = dct_const_round_shift(temp2);
580 LOAD_FROM_TRANSPOSED 7, 5, 27
581 DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5
582 ; --------------------------------------------------------------------------
583 ; part of stage 1
584 ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64;
585 ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64;
586 ;step1b[21][i] = dct_const_round_shift(temp1);
587 ;step1b[26][i] = dct_const_round_shift(temp2);
588 LOAD_FROM_TRANSPOSED 27, 21, 11
589 DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7
590 ; --------------------------------------------------------------------------
591 ; part of stage 2
592 ;step2[20] = step1b[20][i] + step1b[21][i];
593 ;step2[21] = step1b[20][i] - step1b[21][i];
594 ;step2[26] = -step1b[26][i] + step1b[27][i];
595 ;step2[27] = step1b[26][i] + step1b[27][i];
596 vsub.s16 q13, q0, q1
597 vadd.s16 q0, q0, q1
598 vsub.s16 q14, q2, q3
599 vadd.s16 q2, q2, q3
600 ; --------------------------------------------------------------------------
601 ; part of stage 3
602 ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64;
603 ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64;
604 ;step3[21] = dct_const_round_shift(temp1);
605 ;step3[26] = dct_const_round_shift(temp2);
606 DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
607 ; --------------------------------------------------------------------------
608 ; generate 22,23,24,25
609 ; --------------------------------------------------------------------------
610 ; part of stage 1
611 ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64;
612 ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64;
613 ;step1b[22][i] = dct_const_round_shift(temp1);
614 ;step1b[25][i] = dct_const_round_shift(temp2);
615 LOAD_FROM_TRANSPOSED 11, 13, 19
616 DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15
617 ; --------------------------------------------------------------------------
618 ; part of stage 1
619 ;temp1 = input[29 * 32] * cospi_3_64 - input[3 * 32] * cospi_29_64;
620 ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64;
621 ;step1b[23][i] = dct_const_round_shift(temp1);
622 ;step1b[24][i] = dct_const_round_shift(temp2);
623 LOAD_FROM_TRANSPOSED 19, 29, 3
624 DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13
625 ; --------------------------------------------------------------------------
626 ; part of stage 2
627 ;step2[22] = -step1b[22][i] + step1b[23][i];
628 ;step2[23] = step1b[22][i] + step1b[23][i];
629 ;step2[24] = step1b[24][i] + step1b[25][i];
630 ;step2[25] = step1b[24][i] - step1b[25][i];
631 vsub.s16 q14, q4, q5
632 vadd.s16 q5, q4, q5
633 vsub.s16 q13, q6, q7
634 vadd.s16 q6, q6, q7
635 ; --------------------------------------------------------------------------
636 ; part of stage 3
637 ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64);
638 ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64);
639 ;step3[25] = dct_const_round_shift(temp1);
640 ;step3[22] = dct_const_round_shift(temp2);
641 DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15
642 ; --------------------------------------------------------------------------
643 ; combine 20-23,24-27
644 ; --------------------------------------------------------------------------
645 ; part of stage 4
646 ;step1[22] = step1b[22][i] + step1b[21][i];
647 ;step1[23] = step1b[23][i] + step1b[20][i];
648 vadd.s16 q10, q7, q1
649 vadd.s16 q11, q5, q0
650 ;step1[24] = step1b[24][i] + step1b[27][i];
651 ;step1[25] = step1b[25][i] + step1b[26][i];
652 vadd.s16 q12, q6, q2
653 vadd.s16 q15, q4, q3
654 ; --------------------------------------------------------------------------
655 ; part of stage 6
656 ;step3[16] = step1b[16][i] + step1b[23][i];
657 ;step3[17] = step1b[17][i] + step1b[22][i];
658 ;step3[22] = step1b[17][i] - step1b[22][i];
659 ;step3[23] = step1b[16][i] - step1b[23][i];
660 LOAD_FROM_OUTPUT 28, 16, 17, q14, q13
661 vadd.s16 q8, q14, q11
662 vadd.s16 q9, q13, q10
663 vsub.s16 q13, q13, q10
664 vsub.s16 q11, q14, q11
665 STORE_IN_OUTPUT 17, 17, 16, q9, q8
666 ; --------------------------------------------------------------------------
667 ; part of stage 6
668 ;step3[24] = step1b[31][i] - step1b[24][i];
669 ;step3[25] = step1b[30][i] - step1b[25][i];
670 ;step3[30] = step1b[30][i] + step1b[25][i];
671 ;step3[31] = step1b[31][i] + step1b[24][i];
672 LOAD_FROM_OUTPUT 16, 30, 31, q14, q9
673 vsub.s16 q8, q9, q12
674 vadd.s16 q10, q14, q15
675 vsub.s16 q14, q14, q15
676 vadd.s16 q12, q9, q12
677 STORE_IN_OUTPUT 31, 30, 31, q10, q12
678 ; --------------------------------------------------------------------------
679 ; TODO(cd) do some register allocation change to remove these push/pop
680 vpush {q8} ; [24]
681 vpush {q11} ; [23]
682 ; --------------------------------------------------------------------------
683 ; part of stage 7
684 ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64;
685 ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64;
686 ;step1[22] = dct_const_round_shift(temp1);
687 ;step1[25] = dct_const_round_shift(temp2);
688 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
689 STORE_IN_OUTPUT 31, 25, 22, q14, q13
690 ; --------------------------------------------------------------------------
691 ; part of stage 7
692 ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64;
693 ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64;
694 ;step1[23] = dct_const_round_shift(temp1);
695 ;step1[24] = dct_const_round_shift(temp2);
696 ; TODO(cd) do some register allocation change to remove these push/pop
697 vpop {q13} ; [23]
698 vpop {q14} ; [24]
699 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
700 STORE_IN_OUTPUT 22, 24, 23, q14, q13
701 ; --------------------------------------------------------------------------
702 ; part of stage 4
703 ;step1[20] = step1b[23][i] - step1b[20][i];
704 ;step1[27] = step1b[24][i] - step1b[27][i];
705 vsub.s16 q14, q5, q0
706 vsub.s16 q13, q6, q2
707 ; --------------------------------------------------------------------------
708 ; part of stage 5
709 ;temp1 = step1b[20][i] * (-cospi_8_64) - step1b[27][i] * (-cospi_24_64);
710 ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64);
711 ;step2[27] = dct_const_round_shift(temp1);
712 ;step2[20] = dct_const_round_shift(temp2);
713 DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13
714 ; --------------------------------------------------------------------------
715 ; part of stage 4
716 ;step1[21] = step1b[22][i] - step1b[21][i];
717 ;step1[26] = step1b[25][i] - step1b[26][i];
718 vsub.s16 q14, q7, q1
719 vsub.s16 q13, q4, q3
720 ; --------------------------------------------------------------------------
721 ; part of stage 5
722 ;temp1 = step1b[21][i] * (-cospi_8_64) - step1b[26][i] * (-cospi_24_64);
723 ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64);
724 ;step2[26] = dct_const_round_shift(temp1);
725 ;step2[21] = dct_const_round_shift(temp2);
726 DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3
727 ; --------------------------------------------------------------------------
728 ; part of stage 6
729 ;step3[18] = step1b[18][i] + step1b[21][i];
730 ;step3[19] = step1b[19][i] + step1b[20][i];
731 ;step3[20] = step1b[19][i] - step1b[20][i];
732 ;step3[21] = step1b[18][i] - step1b[21][i];
733 LOAD_FROM_OUTPUT 23, 18, 19, q14, q13
734 vadd.s16 q8, q14, q1
735 vadd.s16 q9, q13, q6
736 vsub.s16 q13, q13, q6
737 vsub.s16 q1, q14, q1
738 STORE_IN_OUTPUT 19, 18, 19, q8, q9
739 ; --------------------------------------------------------------------------
740 ; part of stage 6
741 ;step3[27] = step1b[28][i] - step1b[27][i];
742 ;step3[28] = step1b[28][i] + step1b[27][i];
743 ;step3[29] = step1b[29][i] + step1b[26][i];
744 ;step3[26] = step1b[29][i] - step1b[26][i];
745 LOAD_FROM_OUTPUT 19, 28, 29, q8, q9
746 vsub.s16 q14, q8, q5
747 vadd.s16 q10, q8, q5
748 vadd.s16 q11, q9, q0
749 vsub.s16 q0, q9, q0
750 STORE_IN_OUTPUT 29, 28, 29, q10, q11
751 ; --------------------------------------------------------------------------
752 ; part of stage 7
753 ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64;
754 ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64;
755 ;step1[20] = dct_const_round_shift(temp1);
756 ;step1[27] = dct_const_round_shift(temp2);
757 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
758 STORE_IN_OUTPUT 29, 20, 27, q13, q14
759 ; --------------------------------------------------------------------------
760 ; part of stage 7
761 ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64;
762 ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64;
763 ;step1[21] = dct_const_round_shift(temp1);
764 ;step1[26] = dct_const_round_shift(temp2);
765 DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1
766 STORE_IN_OUTPUT 27, 21, 26, q1, q0
767 ; --------------------------------------------------------------------------
768
769
770 ; --------------------------------------------------------------------------
771 ; BLOCK C: 8-10,11-15
772 ; --------------------------------------------------------------------------
773 ; generate 8,9,14,15
774 ; --------------------------------------------------------------------------
775 ; part of stage 2
776 ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64;
777 ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64;
778 ;step2[8] = dct_const_round_shift(temp1);
779 ;step2[15] = dct_const_round_shift(temp2);
780 LOAD_FROM_TRANSPOSED 3, 2, 30
781 DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5
782 ; --------------------------------------------------------------------------
783 ; part of stage 2
784 ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64;
785 ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64;
786 ;step2[9] = dct_const_round_shift(temp1);
787 ;step2[14] = dct_const_round_shift(temp2);
788 LOAD_FROM_TRANSPOSED 30, 18, 14
789 DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7
790 ; --------------------------------------------------------------------------
791 ; part of stage 3
792 ;step3[8] = step1b[8][i] + step1b[9][i];
793 ;step3[9] = step1b[8][i] - step1b[9][i];
794 ;step3[14] = step1b[15][i] - step1b[14][i];
795 ;step3[15] = step1b[15][i] + step1b[14][i];
796 vsub.s16 q13, q0, q1
797 vadd.s16 q0, q0, q1
798 vsub.s16 q14, q2, q3
799 vadd.s16 q2, q2, q3
800 ; --------------------------------------------------------------------------
801 ; part of stage 4
802 ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64;
803 ;temp2 = step1b[14][i] * cospi_8_64 + step1b[9][i] * cospi_24_64;
804 ;step1[9] = dct_const_round_shift(temp1);
805 ;step1[14] = dct_const_round_shift(temp2);
806 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7
807 ; --------------------------------------------------------------------------
808 ; generate 10,11,12,13
809 ; --------------------------------------------------------------------------
810 ; part of stage 2
811 ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64;
812 ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64;
813 ;step2[10] = dct_const_round_shift(temp1);
814 ;step2[13] = dct_const_round_shift(temp2);
815 LOAD_FROM_TRANSPOSED 14, 10, 22
816 DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15
817 ; --------------------------------------------------------------------------
818 ; part of stage 2
819 ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64;
820 ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64;
821 ;step2[11] = dct_const_round_shift(temp1);
822 ;step2[12] = dct_const_round_shift(temp2);
823 LOAD_FROM_TRANSPOSED 22, 26, 6
824 DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13
825 ; --------------------------------------------------------------------------
826 ; part of stage 3
827 ;step3[10] = step1b[11][i] - step1b[10][i];
828 ;step3[11] = step1b[11][i] + step1b[10][i];
829 ;step3[12] = step1b[12][i] + step1b[13][i];
830 ;step3[13] = step1b[12][i] - step1b[13][i];
831 vsub.s16 q14, q4, q5
832 vadd.s16 q5, q4, q5
833 vsub.s16 q13, q6, q7
834 vadd.s16 q6, q6, q7
835 ; --------------------------------------------------------------------------
836 ; part of stage 4
837 ;temp1 = step1b[10][i] * (-cospi_8_64) - step1b[13][i] * (-cospi_24_64);
838 ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64);
839 ;step1[13] = dct_const_round_shift(temp1);
840 ;step1[10] = dct_const_round_shift(temp2);
841 DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15
842 ; --------------------------------------------------------------------------
843 ; combine 8-10,11-15
844 ; --------------------------------------------------------------------------
845 ; part of stage 5
846 ;step2[8] = step1b[8][i] + step1b[11][i];
847 ;step2[9] = step1b[9][i] + step1b[10][i];
848 ;step2[10] = step1b[9][i] - step1b[10][i];
849 vadd.s16 q8, q0, q5
850 vadd.s16 q9, q1, q7
851 vsub.s16 q13, q1, q7
852 ;step2[13] = step1b[14][i] - step1b[13][i];
853 ;step2[14] = step1b[14][i] + step1b[13][i];
854 ;step2[15] = step1b[15][i] + step1b[12][i];
855 vsub.s16 q14, q3, q4
856 vadd.s16 q10, q3, q4
857 vadd.s16 q15, q2, q6
858 STORE_IN_OUTPUT 26, 8, 15, q8, q15
859 STORE_IN_OUTPUT 15, 9, 14, q9, q10
860 ; --------------------------------------------------------------------------
861 ; part of stage 6
862 ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64;
863 ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64;
864 ;step3[10] = dct_const_round_shift(temp1);
865 ;step3[13] = dct_const_round_shift(temp2);
866 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
867 STORE_IN_OUTPUT 14, 13, 10, q3, q1
868 ; --------------------------------------------------------------------------
869 ; part of stage 5
870 ;step2[11] = step1b[8][i] - step1b[11][i];
871 ;step2[12] = step1b[15][i] - step1b[12][i];
872 vsub.s16 q13, q0, q5
873 vsub.s16 q14, q2, q6
874 ; --------------------------------------------------------------------------
875 ; part of stage 6
876 ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64;
877 ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64;
878 ;step3[11] = dct_const_round_shift(temp1);
879 ;step3[12] = dct_const_round_shift(temp2);
880 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
881 STORE_IN_OUTPUT 10, 11, 12, q1, q3
882 ; --------------------------------------------------------------------------
883
884
885 ; --------------------------------------------------------------------------
886 ; BLOCK D: 0-3,4-7
887 ; --------------------------------------------------------------------------
888 ; generate 4,5,6,7
889 ; --------------------------------------------------------------------------
890 ; part of stage 3
891 ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64;
892 ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64;
893 ;step3[4] = dct_const_round_shift(temp1);
894 ;step3[7] = dct_const_round_shift(temp2);
895 LOAD_FROM_TRANSPOSED 6, 4, 28
896 DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5
897 ; --------------------------------------------------------------------------
898 ; part of stage 3
899 ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64;
900 ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64;
901 ;step3[5] = dct_const_round_shift(temp1);
902 ;step3[6] = dct_const_round_shift(temp2);
903 LOAD_FROM_TRANSPOSED 28, 20, 12
904 DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
905 ; --------------------------------------------------------------------------
906 ; part of stage 4
907 ;step1[4] = step1b[4][i] + step1b[5][i];
908 ;step1[5] = step1b[4][i] - step1b[5][i];
909 ;step1[6] = step1b[7][i] - step1b[6][i];
910 ;step1[7] = step1b[7][i] + step1b[6][i];
911 vsub.s16 q13, q0, q1
912 vadd.s16 q0, q0, q1
913 vsub.s16 q14, q2, q3
914 vadd.s16 q2, q2, q3
915 ; --------------------------------------------------------------------------
916 ; part of stage 5
917 ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64;
918 ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64;
919 ;step2[5] = dct_const_round_shift(temp1);
920 ;step2[6] = dct_const_round_shift(temp2);
921 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
922 ; --------------------------------------------------------------------------
923 ; generate 0,1,2,3
924 ; --------------------------------------------------------------------------
925 ; part of stage 4
926 ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64;
927 ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64;
928 ;step1[1] = dct_const_round_shift(temp1);
929 ;step1[0] = dct_const_round_shift(temp2);
930 LOAD_FROM_TRANSPOSED 12, 0, 16
931 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15
932 ; --------------------------------------------------------------------------
933 ; part of stage 4
934 ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64;
935 ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64;
936 ;step1[2] = dct_const_round_shift(temp1);
937 ;step1[3] = dct_const_round_shift(temp2);
938 LOAD_FROM_TRANSPOSED 16, 8, 24
939 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13
940 ; --------------------------------------------------------------------------
941 ; part of stage 5
942 ;step2[0] = step1b[0][i] + step1b[3][i];
943 ;step2[1] = step1b[1][i] + step1b[2][i];
944 ;step2[2] = step1b[1][i] - step1b[2][i];
945 ;step2[3] = step1b[0][i] - step1b[3][i];
946 vadd.s16 q4, q7, q6
947 vsub.s16 q7, q7, q6
948 vsub.s16 q6, q5, q14
949 vadd.s16 q5, q5, q14
950 ; --------------------------------------------------------------------------
951 ; combine 0-3,4-7
952 ; --------------------------------------------------------------------------
953 ; part of stage 6
954 ;step3[0] = step1b[0][i] + step1b[7][i];
955 ;step3[1] = step1b[1][i] + step1b[6][i];
956 ;step3[2] = step1b[2][i] + step1b[5][i];
957 ;step3[3] = step1b[3][i] + step1b[4][i];
958 vadd.s16 q8, q4, q2
959 vadd.s16 q9, q5, q3
960 vadd.s16 q10, q6, q1
961 vadd.s16 q11, q7, q0
962 ;step3[4] = step1b[3][i] - step1b[4][i];
963 ;step3[5] = step1b[2][i] - step1b[5][i];
964 ;step3[6] = step1b[1][i] - step1b[6][i];
965 ;step3[7] = step1b[0][i] - step1b[7][i];
966 vsub.s16 q12, q7, q0
967 vsub.s16 q13, q6, q1
968 vsub.s16 q14, q5, q3
969 vsub.s16 q15, q4, q2
970 ; --------------------------------------------------------------------------
971 ; part of stage 7
972 ;step1[0] = step1b[0][i] + step1b[15][i];
973 ;step1[1] = step1b[1][i] + step1b[14][i];
974 ;step1[14] = step1b[1][i] - step1b[14][i];
975 ;step1[15] = step1b[0][i] - step1b[15][i];
976 LOAD_FROM_OUTPUT 12, 14, 15, q0, q1
977 vadd.s16 q2, q8, q1
978 vadd.s16 q3, q9, q0
979 vsub.s16 q4, q9, q0
980 vsub.s16 q5, q8, q1
981 ; --------------------------------------------------------------------------
982 ; part of final stage
983 ;output[14 * 32] = step1b[14][i] + step1b[17][i];
984 ;output[15 * 32] = step1b[15][i] + step1b[16][i];
985 ;output[16 * 32] = step1b[15][i] - step1b[16][i];
986 ;output[17 * 32] = step1b[14][i] - step1b[17][i];
987 LOAD_FROM_OUTPUT 15, 16, 17, q0, q1
988 vadd.s16 q8, q4, q1
989 vadd.s16 q9, q5, q0
990 vsub.s16 q6, q5, q0
991 vsub.s16 q7, q4, q1
992
993 cmp r5, #0
994 bgt idct32_bands_end_2nd_pass
995
996 idct32_bands_end_1st_pass
997 STORE_IN_OUTPUT 17, 16, 17, q6, q7
998 STORE_IN_OUTPUT 17, 14, 15, q8, q9
999 ; --------------------------------------------------------------------------
1000 ; part of final stage
1001 ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
1002 ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
1003 ;output[30 * 32] = step1b[1][i] - step1b[30][i];
1004 ;output[31 * 32] = step1b[0][i] - step1b[31][i];
1005 LOAD_FROM_OUTPUT 15, 30, 31, q0, q1
1006 vadd.s16 q4, q2, q1
1007 vadd.s16 q5, q3, q0
1008 vsub.s16 q6, q3, q0
1009 vsub.s16 q7, q2, q1
1010 STORE_IN_OUTPUT 31, 30, 31, q6, q7
1011 STORE_IN_OUTPUT 31, 0, 1, q4, q5
1012 ; --------------------------------------------------------------------------
1013 ; part of stage 7
1014 ;step1[2] = step1b[2][i] + step1b[13][i];
1015 ;step1[3] = step1b[3][i] + step1b[12][i];
1016 ;step1[12] = step1b[3][i] - step1b[12][i];
1017 ;step1[13] = step1b[2][i] - step1b[13][i];
1018 LOAD_FROM_OUTPUT 1, 12, 13, q0, q1
1019 vadd.s16 q2, q10, q1
1020 vadd.s16 q3, q11, q0
1021 vsub.s16 q4, q11, q0
1022 vsub.s16 q5, q10, q1
1023 ; --------------------------------------------------------------------------
1024 ; part of final stage
1025 ;output[12 * 32] = step1b[12][i] + step1b[19][i];
1026 ;output[13 * 32] = step1b[13][i] + step1b[18][i];
1027 ;output[18 * 32] = step1b[13][i] - step1b[18][i];
1028 ;output[19 * 32] = step1b[12][i] - step1b[19][i];
1029 LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
1030 vadd.s16 q8, q4, q1
1031 vadd.s16 q9, q5, q0
1032 vsub.s16 q6, q5, q0
1033 vsub.s16 q7, q4, q1
1034 STORE_IN_OUTPUT 19, 18, 19, q6, q7
1035 STORE_IN_OUTPUT 19, 12, 13, q8, q9
1036 ; --------------------------------------------------------------------------
1037 ; part of final stage
1038 ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
1039 ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
1040 ;output[28 * 32] = step1b[3][i] - step1b[28][i];
1041 ;output[29 * 32] = step1b[2][i] - step1b[29][i];
1042 LOAD_FROM_OUTPUT 13, 28, 29, q0, q1
1043 vadd.s16 q4, q2, q1
1044 vadd.s16 q5, q3, q0
1045 vsub.s16 q6, q3, q0
1046 vsub.s16 q7, q2, q1
1047 STORE_IN_OUTPUT 29, 28, 29, q6, q7
1048 STORE_IN_OUTPUT 29, 2, 3, q4, q5
1049 ; --------------------------------------------------------------------------
1050 ; part of stage 7
1051 ;step1[4] = step1b[4][i] + step1b[11][i];
1052 ;step1[5] = step1b[5][i] + step1b[10][i];
1053 ;step1[10] = step1b[5][i] - step1b[10][i];
1054 ;step1[11] = step1b[4][i] - step1b[11][i];
1055 LOAD_FROM_OUTPUT 3, 10, 11, q0, q1
1056 vadd.s16 q2, q12, q1
1057 vadd.s16 q3, q13, q0
1058 vsub.s16 q4, q13, q0
1059 vsub.s16 q5, q12, q1
1060 ; --------------------------------------------------------------------------
1061 ; part of final stage
1062 ;output[10 * 32] = step1b[10][i] + step1b[21][i];
1063 ;output[11 * 32] = step1b[11][i] + step1b[20][i];
1064 ;output[20 * 32] = step1b[11][i] - step1b[20][i];
1065 ;output[21 * 32] = step1b[10][i] - step1b[21][i];
1066 LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
1067 vadd.s16 q8, q4, q1
1068 vadd.s16 q9, q5, q0
1069 vsub.s16 q6, q5, q0
1070 vsub.s16 q7, q4, q1
1071 STORE_IN_OUTPUT 21, 20, 21, q6, q7
1072 STORE_IN_OUTPUT 21, 10, 11, q8, q9
1073 ; --------------------------------------------------------------------------
1074 ; part of final stage
1075 ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
1076 ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
1077 ;output[26 * 32] = step1b[5][i] - step1b[26][i];
1078 ;output[27 * 32] = step1b[4][i] - step1b[27][i];
1079 LOAD_FROM_OUTPUT 11, 26, 27, q0, q1
1080 vadd.s16 q4, q2, q1
1081 vadd.s16 q5, q3, q0
1082 vsub.s16 q6, q3, q0
1083 vsub.s16 q7, q2, q1
1084 STORE_IN_OUTPUT 27, 26, 27, q6, q7
1085 STORE_IN_OUTPUT 27, 4, 5, q4, q5
1086 ; --------------------------------------------------------------------------
1087 ; part of stage 7
1088 ;step1[6] = step1b[6][i] + step1b[9][i];
1089 ;step1[7] = step1b[7][i] + step1b[8][i];
1090 ;step1[8] = step1b[7][i] - step1b[8][i];
1091 ;step1[9] = step1b[6][i] - step1b[9][i];
1092 LOAD_FROM_OUTPUT 5, 8, 9, q0, q1
1093 vadd.s16 q2, q14, q1
1094 vadd.s16 q3, q15, q0
1095 vsub.s16 q4, q15, q0
1096 vsub.s16 q5, q14, q1
1097 ; --------------------------------------------------------------------------
1098 ; part of final stage
1099 ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
1100 ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
1101 ;output[22 * 32] = step1b[9][i] - step1b[22][i];
1102 ;output[23 * 32] = step1b[8][i] - step1b[23][i];
1103 LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
1104 vadd.s16 q8, q4, q1
1105 vadd.s16 q9, q5, q0
1106 vsub.s16 q6, q5, q0
1107 vsub.s16 q7, q4, q1
1108 STORE_IN_OUTPUT 23, 22, 23, q6, q7
1109 STORE_IN_OUTPUT 23, 8, 9, q8, q9
1110 ; --------------------------------------------------------------------------
1111 ; part of final stage
1112 ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
1113 ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
1114 ;output[24 * 32] = step1b[7][i] - step1b[24][i];
1115 ;output[25 * 32] = step1b[6][i] - step1b[25][i];
1116 LOAD_FROM_OUTPUT 9, 24, 25, q0, q1
1117 vadd.s16 q4, q2, q1
1118 vadd.s16 q5, q3, q0
1119 vsub.s16 q6, q3, q0
1120 vsub.s16 q7, q2, q1
1121 STORE_IN_OUTPUT 25, 24, 25, q6, q7
1122 STORE_IN_OUTPUT 25, 6, 7, q4, q5
1123
1124 ; restore r0 by removing the last offset from the last
1125 ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
1126 sub r0, r0, #24*8*2
1127 ; restore r1 by removing the last offset from the last
1128 ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7*32*2
1129 ; advance by 8 columns => 8*2
1130 sub r1, r1, #7*32*2 - 8*2
1131 ; advance by 8 lines (8*32*2)
1132 ; go back by the two pairs from the loop (32*2)
1133 add r3, r3, #8*32*2 - 32*2
1134
1135 ; bands loop processing
1136 subs r4, r4, #1
1137 bne idct32_bands_loop
1138
1139 ; parameters for second pass
1140 ; the input of pass2 is the result of pass1. we have to remove the offset
1141 ; of 32 columns induced by the above idct32_bands_loop
1142 sub r3, r1, #32*2
1143 ; r1 = pass2[32 * 32]
1144 add r1, sp, #2048
1145
1146 ; pass loop processing
1147 add r5, r5, #1
1148 b idct32_pass_loop
1149
1150 idct32_bands_end_2nd_pass
1151 STORE_COMBINE_CENTER_RESULTS
1152 ; --------------------------------------------------------------------------
1153 ; part of final stage
1154 ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
1155 ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
1156 ;output[30 * 32] = step1b[1][i] - step1b[30][i];
1157 ;output[31 * 32] = step1b[0][i] - step1b[31][i];
1158 LOAD_FROM_OUTPUT 17, 30, 31, q0, q1
1159 vadd.s16 q4, q2, q1
1160 vadd.s16 q5, q3, q0
1161 vsub.s16 q6, q3, q0
1162 vsub.s16 q7, q2, q1
1163 STORE_COMBINE_EXTREME_RESULTS
1164 ; --------------------------------------------------------------------------
1165 ; part of stage 7
1166 ;step1[2] = step1b[2][i] + step1b[13][i];
1167 ;step1[3] = step1b[3][i] + step1b[12][i];
1168 ;step1[12] = step1b[3][i] - step1b[12][i];
1169 ;step1[13] = step1b[2][i] - step1b[13][i];
1170 LOAD_FROM_OUTPUT 31, 12, 13, q0, q1
1171 vadd.s16 q2, q10, q1
1172 vadd.s16 q3, q11, q0
1173 vsub.s16 q4, q11, q0
1174 vsub.s16 q5, q10, q1
1175 ; --------------------------------------------------------------------------
1176 ; part of final stage
1177 ;output[12 * 32] = step1b[12][i] + step1b[19][i];
1178 ;output[13 * 32] = step1b[13][i] + step1b[18][i];
1179 ;output[18 * 32] = step1b[13][i] - step1b[18][i];
1180 ;output[19 * 32] = step1b[12][i] - step1b[19][i];
1181 LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
1182 vadd.s16 q8, q4, q1
1183 vadd.s16 q9, q5, q0
1184 vsub.s16 q6, q5, q0
1185 vsub.s16 q7, q4, q1
1186 STORE_COMBINE_CENTER_RESULTS
1187 ; --------------------------------------------------------------------------
1188 ; part of final stage
1189 ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
1190 ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
1191 ;output[28 * 32] = step1b[3][i] - step1b[28][i];
1192 ;output[29 * 32] = step1b[2][i] - step1b[29][i];
1193 LOAD_FROM_OUTPUT 19, 28, 29, q0, q1
1194 vadd.s16 q4, q2, q1
1195 vadd.s16 q5, q3, q0
1196 vsub.s16 q6, q3, q0
1197 vsub.s16 q7, q2, q1
1198 STORE_COMBINE_EXTREME_RESULTS
1199 ; --------------------------------------------------------------------------
1200 ; part of stage 7
1201 ;step1[4] = step1b[4][i] + step1b[11][i];
1202 ;step1[5] = step1b[5][i] + step1b[10][i];
1203 ;step1[10] = step1b[5][i] - step1b[10][i];
1204 ;step1[11] = step1b[4][i] - step1b[11][i];
1205 LOAD_FROM_OUTPUT 29, 10, 11, q0, q1
1206 vadd.s16 q2, q12, q1
1207 vadd.s16 q3, q13, q0
1208 vsub.s16 q4, q13, q0
1209 vsub.s16 q5, q12, q1
1210 ; --------------------------------------------------------------------------
1211 ; part of final stage
1212 ;output[10 * 32] = step1b[10][i] + step1b[21][i];
1213 ;output[11 * 32] = step1b[11][i] + step1b[20][i];
1214 ;output[20 * 32] = step1b[11][i] - step1b[20][i];
1215 ;output[21 * 32] = step1b[10][i] - step1b[21][i];
1216 LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
1217 vadd.s16 q8, q4, q1
1218 vadd.s16 q9, q5, q0
1219 vsub.s16 q6, q5, q0
1220 vsub.s16 q7, q4, q1
1221 STORE_COMBINE_CENTER_RESULTS
1222 ; --------------------------------------------------------------------------
1223 ; part of final stage
1224 ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
1225 ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
1226 ;output[26 * 32] = step1b[5][i] - step1b[26][i];
1227 ;output[27 * 32] = step1b[4][i] - step1b[27][i];
1228 LOAD_FROM_OUTPUT 21, 26, 27, q0, q1
1229 vadd.s16 q4, q2, q1
1230 vadd.s16 q5, q3, q0
1231 vsub.s16 q6, q3, q0
1232 vsub.s16 q7, q2, q1
1233 STORE_COMBINE_EXTREME_RESULTS
1234 ; --------------------------------------------------------------------------
1235 ; part of stage 7
1236 ;step1[6] = step1b[6][i] + step1b[9][i];
1237 ;step1[7] = step1b[7][i] + step1b[8][i];
1238 ;step1[8] = step1b[7][i] - step1b[8][i];
1239 ;step1[9] = step1b[6][i] - step1b[9][i];
1240 LOAD_FROM_OUTPUT 27, 8, 9, q0, q1
1241 vadd.s16 q2, q14, q1
1242 vadd.s16 q3, q15, q0
1243 vsub.s16 q4, q15, q0
1244 vsub.s16 q5, q14, q1
1245 ; --------------------------------------------------------------------------
1246 ; part of final stage
1247 ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
1248 ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
1249 ;output[22 * 32] = step1b[9][i] - step1b[22][i];
1250 ;output[23 * 32] = step1b[8][i] - step1b[23][i];
1251 LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
1252 vadd.s16 q8, q4, q1
1253 vadd.s16 q9, q5, q0
1254 vsub.s16 q6, q5, q0
1255 vsub.s16 q7, q4, q1
1256 STORE_COMBINE_CENTER_RESULTS_LAST
1257 ; --------------------------------------------------------------------------
1258 ; part of final stage
1259 ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
1260 ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
1261 ;output[24 * 32] = step1b[7][i] - step1b[24][i];
1262 ;output[25 * 32] = step1b[6][i] - step1b[25][i];
1263 LOAD_FROM_OUTPUT 23, 24, 25, q0, q1
1264 vadd.s16 q4, q2, q1
1265 vadd.s16 q5, q3, q0
1266 vsub.s16 q6, q3, q0
1267 vsub.s16 q7, q2, q1
1268 STORE_COMBINE_EXTREME_RESULTS_LAST
1269 ; --------------------------------------------------------------------------
1270 ; restore pointers to their initial indices for next band pass by
1271 ; removing/adding dest_stride * 8. The actual increment by eight
1272 ; is taken care of within the _LAST macros.
1273 add r6, r6, r2, lsl #3
1274 add r9, r9, r2, lsl #3
1275 sub r7, r7, r2, lsl #3
1276 sub r10, r10, r2, lsl #3
1277
1278 ; restore r0 by removing the last offset from the last
1279 ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
1280 sub r0, r0, #24*8*2
1281 ; restore r1 by removing the last offset from the last
1282 ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2
1283 ; advance by 8 columns => 8*2
1284 sub r1, r1, #25*32*2 - 8*2
1285 ; advance by 8 lines (8*32*2)
1286 ; go back by the two pairs from the loop (32*2)
1287 add r3, r3, #8*32*2 - 32*2
1288
1289 ; bands loop processing
1290 subs r4, r4, #1
1291 bne idct32_bands_loop
1292
1293 ; stack operation
1294 add sp, sp, #512+2048+2048
1295 vpop {d8-d15}
1296 pop {r4-r11}
1297 bx lr
1298 ENDP ; |vp9_idct32x32_1024_add_neon|
1299 END
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698