OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 ;TODO(cd): adjust these constant to be able to use vqdmulh for faster |
| 12 ; dct_const_round_shift(a * b) within butterfly calculations. |
| 13 cospi_1_64 EQU 16364 |
| 14 cospi_2_64 EQU 16305 |
| 15 cospi_3_64 EQU 16207 |
| 16 cospi_4_64 EQU 16069 |
| 17 cospi_5_64 EQU 15893 |
| 18 cospi_6_64 EQU 15679 |
| 19 cospi_7_64 EQU 15426 |
| 20 cospi_8_64 EQU 15137 |
| 21 cospi_9_64 EQU 14811 |
| 22 cospi_10_64 EQU 14449 |
| 23 cospi_11_64 EQU 14053 |
| 24 cospi_12_64 EQU 13623 |
| 25 cospi_13_64 EQU 13160 |
| 26 cospi_14_64 EQU 12665 |
| 27 cospi_15_64 EQU 12140 |
| 28 cospi_16_64 EQU 11585 |
| 29 cospi_17_64 EQU 11003 |
| 30 cospi_18_64 EQU 10394 |
| 31 cospi_19_64 EQU 9760 |
| 32 cospi_20_64 EQU 9102 |
| 33 cospi_21_64 EQU 8423 |
| 34 cospi_22_64 EQU 7723 |
| 35 cospi_23_64 EQU 7005 |
| 36 cospi_24_64 EQU 6270 |
| 37 cospi_25_64 EQU 5520 |
| 38 cospi_26_64 EQU 4756 |
| 39 cospi_27_64 EQU 3981 |
| 40 cospi_28_64 EQU 3196 |
| 41 cospi_29_64 EQU 2404 |
| 42 cospi_30_64 EQU 1606 |
| 43 cospi_31_64 EQU 804 |
| 44 |
| 45 |
| 46 EXPORT |vp9_idct32x32_1024_add_neon| |
| 47 ARM |
| 48 REQUIRE8 |
| 49 PRESERVE8 |
| 50 |
| 51 AREA ||.text||, CODE, READONLY, ALIGN=2 |
| 52 |
| 53 AREA Block, CODE, READONLY |
| 54 |
| 55 ; -------------------------------------------------------------------------- |
| 56 ; Load from transposed_buffer |
| 57 ; q13 = transposed_buffer[first_offset] |
| 58 ; q14 = transposed_buffer[second_offset] |
| 59 ; for proper address calculation, the last offset used when manipulating |
| 60 ; transposed_buffer must be passed in. use 0 for first use. |
| 61 MACRO |
| 62 LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset |
| 63 ; address calculation with proper stride and loading |
| 64 add r0, #($first_offset - $prev_offset )*8*2 |
| 65 vld1.s16 {q14}, [r0] |
| 66 add r0, #($second_offset - $first_offset)*8*2 |
| 67 vld1.s16 {q13}, [r0] |
| 68 ; (used) two registers (q14, q13) |
| 69 MEND |
| 70 ; -------------------------------------------------------------------------- |
| 71 ; Load from output (used as temporary storage) |
| 72 ; reg1 = output[first_offset] |
| 73 ; reg2 = output[second_offset] |
| 74 ; for proper address calculation, the last offset used when manipulating |
| 75 ; output, whether reading or storing) must be passed in. use 0 for first |
| 76 ; use. |
| 77 MACRO |
| 78 LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 |
| 79 ; address calculation with proper stride and loading |
| 80 add r1, #($first_offset - $prev_offset )*32*2 |
| 81 vld1.s16 {$reg1}, [r1] |
| 82 add r1, #($second_offset - $first_offset)*32*2 |
| 83 vld1.s16 {$reg2}, [r1] |
| 84 ; (used) two registers ($reg1, $reg2) |
| 85 MEND |
| 86 ; -------------------------------------------------------------------------- |
| 87 ; Store into output (sometimes as as temporary storage) |
| 88 ; output[first_offset] = reg1 |
| 89 ; output[second_offset] = reg2 |
| 90 ; for proper address calculation, the last offset used when manipulating |
| 91 ; output, whether reading or storing) must be passed in. use 0 for first |
| 92 ; use. |
| 93 MACRO |
| 94 STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 |
| 95 ; address calculation with proper stride and storing |
| 96 add r1, #($first_offset - $prev_offset )*32*2 |
| 97 vst1.16 {$reg1}, [r1] |
| 98 add r1, #($second_offset - $first_offset)*32*2 |
| 99 vst1.16 {$reg2}, [r1] |
| 100 MEND |
| 101 ; -------------------------------------------------------------------------- |
| 102 ; Combine-add results with current destination content |
| 103 ; q6-q9 contain the results (out[j * 32 + 0-31]) |
| 104 MACRO |
| 105 STORE_COMBINE_CENTER_RESULTS |
| 106 ; load dest[j * dest_stride + 0-31] |
| 107 vld1.s16 {d8}, [r10], r2 |
| 108 vld1.s16 {d11}, [r9], r11 |
| 109 vld1.s16 {d9}, [r10] |
| 110 vld1.s16 {d10}, [r9] |
| 111 ; ROUND_POWER_OF_TWO |
| 112 vrshr.s16 q7, q7, #6 |
| 113 vrshr.s16 q8, q8, #6 |
| 114 vrshr.s16 q9, q9, #6 |
| 115 vrshr.s16 q6, q6, #6 |
| 116 ; add to dest[j * dest_stride + 0-31] |
| 117 vaddw.u8 q7, q7, d9 |
| 118 vaddw.u8 q8, q8, d10 |
| 119 vaddw.u8 q9, q9, d11 |
| 120 vaddw.u8 q6, q6, d8 |
| 121 ; clip pixel |
| 122 vqmovun.s16 d9, q7 |
| 123 vqmovun.s16 d10, q8 |
| 124 vqmovun.s16 d11, q9 |
| 125 vqmovun.s16 d8, q6 |
| 126 ; store back into dest[j * dest_stride + 0-31] |
| 127 vst1.16 {d9}, [r10], r11 |
| 128 vst1.16 {d10}, [r9], r2 |
| 129 vst1.16 {d8}, [r10] |
| 130 vst1.16 {d11}, [r9] |
| 131 ; update pointers (by dest_stride * 2) |
| 132 sub r9, r9, r2, lsl #1 |
| 133 add r10, r10, r2, lsl #1 |
| 134 MEND |
| 135 ; -------------------------------------------------------------------------- |
| 136 ; Combine-add results with current destination content |
| 137 ; q6-q9 contain the results (out[j * 32 + 0-31]) |
| 138 MACRO |
| 139 STORE_COMBINE_CENTER_RESULTS_LAST |
| 140 ; load dest[j * dest_stride + 0-31] |
| 141 vld1.s16 {d8}, [r10], r2 |
| 142 vld1.s16 {d11}, [r9], r11 |
| 143 vld1.s16 {d9}, [r10] |
| 144 vld1.s16 {d10}, [r9] |
| 145 ; ROUND_POWER_OF_TWO |
| 146 vrshr.s16 q7, q7, #6 |
| 147 vrshr.s16 q8, q8, #6 |
| 148 vrshr.s16 q9, q9, #6 |
| 149 vrshr.s16 q6, q6, #6 |
| 150 ; add to dest[j * dest_stride + 0-31] |
| 151 vaddw.u8 q7, q7, d9 |
| 152 vaddw.u8 q8, q8, d10 |
| 153 vaddw.u8 q9, q9, d11 |
| 154 vaddw.u8 q6, q6, d8 |
| 155 ; clip pixel |
| 156 vqmovun.s16 d9, q7 |
| 157 vqmovun.s16 d10, q8 |
| 158 vqmovun.s16 d11, q9 |
| 159 vqmovun.s16 d8, q6 |
| 160 ; store back into dest[j * dest_stride + 0-31] |
| 161 vst1.16 {d9}, [r10], r11 |
| 162 vst1.16 {d10}, [r9], r2 |
| 163 vst1.16 {d8}, [r10]! |
| 164 vst1.16 {d11}, [r9]! |
| 165 ; update pointers (by dest_stride * 2) |
| 166 sub r9, r9, r2, lsl #1 |
| 167 add r10, r10, r2, lsl #1 |
| 168 MEND |
| 169 ; -------------------------------------------------------------------------- |
| 170 ; Combine-add results with current destination content |
| 171 ; q4-q7 contain the results (out[j * 32 + 0-31]) |
| 172 MACRO |
| 173 STORE_COMBINE_EXTREME_RESULTS |
| 174 ; load dest[j * dest_stride + 0-31] |
| 175 vld1.s16 {d4}, [r7], r2 |
| 176 vld1.s16 {d7}, [r6], r11 |
| 177 vld1.s16 {d5}, [r7] |
| 178 vld1.s16 {d6}, [r6] |
| 179 ; ROUND_POWER_OF_TWO |
| 180 vrshr.s16 q5, q5, #6 |
| 181 vrshr.s16 q6, q6, #6 |
| 182 vrshr.s16 q7, q7, #6 |
| 183 vrshr.s16 q4, q4, #6 |
| 184 ; add to dest[j * dest_stride + 0-31] |
| 185 vaddw.u8 q5, q5, d5 |
| 186 vaddw.u8 q6, q6, d6 |
| 187 vaddw.u8 q7, q7, d7 |
| 188 vaddw.u8 q4, q4, d4 |
| 189 ; clip pixel |
| 190 vqmovun.s16 d5, q5 |
| 191 vqmovun.s16 d6, q6 |
| 192 vqmovun.s16 d7, q7 |
| 193 vqmovun.s16 d4, q4 |
| 194 ; store back into dest[j * dest_stride + 0-31] |
| 195 vst1.16 {d5}, [r7], r11 |
| 196 vst1.16 {d6}, [r6], r2 |
| 197 vst1.16 {d7}, [r6] |
| 198 vst1.16 {d4}, [r7] |
| 199 ; update pointers (by dest_stride * 2) |
| 200 sub r6, r6, r2, lsl #1 |
| 201 add r7, r7, r2, lsl #1 |
| 202 MEND |
| 203 ; -------------------------------------------------------------------------- |
| 204 ; Combine-add results with current destination content |
| 205 ; q4-q7 contain the results (out[j * 32 + 0-31]) |
| 206 MACRO |
| 207 STORE_COMBINE_EXTREME_RESULTS_LAST |
| 208 ; load dest[j * dest_stride + 0-31] |
| 209 vld1.s16 {d4}, [r7], r2 |
| 210 vld1.s16 {d7}, [r6], r11 |
| 211 vld1.s16 {d5}, [r7] |
| 212 vld1.s16 {d6}, [r6] |
| 213 ; ROUND_POWER_OF_TWO |
| 214 vrshr.s16 q5, q5, #6 |
| 215 vrshr.s16 q6, q6, #6 |
| 216 vrshr.s16 q7, q7, #6 |
| 217 vrshr.s16 q4, q4, #6 |
| 218 ; add to dest[j * dest_stride + 0-31] |
| 219 vaddw.u8 q5, q5, d5 |
| 220 vaddw.u8 q6, q6, d6 |
| 221 vaddw.u8 q7, q7, d7 |
| 222 vaddw.u8 q4, q4, d4 |
| 223 ; clip pixel |
| 224 vqmovun.s16 d5, q5 |
| 225 vqmovun.s16 d6, q6 |
| 226 vqmovun.s16 d7, q7 |
| 227 vqmovun.s16 d4, q4 |
| 228 ; store back into dest[j * dest_stride + 0-31] |
| 229 vst1.16 {d5}, [r7], r11 |
| 230 vst1.16 {d6}, [r6], r2 |
| 231 vst1.16 {d7}, [r6]! |
| 232 vst1.16 {d4}, [r7]! |
| 233 ; update pointers (by dest_stride * 2) |
| 234 sub r6, r6, r2, lsl #1 |
| 235 add r7, r7, r2, lsl #1 |
| 236 MEND |
| 237 ; -------------------------------------------------------------------------- |
| 238 ; Touches q8-q12, q15 (q13-q14 are preserved) |
| 239 ; valid output registers are anything but q8-q11 |
| 240 MACRO |
| 241 DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant,
$reg1, $reg2, $reg3, $reg4 |
| 242 ; TODO(cd): have special case to re-use constants when they are similar for |
| 243 ; consecutive butterflies |
| 244 ; TODO(cd): have special case when both constants are the same, do the |
| 245 ; additions/subtractions before the multiplies. |
| 246 ; generate the constants |
| 247 ; generate scalar constants |
| 248 mov r8, #$first_constant & 0xFF00 |
| 249 mov r12, #$second_constant & 0xFF00 |
| 250 add r8, #$first_constant & 0x00FF |
| 251 add r12, #$second_constant & 0x00FF |
| 252 ; generate vector constants |
| 253 vdup.16 d30, r8 |
| 254 vdup.16 d31, r12 |
| 255 ; (used) two for inputs (regA-regD), one for constants (q15) |
| 256 ; do some multiplications (ordered for maximum latency hiding) |
| 257 vmull.s16 q8, $regC, d30 |
| 258 vmull.s16 q10, $regA, d31 |
| 259 vmull.s16 q9, $regD, d30 |
| 260 vmull.s16 q11, $regB, d31 |
| 261 vmull.s16 q12, $regC, d31 |
| 262 ; (used) five for intermediate (q8-q12), one for constants (q15) |
| 263 ; do some addition/subtractions (to get back two register) |
| 264 vsub.s32 q8, q8, q10 |
| 265 vsub.s32 q9, q9, q11 |
| 266 ; do more multiplications (ordered for maximum latency hiding) |
| 267 vmull.s16 q10, $regD, d31 |
| 268 vmull.s16 q11, $regA, d30 |
| 269 vmull.s16 q15, $regB, d30 |
| 270 ; (used) six for intermediate (q8-q12, q15) |
| 271 ; do more addition/subtractions |
| 272 vadd.s32 q11, q12, q11 |
| 273 vadd.s32 q10, q10, q15 |
| 274 ; (used) four for intermediate (q8-q11) |
| 275 ; dct_const_round_shift |
| 276 vqrshrn.s32 $reg1, q8, #14 |
| 277 vqrshrn.s32 $reg2, q9, #14 |
| 278 vqrshrn.s32 $reg3, q11, #14 |
| 279 vqrshrn.s32 $reg4, q10, #14 |
| 280 ; (used) two for results, well four d registers |
| 281 MEND |
| 282 ; -------------------------------------------------------------------------- |
| 283 ; Touches q8-q12, q15 (q13-q14 are preserved) |
| 284 ; valid output registers are anything but q8-q11 |
| 285 MACRO |
| 286 DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $re
g4 |
| 287 DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $
reg2, $reg3, $reg4 |
| 288 MEND |
| 289 ; -------------------------------------------------------------------------- |
| 290 |
| 291 ;void vp9_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride
); |
| 292 ; |
| 293 ; r0 int16_t *input, |
| 294 ; r1 uint8_t *dest, |
| 295 ; r2 int dest_stride) |
| 296 ; loop counters |
| 297 ; r4 bands loop counter |
| 298 ; r5 pass loop counter |
| 299 ; r8 transpose loop counter |
| 300 ; combine-add pointers |
| 301 ; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...) |
| 302 ; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...) |
| 303 ; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...) |
| 304 ; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...) |
| 305 |
| 306 |vp9_idct32x32_1024_add_neon| PROC |
| 307 ; This function does one pass of idct32x32 transform. |
| 308 ; |
| 309 ; This is done by transposing the input and then doing a 1d transform on |
| 310 ; columns. In the first pass, the transposed columns are the original |
| 311 ; rows. In the second pass, after the transposition, the colums are the |
| 312 ; original columns. |
| 313 ; The 1d transform is done by looping over bands of eight columns (the |
| 314 ; idct32_bands loop). For each band, the transform input transposition |
| 315 ; is done on demand, one band of four 8x8 matrices at a time. The four |
| 316 ; matrices are transposed by pairs (the idct32_transpose_pair loop). |
| 317 push {r4-r11} |
| 318 vpush {d8-d15} |
| 319 ; stack operation |
| 320 ; internal buffer used to transpose 8 lines into before transforming them |
| 321 ; int16_t transpose_buffer[32 * 8]; |
| 322 ; at sp + [4096, 4607] |
| 323 ; results of the first pass (transpose and transform rows) |
| 324 ; int16_t pass1[32 * 32]; |
| 325 ; at sp + [0, 2047] |
| 326 ; results of the second pass (transpose and transform columns) |
| 327 ; int16_t pass2[32 * 32]; |
| 328 ; at sp + [2048, 4095] |
| 329 sub sp, sp, #512+2048+2048 |
| 330 |
| 331 ; r6 = dest + 31 * dest_stride |
| 332 ; r7 = dest + 0 * dest_stride |
| 333 ; r9 = dest + 15 * dest_stride |
| 334 ; r10 = dest + 16 * dest_stride |
| 335 rsb r6, r2, r2, lsl #5 |
| 336 rsb r9, r2, r2, lsl #4 |
| 337 add r10, r1, r2, lsl #4 |
| 338 mov r7, r1 |
| 339 add r6, r6, r1 |
| 340 add r9, r9, r1 |
| 341 ; r11 = -dest_stride |
| 342 neg r11, r2 |
| 343 ; r3 = input |
| 344 mov r3, r0 |
| 345 ; parameters for first pass |
| 346 ; r0 = transpose_buffer[32 * 8] |
| 347 add r0, sp, #4096 |
| 348 ; r1 = pass1[32 * 32] |
| 349 mov r1, sp |
| 350 |
| 351 mov r5, #0 ; initialize pass loop counter |
| 352 idct32_pass_loop |
| 353 mov r4, #4 ; initialize bands loop counter |
| 354 idct32_bands_loop |
| 355 mov r8, #2 ; initialize transpose loop counter |
| 356 idct32_transpose_pair_loop |
| 357 ; Load two horizontally consecutive 8x8 16bit data matrices. The first one |
| 358 ; into q0-q7 and the second one into q8-q15. There is a stride of 64, |
| 359 ; adjusted to 32 because of the two post-increments. |
| 360 vld1.s16 {q8}, [r3]! |
| 361 vld1.s16 {q0}, [r3]! |
| 362 add r3, #32 |
| 363 vld1.s16 {q9}, [r3]! |
| 364 vld1.s16 {q1}, [r3]! |
| 365 add r3, #32 |
| 366 vld1.s16 {q10}, [r3]! |
| 367 vld1.s16 {q2}, [r3]! |
| 368 add r3, #32 |
| 369 vld1.s16 {q11}, [r3]! |
| 370 vld1.s16 {q3}, [r3]! |
| 371 add r3, #32 |
| 372 vld1.s16 {q12}, [r3]! |
| 373 vld1.s16 {q4}, [r3]! |
| 374 add r3, #32 |
| 375 vld1.s16 {q13}, [r3]! |
| 376 vld1.s16 {q5}, [r3]! |
| 377 add r3, #32 |
| 378 vld1.s16 {q14}, [r3]! |
| 379 vld1.s16 {q6}, [r3]! |
| 380 add r3, #32 |
| 381 vld1.s16 {q15}, [r3]! |
| 382 vld1.s16 {q7}, [r3]! |
| 383 |
| 384 ; Transpose the two 8x8 16bit data matrices. |
| 385 vswp d17, d24 |
| 386 vswp d23, d30 |
| 387 vswp d21, d28 |
| 388 vswp d19, d26 |
| 389 vswp d1, d8 |
| 390 vswp d7, d14 |
| 391 vswp d5, d12 |
| 392 vswp d3, d10 |
| 393 vtrn.32 q8, q10 |
| 394 vtrn.32 q9, q11 |
| 395 vtrn.32 q12, q14 |
| 396 vtrn.32 q13, q15 |
| 397 vtrn.32 q0, q2 |
| 398 vtrn.32 q1, q3 |
| 399 vtrn.32 q4, q6 |
| 400 vtrn.32 q5, q7 |
| 401 vtrn.16 q8, q9 |
| 402 vtrn.16 q10, q11 |
| 403 vtrn.16 q12, q13 |
| 404 vtrn.16 q14, q15 |
| 405 vtrn.16 q0, q1 |
| 406 vtrn.16 q2, q3 |
| 407 vtrn.16 q4, q5 |
| 408 vtrn.16 q6, q7 |
| 409 |
| 410 ; Store both matrices after each other. There is a stride of 32, which |
| 411 ; adjusts to nothing because of the post-increments. |
| 412 vst1.16 {q8}, [r0]! |
| 413 vst1.16 {q9}, [r0]! |
| 414 vst1.16 {q10}, [r0]! |
| 415 vst1.16 {q11}, [r0]! |
| 416 vst1.16 {q12}, [r0]! |
| 417 vst1.16 {q13}, [r0]! |
| 418 vst1.16 {q14}, [r0]! |
| 419 vst1.16 {q15}, [r0]! |
| 420 vst1.16 {q0}, [r0]! |
| 421 vst1.16 {q1}, [r0]! |
| 422 vst1.16 {q2}, [r0]! |
| 423 vst1.16 {q3}, [r0]! |
| 424 vst1.16 {q4}, [r0]! |
| 425 vst1.16 {q5}, [r0]! |
| 426 vst1.16 {q6}, [r0]! |
| 427 vst1.16 {q7}, [r0]! |
| 428 |
| 429 ; increment pointers by adjusted stride (not necessary for r0/out) |
| 430 ; go back by 7*32 for the seven lines moved fully by read and add |
| 431 ; go back by 32 for the eigth line only read |
| 432 ; advance by 16*2 to go the next pair |
| 433 sub r3, r3, #7*32*2 + 32 - 16*2 |
| 434 ; transpose pair loop processing |
| 435 subs r8, r8, #1 |
| 436 bne idct32_transpose_pair_loop |
| 437 |
| 438 ; restore r0/input to its original value |
| 439 sub r0, r0, #32*8*2 |
| 440 |
| 441 ; Instead of doing the transforms stage by stage, it is done by loading |
| 442 ; some input values and doing as many stages as possible to minimize the |
| 443 ; storing/loading of intermediate results. To fit within registers, the |
| 444 ; final coefficients are cut into four blocks: |
| 445 ; BLOCK A: 16-19,28-31 |
| 446 ; BLOCK B: 20-23,24-27 |
| 447 ; BLOCK C: 8-10,11-15 |
| 448 ; BLOCK D: 0-3,4-7 |
| 449 ; Blocks A and C are straight calculation through the various stages. In |
| 450 ; block B, further calculations are performed using the results from |
| 451 ; block A. In block D, further calculations are performed using the results |
| 452 ; from block C and then the final calculations are done using results from |
| 453 ; block A and B which have been combined at the end of block B. |
| 454 |
| 455 ; -------------------------------------------------------------------------- |
| 456 ; BLOCK A: 16-19,28-31 |
| 457 ; -------------------------------------------------------------------------- |
| 458 ; generate 16,17,30,31 |
| 459 ; -------------------------------------------------------------------------- |
| 460 ; part of stage 1 |
| 461 ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] * cospi_1_64; |
| 462 ;temp2 = input[1 * 32] * cospi_1_64 + input[31 * 32] * cospi_31_64; |
| 463 ;step1b[16][i] = dct_const_round_shift(temp1); |
| 464 ;step1b[31][i] = dct_const_round_shift(temp2); |
| 465 LOAD_FROM_TRANSPOSED 0, 1, 31 |
| 466 DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5 |
| 467 ; -------------------------------------------------------------------------- |
| 468 ; part of stage 1 |
| 469 ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64; |
| 470 ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64; |
| 471 ;step1b[17][i] = dct_const_round_shift(temp1); |
| 472 ;step1b[30][i] = dct_const_round_shift(temp2); |
| 473 LOAD_FROM_TRANSPOSED 31, 17, 15 |
| 474 DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7 |
| 475 ; -------------------------------------------------------------------------- |
| 476 ; part of stage 2 |
| 477 ;step2[16] = step1b[16][i] + step1b[17][i]; |
| 478 ;step2[17] = step1b[16][i] - step1b[17][i]; |
| 479 ;step2[30] = -step1b[30][i] + step1b[31][i]; |
| 480 ;step2[31] = step1b[30][i] + step1b[31][i]; |
| 481 vadd.s16 q4, q0, q1 |
| 482 vsub.s16 q13, q0, q1 |
| 483 vadd.s16 q6, q2, q3 |
| 484 vsub.s16 q14, q2, q3 |
| 485 ; -------------------------------------------------------------------------- |
| 486 ; part of stage 3 |
| 487 ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64; |
| 488 ;temp2 = step1b[30][i] * cospi_4_64 - step1b[17][i] * cospi_28_64; |
| 489 ;step3[17] = dct_const_round_shift(temp1); |
| 490 ;step3[30] = dct_const_round_shift(temp2); |
| 491 DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15 |
| 492 ; -------------------------------------------------------------------------- |
| 493 ; generate 18,19,28,29 |
| 494 ; -------------------------------------------------------------------------- |
| 495 ; part of stage 1 |
| 496 ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64; |
| 497 ;temp2 = input[9 * 32] * cospi_9_64 + input[23 * 32] * cospi_23_64; |
| 498 ;step1b[18][i] = dct_const_round_shift(temp1); |
| 499 ;step1b[29][i] = dct_const_round_shift(temp2); |
| 500 LOAD_FROM_TRANSPOSED 15, 9, 23 |
| 501 DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5 |
| 502 ; -------------------------------------------------------------------------- |
| 503 ; part of stage 1 |
| 504 ;temp1 = input[25 * 32] * cospi_7_64 - input[7 * 32] * cospi_25_64; |
| 505 ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64; |
| 506 ;step1b[19][i] = dct_const_round_shift(temp1); |
| 507 ;step1b[28][i] = dct_const_round_shift(temp2); |
| 508 LOAD_FROM_TRANSPOSED 23, 25, 7 |
| 509 DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7 |
| 510 ; -------------------------------------------------------------------------- |
| 511 ; part of stage 2 |
| 512 ;step2[18] = -step1b[18][i] + step1b[19][i]; |
| 513 ;step2[19] = step1b[18][i] + step1b[19][i]; |
| 514 ;step2[28] = step1b[28][i] + step1b[29][i]; |
| 515 ;step2[29] = step1b[28][i] - step1b[29][i]; |
| 516 vsub.s16 q13, q3, q2 |
| 517 vadd.s16 q3, q3, q2 |
| 518 vsub.s16 q14, q1, q0 |
| 519 vadd.s16 q2, q1, q0 |
| 520 ; -------------------------------------------------------------------------- |
| 521 ; part of stage 3 |
| 522 ;temp1 = step1b[18][i] * (-cospi_4_64) - step1b[29][i] * (-cospi_28_64); |
| 523 ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64); |
| 524 ;step3[29] = dct_const_round_shift(temp1); |
| 525 ;step3[18] = dct_const_round_shift(temp2); |
| 526 DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1 |
| 527 ; -------------------------------------------------------------------------- |
| 528 ; combine 16-19,28-31 |
| 529 ; -------------------------------------------------------------------------- |
| 530 ; part of stage 4 |
| 531 ;step1[16] = step1b[16][i] + step1b[19][i]; |
| 532 ;step1[17] = step1b[17][i] + step1b[18][i]; |
| 533 ;step1[18] = step1b[17][i] - step1b[18][i]; |
| 534 ;step1[29] = step1b[30][i] - step1b[29][i]; |
| 535 ;step1[30] = step1b[30][i] + step1b[29][i]; |
| 536 ;step1[31] = step1b[31][i] + step1b[28][i]; |
| 537 vadd.s16 q8, q4, q2 |
| 538 vadd.s16 q9, q5, q0 |
| 539 vadd.s16 q10, q7, q1 |
| 540 vadd.s16 q15, q6, q3 |
| 541 vsub.s16 q13, q5, q0 |
| 542 vsub.s16 q14, q7, q1 |
| 543 STORE_IN_OUTPUT 0, 16, 31, q8, q15 |
| 544 STORE_IN_OUTPUT 31, 17, 30, q9, q10 |
| 545 ; -------------------------------------------------------------------------- |
| 546 ; part of stage 5 |
| 547 ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64; |
| 548 ;temp2 = step1b[29][i] * cospi_8_64 + step1b[18][i] * cospi_24_64; |
| 549 ;step2[18] = dct_const_round_shift(temp1); |
| 550 ;step2[29] = dct_const_round_shift(temp2); |
| 551 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3 |
| 552 STORE_IN_OUTPUT 30, 29, 18, q1, q0 |
| 553 ; -------------------------------------------------------------------------- |
| 554 ; part of stage 4 |
| 555 ;step1[19] = step1b[16][i] - step1b[19][i]; |
| 556 ;step1[28] = step1b[31][i] - step1b[28][i]; |
| 557 vsub.s16 q13, q4, q2 |
| 558 vsub.s16 q14, q6, q3 |
| 559 ; -------------------------------------------------------------------------- |
| 560 ; part of stage 5 |
| 561 ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64; |
| 562 ;temp2 = step1b[28][i] * cospi_8_64 + step1b[19][i] * cospi_24_64; |
| 563 ;step2[19] = dct_const_round_shift(temp1); |
| 564 ;step2[28] = dct_const_round_shift(temp2); |
| 565 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13 |
| 566 STORE_IN_OUTPUT 18, 19, 28, q4, q6 |
| 567 ; -------------------------------------------------------------------------- |
| 568 |
| 569 |
| 570 ; -------------------------------------------------------------------------- |
| 571 ; BLOCK B: 20-23,24-27 |
| 572 ; -------------------------------------------------------------------------- |
| 573 ; generate 20,21,26,27 |
| 574 ; -------------------------------------------------------------------------- |
| 575 ; part of stage 1 |
| 576 ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64; |
| 577 ;temp2 = input[5 * 32] * cospi_5_64 + input[27 * 32] * cospi_27_64; |
| 578 ;step1b[20][i] = dct_const_round_shift(temp1); |
| 579 ;step1b[27][i] = dct_const_round_shift(temp2); |
| 580 LOAD_FROM_TRANSPOSED 7, 5, 27 |
| 581 DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5 |
| 582 ; -------------------------------------------------------------------------- |
| 583 ; part of stage 1 |
| 584 ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64; |
| 585 ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64; |
| 586 ;step1b[21][i] = dct_const_round_shift(temp1); |
| 587 ;step1b[26][i] = dct_const_round_shift(temp2); |
| 588 LOAD_FROM_TRANSPOSED 27, 21, 11 |
| 589 DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7 |
| 590 ; -------------------------------------------------------------------------- |
| 591 ; part of stage 2 |
| 592 ;step2[20] = step1b[20][i] + step1b[21][i]; |
| 593 ;step2[21] = step1b[20][i] - step1b[21][i]; |
| 594 ;step2[26] = -step1b[26][i] + step1b[27][i]; |
| 595 ;step2[27] = step1b[26][i] + step1b[27][i]; |
| 596 vsub.s16 q13, q0, q1 |
| 597 vadd.s16 q0, q0, q1 |
| 598 vsub.s16 q14, q2, q3 |
| 599 vadd.s16 q2, q2, q3 |
| 600 ; -------------------------------------------------------------------------- |
| 601 ; part of stage 3 |
| 602 ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64; |
| 603 ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64; |
| 604 ;step3[21] = dct_const_round_shift(temp1); |
| 605 ;step3[26] = dct_const_round_shift(temp2); |
| 606 DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 |
| 607 ; -------------------------------------------------------------------------- |
| 608 ; generate 22,23,24,25 |
| 609 ; -------------------------------------------------------------------------- |
| 610 ; part of stage 1 |
| 611 ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64; |
| 612 ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64; |
| 613 ;step1b[22][i] = dct_const_round_shift(temp1); |
| 614 ;step1b[25][i] = dct_const_round_shift(temp2); |
| 615 LOAD_FROM_TRANSPOSED 11, 13, 19 |
| 616 DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15 |
| 617 ; -------------------------------------------------------------------------- |
| 618 ; part of stage 1 |
| 619 ;temp1 = input[29 * 32] * cospi_3_64 - input[3 * 32] * cospi_29_64; |
| 620 ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64; |
| 621 ;step1b[23][i] = dct_const_round_shift(temp1); |
| 622 ;step1b[24][i] = dct_const_round_shift(temp2); |
| 623 LOAD_FROM_TRANSPOSED 19, 29, 3 |
| 624 DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13 |
| 625 ; -------------------------------------------------------------------------- |
| 626 ; part of stage 2 |
| 627 ;step2[22] = -step1b[22][i] + step1b[23][i]; |
| 628 ;step2[23] = step1b[22][i] + step1b[23][i]; |
| 629 ;step2[24] = step1b[24][i] + step1b[25][i]; |
| 630 ;step2[25] = step1b[24][i] - step1b[25][i]; |
| 631 vsub.s16 q14, q4, q5 |
| 632 vadd.s16 q5, q4, q5 |
| 633 vsub.s16 q13, q6, q7 |
| 634 vadd.s16 q6, q6, q7 |
| 635 ; -------------------------------------------------------------------------- |
| 636 ; part of stage 3 |
| 637 ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64); |
| 638 ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64); |
| 639 ;step3[25] = dct_const_round_shift(temp1); |
| 640 ;step3[22] = dct_const_round_shift(temp2); |
| 641 DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15 |
| 642 ; -------------------------------------------------------------------------- |
| 643 ; combine 20-23,24-27 |
| 644 ; -------------------------------------------------------------------------- |
| 645 ; part of stage 4 |
| 646 ;step1[22] = step1b[22][i] + step1b[21][i]; |
| 647 ;step1[23] = step1b[23][i] + step1b[20][i]; |
| 648 vadd.s16 q10, q7, q1 |
| 649 vadd.s16 q11, q5, q0 |
| 650 ;step1[24] = step1b[24][i] + step1b[27][i]; |
| 651 ;step1[25] = step1b[25][i] + step1b[26][i]; |
| 652 vadd.s16 q12, q6, q2 |
| 653 vadd.s16 q15, q4, q3 |
| 654 ; -------------------------------------------------------------------------- |
| 655 ; part of stage 6 |
| 656 ;step3[16] = step1b[16][i] + step1b[23][i]; |
| 657 ;step3[17] = step1b[17][i] + step1b[22][i]; |
| 658 ;step3[22] = step1b[17][i] - step1b[22][i]; |
| 659 ;step3[23] = step1b[16][i] - step1b[23][i]; |
| 660 LOAD_FROM_OUTPUT 28, 16, 17, q14, q13 |
| 661 vadd.s16 q8, q14, q11 |
| 662 vadd.s16 q9, q13, q10 |
| 663 vsub.s16 q13, q13, q10 |
| 664 vsub.s16 q11, q14, q11 |
| 665 STORE_IN_OUTPUT 17, 17, 16, q9, q8 |
| 666 ; -------------------------------------------------------------------------- |
| 667 ; part of stage 6 |
| 668 ;step3[24] = step1b[31][i] - step1b[24][i]; |
| 669 ;step3[25] = step1b[30][i] - step1b[25][i]; |
| 670 ;step3[30] = step1b[30][i] + step1b[25][i]; |
| 671 ;step3[31] = step1b[31][i] + step1b[24][i]; |
| 672 LOAD_FROM_OUTPUT 16, 30, 31, q14, q9 |
| 673 vsub.s16 q8, q9, q12 |
| 674 vadd.s16 q10, q14, q15 |
| 675 vsub.s16 q14, q14, q15 |
| 676 vadd.s16 q12, q9, q12 |
| 677 STORE_IN_OUTPUT 31, 30, 31, q10, q12 |
| 678 ; -------------------------------------------------------------------------- |
| 679 ; TODO(cd) do some register allocation change to remove these push/pop |
| 680 vpush {q8} ; [24] |
| 681 vpush {q11} ; [23] |
| 682 ; -------------------------------------------------------------------------- |
| 683 ; part of stage 7 |
| 684 ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64; |
| 685 ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64; |
| 686 ;step1[22] = dct_const_round_shift(temp1); |
| 687 ;step1[25] = dct_const_round_shift(temp2); |
| 688 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 |
| 689 STORE_IN_OUTPUT 31, 25, 22, q14, q13 |
| 690 ; -------------------------------------------------------------------------- |
| 691 ; part of stage 7 |
| 692 ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64; |
| 693 ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64; |
| 694 ;step1[23] = dct_const_round_shift(temp1); |
| 695 ;step1[24] = dct_const_round_shift(temp2); |
| 696 ; TODO(cd) do some register allocation change to remove these push/pop |
| 697 vpop {q13} ; [23] |
| 698 vpop {q14} ; [24] |
| 699 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 |
| 700 STORE_IN_OUTPUT 22, 24, 23, q14, q13 |
| 701 ; -------------------------------------------------------------------------- |
| 702 ; part of stage 4 |
| 703 ;step1[20] = step1b[23][i] - step1b[20][i]; |
| 704 ;step1[27] = step1b[24][i] - step1b[27][i]; |
| 705 vsub.s16 q14, q5, q0 |
| 706 vsub.s16 q13, q6, q2 |
| 707 ; -------------------------------------------------------------------------- |
| 708 ; part of stage 5 |
| 709 ;temp1 = step1b[20][i] * (-cospi_8_64) - step1b[27][i] * (-cospi_24_64); |
| 710 ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64); |
| 711 ;step2[27] = dct_const_round_shift(temp1); |
| 712 ;step2[20] = dct_const_round_shift(temp2); |
| 713 DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13 |
| 714 ; -------------------------------------------------------------------------- |
| 715 ; part of stage 4 |
| 716 ;step1[21] = step1b[22][i] - step1b[21][i]; |
| 717 ;step1[26] = step1b[25][i] - step1b[26][i]; |
| 718 vsub.s16 q14, q7, q1 |
| 719 vsub.s16 q13, q4, q3 |
| 720 ; -------------------------------------------------------------------------- |
| 721 ; part of stage 5 |
| 722 ;temp1 = step1b[21][i] * (-cospi_8_64) - step1b[26][i] * (-cospi_24_64); |
| 723 ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64); |
| 724 ;step2[26] = dct_const_round_shift(temp1); |
| 725 ;step2[21] = dct_const_round_shift(temp2); |
| 726 DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3 |
| 727 ; -------------------------------------------------------------------------- |
| 728 ; part of stage 6 |
| 729 ;step3[18] = step1b[18][i] + step1b[21][i]; |
| 730 ;step3[19] = step1b[19][i] + step1b[20][i]; |
| 731 ;step3[20] = step1b[19][i] - step1b[20][i]; |
| 732 ;step3[21] = step1b[18][i] - step1b[21][i]; |
| 733 LOAD_FROM_OUTPUT 23, 18, 19, q14, q13 |
| 734 vadd.s16 q8, q14, q1 |
| 735 vadd.s16 q9, q13, q6 |
| 736 vsub.s16 q13, q13, q6 |
| 737 vsub.s16 q1, q14, q1 |
| 738 STORE_IN_OUTPUT 19, 18, 19, q8, q9 |
| 739 ; -------------------------------------------------------------------------- |
| 740 ; part of stage 6 |
| 741 ;step3[27] = step1b[28][i] - step1b[27][i]; |
| 742 ;step3[28] = step1b[28][i] + step1b[27][i]; |
| 743 ;step3[29] = step1b[29][i] + step1b[26][i]; |
| 744 ;step3[26] = step1b[29][i] - step1b[26][i]; |
| 745 LOAD_FROM_OUTPUT 19, 28, 29, q8, q9 |
| 746 vsub.s16 q14, q8, q5 |
| 747 vadd.s16 q10, q8, q5 |
| 748 vadd.s16 q11, q9, q0 |
| 749 vsub.s16 q0, q9, q0 |
| 750 STORE_IN_OUTPUT 29, 28, 29, q10, q11 |
| 751 ; -------------------------------------------------------------------------- |
| 752 ; part of stage 7 |
| 753 ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64; |
| 754 ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64; |
| 755 ;step1[20] = dct_const_round_shift(temp1); |
| 756 ;step1[27] = dct_const_round_shift(temp2); |
| 757 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 |
| 758 STORE_IN_OUTPUT 29, 20, 27, q13, q14 |
| 759 ; -------------------------------------------------------------------------- |
| 760 ; part of stage 7 |
| 761 ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64; |
| 762 ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64; |
| 763 ;step1[21] = dct_const_round_shift(temp1); |
| 764 ;step1[26] = dct_const_round_shift(temp2); |
| 765 DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1 |
| 766 STORE_IN_OUTPUT 27, 21, 26, q1, q0 |
| 767 ; -------------------------------------------------------------------------- |
| 768 |
| 769 |
| 770 ; -------------------------------------------------------------------------- |
| 771 ; BLOCK C: 8-10,11-15 |
| 772 ; -------------------------------------------------------------------------- |
| 773 ; generate 8,9,14,15 |
| 774 ; -------------------------------------------------------------------------- |
| 775 ; part of stage 2 |
| 776 ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64; |
| 777 ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64; |
| 778 ;step2[8] = dct_const_round_shift(temp1); |
| 779 ;step2[15] = dct_const_round_shift(temp2); |
| 780 LOAD_FROM_TRANSPOSED 3, 2, 30 |
| 781 DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5 |
| 782 ; -------------------------------------------------------------------------- |
| 783 ; part of stage 2 |
| 784 ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64; |
| 785 ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64; |
| 786 ;step2[9] = dct_const_round_shift(temp1); |
| 787 ;step2[14] = dct_const_round_shift(temp2); |
| 788 LOAD_FROM_TRANSPOSED 30, 18, 14 |
| 789 DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7 |
| 790 ; -------------------------------------------------------------------------- |
| 791 ; part of stage 3 |
| 792 ;step3[8] = step1b[8][i] + step1b[9][i]; |
| 793 ;step3[9] = step1b[8][i] - step1b[9][i]; |
| 794 ;step3[14] = step1b[15][i] - step1b[14][i]; |
| 795 ;step3[15] = step1b[15][i] + step1b[14][i]; |
| 796 vsub.s16 q13, q0, q1 |
| 797 vadd.s16 q0, q0, q1 |
| 798 vsub.s16 q14, q2, q3 |
| 799 vadd.s16 q2, q2, q3 |
| 800 ; -------------------------------------------------------------------------- |
| 801 ; part of stage 4 |
| 802 ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64; |
| 803 ;temp2 = step1b[14][i] * cospi_8_64 + step1b[9][i] * cospi_24_64; |
| 804 ;step1[9] = dct_const_round_shift(temp1); |
| 805 ;step1[14] = dct_const_round_shift(temp2); |
| 806 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7 |
| 807 ; -------------------------------------------------------------------------- |
| 808 ; generate 10,11,12,13 |
| 809 ; -------------------------------------------------------------------------- |
| 810 ; part of stage 2 |
| 811 ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64; |
| 812 ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64; |
| 813 ;step2[10] = dct_const_round_shift(temp1); |
| 814 ;step2[13] = dct_const_round_shift(temp2); |
| 815 LOAD_FROM_TRANSPOSED 14, 10, 22 |
| 816 DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15 |
| 817 ; -------------------------------------------------------------------------- |
| 818 ; part of stage 2 |
| 819 ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64; |
| 820 ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64; |
| 821 ;step2[11] = dct_const_round_shift(temp1); |
| 822 ;step2[12] = dct_const_round_shift(temp2); |
| 823 LOAD_FROM_TRANSPOSED 22, 26, 6 |
| 824 DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13 |
| 825 ; -------------------------------------------------------------------------- |
| 826 ; part of stage 3 |
| 827 ;step3[10] = step1b[11][i] - step1b[10][i]; |
| 828 ;step3[11] = step1b[11][i] + step1b[10][i]; |
| 829 ;step3[12] = step1b[12][i] + step1b[13][i]; |
| 830 ;step3[13] = step1b[12][i] - step1b[13][i]; |
| 831 vsub.s16 q14, q4, q5 |
| 832 vadd.s16 q5, q4, q5 |
| 833 vsub.s16 q13, q6, q7 |
| 834 vadd.s16 q6, q6, q7 |
| 835 ; -------------------------------------------------------------------------- |
| 836 ; part of stage 4 |
| 837 ;temp1 = step1b[10][i] * (-cospi_8_64) - step1b[13][i] * (-cospi_24_64); |
| 838 ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64); |
| 839 ;step1[13] = dct_const_round_shift(temp1); |
| 840 ;step1[10] = dct_const_round_shift(temp2); |
| 841 DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15 |
| 842 ; -------------------------------------------------------------------------- |
| 843 ; combine 8-10,11-15 |
| 844 ; -------------------------------------------------------------------------- |
| 845 ; part of stage 5 |
| 846 ;step2[8] = step1b[8][i] + step1b[11][i]; |
| 847 ;step2[9] = step1b[9][i] + step1b[10][i]; |
| 848 ;step2[10] = step1b[9][i] - step1b[10][i]; |
| 849 vadd.s16 q8, q0, q5 |
| 850 vadd.s16 q9, q1, q7 |
| 851 vsub.s16 q13, q1, q7 |
| 852 ;step2[13] = step1b[14][i] - step1b[13][i]; |
| 853 ;step2[14] = step1b[14][i] + step1b[13][i]; |
| 854 ;step2[15] = step1b[15][i] + step1b[12][i]; |
| 855 vsub.s16 q14, q3, q4 |
| 856 vadd.s16 q10, q3, q4 |
| 857 vadd.s16 q15, q2, q6 |
| 858 STORE_IN_OUTPUT 26, 8, 15, q8, q15 |
| 859 STORE_IN_OUTPUT 15, 9, 14, q9, q10 |
| 860 ; -------------------------------------------------------------------------- |
| 861 ; part of stage 6 |
| 862 ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64; |
| 863 ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64; |
| 864 ;step3[10] = dct_const_round_shift(temp1); |
| 865 ;step3[13] = dct_const_round_shift(temp2); |
| 866 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 |
| 867 STORE_IN_OUTPUT 14, 13, 10, q3, q1 |
| 868 ; -------------------------------------------------------------------------- |
| 869 ; part of stage 5 |
| 870 ;step2[11] = step1b[8][i] - step1b[11][i]; |
| 871 ;step2[12] = step1b[15][i] - step1b[12][i]; |
| 872 vsub.s16 q13, q0, q5 |
| 873 vsub.s16 q14, q2, q6 |
| 874 ; -------------------------------------------------------------------------- |
| 875 ; part of stage 6 |
| 876 ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64; |
| 877 ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64; |
| 878 ;step3[11] = dct_const_round_shift(temp1); |
| 879 ;step3[12] = dct_const_round_shift(temp2); |
| 880 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 |
| 881 STORE_IN_OUTPUT 10, 11, 12, q1, q3 |
| 882 ; -------------------------------------------------------------------------- |
| 883 |
| 884 |
| 885 ; -------------------------------------------------------------------------- |
| 886 ; BLOCK D: 0-3,4-7 |
| 887 ; -------------------------------------------------------------------------- |
| 888 ; generate 4,5,6,7 |
| 889 ; -------------------------------------------------------------------------- |
| 890 ; part of stage 3 |
| 891 ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64; |
| 892 ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64; |
| 893 ;step3[4] = dct_const_round_shift(temp1); |
| 894 ;step3[7] = dct_const_round_shift(temp2); |
| 895 LOAD_FROM_TRANSPOSED 6, 4, 28 |
| 896 DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5 |
| 897 ; -------------------------------------------------------------------------- |
| 898 ; part of stage 3 |
| 899 ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64; |
| 900 ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64; |
| 901 ;step3[5] = dct_const_round_shift(temp1); |
| 902 ;step3[6] = dct_const_round_shift(temp2); |
| 903 LOAD_FROM_TRANSPOSED 28, 20, 12 |
| 904 DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 |
| 905 ; -------------------------------------------------------------------------- |
| 906 ; part of stage 4 |
| 907 ;step1[4] = step1b[4][i] + step1b[5][i]; |
| 908 ;step1[5] = step1b[4][i] - step1b[5][i]; |
| 909 ;step1[6] = step1b[7][i] - step1b[6][i]; |
| 910 ;step1[7] = step1b[7][i] + step1b[6][i]; |
| 911 vsub.s16 q13, q0, q1 |
| 912 vadd.s16 q0, q0, q1 |
| 913 vsub.s16 q14, q2, q3 |
| 914 vadd.s16 q2, q2, q3 |
| 915 ; -------------------------------------------------------------------------- |
| 916 ; part of stage 5 |
| 917 ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64; |
| 918 ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64; |
| 919 ;step2[5] = dct_const_round_shift(temp1); |
| 920 ;step2[6] = dct_const_round_shift(temp2); |
| 921 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 |
| 922 ; -------------------------------------------------------------------------- |
| 923 ; generate 0,1,2,3 |
| 924 ; -------------------------------------------------------------------------- |
| 925 ; part of stage 4 |
| 926 ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64; |
| 927 ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64; |
| 928 ;step1[1] = dct_const_round_shift(temp1); |
| 929 ;step1[0] = dct_const_round_shift(temp2); |
| 930 LOAD_FROM_TRANSPOSED 12, 0, 16 |
| 931 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15 |
| 932 ; -------------------------------------------------------------------------- |
| 933 ; part of stage 4 |
| 934 ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64; |
| 935 ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64; |
| 936 ;step1[2] = dct_const_round_shift(temp1); |
| 937 ;step1[3] = dct_const_round_shift(temp2); |
| 938 LOAD_FROM_TRANSPOSED 16, 8, 24 |
| 939 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13 |
| 940 ; -------------------------------------------------------------------------- |
| 941 ; part of stage 5 |
| 942 ;step2[0] = step1b[0][i] + step1b[3][i]; |
| 943 ;step2[1] = step1b[1][i] + step1b[2][i]; |
| 944 ;step2[2] = step1b[1][i] - step1b[2][i]; |
| 945 ;step2[3] = step1b[0][i] - step1b[3][i]; |
| 946 vadd.s16 q4, q7, q6 |
| 947 vsub.s16 q7, q7, q6 |
| 948 vsub.s16 q6, q5, q14 |
| 949 vadd.s16 q5, q5, q14 |
| 950 ; -------------------------------------------------------------------------- |
| 951 ; combine 0-3,4-7 |
| 952 ; -------------------------------------------------------------------------- |
| 953 ; part of stage 6 |
| 954 ;step3[0] = step1b[0][i] + step1b[7][i]; |
| 955 ;step3[1] = step1b[1][i] + step1b[6][i]; |
| 956 ;step3[2] = step1b[2][i] + step1b[5][i]; |
| 957 ;step3[3] = step1b[3][i] + step1b[4][i]; |
| 958 vadd.s16 q8, q4, q2 |
| 959 vadd.s16 q9, q5, q3 |
| 960 vadd.s16 q10, q6, q1 |
| 961 vadd.s16 q11, q7, q0 |
| 962 ;step3[4] = step1b[3][i] - step1b[4][i]; |
| 963 ;step3[5] = step1b[2][i] - step1b[5][i]; |
| 964 ;step3[6] = step1b[1][i] - step1b[6][i]; |
| 965 ;step3[7] = step1b[0][i] - step1b[7][i]; |
| 966 vsub.s16 q12, q7, q0 |
| 967 vsub.s16 q13, q6, q1 |
| 968 vsub.s16 q14, q5, q3 |
| 969 vsub.s16 q15, q4, q2 |
| 970 ; -------------------------------------------------------------------------- |
| 971 ; part of stage 7 |
| 972 ;step1[0] = step1b[0][i] + step1b[15][i]; |
| 973 ;step1[1] = step1b[1][i] + step1b[14][i]; |
| 974 ;step1[14] = step1b[1][i] - step1b[14][i]; |
| 975 ;step1[15] = step1b[0][i] - step1b[15][i]; |
| 976 LOAD_FROM_OUTPUT 12, 14, 15, q0, q1 |
| 977 vadd.s16 q2, q8, q1 |
| 978 vadd.s16 q3, q9, q0 |
| 979 vsub.s16 q4, q9, q0 |
| 980 vsub.s16 q5, q8, q1 |
| 981 ; -------------------------------------------------------------------------- |
| 982 ; part of final stage |
| 983 ;output[14 * 32] = step1b[14][i] + step1b[17][i]; |
| 984 ;output[15 * 32] = step1b[15][i] + step1b[16][i]; |
| 985 ;output[16 * 32] = step1b[15][i] - step1b[16][i]; |
| 986 ;output[17 * 32] = step1b[14][i] - step1b[17][i]; |
| 987 LOAD_FROM_OUTPUT 15, 16, 17, q0, q1 |
| 988 vadd.s16 q8, q4, q1 |
| 989 vadd.s16 q9, q5, q0 |
| 990 vsub.s16 q6, q5, q0 |
| 991 vsub.s16 q7, q4, q1 |
| 992 |
| 993 cmp r5, #0 |
| 994 bgt idct32_bands_end_2nd_pass |
| 995 |
| 996 idct32_bands_end_1st_pass |
| 997 STORE_IN_OUTPUT 17, 16, 17, q6, q7 |
| 998 STORE_IN_OUTPUT 17, 14, 15, q8, q9 |
| 999 ; -------------------------------------------------------------------------- |
| 1000 ; part of final stage |
| 1001 ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; |
| 1002 ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; |
| 1003 ;output[30 * 32] = step1b[1][i] - step1b[30][i]; |
| 1004 ;output[31 * 32] = step1b[0][i] - step1b[31][i]; |
| 1005 LOAD_FROM_OUTPUT 15, 30, 31, q0, q1 |
| 1006 vadd.s16 q4, q2, q1 |
| 1007 vadd.s16 q5, q3, q0 |
| 1008 vsub.s16 q6, q3, q0 |
| 1009 vsub.s16 q7, q2, q1 |
| 1010 STORE_IN_OUTPUT 31, 30, 31, q6, q7 |
| 1011 STORE_IN_OUTPUT 31, 0, 1, q4, q5 |
| 1012 ; -------------------------------------------------------------------------- |
| 1013 ; part of stage 7 |
| 1014 ;step1[2] = step1b[2][i] + step1b[13][i]; |
| 1015 ;step1[3] = step1b[3][i] + step1b[12][i]; |
| 1016 ;step1[12] = step1b[3][i] - step1b[12][i]; |
| 1017 ;step1[13] = step1b[2][i] - step1b[13][i]; |
| 1018 LOAD_FROM_OUTPUT 1, 12, 13, q0, q1 |
| 1019 vadd.s16 q2, q10, q1 |
| 1020 vadd.s16 q3, q11, q0 |
| 1021 vsub.s16 q4, q11, q0 |
| 1022 vsub.s16 q5, q10, q1 |
| 1023 ; -------------------------------------------------------------------------- |
| 1024 ; part of final stage |
| 1025 ;output[12 * 32] = step1b[12][i] + step1b[19][i]; |
| 1026 ;output[13 * 32] = step1b[13][i] + step1b[18][i]; |
| 1027 ;output[18 * 32] = step1b[13][i] - step1b[18][i]; |
| 1028 ;output[19 * 32] = step1b[12][i] - step1b[19][i]; |
| 1029 LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 |
| 1030 vadd.s16 q8, q4, q1 |
| 1031 vadd.s16 q9, q5, q0 |
| 1032 vsub.s16 q6, q5, q0 |
| 1033 vsub.s16 q7, q4, q1 |
| 1034 STORE_IN_OUTPUT 19, 18, 19, q6, q7 |
| 1035 STORE_IN_OUTPUT 19, 12, 13, q8, q9 |
| 1036 ; -------------------------------------------------------------------------- |
| 1037 ; part of final stage |
| 1038 ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; |
| 1039 ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; |
| 1040 ;output[28 * 32] = step1b[3][i] - step1b[28][i]; |
| 1041 ;output[29 * 32] = step1b[2][i] - step1b[29][i]; |
| 1042 LOAD_FROM_OUTPUT 13, 28, 29, q0, q1 |
| 1043 vadd.s16 q4, q2, q1 |
| 1044 vadd.s16 q5, q3, q0 |
| 1045 vsub.s16 q6, q3, q0 |
| 1046 vsub.s16 q7, q2, q1 |
| 1047 STORE_IN_OUTPUT 29, 28, 29, q6, q7 |
| 1048 STORE_IN_OUTPUT 29, 2, 3, q4, q5 |
| 1049 ; -------------------------------------------------------------------------- |
| 1050 ; part of stage 7 |
| 1051 ;step1[4] = step1b[4][i] + step1b[11][i]; |
| 1052 ;step1[5] = step1b[5][i] + step1b[10][i]; |
| 1053 ;step1[10] = step1b[5][i] - step1b[10][i]; |
| 1054 ;step1[11] = step1b[4][i] - step1b[11][i]; |
| 1055 LOAD_FROM_OUTPUT 3, 10, 11, q0, q1 |
| 1056 vadd.s16 q2, q12, q1 |
| 1057 vadd.s16 q3, q13, q0 |
| 1058 vsub.s16 q4, q13, q0 |
| 1059 vsub.s16 q5, q12, q1 |
| 1060 ; -------------------------------------------------------------------------- |
| 1061 ; part of final stage |
| 1062 ;output[10 * 32] = step1b[10][i] + step1b[21][i]; |
| 1063 ;output[11 * 32] = step1b[11][i] + step1b[20][i]; |
| 1064 ;output[20 * 32] = step1b[11][i] - step1b[20][i]; |
| 1065 ;output[21 * 32] = step1b[10][i] - step1b[21][i]; |
| 1066 LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 |
| 1067 vadd.s16 q8, q4, q1 |
| 1068 vadd.s16 q9, q5, q0 |
| 1069 vsub.s16 q6, q5, q0 |
| 1070 vsub.s16 q7, q4, q1 |
| 1071 STORE_IN_OUTPUT 21, 20, 21, q6, q7 |
| 1072 STORE_IN_OUTPUT 21, 10, 11, q8, q9 |
| 1073 ; -------------------------------------------------------------------------- |
| 1074 ; part of final stage |
| 1075 ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; |
| 1076 ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; |
| 1077 ;output[26 * 32] = step1b[5][i] - step1b[26][i]; |
| 1078 ;output[27 * 32] = step1b[4][i] - step1b[27][i]; |
| 1079 LOAD_FROM_OUTPUT 11, 26, 27, q0, q1 |
| 1080 vadd.s16 q4, q2, q1 |
| 1081 vadd.s16 q5, q3, q0 |
| 1082 vsub.s16 q6, q3, q0 |
| 1083 vsub.s16 q7, q2, q1 |
| 1084 STORE_IN_OUTPUT 27, 26, 27, q6, q7 |
| 1085 STORE_IN_OUTPUT 27, 4, 5, q4, q5 |
| 1086 ; -------------------------------------------------------------------------- |
| 1087 ; part of stage 7 |
| 1088 ;step1[6] = step1b[6][i] + step1b[9][i]; |
| 1089 ;step1[7] = step1b[7][i] + step1b[8][i]; |
| 1090 ;step1[8] = step1b[7][i] - step1b[8][i]; |
| 1091 ;step1[9] = step1b[6][i] - step1b[9][i]; |
| 1092 LOAD_FROM_OUTPUT 5, 8, 9, q0, q1 |
| 1093 vadd.s16 q2, q14, q1 |
| 1094 vadd.s16 q3, q15, q0 |
| 1095 vsub.s16 q4, q15, q0 |
| 1096 vsub.s16 q5, q14, q1 |
| 1097 ; -------------------------------------------------------------------------- |
| 1098 ; part of final stage |
| 1099 ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; |
| 1100 ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; |
| 1101 ;output[22 * 32] = step1b[9][i] - step1b[22][i]; |
| 1102 ;output[23 * 32] = step1b[8][i] - step1b[23][i]; |
| 1103 LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 |
| 1104 vadd.s16 q8, q4, q1 |
| 1105 vadd.s16 q9, q5, q0 |
| 1106 vsub.s16 q6, q5, q0 |
| 1107 vsub.s16 q7, q4, q1 |
| 1108 STORE_IN_OUTPUT 23, 22, 23, q6, q7 |
| 1109 STORE_IN_OUTPUT 23, 8, 9, q8, q9 |
| 1110 ; -------------------------------------------------------------------------- |
| 1111 ; part of final stage |
| 1112 ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; |
| 1113 ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; |
| 1114 ;output[24 * 32] = step1b[7][i] - step1b[24][i]; |
| 1115 ;output[25 * 32] = step1b[6][i] - step1b[25][i]; |
| 1116 LOAD_FROM_OUTPUT 9, 24, 25, q0, q1 |
| 1117 vadd.s16 q4, q2, q1 |
| 1118 vadd.s16 q5, q3, q0 |
| 1119 vsub.s16 q6, q3, q0 |
| 1120 vsub.s16 q7, q2, q1 |
| 1121 STORE_IN_OUTPUT 25, 24, 25, q6, q7 |
| 1122 STORE_IN_OUTPUT 25, 6, 7, q4, q5 |
| 1123 |
| 1124 ; restore r0 by removing the last offset from the last |
| 1125 ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 |
| 1126 sub r0, r0, #24*8*2 |
| 1127 ; restore r1 by removing the last offset from the last |
| 1128 ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7*32*2 |
| 1129 ; advance by 8 columns => 8*2 |
| 1130 sub r1, r1, #7*32*2 - 8*2 |
| 1131 ; advance by 8 lines (8*32*2) |
| 1132 ; go back by the two pairs from the loop (32*2) |
| 1133 add r3, r3, #8*32*2 - 32*2 |
| 1134 |
| 1135 ; bands loop processing |
| 1136 subs r4, r4, #1 |
| 1137 bne idct32_bands_loop |
| 1138 |
| 1139 ; parameters for second pass |
| 1140 ; the input of pass2 is the result of pass1. we have to remove the offset |
| 1141 ; of 32 columns induced by the above idct32_bands_loop |
| 1142 sub r3, r1, #32*2 |
| 1143 ; r1 = pass2[32 * 32] |
| 1144 add r1, sp, #2048 |
| 1145 |
| 1146 ; pass loop processing |
| 1147 add r5, r5, #1 |
| 1148 b idct32_pass_loop |
| 1149 |
| 1150 idct32_bands_end_2nd_pass |
| 1151 STORE_COMBINE_CENTER_RESULTS |
| 1152 ; -------------------------------------------------------------------------- |
| 1153 ; part of final stage |
| 1154 ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; |
| 1155 ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; |
| 1156 ;output[30 * 32] = step1b[1][i] - step1b[30][i]; |
| 1157 ;output[31 * 32] = step1b[0][i] - step1b[31][i]; |
| 1158 LOAD_FROM_OUTPUT 17, 30, 31, q0, q1 |
| 1159 vadd.s16 q4, q2, q1 |
| 1160 vadd.s16 q5, q3, q0 |
| 1161 vsub.s16 q6, q3, q0 |
| 1162 vsub.s16 q7, q2, q1 |
| 1163 STORE_COMBINE_EXTREME_RESULTS |
| 1164 ; -------------------------------------------------------------------------- |
| 1165 ; part of stage 7 |
| 1166 ;step1[2] = step1b[2][i] + step1b[13][i]; |
| 1167 ;step1[3] = step1b[3][i] + step1b[12][i]; |
| 1168 ;step1[12] = step1b[3][i] - step1b[12][i]; |
| 1169 ;step1[13] = step1b[2][i] - step1b[13][i]; |
| 1170 LOAD_FROM_OUTPUT 31, 12, 13, q0, q1 |
| 1171 vadd.s16 q2, q10, q1 |
| 1172 vadd.s16 q3, q11, q0 |
| 1173 vsub.s16 q4, q11, q0 |
| 1174 vsub.s16 q5, q10, q1 |
| 1175 ; -------------------------------------------------------------------------- |
| 1176 ; part of final stage |
| 1177 ;output[12 * 32] = step1b[12][i] + step1b[19][i]; |
| 1178 ;output[13 * 32] = step1b[13][i] + step1b[18][i]; |
| 1179 ;output[18 * 32] = step1b[13][i] - step1b[18][i]; |
| 1180 ;output[19 * 32] = step1b[12][i] - step1b[19][i]; |
| 1181 LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 |
| 1182 vadd.s16 q8, q4, q1 |
| 1183 vadd.s16 q9, q5, q0 |
| 1184 vsub.s16 q6, q5, q0 |
| 1185 vsub.s16 q7, q4, q1 |
| 1186 STORE_COMBINE_CENTER_RESULTS |
| 1187 ; -------------------------------------------------------------------------- |
| 1188 ; part of final stage |
| 1189 ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; |
| 1190 ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; |
| 1191 ;output[28 * 32] = step1b[3][i] - step1b[28][i]; |
| 1192 ;output[29 * 32] = step1b[2][i] - step1b[29][i]; |
| 1193 LOAD_FROM_OUTPUT 19, 28, 29, q0, q1 |
| 1194 vadd.s16 q4, q2, q1 |
| 1195 vadd.s16 q5, q3, q0 |
| 1196 vsub.s16 q6, q3, q0 |
| 1197 vsub.s16 q7, q2, q1 |
| 1198 STORE_COMBINE_EXTREME_RESULTS |
| 1199 ; -------------------------------------------------------------------------- |
| 1200 ; part of stage 7 |
| 1201 ;step1[4] = step1b[4][i] + step1b[11][i]; |
| 1202 ;step1[5] = step1b[5][i] + step1b[10][i]; |
| 1203 ;step1[10] = step1b[5][i] - step1b[10][i]; |
| 1204 ;step1[11] = step1b[4][i] - step1b[11][i]; |
| 1205 LOAD_FROM_OUTPUT 29, 10, 11, q0, q1 |
| 1206 vadd.s16 q2, q12, q1 |
| 1207 vadd.s16 q3, q13, q0 |
| 1208 vsub.s16 q4, q13, q0 |
| 1209 vsub.s16 q5, q12, q1 |
| 1210 ; -------------------------------------------------------------------------- |
| 1211 ; part of final stage |
| 1212 ;output[10 * 32] = step1b[10][i] + step1b[21][i]; |
| 1213 ;output[11 * 32] = step1b[11][i] + step1b[20][i]; |
| 1214 ;output[20 * 32] = step1b[11][i] - step1b[20][i]; |
| 1215 ;output[21 * 32] = step1b[10][i] - step1b[21][i]; |
| 1216 LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 |
| 1217 vadd.s16 q8, q4, q1 |
| 1218 vadd.s16 q9, q5, q0 |
| 1219 vsub.s16 q6, q5, q0 |
| 1220 vsub.s16 q7, q4, q1 |
| 1221 STORE_COMBINE_CENTER_RESULTS |
| 1222 ; -------------------------------------------------------------------------- |
| 1223 ; part of final stage |
| 1224 ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; |
| 1225 ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; |
| 1226 ;output[26 * 32] = step1b[5][i] - step1b[26][i]; |
| 1227 ;output[27 * 32] = step1b[4][i] - step1b[27][i]; |
| 1228 LOAD_FROM_OUTPUT 21, 26, 27, q0, q1 |
| 1229 vadd.s16 q4, q2, q1 |
| 1230 vadd.s16 q5, q3, q0 |
| 1231 vsub.s16 q6, q3, q0 |
| 1232 vsub.s16 q7, q2, q1 |
| 1233 STORE_COMBINE_EXTREME_RESULTS |
| 1234 ; -------------------------------------------------------------------------- |
| 1235 ; part of stage 7 |
| 1236 ;step1[6] = step1b[6][i] + step1b[9][i]; |
| 1237 ;step1[7] = step1b[7][i] + step1b[8][i]; |
| 1238 ;step1[8] = step1b[7][i] - step1b[8][i]; |
| 1239 ;step1[9] = step1b[6][i] - step1b[9][i]; |
| 1240 LOAD_FROM_OUTPUT 27, 8, 9, q0, q1 |
| 1241 vadd.s16 q2, q14, q1 |
| 1242 vadd.s16 q3, q15, q0 |
| 1243 vsub.s16 q4, q15, q0 |
| 1244 vsub.s16 q5, q14, q1 |
| 1245 ; -------------------------------------------------------------------------- |
| 1246 ; part of final stage |
| 1247 ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; |
| 1248 ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; |
| 1249 ;output[22 * 32] = step1b[9][i] - step1b[22][i]; |
| 1250 ;output[23 * 32] = step1b[8][i] - step1b[23][i]; |
| 1251 LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 |
| 1252 vadd.s16 q8, q4, q1 |
| 1253 vadd.s16 q9, q5, q0 |
| 1254 vsub.s16 q6, q5, q0 |
| 1255 vsub.s16 q7, q4, q1 |
| 1256 STORE_COMBINE_CENTER_RESULTS_LAST |
| 1257 ; -------------------------------------------------------------------------- |
| 1258 ; part of final stage |
| 1259 ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; |
| 1260 ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; |
| 1261 ;output[24 * 32] = step1b[7][i] - step1b[24][i]; |
| 1262 ;output[25 * 32] = step1b[6][i] - step1b[25][i]; |
| 1263 LOAD_FROM_OUTPUT 23, 24, 25, q0, q1 |
| 1264 vadd.s16 q4, q2, q1 |
| 1265 vadd.s16 q5, q3, q0 |
| 1266 vsub.s16 q6, q3, q0 |
| 1267 vsub.s16 q7, q2, q1 |
| 1268 STORE_COMBINE_EXTREME_RESULTS_LAST |
| 1269 ; -------------------------------------------------------------------------- |
| 1270 ; restore pointers to their initial indices for next band pass by |
| 1271 ; removing/adding dest_stride * 8. The actual increment by eight |
| 1272 ; is taken care of within the _LAST macros. |
| 1273 add r6, r6, r2, lsl #3 |
| 1274 add r9, r9, r2, lsl #3 |
| 1275 sub r7, r7, r2, lsl #3 |
| 1276 sub r10, r10, r2, lsl #3 |
| 1277 |
| 1278 ; restore r0 by removing the last offset from the last |
| 1279 ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 |
| 1280 sub r0, r0, #24*8*2 |
| 1281 ; restore r1 by removing the last offset from the last |
| 1282 ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2 |
| 1283 ; advance by 8 columns => 8*2 |
| 1284 sub r1, r1, #25*32*2 - 8*2 |
| 1285 ; advance by 8 lines (8*32*2) |
| 1286 ; go back by the two pairs from the loop (32*2) |
| 1287 add r3, r3, #8*32*2 - 32*2 |
| 1288 |
| 1289 ; bands loop processing |
| 1290 subs r4, r4, #1 |
| 1291 bne idct32_bands_loop |
| 1292 |
| 1293 ; stack operation |
| 1294 add sp, sp, #512+2048+2048 |
| 1295 vpop {d8-d15} |
| 1296 pop {r4-r11} |
| 1297 bx lr |
| 1298 ENDP ; |vp9_idct32x32_1024_add_neon| |
| 1299 END |
OLD | NEW |