| OLD | NEW |
| 1 ; | 1 ; |
| 2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved. |
| 3 ; | 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; | 9 ; |
| 10 | 10 |
| (...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 65 vld1.s16 {q14}, [r0] | 65 vld1.s16 {q14}, [r0] |
| 66 add r0, #($second_offset - $first_offset)*8*2 | 66 add r0, #($second_offset - $first_offset)*8*2 |
| 67 vld1.s16 {q13}, [r0] | 67 vld1.s16 {q13}, [r0] |
| 68 ; (used) two registers (q14, q13) | 68 ; (used) two registers (q14, q13) |
| 69 MEND | 69 MEND |
| 70 ; -------------------------------------------------------------------------- | 70 ; -------------------------------------------------------------------------- |
| 71 ; Load from output (used as temporary storage) | 71 ; Load from output (used as temporary storage) |
| 72 ; reg1 = output[first_offset] | 72 ; reg1 = output[first_offset] |
| 73 ; reg2 = output[second_offset] | 73 ; reg2 = output[second_offset] |
| 74 ; for proper address calculation, the last offset used when manipulating | 74 ; for proper address calculation, the last offset used when manipulating |
| 75 ; output, wethere reading or storing) must be passed in. use 0 for first | 75 ; output, whether reading or storing) must be passed in. use 0 for first |
| 76 ; use. | 76 ; use. |
| 77 MACRO | 77 MACRO |
| 78 LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 | 78 LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 |
| 79 ; address calculation with proper stride and loading | 79 ; address calculation with proper stride and loading |
| 80 add r1, #($first_offset - $prev_offset )*32*2 | 80 add r1, #($first_offset - $prev_offset )*32*2 |
| 81 vld1.s16 {$reg1}, [r1] | 81 vld1.s16 {$reg1}, [r1] |
| 82 add r1, #($second_offset - $first_offset)*32*2 | 82 add r1, #($second_offset - $first_offset)*32*2 |
| 83 vld1.s16 {$reg2}, [r1] | 83 vld1.s16 {$reg2}, [r1] |
| 84 ; (used) two registers ($reg1, $reg2) | 84 ; (used) two registers ($reg1, $reg2) |
| 85 MEND | 85 MEND |
| 86 ; -------------------------------------------------------------------------- | 86 ; -------------------------------------------------------------------------- |
| 87 ; Store into output (sometimes as as temporary storage) | 87 ; Store into output (sometimes as as temporary storage) |
| 88 ; output[first_offset] = reg1 | 88 ; output[first_offset] = reg1 |
| 89 ; output[second_offset] = reg2 | 89 ; output[second_offset] = reg2 |
| 90 ; for proper address calculation, the last offset used when manipulating | 90 ; for proper address calculation, the last offset used when manipulating |
| 91 ; output, wethere reading or storing) must be passed in. use 0 for first | 91 ; output, whether reading or storing) must be passed in. use 0 for first |
| 92 ; use. | 92 ; use. |
| 93 MACRO | 93 MACRO |
| 94 STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 | 94 STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 |
| 95 ; address calculation with proper stride and storing | 95 ; address calculation with proper stride and storing |
| 96 add r1, #($first_offset - $prev_offset )*32*2 | 96 add r1, #($first_offset - $prev_offset )*32*2 |
| 97 vst1.16 {$reg1}, [r1] | 97 vst1.16 {$reg1}, [r1] |
| 98 add r1, #($second_offset - $first_offset)*32*2 | 98 add r1, #($second_offset - $first_offset)*32*2 |
| 99 vst1.16 {$reg2}, [r1] | 99 vst1.16 {$reg2}, [r1] |
| 100 MEND | 100 MEND |
| 101 ; -------------------------------------------------------------------------- | 101 ; -------------------------------------------------------------------------- |
| (...skipping 133 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 235 add r7, r7, r2, lsl #1 | 235 add r7, r7, r2, lsl #1 |
| 236 MEND | 236 MEND |
| 237 ; -------------------------------------------------------------------------- | 237 ; -------------------------------------------------------------------------- |
| 238 ; Touches q8-q12, q15 (q13-q14 are preserved) | 238 ; Touches q8-q12, q15 (q13-q14 are preserved) |
| 239 ; valid output registers are anything but q8-q11 | 239 ; valid output registers are anything but q8-q11 |
| 240 MACRO | 240 MACRO |
| 241 DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant,
$reg1, $reg2, $reg3, $reg4 | 241 DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant,
$reg1, $reg2, $reg3, $reg4 |
| 242 ; TODO(cd): have special case to re-use constants when they are similar for | 242 ; TODO(cd): have special case to re-use constants when they are similar for |
| 243 ; consecutive butterflies | 243 ; consecutive butterflies |
| 244 ; TODO(cd): have special case when both constants are the same, do the | 244 ; TODO(cd): have special case when both constants are the same, do the |
| 245 ; additions/substractions before the multiplies. | 245 ; additions/subtractions before the multiplies. |
| 246 ; generate the constants | 246 ; generate the constants |
| 247 ; generate scalar constants | 247 ; generate scalar constants |
| 248 mov r8, #$first_constant & 0xFF00 | 248 mov r8, #$first_constant & 0xFF00 |
| 249 mov r12, #$second_constant & 0xFF00 | 249 mov r12, #$second_constant & 0xFF00 |
| 250 add r8, #$first_constant & 0x00FF | 250 add r8, #$first_constant & 0x00FF |
| 251 add r12, #$second_constant & 0x00FF | 251 add r12, #$second_constant & 0x00FF |
| 252 ; generate vector constants | 252 ; generate vector constants |
| 253 vdup.16 d30, r8 | 253 vdup.16 d30, r8 |
| 254 vdup.16 d31, r12 | 254 vdup.16 d31, r12 |
| 255 ; (used) two for inputs (regA-regD), one for constants (q15) | 255 ; (used) two for inputs (regA-regD), one for constants (q15) |
| 256 ; do some multiplications (ordered for maximum latency hiding) | 256 ; do some multiplications (ordered for maximum latency hiding) |
| 257 vmull.s16 q8, $regC, d30 | 257 vmull.s16 q8, $regC, d30 |
| 258 vmull.s16 q10, $regA, d31 | 258 vmull.s16 q10, $regA, d31 |
| 259 vmull.s16 q9, $regD, d30 | 259 vmull.s16 q9, $regD, d30 |
| 260 vmull.s16 q11, $regB, d31 | 260 vmull.s16 q11, $regB, d31 |
| 261 vmull.s16 q12, $regC, d31 | 261 vmull.s16 q12, $regC, d31 |
| 262 ; (used) five for intermediate (q8-q12), one for constants (q15) | 262 ; (used) five for intermediate (q8-q12), one for constants (q15) |
| 263 ; do some addition/substractions (to get back two register) | 263 ; do some addition/subtractions (to get back two register) |
| 264 vsub.s32 q8, q8, q10 | 264 vsub.s32 q8, q8, q10 |
| 265 vsub.s32 q9, q9, q11 | 265 vsub.s32 q9, q9, q11 |
| 266 ; do more multiplications (ordered for maximum latency hiding) | 266 ; do more multiplications (ordered for maximum latency hiding) |
| 267 vmull.s16 q10, $regD, d31 | 267 vmull.s16 q10, $regD, d31 |
| 268 vmull.s16 q11, $regA, d30 | 268 vmull.s16 q11, $regA, d30 |
| 269 vmull.s16 q15, $regB, d30 | 269 vmull.s16 q15, $regB, d30 |
| 270 ; (used) six for intermediate (q8-q12, q15) | 270 ; (used) six for intermediate (q8-q12, q15) |
| 271 ; do more addition/substractions | 271 ; do more addition/subtractions |
| 272 vadd.s32 q11, q12, q11 | 272 vadd.s32 q11, q12, q11 |
| 273 vadd.s32 q10, q10, q15 | 273 vadd.s32 q10, q10, q15 |
| 274 ; (used) four for intermediate (q8-q11) | 274 ; (used) four for intermediate (q8-q11) |
| 275 ; dct_const_round_shift | 275 ; dct_const_round_shift |
| 276 vqrshrn.s32 $reg1, q8, #14 | 276 vqrshrn.s32 $reg1, q8, #14 |
| 277 vqrshrn.s32 $reg2, q9, #14 | 277 vqrshrn.s32 $reg2, q9, #14 |
| 278 vqrshrn.s32 $reg3, q11, #14 | 278 vqrshrn.s32 $reg3, q11, #14 |
| 279 vqrshrn.s32 $reg4, q10, #14 | 279 vqrshrn.s32 $reg4, q10, #14 |
| 280 ; (used) two for results, well four d registers | 280 ; (used) two for results, well four d registers |
| 281 MEND | 281 MEND |
| (...skipping 1008 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1290 subs r4, r4, #1 | 1290 subs r4, r4, #1 |
| 1291 bne idct32_bands_loop | 1291 bne idct32_bands_loop |
| 1292 | 1292 |
| 1293 ; stack operation | 1293 ; stack operation |
| 1294 add sp, sp, #512+2048+2048 | 1294 add sp, sp, #512+2048+2048 |
| 1295 vpop {d8-d15} | 1295 vpop {d8-d15} |
| 1296 pop {r4-r11} | 1296 pop {r4-r11} |
| 1297 bx lr | 1297 bx lr |
| 1298 ENDP ; |vp9_idct32x32_1024_add_neon| | 1298 ENDP ; |vp9_idct32x32_1024_add_neon| |
| 1299 END | 1299 END |
| OLD | NEW |