| OLD | NEW |
| 1 ; | 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; | 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; | 9 ; |
| 10 EXPORT |vp8_short_inv_walsh4x4_neon| | 10 EXPORT |vp8_short_inv_walsh4x4_neon| |
| 11 EXPORT |vp8_short_inv_walsh4x4_1_neon| | 11 EXPORT |vp8_short_inv_walsh4x4_1_neon| |
| 12 | 12 |
| 13 ARM | 13 ARM |
| 14 REQUIRE8 | 14 REQUIRE8 |
| 15 PRESERVE8 | 15 PRESERVE8 |
| 16 | 16 |
| 17 AREA |.text|, CODE, READONLY ; name this block of code | 17 AREA |.text|, CODE, READONLY ; name this block of code |
| 18 | 18 |
| 19 ;short vp8_short_inv_walsh4x4_neon(short *input, short *output) | 19 ;short vp8_short_inv_walsh4x4_neon(short *input, short *output) |
| 20 |vp8_short_inv_walsh4x4_neon| PROC | 20 |vp8_short_inv_walsh4x4_neon| PROC |
| 21 | 21 |
| 22 ; read in all four lines of values: d0->d3 | 22 ; read in all four lines of values: d0->d3 |
| 23 vldm.64 r0, {q0, q1} | 23 vld1.i16 {q0-q1}, [r0@128] |
| 24 | 24 |
| 25 ; first for loop | 25 ; first for loop |
| 26 vadd.s16 d4, d0, d3 ;a = [0] + [12] |
| 27 vadd.s16 d6, d1, d2 ;b = [4] + [8] |
| 28 vsub.s16 d5, d0, d3 ;d = [0] - [12] |
| 29 vsub.s16 d7, d1, d2 ;c = [4] - [8] |
| 26 | 30 |
| 27 vadd.s16 d4, d0, d3 ;a = [0] + [12] | 31 vadd.s16 q0, q2, q3 ; a+b d+c |
| 28 vadd.s16 d5, d1, d2 ;b = [4] + [8] | 32 vsub.s16 q1, q2, q3 ; a-b d-c |
| 29 vsub.s16 d6, d1, d2 ;c = [4] - [8] | |
| 30 vsub.s16 d7, d0, d3 ;d = [0] - [12] | |
| 31 | |
| 32 vadd.s16 d0, d4, d5 ;a + b | |
| 33 vadd.s16 d1, d6, d7 ;c + d | |
| 34 vsub.s16 d2, d4, d5 ;a - b | |
| 35 vsub.s16 d3, d7, d6 ;d - c | |
| 36 | 33 |
| 37 vtrn.32 d0, d2 ;d0: 0 1 8 9 | 34 vtrn.32 d0, d2 ;d0: 0 1 8 9 |
| 38 ;d2: 2 3 10 11 | 35 ;d2: 2 3 10 11 |
| 39 vtrn.32 d1, d3 ;d1: 4 5 12 13 | 36 vtrn.32 d1, d3 ;d1: 4 5 12 13 |
| 40 ;d3: 6 7 14 15 | 37 ;d3: 6 7 14 15 |
| 41 | 38 |
| 42 vtrn.16 d0, d1 ;d0: 0 4 8 12 | 39 vtrn.16 d0, d1 ;d0: 0 4 8 12 |
| 43 ;d1: 1 5 9 13 | 40 ;d1: 1 5 9 13 |
| 44 vtrn.16 d2, d3 ;d2: 2 6 10 14 | 41 vtrn.16 d2, d3 ;d2: 2 6 10 14 |
| 45 ;d3: 3 7 11 15 | 42 ;d3: 3 7 11 15 |
| 46 | 43 |
| 47 ; second for loop | 44 ; second for loop |
| 48 | 45 |
| 49 vadd.s16 d4, d0, d3 ;a = [0] + [3] | 46 vadd.s16 d4, d0, d3 ;a = [0] + [3] |
| 50 vadd.s16 d5, d1, d2 ;b = [1] + [2] | 47 vadd.s16 d6, d1, d2 ;b = [1] + [2] |
| 51 vsub.s16 d6, d1, d2 ;c = [1] - [2] | 48 vsub.s16 d5, d0, d3 ;d = [0] - [3] |
| 52 vsub.s16 d7, d0, d3 ;d = [0] - [3] | 49 vsub.s16 d7, d1, d2 ;c = [1] - [2] |
| 53 | 50 |
| 54 vadd.s16 d0, d4, d5 ;e = a + b | 51 vmov.i16 q8, #3 |
| 55 vadd.s16 d1, d6, d7 ;f = c + d | |
| 56 vsub.s16 d2, d4, d5 ;g = a - b | |
| 57 vsub.s16 d3, d7, d6 ;h = d - c | |
| 58 | 52 |
| 59 vmov.i16 q2, #3 | 53 vadd.s16 q0, q2, q3 ; a+b d+c |
| 60 vadd.i16 q0, q0, q2 ;e/f += 3 | 54 vsub.s16 q1, q2, q3 ; a-b d-c |
| 61 vadd.i16 q1, q1, q2 ;g/h += 3 | 55 |
| 56 vadd.i16 q0, q0, q8 ;e/f += 3 |
| 57 vadd.i16 q1, q1, q8 ;g/h += 3 |
| 62 | 58 |
| 63 vshr.s16 q0, q0, #3 ;e/f >> 3 | 59 vshr.s16 q0, q0, #3 ;e/f >> 3 |
| 64 vshr.s16 q1, q1, #3 ;g/h >> 3 | 60 vshr.s16 q1, q1, #3 ;g/h >> 3 |
| 65 | 61 |
| 66 vtrn.32 d0, d2 | 62 vst4.i16 {d0,d1,d2,d3}, [r1@128] |
| 67 vtrn.32 d1, d3 | |
| 68 vtrn.16 d0, d1 | |
| 69 vtrn.16 d2, d3 | |
| 70 | |
| 71 vstmia.16 r1!, {q0} | |
| 72 vstmia.16 r1!, {q1} | |
| 73 | 63 |
| 74 bx lr | 64 bx lr |
| 75 ENDP ; |vp8_short_inv_walsh4x4_neon| | 65 ENDP ; |vp8_short_inv_walsh4x4_neon| |
| 76 | 66 |
| 77 | 67 |
| 78 ;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output) | 68 ;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output) |
| 79 |vp8_short_inv_walsh4x4_1_neon| PROC | 69 |vp8_short_inv_walsh4x4_1_neon| PROC |
| 80 ; load a full line into a neon register | 70 ldrsh r2, [r0] ; load input[0] |
| 81 vld1.16 {q0}, [r0] | 71 add r3, r2, #3 ; add 3 |
| 82 ; extract first element and replicate | 72 add r2, r1, #16 ; base for last 8 output |
| 83 vdup.16 q1, d0[0] | 73 asr r0, r3, #3 ; right shift 3 |
| 84 ; add 3 to all values | 74 vdup.16 q0, r0 ; load and duplicate |
| 85 vmov.i16 q2, #3 | 75 vst1.16 {q0}, [r1@128] ; write back 8 |
| 86 vadd.i16 q3, q1, q2 | 76 vst1.16 {q0}, [r2@128] ; write back last 8 |
| 87 ; right shift | |
| 88 vshr.s16 q3, q3, #3 | |
| 89 ; write it back | |
| 90 vstmia.16 r1!, {q3} | |
| 91 vstmia.16 r1!, {q3} | |
| 92 | |
| 93 bx lr | 77 bx lr |
| 94 ENDP ; |vp8_short_inv_walsh4x4_1_neon| | 78 ENDP ; |vp8_short_inv_walsh4x4_1_neon| |
| 95 | 79 |
| 96 END | 80 END |
| OLD | NEW |