| OLD | NEW | 
|---|
| 1 ; | 1 ; | 
| 2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 
| 3 ; | 3 ; | 
| 4 ;  Use of this source code is governed by a BSD-style license | 4 ;  Use of this source code is governed by a BSD-style license | 
| 5 ;  that can be found in the LICENSE file in the root of the source | 5 ;  that can be found in the LICENSE file in the root of the source | 
| 6 ;  tree. An additional intellectual property rights grant can be found | 6 ;  tree. An additional intellectual property rights grant can be found | 
| 7 ;  in the file PATENTS.  All contributing project authors may | 7 ;  in the file PATENTS.  All contributing project authors may | 
| 8 ;  be found in the AUTHORS file in the root of the source tree. | 8 ;  be found in the AUTHORS file in the root of the source tree. | 
| 9 ; | 9 ; | 
| 10     EXPORT  |vp8_short_inv_walsh4x4_neon| | 10     EXPORT  |vp8_short_inv_walsh4x4_neon| | 
| 11     EXPORT  |vp8_short_inv_walsh4x4_1_neon| | 11     EXPORT  |vp8_short_inv_walsh4x4_1_neon| | 
| 12 | 12 | 
| 13     ARM | 13     ARM | 
| 14     REQUIRE8 | 14     REQUIRE8 | 
| 15     PRESERVE8 | 15     PRESERVE8 | 
| 16 | 16 | 
| 17     AREA    |.text|, CODE, READONLY  ; name this block of code | 17     AREA    |.text|, CODE, READONLY  ; name this block of code | 
| 18 | 18 | 
| 19 ;short vp8_short_inv_walsh4x4_neon(short *input, short *output) | 19 ;short vp8_short_inv_walsh4x4_neon(short *input, short *output) | 
| 20 |vp8_short_inv_walsh4x4_neon| PROC | 20 |vp8_short_inv_walsh4x4_neon| PROC | 
| 21 | 21 | 
| 22     ; read in all four lines of values: d0->d3 | 22     ; read in all four lines of values: d0->d3 | 
| 23     vldm.64 r0, {q0, q1} | 23     vld1.i16 {q0-q1}, [r0@128] | 
| 24 | 24 | 
| 25     ; first for loop | 25     ; first for loop | 
|  | 26     vadd.s16 d4, d0, d3 ;a = [0] + [12] | 
|  | 27     vadd.s16 d6, d1, d2 ;b = [4] + [8] | 
|  | 28     vsub.s16 d5, d0, d3 ;d = [0] - [12] | 
|  | 29     vsub.s16 d7, d1, d2 ;c = [4] - [8] | 
| 26 | 30 | 
| 27     vadd.s16 d4, d0, d3 ;a = [0] + [12] | 31     vadd.s16 q0, q2, q3 ; a+b d+c | 
| 28     vadd.s16 d5, d1, d2 ;b = [4] + [8] | 32     vsub.s16 q1, q2, q3 ; a-b d-c | 
| 29     vsub.s16 d6, d1, d2 ;c = [4] - [8] |  | 
| 30     vsub.s16 d7, d0, d3 ;d = [0] - [12] |  | 
| 31 |  | 
| 32     vadd.s16 d0, d4, d5 ;a + b |  | 
| 33     vadd.s16 d1, d6, d7 ;c + d |  | 
| 34     vsub.s16 d2, d4, d5 ;a - b |  | 
| 35     vsub.s16 d3, d7, d6 ;d - c |  | 
| 36 | 33 | 
| 37     vtrn.32 d0, d2 ;d0:  0  1  8  9 | 34     vtrn.32 d0, d2 ;d0:  0  1  8  9 | 
| 38                    ;d2:  2  3 10 11 | 35                    ;d2:  2  3 10 11 | 
| 39     vtrn.32 d1, d3 ;d1:  4  5 12 13 | 36     vtrn.32 d1, d3 ;d1:  4  5 12 13 | 
| 40                    ;d3:  6  7 14 15 | 37                    ;d3:  6  7 14 15 | 
| 41 | 38 | 
| 42     vtrn.16 d0, d1 ;d0:  0  4  8 12 | 39     vtrn.16 d0, d1 ;d0:  0  4  8 12 | 
| 43                    ;d1:  1  5  9 13 | 40                    ;d1:  1  5  9 13 | 
| 44     vtrn.16 d2, d3 ;d2:  2  6 10 14 | 41     vtrn.16 d2, d3 ;d2:  2  6 10 14 | 
| 45                    ;d3:  3  7 11 15 | 42                    ;d3:  3  7 11 15 | 
| 46 | 43 | 
| 47     ; second for loop | 44     ; second for loop | 
| 48 | 45 | 
| 49     vadd.s16 d4, d0, d3 ;a = [0] + [3] | 46     vadd.s16 d4, d0, d3 ;a = [0] + [3] | 
| 50     vadd.s16 d5, d1, d2 ;b = [1] + [2] | 47     vadd.s16 d6, d1, d2 ;b = [1] + [2] | 
| 51     vsub.s16 d6, d1, d2 ;c = [1] - [2] | 48     vsub.s16 d5, d0, d3 ;d = [0] - [3] | 
| 52     vsub.s16 d7, d0, d3 ;d = [0] - [3] | 49     vsub.s16 d7, d1, d2 ;c = [1] - [2] | 
| 53 | 50 | 
| 54     vadd.s16 d0, d4, d5 ;e = a + b | 51     vmov.i16 q8, #3 | 
| 55     vadd.s16 d1, d6, d7 ;f = c + d |  | 
| 56     vsub.s16 d2, d4, d5 ;g = a - b |  | 
| 57     vsub.s16 d3, d7, d6 ;h = d - c |  | 
| 58 | 52 | 
| 59     vmov.i16 q2, #3 | 53     vadd.s16 q0, q2, q3 ; a+b d+c | 
| 60     vadd.i16 q0, q0, q2 ;e/f += 3 | 54     vsub.s16 q1, q2, q3 ; a-b d-c | 
| 61     vadd.i16 q1, q1, q2 ;g/h += 3 | 55 | 
|  | 56     vadd.i16 q0, q0, q8 ;e/f += 3 | 
|  | 57     vadd.i16 q1, q1, q8 ;g/h += 3 | 
| 62 | 58 | 
| 63     vshr.s16 q0, q0, #3 ;e/f >> 3 | 59     vshr.s16 q0, q0, #3 ;e/f >> 3 | 
| 64     vshr.s16 q1, q1, #3 ;g/h >> 3 | 60     vshr.s16 q1, q1, #3 ;g/h >> 3 | 
| 65 | 61 | 
| 66     vtrn.32 d0, d2 | 62     vst4.i16 {d0,d1,d2,d3}, [r1@128] | 
| 67     vtrn.32 d1, d3 |  | 
| 68     vtrn.16 d0, d1 |  | 
| 69     vtrn.16 d2, d3 |  | 
| 70 |  | 
| 71     vstmia.16 r1!, {q0} |  | 
| 72     vstmia.16 r1!, {q1} |  | 
| 73 | 63 | 
| 74     bx lr | 64     bx lr | 
| 75     ENDP    ; |vp8_short_inv_walsh4x4_neon| | 65     ENDP    ; |vp8_short_inv_walsh4x4_neon| | 
| 76 | 66 | 
| 77 | 67 | 
| 78 ;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output) | 68 ;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output) | 
| 79 |vp8_short_inv_walsh4x4_1_neon| PROC | 69 |vp8_short_inv_walsh4x4_1_neon| PROC | 
| 80     ; load a full line into a neon register | 70     ldrsh r2, [r0]          ; load input[0] | 
| 81     vld1.16  {q0}, [r0] | 71     add r3, r2, #3          ; add 3 | 
| 82     ; extract first element and replicate | 72     add r2, r1, #16         ; base for last 8 output | 
| 83     vdup.16 q1, d0[0] | 73     asr r0, r3, #3          ; right shift 3 | 
| 84     ; add 3 to all values | 74     vdup.16 q0, r0          ; load and duplicate | 
| 85     vmov.i16 q2, #3 | 75     vst1.16 {q0}, [r1@128]  ; write back 8 | 
| 86     vadd.i16 q3, q1, q2 | 76     vst1.16 {q0}, [r2@128]  ; write back last 8 | 
| 87     ; right shift |  | 
| 88     vshr.s16 q3, q3, #3 |  | 
| 89     ; write it back |  | 
| 90     vstmia.16 r1!, {q3} |  | 
| 91     vstmia.16 r1!, {q3} |  | 
| 92 |  | 
| 93     bx lr | 77     bx lr | 
| 94     ENDP    ; |vp8_short_inv_walsh4x4_1_neon| | 78     ENDP    ; |vp8_short_inv_walsh4x4_1_neon| | 
| 95 | 79 | 
| 96     END | 80     END | 
| OLD | NEW | 
|---|