| Index: source/libvpx/vp8/common/arm/neon/iwalsh_neon.asm | 
| =================================================================== | 
| --- source/libvpx/vp8/common/arm/neon/iwalsh_neon.asm	(revision 96967) | 
| +++ source/libvpx/vp8/common/arm/neon/iwalsh_neon.asm	(working copy) | 
| @@ -20,19 +20,16 @@ | 
| |vp8_short_inv_walsh4x4_neon| PROC | 
|  | 
| ; read in all four lines of values: d0->d3 | 
| -    vldm.64 r0, {q0, q1} | 
| +    vld1.i16 {q0-q1}, [r0@128] | 
|  | 
| ; first for loop | 
| - | 
| vadd.s16 d4, d0, d3 ;a = [0] + [12] | 
| -    vadd.s16 d5, d1, d2 ;b = [4] + [8] | 
| -    vsub.s16 d6, d1, d2 ;c = [4] - [8] | 
| -    vsub.s16 d7, d0, d3 ;d = [0] - [12] | 
| +    vadd.s16 d6, d1, d2 ;b = [4] + [8] | 
| +    vsub.s16 d5, d0, d3 ;d = [0] - [12] | 
| +    vsub.s16 d7, d1, d2 ;c = [4] - [8] | 
|  | 
| -    vadd.s16 d0, d4, d5 ;a + b | 
| -    vadd.s16 d1, d6, d7 ;c + d | 
| -    vsub.s16 d2, d4, d5 ;a - b | 
| -    vsub.s16 d3, d7, d6 ;d - c | 
| +    vadd.s16 q0, q2, q3 ; a+b d+c | 
| +    vsub.s16 q1, q2, q3 ; a-b d-c | 
|  | 
| vtrn.32 d0, d2 ;d0:  0  1  8  9 | 
| ;d2:  2  3 10 11 | 
| @@ -47,49 +44,36 @@ | 
| ; second for loop | 
|  | 
| vadd.s16 d4, d0, d3 ;a = [0] + [3] | 
| -    vadd.s16 d5, d1, d2 ;b = [1] + [2] | 
| -    vsub.s16 d6, d1, d2 ;c = [1] - [2] | 
| -    vsub.s16 d7, d0, d3 ;d = [0] - [3] | 
| +    vadd.s16 d6, d1, d2 ;b = [1] + [2] | 
| +    vsub.s16 d5, d0, d3 ;d = [0] - [3] | 
| +    vsub.s16 d7, d1, d2 ;c = [1] - [2] | 
|  | 
| -    vadd.s16 d0, d4, d5 ;e = a + b | 
| -    vadd.s16 d1, d6, d7 ;f = c + d | 
| -    vsub.s16 d2, d4, d5 ;g = a - b | 
| -    vsub.s16 d3, d7, d6 ;h = d - c | 
| +    vmov.i16 q8, #3 | 
|  | 
| -    vmov.i16 q2, #3 | 
| -    vadd.i16 q0, q0, q2 ;e/f += 3 | 
| -    vadd.i16 q1, q1, q2 ;g/h += 3 | 
| +    vadd.s16 q0, q2, q3 ; a+b d+c | 
| +    vsub.s16 q1, q2, q3 ; a-b d-c | 
|  | 
| +    vadd.i16 q0, q0, q8 ;e/f += 3 | 
| +    vadd.i16 q1, q1, q8 ;g/h += 3 | 
| + | 
| vshr.s16 q0, q0, #3 ;e/f >> 3 | 
| vshr.s16 q1, q1, #3 ;g/h >> 3 | 
|  | 
| -    vtrn.32 d0, d2 | 
| -    vtrn.32 d1, d3 | 
| -    vtrn.16 d0, d1 | 
| -    vtrn.16 d2, d3 | 
| +    vst4.i16 {d0,d1,d2,d3}, [r1@128] | 
|  | 
| -    vstmia.16 r1!, {q0} | 
| -    vstmia.16 r1!, {q1} | 
| - | 
| bx lr | 
| ENDP    ; |vp8_short_inv_walsh4x4_neon| | 
|  | 
|  | 
| ;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output) | 
| |vp8_short_inv_walsh4x4_1_neon| PROC | 
| -    ; load a full line into a neon register | 
| -    vld1.16  {q0}, [r0] | 
| -    ; extract first element and replicate | 
| -    vdup.16 q1, d0[0] | 
| -    ; add 3 to all values | 
| -    vmov.i16 q2, #3 | 
| -    vadd.i16 q3, q1, q2 | 
| -    ; right shift | 
| -    vshr.s16 q3, q3, #3 | 
| -    ; write it back | 
| -    vstmia.16 r1!, {q3} | 
| -    vstmia.16 r1!, {q3} | 
| - | 
| +    ldrsh r2, [r0]          ; load input[0] | 
| +    add r3, r2, #3          ; add 3 | 
| +    add r2, r1, #16         ; base for last 8 output | 
| +    asr r0, r3, #3          ; right shift 3 | 
| +    vdup.16 q0, r0          ; load and duplicate | 
| +    vst1.16 {q0}, [r1@128]  ; write back 8 | 
| +    vst1.16 {q0}, [r2@128]  ; write back last 8 | 
| bx lr | 
| ENDP    ; |vp8_short_inv_walsh4x4_1_neon| | 
|  | 
|  |