| OLD | NEW | 
|---|
| 1 ; | 1 ; | 
| 2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 
| 3 ; | 3 ; | 
| 4 ;  Use of this source code is governed by a BSD-style license | 4 ;  Use of this source code is governed by a BSD-style license | 
| 5 ;  that can be found in the LICENSE file in the root of the source | 5 ;  that can be found in the LICENSE file in the root of the source | 
| 6 ;  tree. An additional intellectual property rights grant can be found | 6 ;  tree. An additional intellectual property rights grant can be found | 
| 7 ;  in the file PATENTS.  All contributing project authors may | 7 ;  in the file PATENTS.  All contributing project authors may | 
| 8 ;  be found in the AUTHORS file in the root of the source tree. | 8 ;  be found in the AUTHORS file in the root of the source tree. | 
| 9 ; | 9 ; | 
| 10 | 10 | 
| 11 | 11 | 
| 12 %include "vpx_ports/x86_abi_support.asm" | 12 %include "vpx_ports/x86_abi_support.asm" | 
| 13 | 13 | 
| 14 ;void vp8_short_inv_walsh4x4_sse2(short *input, short *output) | 14 ;void vp8_short_inv_walsh4x4_sse2(short *input, short *output) | 
| 15 global sym(vp8_short_inv_walsh4x4_sse2) | 15 global sym(vp8_short_inv_walsh4x4_sse2) | 
| 16 sym(vp8_short_inv_walsh4x4_sse2): | 16 sym(vp8_short_inv_walsh4x4_sse2): | 
| 17     push        rbp | 17     push        rbp | 
| 18     mov         rbp, rsp | 18     mov         rbp, rsp | 
| 19     SHADOW_ARGS_TO_STACK 2 | 19     SHADOW_ARGS_TO_STACK 2 | 
| 20     SAVE_XMM | 20     SAVE_XMM 6 | 
| 21     push        rsi | 21     push        rsi | 
| 22     push        rdi | 22     push        rdi | 
| 23     ; end prolog | 23     ; end prolog | 
| 24 | 24 | 
| 25     mov     rsi, arg(0) | 25     mov     rsi, arg(0) | 
| 26     mov     rdi, arg(1) | 26     mov     rdi, arg(1) | 
| 27     mov     rax, 3 | 27     mov     rax, 3 | 
| 28 | 28 | 
| 29     movdqa    xmm0, [rsi + 0]       ;ip[4] ip[0] | 29     movdqa    xmm0, [rsi + 0]       ;ip[4] ip[0] | 
| 30     movdqa    xmm1, [rsi + 16]      ;ip[12] ip[8] | 30     movdqa    xmm1, [rsi + 16]      ;ip[12] ip[8] | 
| 31 | 31 | 
| 32     shl     rax, 16 | 32     shl     rax, 16 | 
| 33     or      rax, 3            ;00030003h | 33     or      rax, 3            ;00030003h | 
| 34 | 34 | 
| 35     pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12] | 35     pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12] | 
| 36     movdqa    xmm3, xmm0          ;ip[4] ip[0] | 36     movdqa    xmm3, xmm0          ;ip[4] ip[0] | 
| 37 | 37 | 
| 38     paddw   xmm0, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 | 38     paddw   xmm0, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 | 
| 39     psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 | 39     psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 | 
| 40 | 40 | 
| 41     movdqa    xmm4, xmm0 | 41     movdqa    xmm4, xmm0 | 
| 42     punpcklqdq  xmm0, xmm3          ;d1 a1 | 42     punpcklqdq  xmm0, xmm3          ;d1 a1 | 
| 43     punpckhqdq  xmm4, xmm3          ;c1 b1 | 43     punpckhqdq  xmm4, xmm3          ;c1 b1 | 
| 44     movd    xmm7, eax | 44     movd    xmm6, eax | 
| 45 | 45 | 
| 46     movdqa    xmm1, xmm4          ;c1 b1 | 46     movdqa    xmm1, xmm4          ;c1 b1 | 
| 47     paddw   xmm4, xmm0          ;dl+cl a1+b1 aka op[4] op[0] | 47     paddw   xmm4, xmm0          ;dl+cl a1+b1 aka op[4] op[0] | 
| 48     psubw   xmm0, xmm1          ;d1-c1 a1-b1 aka op[12] op[8] | 48     psubw   xmm0, xmm1          ;d1-c1 a1-b1 aka op[12] op[8] | 
| 49 | 49 | 
| 50 ;;;temp output | 50 ;;;temp output | 
| 51 ;;  movdqu  [rdi + 0], xmm4 | 51 ;;  movdqu  [rdi + 0], xmm4 | 
| 52 ;;  movdqu  [rdi + 16], xmm3 | 52 ;;  movdqu  [rdi + 16], xmm3 | 
| 53 | 53 | 
| 54 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 54 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 
| 55     ; 13 12 11 10 03 02 01 00 | 55     ; 13 12 11 10 03 02 01 00 | 
| 56     ; | 56     ; | 
| 57     ; 33 32 31 30 23 22 21 20 | 57     ; 33 32 31 30 23 22 21 20 | 
| 58     ; | 58     ; | 
| 59     movdqa    xmm3, xmm4          ; 13 12 11 10 03 02 01 00 | 59     movdqa    xmm3, xmm4          ; 13 12 11 10 03 02 01 00 | 
| 60     punpcklwd xmm4, xmm0          ; 23 03 22 02 21 01 20 00 | 60     punpcklwd xmm4, xmm0          ; 23 03 22 02 21 01 20 00 | 
| 61     punpckhwd xmm3, xmm0          ; 33 13 32 12 31 11 30 10 | 61     punpckhwd xmm3, xmm0          ; 33 13 32 12 31 11 30 10 | 
| 62     movdqa    xmm1, xmm4          ; 23 03 22 02 21 01 20 00 | 62     movdqa    xmm1, xmm4          ; 23 03 22 02 21 01 20 00 | 
| 63     punpcklwd xmm4, xmm3          ; 31 21 11 01 30 20 10 00 | 63     punpcklwd xmm4, xmm3          ; 31 21 11 01 30 20 10 00 | 
| 64     punpckhwd xmm1, xmm3          ; 33 23 13 03 32 22 12 02 | 64     punpckhwd xmm1, xmm3          ; 33 23 13 03 32 22 12 02 | 
| 65     ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 65     ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 
| 66     pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12] | 66     pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12] | 
| 67     movdqa    xmm3, xmm4          ;ip[4] ip[0] | 67     movdqa    xmm3, xmm4          ;ip[4] ip[0] | 
| 68 | 68 | 
| 69     pshufd    xmm7, xmm7, 0       ;03 03 03 03 03 03 03 03 | 69     pshufd    xmm6, xmm6, 0       ;03 03 03 03 03 03 03 03 | 
| 70 | 70 | 
| 71     paddw   xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 | 71     paddw   xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 | 
| 72     psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 | 72     psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 | 
| 73 | 73 | 
| 74     movdqa    xmm5, xmm4 | 74     movdqa    xmm5, xmm4 | 
| 75     punpcklqdq  xmm4, xmm3          ;d1 a1 | 75     punpcklqdq  xmm4, xmm3          ;d1 a1 | 
| 76     punpckhqdq  xmm5, xmm3          ;c1 b1 | 76     punpckhqdq  xmm5, xmm3          ;c1 b1 | 
| 77 | 77 | 
| 78     movdqa    xmm1, xmm5          ;c1 b1 | 78     movdqa    xmm1, xmm5          ;c1 b1 | 
| 79     paddw   xmm5, xmm4          ;dl+cl a1+b1 aka op[4] op[0] | 79     paddw   xmm5, xmm4          ;dl+cl a1+b1 aka op[4] op[0] | 
| 80     psubw   xmm4, xmm1          ;d1-c1 a1-b1 aka op[12] op[8] | 80     psubw   xmm4, xmm1          ;d1-c1 a1-b1 aka op[12] op[8] | 
| 81 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 81 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 
| 82     ; 13 12 11 10 03 02 01 00 | 82     ; 13 12 11 10 03 02 01 00 | 
| 83     ; | 83     ; | 
| 84     ; 33 32 31 30 23 22 21 20 | 84     ; 33 32 31 30 23 22 21 20 | 
| 85     ; | 85     ; | 
| 86     movdqa    xmm0, xmm5          ; 13 12 11 10 03 02 01 00 | 86     movdqa    xmm0, xmm5          ; 13 12 11 10 03 02 01 00 | 
| 87     punpcklwd xmm5, xmm4          ; 23 03 22 02 21 01 20 00 | 87     punpcklwd xmm5, xmm4          ; 23 03 22 02 21 01 20 00 | 
| 88     punpckhwd xmm0, xmm4          ; 33 13 32 12 31 11 30 10 | 88     punpckhwd xmm0, xmm4          ; 33 13 32 12 31 11 30 10 | 
| 89     movdqa    xmm1, xmm5          ; 23 03 22 02 21 01 20 00 | 89     movdqa    xmm1, xmm5          ; 23 03 22 02 21 01 20 00 | 
| 90     punpcklwd xmm5, xmm0          ; 31 21 11 01 30 20 10 00 | 90     punpcklwd xmm5, xmm0          ; 31 21 11 01 30 20 10 00 | 
| 91     punpckhwd xmm1, xmm0          ; 33 23 13 03 32 22 12 02 | 91     punpckhwd xmm1, xmm0          ; 33 23 13 03 32 22 12 02 | 
| 92 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 92 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 
| 93     paddw   xmm5, xmm7 | 93     paddw   xmm5, xmm6 | 
| 94     paddw   xmm1, xmm7 | 94     paddw   xmm1, xmm6 | 
| 95 | 95 | 
| 96     psraw   xmm5, 3 | 96     psraw   xmm5, 3 | 
| 97     psraw   xmm1, 3 | 97     psraw   xmm1, 3 | 
| 98 | 98 | 
| 99     movdqa  [rdi + 0], xmm5 | 99     movdqa  [rdi + 0], xmm5 | 
| 100     movdqa  [rdi + 16], xmm1 | 100     movdqa  [rdi + 16], xmm1 | 
| 101 | 101 | 
| 102     ; begin epilog | 102     ; begin epilog | 
| 103     pop rdi | 103     pop rdi | 
| 104     pop rsi | 104     pop rsi | 
| 105     RESTORE_XMM | 105     RESTORE_XMM | 
| 106     UNSHADOW_ARGS | 106     UNSHADOW_ARGS | 
| 107     pop         rbp | 107     pop         rbp | 
| 108     ret | 108     ret | 
| 109 | 109 | 
| 110 SECTION_RODATA | 110 SECTION_RODATA | 
| 111 align 16 | 111 align 16 | 
| 112 x_s1sqr2: | 112 x_s1sqr2: | 
| 113     times 4 dw 0x8A8C | 113     times 4 dw 0x8A8C | 
| 114 align 16 | 114 align 16 | 
| 115 x_c1sqr2less1: | 115 x_c1sqr2less1: | 
| 116     times 4 dw 0x4E7B | 116     times 4 dw 0x4E7B | 
| 117 align 16 | 117 align 16 | 
| 118 fours: | 118 fours: | 
| 119     times 4 dw 0x0004 | 119     times 4 dw 0x0004 | 
| OLD | NEW | 
|---|