| Index: source/libvpx/vp9/common/x86/vp9_subpel_variance_impl_sse2.asm
 | 
| ===================================================================
 | 
| --- source/libvpx/vp9/common/x86/vp9_subpel_variance_impl_sse2.asm	(revision 0)
 | 
| +++ source/libvpx/vp9/common/x86/vp9_subpel_variance_impl_sse2.asm	(revision 0)
 | 
| @@ -0,0 +1,645 @@
 | 
| +;
 | 
| +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 | 
| +;
 | 
| +;  Use of this source code is governed by a BSD-style license
 | 
| +;  that can be found in the LICENSE file in the root of the source
 | 
| +;  tree. An additional intellectual property rights grant can be found
 | 
| +;  in the file PATENTS.  All contributing project authors may
 | 
| +;  be found in the AUTHORS file in the root of the source tree.
 | 
| +;
 | 
| +
 | 
| +
 | 
| +%include "vpx_ports/x86_abi_support.asm"
 | 
| +
 | 
| +%define xmm_filter_shift            7
 | 
| +
 | 
| +;void vp9_filter_block2d_bil_var_sse2
 | 
| +;(
 | 
| +;    unsigned char *ref_ptr,
 | 
| +;    int ref_pixels_per_line,
 | 
| +;    unsigned char *src_ptr,
 | 
| +;    int src_pixels_per_line,
 | 
| +;    unsigned int Height,
 | 
| +;    int  xoffset,
 | 
| +;    int  yoffset,
 | 
| +;    int *sum,
 | 
| +;    unsigned int *sumsquared;;
 | 
| +;
 | 
| +;)
 | 
| +global sym(vp9_filter_block2d_bil_var_sse2) PRIVATE
 | 
| +sym(vp9_filter_block2d_bil_var_sse2):
 | 
| +    push        rbp
 | 
| +    mov         rbp, rsp
 | 
| +    SHADOW_ARGS_TO_STACK 9
 | 
| +    SAVE_XMM 7
 | 
| +    GET_GOT     rbx
 | 
| +    push rsi
 | 
| +    push rdi
 | 
| +    push rbx
 | 
| +    ; end prolog
 | 
| +
 | 
| +        pxor            xmm6,           xmm6                 ;
 | 
| +        pxor            xmm7,           xmm7                 ;
 | 
| +
 | 
| +        lea             rsi,            [GLOBAL(xmm_bi_rd)]  ; rounding
 | 
| +        movdqa          xmm4,           XMMWORD PTR [rsi]
 | 
| +
 | 
| +        lea             rcx,            [GLOBAL(bilinear_filters_sse2)]
 | 
| +        movsxd          rax,            dword ptr arg(5)     ; xoffset
 | 
| +
 | 
| +        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
 | 
| +        je              filter_block2d_bil_var_sse2_sp_only
 | 
| +
 | 
| +        shl             rax,            5                    ; point to filter coeff with xoffset
 | 
| +        lea             rax,            [rax + rcx]          ; HFilter
 | 
| +
 | 
| +        movsxd          rdx,            dword ptr arg(6)     ; yoffset
 | 
| +
 | 
| +        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
 | 
| +        je              filter_block2d_bil_var_sse2_fp_only
 | 
| +
 | 
| +        shl             rdx,            5
 | 
| +        lea             rdx,            [rdx + rcx]          ; VFilter
 | 
| +
 | 
| +        mov             rsi,            arg(0)               ;ref_ptr
 | 
| +        mov             rdi,            arg(2)               ;src_ptr
 | 
| +        movsxd          rcx,            dword ptr arg(4)     ;Height
 | 
| +
 | 
| +        pxor            xmm0,           xmm0                 ;
 | 
| +        movq            xmm1,           QWORD PTR [rsi]      ;
 | 
| +        movq            xmm3,           QWORD PTR [rsi+1]    ;
 | 
| +
 | 
| +        punpcklbw       xmm1,           xmm0                 ;
 | 
| +        pmullw          xmm1,           [rax]                ;
 | 
| +        punpcklbw       xmm3,           xmm0
 | 
| +        pmullw          xmm3,           [rax+16]             ;
 | 
| +
 | 
| +        paddw           xmm1,           xmm3                 ;
 | 
| +        paddw           xmm1,           xmm4                 ;
 | 
| +        psraw           xmm1,           xmm_filter_shift     ;
 | 
| +        movdqa          xmm5,           xmm1
 | 
| +
 | 
| +        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
 | 
| +        lea             rsi,            [rsi + rbx]
 | 
| +%if ABI_IS_32BIT=0
 | 
| +        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
 | 
| +%endif
 | 
| +
 | 
| +filter_block2d_bil_var_sse2_loop:
 | 
| +        movq            xmm1,           QWORD PTR [rsi]               ;
 | 
| +        movq            xmm3,           QWORD PTR [rsi+1]             ;
 | 
| +
 | 
| +        punpcklbw       xmm1,           xmm0                 ;
 | 
| +        pmullw          xmm1,           [rax]               ;
 | 
| +        punpcklbw       xmm3,           xmm0                 ;
 | 
| +        pmullw          xmm3,           [rax+16]             ;
 | 
| +
 | 
| +        paddw           xmm1,           xmm3                 ;
 | 
| +        paddw           xmm1,           xmm4               ;
 | 
| +        psraw           xmm1,           xmm_filter_shift    ;
 | 
| +
 | 
| +        movdqa          xmm3,           xmm5                 ;
 | 
| +        movdqa          xmm5,           xmm1                 ;
 | 
| +
 | 
| +        pmullw          xmm3,           [rdx]               ;
 | 
| +        pmullw          xmm1,           [rdx+16]             ;
 | 
| +        paddw           xmm1,           xmm3                 ;
 | 
| +        paddw           xmm1,           xmm4                 ;
 | 
| +        psraw           xmm1,           xmm_filter_shift    ;
 | 
| +
 | 
| +        movq            xmm3,           QWORD PTR [rdi]               ;
 | 
| +        punpcklbw       xmm3,           xmm0                 ;
 | 
| +
 | 
| +        psubw           xmm1,           xmm3                 ;
 | 
| +        paddw           xmm6,           xmm1                 ;
 | 
| +
 | 
| +        pmaddwd         xmm1,           xmm1                 ;
 | 
| +        paddd           xmm7,           xmm1                 ;
 | 
| +
 | 
| +        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
 | 
| +%if ABI_IS_32BIT
 | 
| +        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
 | 
| +%else
 | 
| +        lea             rdi,            [rdi + r9]
 | 
| +%endif
 | 
| +
 | 
| +        sub             rcx,            1                   ;
 | 
| +        jnz             filter_block2d_bil_var_sse2_loop       ;
 | 
| +
 | 
| +        jmp             filter_block2d_bil_variance
 | 
| +
 | 
| +filter_block2d_bil_var_sse2_sp_only:
 | 
| +        movsxd          rdx,            dword ptr arg(6)     ; yoffset
 | 
| +
 | 
| +        cmp             rdx,            0                    ; skip all if both xoffset=0 and yoffset=0
 | 
| +        je              filter_block2d_bil_var_sse2_full_pixel
 | 
| +
 | 
| +        shl             rdx,            5
 | 
| +        lea             rdx,            [rdx + rcx]          ; VFilter
 | 
| +
 | 
| +        mov             rsi,            arg(0)               ;ref_ptr
 | 
| +        mov             rdi,            arg(2)               ;src_ptr
 | 
| +        movsxd          rcx,            dword ptr arg(4)     ;Height
 | 
| +        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
 | 
| +
 | 
| +        pxor            xmm0,           xmm0                 ;
 | 
| +        movq            xmm1,           QWORD PTR [rsi]      ;
 | 
| +        punpcklbw       xmm1,           xmm0                 ;
 | 
| +
 | 
| +        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
 | 
| +        lea             rsi,            [rsi + rax]
 | 
| +
 | 
| +filter_block2d_bil_sp_only_loop:
 | 
| +        movq            xmm3,           QWORD PTR [rsi]             ;
 | 
| +        punpcklbw       xmm3,           xmm0                 ;
 | 
| +        movdqa          xmm5,           xmm3
 | 
| +
 | 
| +        pmullw          xmm1,           [rdx]               ;
 | 
| +        pmullw          xmm3,           [rdx+16]             ;
 | 
| +        paddw           xmm1,           xmm3                 ;
 | 
| +        paddw           xmm1,           xmm4                 ;
 | 
| +        psraw           xmm1,           xmm_filter_shift    ;
 | 
| +
 | 
| +        movq            xmm3,           QWORD PTR [rdi]               ;
 | 
| +        punpcklbw       xmm3,           xmm0                 ;
 | 
| +
 | 
| +        psubw           xmm1,           xmm3                 ;
 | 
| +        paddw           xmm6,           xmm1                 ;
 | 
| +
 | 
| +        pmaddwd         xmm1,           xmm1                 ;
 | 
| +        paddd           xmm7,           xmm1                 ;
 | 
| +
 | 
| +        movdqa          xmm1,           xmm5                 ;
 | 
| +        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
 | 
| +        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
 | 
| +
 | 
| +        sub             rcx,            1                   ;
 | 
| +        jnz             filter_block2d_bil_sp_only_loop       ;
 | 
| +
 | 
| +        jmp             filter_block2d_bil_variance
 | 
| +
 | 
| +filter_block2d_bil_var_sse2_full_pixel:
 | 
| +        mov             rsi,            arg(0)               ;ref_ptr
 | 
| +        mov             rdi,            arg(2)               ;src_ptr
 | 
| +        movsxd          rcx,            dword ptr arg(4)     ;Height
 | 
| +        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
 | 
| +        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
 | 
| +        pxor            xmm0,           xmm0                 ;
 | 
| +
 | 
| +filter_block2d_bil_full_pixel_loop:
 | 
| +        movq            xmm1,           QWORD PTR [rsi]               ;
 | 
| +        punpcklbw       xmm1,           xmm0                 ;
 | 
| +
 | 
| +        movq            xmm2,           QWORD PTR [rdi]               ;
 | 
| +        punpcklbw       xmm2,           xmm0                 ;
 | 
| +
 | 
| +        psubw           xmm1,           xmm2                 ;
 | 
| +        paddw           xmm6,           xmm1                 ;
 | 
| +
 | 
| +        pmaddwd         xmm1,           xmm1                 ;
 | 
| +        paddd           xmm7,           xmm1                 ;
 | 
| +
 | 
| +        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
 | 
| +        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
 | 
| +
 | 
| +        sub             rcx,            1                   ;
 | 
| +        jnz             filter_block2d_bil_full_pixel_loop       ;
 | 
| +
 | 
| +        jmp             filter_block2d_bil_variance
 | 
| +
 | 
| +filter_block2d_bil_var_sse2_fp_only:
 | 
| +        mov             rsi,            arg(0)               ;ref_ptr
 | 
| +        mov             rdi,            arg(2)               ;src_ptr
 | 
| +        movsxd          rcx,            dword ptr arg(4)     ;Height
 | 
| +        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
 | 
| +
 | 
| +        pxor            xmm0,           xmm0                 ;
 | 
| +        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
 | 
| +
 | 
| +filter_block2d_bil_fp_only_loop:
 | 
| +        movq            xmm1,           QWORD PTR [rsi]       ;
 | 
| +        movq            xmm3,           QWORD PTR [rsi+1]     ;
 | 
| +
 | 
| +        punpcklbw       xmm1,           xmm0                 ;
 | 
| +        pmullw          xmm1,           [rax]               ;
 | 
| +        punpcklbw       xmm3,           xmm0                 ;
 | 
| +        pmullw          xmm3,           [rax+16]             ;
 | 
| +
 | 
| +        paddw           xmm1,           xmm3                 ;
 | 
| +        paddw           xmm1,           xmm4  ;
 | 
| +        psraw           xmm1,           xmm_filter_shift    ;
 | 
| +
 | 
| +        movq            xmm3,           QWORD PTR [rdi]     ;
 | 
| +        punpcklbw       xmm3,           xmm0                 ;
 | 
| +
 | 
| +        psubw           xmm1,           xmm3                 ;
 | 
| +        paddw           xmm6,           xmm1                 ;
 | 
| +
 | 
| +        pmaddwd         xmm1,           xmm1                 ;
 | 
| +        paddd           xmm7,           xmm1                 ;
 | 
| +        lea             rsi,            [rsi + rdx]
 | 
| +        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
 | 
| +
 | 
| +        sub             rcx,            1                   ;
 | 
| +        jnz             filter_block2d_bil_fp_only_loop       ;
 | 
| +
 | 
| +        jmp             filter_block2d_bil_variance
 | 
| +
 | 
| +filter_block2d_bil_variance:
 | 
| +        movdq2q         mm6,            xmm6                ;
 | 
| +        movdq2q         mm7,            xmm7                ;
 | 
| +
 | 
| +        psrldq          xmm6,           8
 | 
| +        psrldq          xmm7,           8
 | 
| +
 | 
| +        movdq2q         mm2,            xmm6
 | 
| +        movdq2q         mm3,            xmm7
 | 
| +
 | 
| +        paddw           mm6,            mm2
 | 
| +        paddd           mm7,            mm3
 | 
| +
 | 
| +        pxor            mm3,            mm3                 ;
 | 
| +        pxor            mm2,            mm2                 ;
 | 
| +
 | 
| +        punpcklwd       mm2,            mm6                 ;
 | 
| +        punpckhwd       mm3,            mm6                 ;
 | 
| +
 | 
| +        paddd           mm2,            mm3                 ;
 | 
| +        movq            mm6,            mm2                 ;
 | 
| +
 | 
| +        psrlq           mm6,            32                  ;
 | 
| +        paddd           mm2,            mm6                 ;
 | 
| +
 | 
| +        psrad           mm2,            16                  ;
 | 
| +        movq            mm4,            mm7                 ;
 | 
| +
 | 
| +        psrlq           mm4,            32                  ;
 | 
| +        paddd           mm4,            mm7                 ;
 | 
| +
 | 
| +        mov             rsi,            arg(7) ; sum
 | 
| +        mov             rdi,            arg(8) ; sumsquared
 | 
| +
 | 
| +        movd            [rsi],          mm2    ; xsum
 | 
| +        movd            [rdi],          mm4    ; xxsum
 | 
| +
 | 
| +    ; begin epilog
 | 
| +    pop rbx
 | 
| +    pop rdi
 | 
| +    pop rsi
 | 
| +    RESTORE_GOT
 | 
| +    RESTORE_XMM
 | 
| +    UNSHADOW_ARGS
 | 
| +    pop         rbp
 | 
| +    ret
 | 
| +
 | 
| +
 | 
| +
 | 
| +;void vp9_half_horiz_vert_variance16x_h_sse2
 | 
| +;(
 | 
| +;    unsigned char *ref_ptr,
 | 
| +;    int ref_pixels_per_line,
 | 
| +;    unsigned char *src_ptr,
 | 
| +;    int src_pixels_per_line,
 | 
| +;    unsigned int Height,
 | 
| +;    int *sum,
 | 
| +;    unsigned int *sumsquared
 | 
| +;)
 | 
| +global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE
 | 
| +sym(vp9_half_horiz_vert_variance16x_h_sse2):
 | 
| +    push        rbp
 | 
| +    mov         rbp, rsp
 | 
| +    SHADOW_ARGS_TO_STACK 7
 | 
| +    SAVE_XMM 7
 | 
| +    GET_GOT     rbx
 | 
| +    push rsi
 | 
| +    push rdi
 | 
| +    ; end prolog
 | 
| +
 | 
| +        pxor            xmm6,           xmm6                ;  error accumulator
 | 
| +        pxor            xmm7,           xmm7                ;  sse eaccumulator
 | 
| +        mov             rsi,            arg(0) ;ref_ptr              ;
 | 
| +
 | 
| +        mov             rdi,            arg(2) ;src_ptr              ;
 | 
| +        movsxd          rcx,            dword ptr arg(4) ;Height              ;
 | 
| +        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
 | 
| +        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
 | 
| +
 | 
| +        pxor            xmm0,           xmm0                ;
 | 
| +
 | 
| +        movdqu          xmm5,           XMMWORD PTR [rsi]
 | 
| +        movdqu          xmm3,           XMMWORD PTR [rsi+1]
 | 
| +        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
 | 
| +
 | 
| +        lea             rsi,            [rsi + rax]
 | 
| +
 | 
| +.half_horiz_vert_variance16x_h_1:
 | 
| +        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
 | 
| +        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
 | 
| +        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
 | 
| +
 | 
| +        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
 | 
| +
 | 
| +        movdqa          xmm4,           xmm5
 | 
| +        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
 | 
| +        punpckhbw       xmm4,           xmm0
 | 
| +
 | 
| +        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
 | 
| +        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
 | 
| +        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
 | 
| +
 | 
| +        movq            xmm3,           QWORD PTR [rdi+8]
 | 
| +        punpcklbw       xmm3,           xmm0
 | 
| +        psubw           xmm4,           xmm3
 | 
| +
 | 
| +        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
 | 
| +        paddw           xmm6,           xmm4
 | 
| +        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
 | 
| +        pmaddwd         xmm4,           xmm4
 | 
| +        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
 | 
| +        paddd           xmm7,           xmm4
 | 
| +
 | 
| +        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
 | 
| +
 | 
| +        lea             rsi,            [rsi + rax]
 | 
| +        lea             rdi,            [rdi + rdx]
 | 
| +
 | 
| +        sub             rcx,            1                   ;
 | 
| +        jnz             .half_horiz_vert_variance16x_h_1    ;
 | 
| +
 | 
| +        pxor        xmm1,           xmm1
 | 
| +        pxor        xmm5,           xmm5
 | 
| +
 | 
| +        punpcklwd   xmm0,           xmm6
 | 
| +        punpckhwd   xmm1,           xmm6
 | 
| +        psrad       xmm0,           16
 | 
| +        psrad       xmm1,           16
 | 
| +        paddd       xmm0,           xmm1
 | 
| +        movdqa      xmm1,           xmm0
 | 
| +
 | 
| +        movdqa      xmm6,           xmm7
 | 
| +        punpckldq   xmm6,           xmm5
 | 
| +        punpckhdq   xmm7,           xmm5
 | 
| +        paddd       xmm6,           xmm7
 | 
| +
 | 
| +        punpckldq   xmm0,           xmm5
 | 
| +        punpckhdq   xmm1,           xmm5
 | 
| +        paddd       xmm0,           xmm1
 | 
| +
 | 
| +        movdqa      xmm7,           xmm6
 | 
| +        movdqa      xmm1,           xmm0
 | 
| +
 | 
| +        psrldq      xmm7,           8
 | 
| +        psrldq      xmm1,           8
 | 
| +
 | 
| +        paddd       xmm6,           xmm7
 | 
| +        paddd       xmm0,           xmm1
 | 
| +
 | 
| +        mov         rsi,            arg(5) ;[Sum]
 | 
| +        mov         rdi,            arg(6) ;[SSE]
 | 
| +
 | 
| +        movd        [rsi],       xmm0
 | 
| +        movd        [rdi],       xmm6
 | 
| +
 | 
| +    ; begin epilog
 | 
| +    pop rdi
 | 
| +    pop rsi
 | 
| +    RESTORE_GOT
 | 
| +    RESTORE_XMM
 | 
| +    UNSHADOW_ARGS
 | 
| +    pop         rbp
 | 
| +    ret
 | 
| +
 | 
| +;void vp9_half_vert_variance16x_h_sse2
 | 
| +;(
 | 
| +;    unsigned char *ref_ptr,
 | 
| +;    int ref_pixels_per_line,
 | 
| +;    unsigned char *src_ptr,
 | 
| +;    int src_pixels_per_line,
 | 
| +;    unsigned int Height,
 | 
| +;    int *sum,
 | 
| +;    unsigned int *sumsquared
 | 
| +;)
 | 
| +global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE
 | 
| +sym(vp9_half_vert_variance16x_h_sse2):
 | 
| +    push        rbp
 | 
| +    mov         rbp, rsp
 | 
| +    SHADOW_ARGS_TO_STACK 7
 | 
| +    SAVE_XMM 7
 | 
| +    GET_GOT     rbx
 | 
| +    push rsi
 | 
| +    push rdi
 | 
| +    ; end prolog
 | 
| +
 | 
| +        pxor            xmm6,           xmm6                ;  error accumulator
 | 
| +        pxor            xmm7,           xmm7                ;  sse eaccumulator
 | 
| +        mov             rsi,            arg(0)              ;ref_ptr
 | 
| +
 | 
| +        mov             rdi,            arg(2)              ;src_ptr
 | 
| +        movsxd          rcx,            dword ptr arg(4)    ;Height
 | 
| +        movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line
 | 
| +        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
 | 
| +
 | 
| +        movdqu          xmm5,           XMMWORD PTR [rsi]
 | 
| +        lea             rsi,            [rsi + rax          ]
 | 
| +        pxor            xmm0,           xmm0
 | 
| +
 | 
| +.half_vert_variance16x_h_1:
 | 
| +        movdqu          xmm3,           XMMWORD PTR [rsi]
 | 
| +
 | 
| +        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
 | 
| +        movdqa          xmm4,           xmm5
 | 
| +        punpcklbw       xmm5,           xmm0
 | 
| +        punpckhbw       xmm4,           xmm0
 | 
| +
 | 
| +        movq            xmm2,           QWORD PTR [rdi]
 | 
| +        punpcklbw       xmm2,           xmm0
 | 
| +        psubw           xmm5,           xmm2
 | 
| +        movq            xmm2,           QWORD PTR [rdi+8]
 | 
| +        punpcklbw       xmm2,           xmm0
 | 
| +        psubw           xmm4,           xmm2
 | 
| +
 | 
| +        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
 | 
| +        paddw           xmm6,           xmm4
 | 
| +        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
 | 
| +        pmaddwd         xmm4,           xmm4
 | 
| +        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
 | 
| +        paddd           xmm7,           xmm4
 | 
| +
 | 
| +        movdqa          xmm5,           xmm3
 | 
| +
 | 
| +        lea             rsi,            [rsi + rax]
 | 
| +        lea             rdi,            [rdi + rdx]
 | 
| +
 | 
| +        sub             rcx,            1
 | 
| +        jnz             .half_vert_variance16x_h_1
 | 
| +
 | 
| +        pxor        xmm1,           xmm1
 | 
| +        pxor        xmm5,           xmm5
 | 
| +
 | 
| +        punpcklwd   xmm0,           xmm6
 | 
| +        punpckhwd   xmm1,           xmm6
 | 
| +        psrad       xmm0,           16
 | 
| +        psrad       xmm1,           16
 | 
| +        paddd       xmm0,           xmm1
 | 
| +        movdqa      xmm1,           xmm0
 | 
| +
 | 
| +        movdqa      xmm6,           xmm7
 | 
| +        punpckldq   xmm6,           xmm5
 | 
| +        punpckhdq   xmm7,           xmm5
 | 
| +        paddd       xmm6,           xmm7
 | 
| +
 | 
| +        punpckldq   xmm0,           xmm5
 | 
| +        punpckhdq   xmm1,           xmm5
 | 
| +        paddd       xmm0,           xmm1
 | 
| +
 | 
| +        movdqa      xmm7,           xmm6
 | 
| +        movdqa      xmm1,           xmm0
 | 
| +
 | 
| +        psrldq      xmm7,           8
 | 
| +        psrldq      xmm1,           8
 | 
| +
 | 
| +        paddd       xmm6,           xmm7
 | 
| +        paddd       xmm0,           xmm1
 | 
| +
 | 
| +        mov         rsi,            arg(5) ;[Sum]
 | 
| +        mov         rdi,            arg(6) ;[SSE]
 | 
| +
 | 
| +        movd        [rsi],       xmm0
 | 
| +        movd        [rdi],       xmm6
 | 
| +
 | 
| +    ; begin epilog
 | 
| +    pop rdi
 | 
| +    pop rsi
 | 
| +    RESTORE_GOT
 | 
| +    RESTORE_XMM
 | 
| +    UNSHADOW_ARGS
 | 
| +    pop         rbp
 | 
| +    ret
 | 
| +
 | 
| +;void vp9_half_horiz_variance16x_h_sse2
 | 
| +;(
 | 
| +;    unsigned char *ref_ptr,
 | 
| +;    int ref_pixels_per_line,
 | 
| +;    unsigned char *src_ptr,
 | 
| +;    int src_pixels_per_line,
 | 
| +;    unsigned int Height,
 | 
| +;    int *sum,
 | 
| +;    unsigned int *sumsquared
 | 
| +;)
 | 
| +global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE
 | 
| +sym(vp9_half_horiz_variance16x_h_sse2):
 | 
| +    push        rbp
 | 
| +    mov         rbp, rsp
 | 
| +    SHADOW_ARGS_TO_STACK 7
 | 
| +    SAVE_XMM 7
 | 
| +    GET_GOT     rbx
 | 
| +    push rsi
 | 
| +    push rdi
 | 
| +    ; end prolog
 | 
| +
 | 
| +        pxor            xmm6,           xmm6                ;  error accumulator
 | 
| +        pxor            xmm7,           xmm7                ;  sse eaccumulator
 | 
| +        mov             rsi,            arg(0) ;ref_ptr              ;
 | 
| +
 | 
| +        mov             rdi,            arg(2) ;src_ptr              ;
 | 
| +        movsxd          rcx,            dword ptr arg(4) ;Height              ;
 | 
| +        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
 | 
| +        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
 | 
| +
 | 
| +        pxor            xmm0,           xmm0                ;
 | 
| +
 | 
| +.half_horiz_variance16x_h_1:
 | 
| +        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
 | 
| +        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
 | 
| +
 | 
| +        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
 | 
| +        movdqa          xmm1,           xmm5
 | 
| +        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
 | 
| +        punpckhbw       xmm1,           xmm0
 | 
| +
 | 
| +        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
 | 
| +        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
 | 
| +        movq            xmm2,           QWORD PTR [rdi+8]
 | 
| +        punpcklbw       xmm2,           xmm0
 | 
| +
 | 
| +        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
 | 
| +        psubw           xmm1,           xmm2
 | 
| +        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
 | 
| +        paddw           xmm6,           xmm1
 | 
| +        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
 | 
| +        pmaddwd         xmm1,           xmm1
 | 
| +        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
 | 
| +        paddd           xmm7,           xmm1
 | 
| +
 | 
| +        lea             rsi,            [rsi + rax]
 | 
| +        lea             rdi,            [rdi + rdx]
 | 
| +
 | 
| +        sub             rcx,            1                   ;
 | 
| +        jnz             .half_horiz_variance16x_h_1         ;
 | 
| +
 | 
| +        pxor        xmm1,           xmm1
 | 
| +        pxor        xmm5,           xmm5
 | 
| +
 | 
| +        punpcklwd   xmm0,           xmm6
 | 
| +        punpckhwd   xmm1,           xmm6
 | 
| +        psrad       xmm0,           16
 | 
| +        psrad       xmm1,           16
 | 
| +        paddd       xmm0,           xmm1
 | 
| +        movdqa      xmm1,           xmm0
 | 
| +
 | 
| +        movdqa      xmm6,           xmm7
 | 
| +        punpckldq   xmm6,           xmm5
 | 
| +        punpckhdq   xmm7,           xmm5
 | 
| +        paddd       xmm6,           xmm7
 | 
| +
 | 
| +        punpckldq   xmm0,           xmm5
 | 
| +        punpckhdq   xmm1,           xmm5
 | 
| +        paddd       xmm0,           xmm1
 | 
| +
 | 
| +        movdqa      xmm7,           xmm6
 | 
| +        movdqa      xmm1,           xmm0
 | 
| +
 | 
| +        psrldq      xmm7,           8
 | 
| +        psrldq      xmm1,           8
 | 
| +
 | 
| +        paddd       xmm6,           xmm7
 | 
| +        paddd       xmm0,           xmm1
 | 
| +
 | 
| +        mov         rsi,            arg(5) ;[Sum]
 | 
| +        mov         rdi,            arg(6) ;[SSE]
 | 
| +
 | 
| +        movd        [rsi],       xmm0
 | 
| +        movd        [rdi],       xmm6
 | 
| +
 | 
| +    ; begin epilog
 | 
| +    pop rdi
 | 
| +    pop rsi
 | 
| +    RESTORE_GOT
 | 
| +    RESTORE_XMM
 | 
| +    UNSHADOW_ARGS
 | 
| +    pop         rbp
 | 
| +    ret
 | 
| +
 | 
| +SECTION_RODATA
 | 
| +;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
 | 
| +align 16
 | 
| +xmm_bi_rd:
 | 
| +    times 8 dw 64
 | 
| +align 16
 | 
| +bilinear_filters_sse2:
 | 
| +    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
 | 
| +    dw 120, 120, 120, 120, 120, 120, 120, 120,  8,  8,  8,  8,  8,  8,  8,  8
 | 
| +    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
 | 
| +    dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24
 | 
| +    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
 | 
| +    dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40
 | 
| +    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
 | 
| +    dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56
 | 
| +    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
 | 
| +    dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72
 | 
| +    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
 | 
| +    dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88
 | 
| +    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
 | 
| +    dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104
 | 
| +    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
 | 
| +    dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120
 | 
| 
 |