| Index: source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm | 
| =================================================================== | 
| --- source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm	(revision 240950) | 
| +++ source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm	(working copy) | 
| @@ -118,6 +118,14 @@ | 
| RET | 
| %endmacro | 
|  | 
| +%macro INC_SRC_BY_SRC_STRIDE  0 | 
| +%if ARCH_X86=1 && CONFIG_PIC=1 | 
| +  add                srcq, src_stridemp | 
| +%else | 
| +  add                srcq, src_strideq | 
| +%endif | 
| +%endmacro | 
| + | 
| %macro SUBPEL_VARIANCE 1-2 0 ; W | 
| %if cpuflag(ssse3) | 
| %define bilin_filter_m bilin_filter_m_ssse3 | 
| @@ -129,41 +137,85 @@ | 
| ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses | 
| ; 11, not 13, if the registers are ordered correctly. May make a minor speed | 
| ; difference on Win64 | 
| -%ifdef PIC | 
| -%if %2 == 1 ; avg | 
| -cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ | 
| -                                              x_offset, y_offset, \ | 
| -                                              dst, dst_stride, \ | 
| -                                              sec, sec_stride, height, sse | 
| -%define sec_str sec_strideq | 
| + | 
| +%ifdef PIC    ; 64bit PIC | 
| +  %if %2 == 1 ; avg | 
| +    cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ | 
| +                                      x_offset, y_offset, \ | 
| +                                      dst, dst_stride, \ | 
| +                                      sec, sec_stride, height, sse | 
| +    %define sec_str sec_strideq | 
| +  %else | 
| +    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ | 
| +                                  y_offset, dst, dst_stride, height, sse | 
| +  %endif | 
| +  %define h heightd | 
| +  %define bilin_filter sseq | 
| %else | 
| -cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \ | 
| -                                          dst, dst_stride, height, sse | 
| +  %if ARCH_X86=1 && CONFIG_PIC=1 | 
| +    %if %2 == 1 ; avg | 
| +      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ | 
| +                                  x_offset, y_offset, \ | 
| +                                  dst, dst_stride, \ | 
| +                                  sec, sec_stride, \ | 
| +                                  height, sse, g_bilin_filter, g_pw_8 | 
| +      %define h dword heightm | 
| +      %define sec_str sec_stridemp | 
| + | 
| +      ;Store bilin_filter and pw_8 location in stack | 
| +      GET_GOT eax | 
| +      add esp, 4                ; restore esp | 
| + | 
| +      lea ecx, [GLOBAL(bilin_filter_m)] | 
| +      mov g_bilin_filterm, ecx | 
| + | 
| +      lea ecx, [GLOBAL(pw_8)] | 
| +      mov g_pw_8m, ecx | 
| + | 
| +      LOAD_IF_USED 0, 1         ; load eax, ecx back | 
| +    %else | 
| +      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ | 
| +                                y_offset, dst, dst_stride, height, sse, \ | 
| +                                g_bilin_filter, g_pw_8 | 
| +      %define h heightd | 
| + | 
| +      ;Store bilin_filter and pw_8 location in stack | 
| +      GET_GOT eax | 
| +      add esp, 4                ; restore esp | 
| + | 
| +      lea ecx, [GLOBAL(bilin_filter_m)] | 
| +      mov g_bilin_filterm, ecx | 
| + | 
| +      lea ecx, [GLOBAL(pw_8)] | 
| +      mov g_pw_8m, ecx | 
| + | 
| +      LOAD_IF_USED 0, 1         ; load eax, ecx back | 
| +    %endif | 
| +  %else | 
| +    %if %2 == 1 ; avg | 
| +      cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ | 
| +                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \ | 
| +                                             x_offset, y_offset, \ | 
| +                                             dst, dst_stride, \ | 
| +                                             sec, sec_stride, \ | 
| +                                             height, sse | 
| +      %if ARCH_X86_64 | 
| +      %define h heightd | 
| +      %define sec_str sec_strideq | 
| +      %else | 
| +      %define h dword heightm | 
| +      %define sec_str sec_stridemp | 
| +      %endif | 
| +    %else | 
| +      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ | 
| +                              y_offset, dst, dst_stride, height, sse | 
| +      %define h heightd | 
| +    %endif | 
| + | 
| +    %define bilin_filter bilin_filter_m | 
| +  %endif | 
| %endif | 
| -%define h heightd | 
| -%define bilin_filter sseq | 
| -%else | 
| -%if %2 == 1 ; avg | 
| -cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ | 
| -                                    7 + 2 * ARCH_X86_64, 13, src, src_stride, \ | 
| -                                                         x_offset, y_offset, \ | 
| -                                                         dst, dst_stride, \ | 
| -                                                         sec, sec_stride, \ | 
| -                                                         height, sse | 
| -%if ARCH_X86_64 | 
| -%define h heightd | 
| -%define sec_str sec_strideq | 
| -%else | 
| -%define h dword heightm | 
| -%define sec_str sec_stridemp | 
| -%endif | 
| -%else | 
| -cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ | 
| -                                          dst, dst_stride, height, sse | 
| -%define h heightd | 
| -%endif | 
| -%define bilin_filter bilin_filter_m | 
| -%endif | 
| + | 
| ASSERT               %1 <= 16         ; m6 overflows if w > 16 | 
| pxor                 m6, m6           ; sum | 
| pxor                 m7, m7           ; sse | 
| @@ -329,11 +381,22 @@ | 
| %define filter_y_b m9 | 
| %define filter_rnd m10 | 
| %else ; x86-32 or mmx | 
| +%if ARCH_X86=1 && CONFIG_PIC=1 | 
| +; x_offset == 0, reuse x_offset reg | 
| +%define tempq x_offsetq | 
| +  add y_offsetq, g_bilin_filterm | 
| +%define filter_y_a [y_offsetq] | 
| +%define filter_y_b [y_offsetq+16] | 
| +  mov tempq, g_pw_8m | 
| +%define filter_rnd [tempq] | 
| +%else | 
| add           y_offsetq, bilin_filter | 
| %define filter_y_a [y_offsetq] | 
| %define filter_y_b [y_offsetq+16] | 
| %define filter_rnd [pw_8] | 
| %endif | 
| +%endif | 
| + | 
| .x_zero_y_other_loop: | 
| %if %1 == 16 | 
| movu                 m0, [srcq] | 
| @@ -615,12 +678,23 @@ | 
| %define filter_y_a m8 | 
| %define filter_y_b m9 | 
| %define filter_rnd m10 | 
| +%else  ;x86_32 | 
| +%if ARCH_X86=1 && CONFIG_PIC=1 | 
| +; x_offset == 0.5. We can reuse x_offset reg | 
| +%define tempq x_offsetq | 
| +  add y_offsetq, g_bilin_filterm | 
| +%define filter_y_a [y_offsetq] | 
| +%define filter_y_b [y_offsetq+16] | 
| +  mov tempq, g_pw_8m | 
| +%define filter_rnd [tempq] | 
| %else | 
| add           y_offsetq, bilin_filter | 
| %define filter_y_a [y_offsetq] | 
| %define filter_y_b [y_offsetq+16] | 
| %define filter_rnd [pw_8] | 
| %endif | 
| +%endif | 
| + | 
| %if %1 == 16 | 
| movu                 m0, [srcq] | 
| movu                 m3, [srcq+1] | 
| @@ -752,12 +826,23 @@ | 
| %define filter_x_a m8 | 
| %define filter_x_b m9 | 
| %define filter_rnd m10 | 
| +%else    ; x86-32 | 
| +%if ARCH_X86=1 && CONFIG_PIC=1 | 
| +;y_offset == 0. We can reuse y_offset reg. | 
| +%define tempq y_offsetq | 
| +  add x_offsetq, g_bilin_filterm | 
| +%define filter_x_a [x_offsetq] | 
| +%define filter_x_b [x_offsetq+16] | 
| +  mov tempq, g_pw_8m | 
| +%define filter_rnd [tempq] | 
| %else | 
| add           x_offsetq, bilin_filter | 
| %define filter_x_a [x_offsetq] | 
| %define filter_x_b [x_offsetq+16] | 
| %define filter_rnd [pw_8] | 
| %endif | 
| +%endif | 
| + | 
| .x_other_y_zero_loop: | 
| %if %1 == 16 | 
| movu                 m0, [srcq] | 
| @@ -873,12 +958,23 @@ | 
| %define filter_x_a m8 | 
| %define filter_x_b m9 | 
| %define filter_rnd m10 | 
| +%else    ; x86-32 | 
| +%if ARCH_X86=1 && CONFIG_PIC=1 | 
| +; y_offset == 0.5. We can reuse y_offset reg. | 
| +%define tempq y_offsetq | 
| +  add x_offsetq, g_bilin_filterm | 
| +%define filter_x_a [x_offsetq] | 
| +%define filter_x_b [x_offsetq+16] | 
| +  mov tempq, g_pw_8m | 
| +%define filter_rnd [tempq] | 
| %else | 
| add           x_offsetq, bilin_filter | 
| %define filter_x_a [x_offsetq] | 
| %define filter_x_b [x_offsetq+16] | 
| %define filter_rnd [pw_8] | 
| %endif | 
| +%endif | 
| + | 
| %if %1 == 16 | 
| movu                 m0, [srcq] | 
| movu                 m1, [srcq+1] | 
| @@ -1057,6 +1153,21 @@ | 
| %define filter_y_a m10 | 
| %define filter_y_b m11 | 
| %define filter_rnd m12 | 
| +%else   ; x86-32 | 
| +%if ARCH_X86=1 && CONFIG_PIC=1 | 
| +; In this case, there is NO unused register. Used src_stride register. Later, | 
| +; src_stride has to be loaded from stack when it is needed. | 
| +%define tempq src_strideq | 
| +  mov tempq, g_bilin_filterm | 
| +  add           x_offsetq, tempq | 
| +  add           y_offsetq, tempq | 
| +%define filter_x_a [x_offsetq] | 
| +%define filter_x_b [x_offsetq+16] | 
| +%define filter_y_a [y_offsetq] | 
| +%define filter_y_b [y_offsetq+16] | 
| + | 
| +  mov tempq, g_pw_8m | 
| +%define filter_rnd [tempq] | 
| %else | 
| add           x_offsetq, bilin_filter | 
| add           y_offsetq, bilin_filter | 
| @@ -1066,6 +1177,8 @@ | 
| %define filter_y_b [y_offsetq+16] | 
| %define filter_rnd [pw_8] | 
| %endif | 
| +%endif | 
| + | 
| ; x_offset == bilin interpolation && y_offset == bilin interpolation | 
| %if %1 == 16 | 
| movu                 m0, [srcq] | 
| @@ -1093,7 +1206,9 @@ | 
| %endif | 
| psraw                m0, 4 | 
| psraw                m2, 4 | 
| -  add                srcq, src_strideq | 
| + | 
| +  INC_SRC_BY_SRC_STRIDE | 
| + | 
| packuswb             m0, m2 | 
| .x_other_y_other_loop: | 
| %if cpuflag(ssse3) | 
| @@ -1163,7 +1278,7 @@ | 
| SUM_SSE              m0, m1, m2, m3, m6, m7 | 
| mova                 m0, m4 | 
|  | 
| -  add                srcq, src_strideq | 
| +  INC_SRC_BY_SRC_STRIDE | 
| add                dstq, dst_strideq | 
| %else ; %1 < 16 | 
| movh                 m0, [srcq] | 
| @@ -1184,12 +1299,17 @@ | 
| %if cpuflag(ssse3) | 
| packuswb             m0, m0 | 
| %endif | 
| -  add                srcq, src_strideq | 
| + | 
| +  INC_SRC_BY_SRC_STRIDE | 
| + | 
| .x_other_y_other_loop: | 
| movh                 m2, [srcq] | 
| movh                 m1, [srcq+1] | 
| -  movh                 m4, [srcq+src_strideq] | 
| -  movh                 m3, [srcq+src_strideq+1] | 
| + | 
| +  INC_SRC_BY_SRC_STRIDE | 
| +  movh                 m4, [srcq] | 
| +  movh                 m3, [srcq+1] | 
| + | 
| %if cpuflag(ssse3) | 
| punpcklbw            m2, m1 | 
| punpcklbw            m4, m3 | 
| @@ -1253,7 +1373,7 @@ | 
| SUM_SSE              m0, m1, m2, m3, m6, m7 | 
| mova                 m0, m4 | 
|  | 
| -  lea                srcq, [srcq+src_strideq*2] | 
| +  INC_SRC_BY_SRC_STRIDE | 
| lea                dstq, [dstq+dst_strideq*2] | 
| %endif | 
| %if %2 == 1 ; avg | 
|  |