Index: source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm |
diff --git a/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm |
index 68acc03cec417665e3dfa1f85333788243ebfd45..7ea6a0e58b75743bc719f1a5f7d8ee6aac19d14c 100644 |
--- a/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm |
+++ b/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm |
@@ -1,5 +1,5 @@ |
; |
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
+; Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
; |
; Use of this source code is governed by a BSD-style license |
; that can be found in the LICENSE file in the root of the source |
@@ -8,1064 +8,662 @@ |
; be found in the AUTHORS file in the root of the source tree. |
; |
+%include "third_party/x86inc/x86inc.asm" |
-%include "vpx_ports/x86_abi_support.asm" |
- |
-%macro VERTx4 1 |
- mov rdx, arg(5) ;filter ptr |
- mov rsi, arg(0) ;src_ptr |
- mov rdi, arg(2) ;output_ptr |
- mov rcx, 0x0400040 |
- |
- movdqa xmm4, [rdx] ;load filters |
- movq xmm5, rcx |
- packsswb xmm4, xmm4 |
- pshuflw xmm0, xmm4, 0b ;k0_k1 |
- pshuflw xmm1, xmm4, 01010101b ;k2_k3 |
- pshuflw xmm2, xmm4, 10101010b ;k4_k5 |
- pshuflw xmm3, xmm4, 11111111b ;k6_k7 |
- |
- punpcklqdq xmm0, xmm0 |
- punpcklqdq xmm1, xmm1 |
- punpcklqdq xmm2, xmm2 |
- punpcklqdq xmm3, xmm3 |
- |
- movdqa k0k1, xmm0 |
- movdqa k2k3, xmm1 |
- pshufd xmm5, xmm5, 0 |
- movdqa k4k5, xmm2 |
- movdqa k6k7, xmm3 |
- movdqa krd, xmm5 |
- |
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line |
- |
-%if ABI_IS_32BIT=0 |
- movsxd r8, DWORD PTR arg(3) ;out_pitch |
-%endif |
- mov rax, rsi |
- movsxd rcx, DWORD PTR arg(4) ;output_height |
- add rax, rdx |
- |
- lea rbx, [rdx + rdx*4] |
- add rbx, rdx ;pitch * 6 |
- |
-.loop: |
- movd xmm0, [rsi] ;A |
- movd xmm1, [rsi + rdx] ;B |
- movd xmm2, [rsi + rdx * 2] ;C |
- movd xmm3, [rax + rdx * 2] ;D |
- movd xmm4, [rsi + rdx * 4] ;E |
- movd xmm5, [rax + rdx * 4] ;F |
- |
- punpcklbw xmm0, xmm1 ;A B |
- punpcklbw xmm2, xmm3 ;C D |
- punpcklbw xmm4, xmm5 ;E F |
- |
- movd xmm6, [rsi + rbx] ;G |
- movd xmm7, [rax + rbx] ;H |
- |
- pmaddubsw xmm0, k0k1 |
- pmaddubsw xmm2, k2k3 |
- punpcklbw xmm6, xmm7 ;G H |
- pmaddubsw xmm4, k4k5 |
- pmaddubsw xmm6, k6k7 |
- |
- movdqa xmm1, xmm2 |
- paddsw xmm0, xmm6 |
- pmaxsw xmm2, xmm4 |
- pminsw xmm4, xmm1 |
- paddsw xmm0, xmm4 |
- paddsw xmm0, xmm2 |
+SECTION_RODATA |
+pw_64: times 8 dw 64 |
- paddsw xmm0, krd |
- psraw xmm0, 7 |
- packuswb xmm0, xmm0 |
+; %define USE_PMULHRSW |
+; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss |
+; when using this instruction. |
- add rsi, rdx |
- add rax, rdx |
-%if %1 |
- movd xmm1, [rdi] |
- pavgb xmm0, xmm1 |
+SECTION .text |
+%if ARCH_X86_64 |
+ %define LOCAL_VARS_SIZE 16*4 |
+%else |
+ %define LOCAL_VARS_SIZE 16*6 |
%endif |
- movd [rdi], xmm0 |
-%if ABI_IS_32BIT |
- add rdi, DWORD PTR arg(3) ;out_pitch |
+%macro SETUP_LOCAL_VARS 0 |
+ ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 + |
+ ; pmaddubsw has a higher latency on some platforms, this might be eased by |
+ ; interleaving the instructions. |
+ %define k0k1 [rsp + 16*0] |
+ %define k2k3 [rsp + 16*1] |
+ %define k4k5 [rsp + 16*2] |
+ %define k6k7 [rsp + 16*3] |
+ packsswb m4, m4 |
+ ; TODO(slavarnway): multiple pshufb instructions had a higher latency on |
+ ; some platforms. |
+ pshuflw m0, m4, 0b ;k0_k1 |
+ pshuflw m1, m4, 01010101b ;k2_k3 |
+ pshuflw m2, m4, 10101010b ;k4_k5 |
+ pshuflw m3, m4, 11111111b ;k6_k7 |
+ punpcklqdq m0, m0 |
+ punpcklqdq m1, m1 |
+ punpcklqdq m2, m2 |
+ punpcklqdq m3, m3 |
+ mova k0k1, m0 |
+ mova k2k3, m1 |
+ mova k4k5, m2 |
+ mova k6k7, m3 |
+%if ARCH_X86_64 |
+ %define krd m12 |
+ %define tmp m13 |
+ mova krd, [GLOBAL(pw_64)] |
+%else |
+ %define tmp [rsp + 16*4] |
+ %define krd [rsp + 16*5] |
+%if CONFIG_PIC=0 |
+ mova m6, [GLOBAL(pw_64)] |
%else |
- add rdi, r8 |
+ ; build constants without accessing global memory |
+ pcmpeqb m6, m6 ;all ones |
+ psrlw m6, 15 |
+ psllw m6, 6 ;aka pw_64 |
%endif |
- dec rcx |
- jnz .loop |
-%endm |
- |
-%macro VERTx8 1 |
- mov rdx, arg(5) ;filter ptr |
- mov rsi, arg(0) ;src_ptr |
- mov rdi, arg(2) ;output_ptr |
- mov rcx, 0x0400040 |
- |
- movdqa xmm4, [rdx] ;load filters |
- movq xmm5, rcx |
- packsswb xmm4, xmm4 |
- pshuflw xmm0, xmm4, 0b ;k0_k1 |
- pshuflw xmm1, xmm4, 01010101b ;k2_k3 |
- pshuflw xmm2, xmm4, 10101010b ;k4_k5 |
- pshuflw xmm3, xmm4, 11111111b ;k6_k7 |
- |
- punpcklqdq xmm0, xmm0 |
- punpcklqdq xmm1, xmm1 |
- punpcklqdq xmm2, xmm2 |
- punpcklqdq xmm3, xmm3 |
- |
- movdqa k0k1, xmm0 |
- movdqa k2k3, xmm1 |
- pshufd xmm5, xmm5, 0 |
- movdqa k4k5, xmm2 |
- movdqa k6k7, xmm3 |
- movdqa krd, xmm5 |
- |
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line |
- |
-%if ABI_IS_32BIT=0 |
- movsxd r8, DWORD PTR arg(3) ;out_pitch |
+ mova krd, m6 |
%endif |
- mov rax, rsi |
- movsxd rcx, DWORD PTR arg(4) ;output_height |
- add rax, rdx |
- |
- lea rbx, [rdx + rdx*4] |
- add rbx, rdx ;pitch * 6 |
- |
-.loop: |
- movq xmm0, [rsi] ;A |
- movq xmm1, [rsi + rdx] ;B |
- movq xmm2, [rsi + rdx * 2] ;C |
- movq xmm3, [rax + rdx * 2] ;D |
- movq xmm4, [rsi + rdx * 4] ;E |
- movq xmm5, [rax + rdx * 4] ;F |
- |
- punpcklbw xmm0, xmm1 ;A B |
- punpcklbw xmm2, xmm3 ;C D |
- punpcklbw xmm4, xmm5 ;E F |
- |
- movq xmm6, [rsi + rbx] ;G |
- movq xmm7, [rax + rbx] ;H |
- |
- pmaddubsw xmm0, k0k1 |
- pmaddubsw xmm2, k2k3 |
- punpcklbw xmm6, xmm7 ;G H |
- pmaddubsw xmm4, k4k5 |
- pmaddubsw xmm6, k6k7 |
- |
- paddsw xmm0, xmm6 |
- movdqa xmm1, xmm2 |
- pmaxsw xmm2, xmm4 |
- pminsw xmm4, xmm1 |
- paddsw xmm0, xmm4 |
- paddsw xmm0, xmm2 |
- |
- paddsw xmm0, krd |
- psraw xmm0, 7 |
- packuswb xmm0, xmm0 |
+%endm |
- add rsi, rdx |
- add rax, rdx |
-%if %1 |
- movq xmm1, [rdi] |
- pavgb xmm0, xmm1 |
-%endif |
- movq [rdi], xmm0 |
+%macro HORIZx4_ROW 2 |
+ mova %2, %1 |
+ punpcklbw %1, %1 |
+ punpckhbw %2, %2 |
+ |
+ mova m3, %2 |
+ palignr %2, %1, 1 |
+ palignr m3, %1, 5 |
+ |
+ pmaddubsw %2, k0k1k4k5 |
+ pmaddubsw m3, k2k3k6k7 |
+ |
+ mova m4, %2 |
+ mova m5, m3 |
+ psrldq %2, 8 |
+ psrldq m3, 8 |
+ mova m6, m5 |
+ |
+ paddsw m4, m3 |
+ pmaxsw m5, %2 |
+ pminsw %2, m6 |
+ paddsw %2, m4 |
+ paddsw %2, m5 |
+ paddsw %2, krd |
+ psraw %2, 7 |
+ packuswb %2, %2 |
+%endm |
-%if ABI_IS_32BIT |
- add rdi, DWORD PTR arg(3) ;out_pitch |
+;------------------------------------------------------------------------------- |
+%macro SUBPIX_HFILTER4 1 |
+cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \ |
+ src, sstride, dst, dstride, height, filter |
+ mova m4, [filterq] |
+ packsswb m4, m4 |
+%if ARCH_X86_64 |
+ %define k0k1k4k5 m8 |
+ %define k2k3k6k7 m9 |
+ %define krd m10 |
+ %define orig_height r7 |
+ mova krd, [GLOBAL(pw_64)] |
+ pshuflw k0k1k4k5, m4, 0b ;k0_k1 |
+ pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5 |
+ pshuflw k2k3k6k7, m4, 01010101b ;k2_k3 |
+ pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7 |
%else |
- add rdi, r8 |
+ %define k0k1k4k5 [rsp + 16*0] |
+ %define k2k3k6k7 [rsp + 16*1] |
+ %define krd [rsp + 16*2] |
+ %define orig_height [rsp + 16*3] |
+ pshuflw m6, m4, 0b ;k0_k1 |
+ pshufhw m6, m6, 10101010b ;k0_k1_k4_k5 |
+ pshuflw m7, m4, 01010101b ;k2_k3 |
+ pshufhw m7, m7, 11111111b ;k2_k3_k6_k7 |
+%if CONFIG_PIC=0 |
+ mova m1, [GLOBAL(pw_64)] |
+%else |
+ ; build constants without accessing global memory |
+ pcmpeqb m1, m1 ;all ones |
+ psrlw m1, 15 |
+ psllw m1, 6 ;aka pw_64 |
%endif |
- dec rcx |
- jnz .loop |
-%endm |
- |
- |
-%macro VERTx16 1 |
- mov rdx, arg(5) ;filter ptr |
- mov rsi, arg(0) ;src_ptr |
- mov rdi, arg(2) ;output_ptr |
- mov rcx, 0x0400040 |
- |
- movdqa xmm4, [rdx] ;load filters |
- movq xmm5, rcx |
- packsswb xmm4, xmm4 |
- pshuflw xmm0, xmm4, 0b ;k0_k1 |
- pshuflw xmm1, xmm4, 01010101b ;k2_k3 |
- pshuflw xmm2, xmm4, 10101010b ;k4_k5 |
- pshuflw xmm3, xmm4, 11111111b ;k6_k7 |
- |
- punpcklqdq xmm0, xmm0 |
- punpcklqdq xmm1, xmm1 |
- punpcklqdq xmm2, xmm2 |
- punpcklqdq xmm3, xmm3 |
- |
- movdqa k0k1, xmm0 |
- movdqa k2k3, xmm1 |
- pshufd xmm5, xmm5, 0 |
- movdqa k4k5, xmm2 |
- movdqa k6k7, xmm3 |
- movdqa krd, xmm5 |
- |
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line |
- |
-%if ABI_IS_32BIT=0 |
- movsxd r8, DWORD PTR arg(3) ;out_pitch |
+ mova k0k1k4k5, m6 |
+ mova k2k3k6k7, m7 |
+ mova krd, m1 |
%endif |
- mov rax, rsi |
- movsxd rcx, DWORD PTR arg(4) ;output_height |
- add rax, rdx |
- |
- lea rbx, [rdx + rdx*4] |
- add rbx, rdx ;pitch * 6 |
- |
+ mov orig_height, heightq |
+ shr heightq, 1 |
.loop: |
- movq xmm0, [rsi] ;A |
- movq xmm1, [rsi + rdx] ;B |
- movq xmm2, [rsi + rdx * 2] ;C |
- movq xmm3, [rax + rdx * 2] ;D |
- movq xmm4, [rsi + rdx * 4] ;E |
- movq xmm5, [rax + rdx * 4] ;F |
- |
- punpcklbw xmm0, xmm1 ;A B |
- punpcklbw xmm2, xmm3 ;C D |
- punpcklbw xmm4, xmm5 ;E F |
- |
- movq xmm6, [rsi + rbx] ;G |
- movq xmm7, [rax + rbx] ;H |
- |
- pmaddubsw xmm0, k0k1 |
- pmaddubsw xmm2, k2k3 |
- punpcklbw xmm6, xmm7 ;G H |
- pmaddubsw xmm4, k4k5 |
- pmaddubsw xmm6, k6k7 |
- |
- paddsw xmm0, xmm6 |
- movdqa xmm1, xmm2 |
- pmaxsw xmm2, xmm4 |
- pminsw xmm4, xmm1 |
- paddsw xmm0, xmm4 |
- paddsw xmm0, xmm2 |
- |
- paddsw xmm0, krd |
- psraw xmm0, 7 |
- packuswb xmm0, xmm0 |
-%if %1 |
- movq xmm1, [rdi] |
- pavgb xmm0, xmm1 |
+ ;Do two rows at once |
+ movh m0, [srcq - 3] |
+ movh m1, [srcq + 5] |
+ punpcklqdq m0, m1 |
+ mova m1, m0 |
+ movh m2, [srcq + sstrideq - 3] |
+ movh m3, [srcq + sstrideq + 5] |
+ punpcklqdq m2, m3 |
+ mova m3, m2 |
+ punpcklbw m0, m0 |
+ punpckhbw m1, m1 |
+ punpcklbw m2, m2 |
+ punpckhbw m3, m3 |
+ mova m4, m1 |
+ palignr m4, m0, 1 |
+ pmaddubsw m4, k0k1k4k5 |
+ palignr m1, m0, 5 |
+ pmaddubsw m1, k2k3k6k7 |
+ mova m7, m3 |
+ palignr m7, m2, 1 |
+ pmaddubsw m7, k0k1k4k5 |
+ palignr m3, m2, 5 |
+ pmaddubsw m3, k2k3k6k7 |
+ mova m0, m4 |
+ mova m5, m1 |
+ mova m2, m7 |
+ psrldq m4, 8 |
+ psrldq m1, 8 |
+ mova m6, m5 |
+ paddsw m0, m1 |
+ mova m1, m3 |
+ psrldq m7, 8 |
+ psrldq m3, 8 |
+ paddsw m2, m3 |
+ mova m3, m1 |
+ pmaxsw m5, m4 |
+ pminsw m4, m6 |
+ paddsw m4, m0 |
+ paddsw m4, m5 |
+ pmaxsw m1, m7 |
+ pminsw m7, m3 |
+ paddsw m7, m2 |
+ paddsw m7, m1 |
+ |
+ paddsw m4, krd |
+ psraw m4, 7 |
+ packuswb m4, m4 |
+ paddsw m7, krd |
+ psraw m7, 7 |
+ packuswb m7, m7 |
+ |
+%ifidn %1, h8_avg |
+ movd m0, [dstq] |
+ pavgb m4, m0 |
+ movd m2, [dstq + dstrideq] |
+ pavgb m7, m2 |
%endif |
- movq [rdi], xmm0 |
- |
- movq xmm0, [rsi + 8] ;A |
- movq xmm1, [rsi + rdx + 8] ;B |
- movq xmm2, [rsi + rdx * 2 + 8] ;C |
- movq xmm3, [rax + rdx * 2 + 8] ;D |
- movq xmm4, [rsi + rdx * 4 + 8] ;E |
- movq xmm5, [rax + rdx * 4 + 8] ;F |
- |
- punpcklbw xmm0, xmm1 ;A B |
- punpcklbw xmm2, xmm3 ;C D |
- punpcklbw xmm4, xmm5 ;E F |
+ movd [dstq], m4 |
+ movd [dstq + dstrideq], m7 |
- movq xmm6, [rsi + rbx + 8] ;G |
- movq xmm7, [rax + rbx + 8] ;H |
- punpcklbw xmm6, xmm7 ;G H |
+ lea srcq, [srcq + sstrideq ] |
+ prefetcht0 [srcq + 4 * sstrideq - 3] |
+ lea srcq, [srcq + sstrideq ] |
+ lea dstq, [dstq + 2 * dstrideq ] |
+ prefetcht0 [srcq + 2 * sstrideq - 3] |
- pmaddubsw xmm0, k0k1 |
- pmaddubsw xmm2, k2k3 |
- pmaddubsw xmm4, k4k5 |
- pmaddubsw xmm6, k6k7 |
+ dec heightq |
+ jnz .loop |
- paddsw xmm0, xmm6 |
- movdqa xmm1, xmm2 |
- pmaxsw xmm2, xmm4 |
- pminsw xmm4, xmm1 |
- paddsw xmm0, xmm4 |
- paddsw xmm0, xmm2 |
- |
- paddsw xmm0, krd |
- psraw xmm0, 7 |
- packuswb xmm0, xmm0 |
- |
- add rsi, rdx |
- add rax, rdx |
-%if %1 |
- movq xmm1, [rdi+8] |
- pavgb xmm0, xmm1 |
-%endif |
- |
- movq [rdi+8], xmm0 |
- |
-%if ABI_IS_32BIT |
- add rdi, DWORD PTR arg(3) ;out_pitch |
-%else |
- add rdi, r8 |
+ ; Do last row if output_height is odd |
+ mov heightq, orig_height |
+ and heightq, 1 |
+ je .done |
+ |
+ movh m0, [srcq - 3] ; load src |
+ movh m1, [srcq + 5] |
+ punpcklqdq m0, m1 |
+ |
+ HORIZx4_ROW m0, m1 |
+%ifidn %1, h8_avg |
+ movd m0, [dstq] |
+ pavgb m1, m0 |
%endif |
- dec rcx |
- jnz .loop |
+ movd [dstq], m1 |
+.done |
+ RET |
%endm |
-;void vpx_filter_block1d8_v8_ssse3 |
-;( |
-; unsigned char *src_ptr, |
-; unsigned int src_pitch, |
-; unsigned char *output_ptr, |
-; unsigned int out_pitch, |
-; unsigned int output_height, |
-; short *filter |
-;) |
-global sym(vpx_filter_block1d4_v8_ssse3) PRIVATE |
-sym(vpx_filter_block1d4_v8_ssse3): |
- push rbp |
- mov rbp, rsp |
- SHADOW_ARGS_TO_STACK 6 |
- SAVE_XMM 7 |
- push rsi |
- push rdi |
- push rbx |
- ; end prolog |
- |
- ALIGN_STACK 16, rax |
- sub rsp, 16*5 |
- %define k0k1 [rsp + 16*0] |
- %define k2k3 [rsp + 16*1] |
- %define k4k5 [rsp + 16*2] |
- %define k6k7 [rsp + 16*3] |
- %define krd [rsp + 16*4] |
- |
- VERTx4 0 |
- |
- add rsp, 16*5 |
- pop rsp |
- pop rbx |
- ; begin epilog |
- pop rdi |
- pop rsi |
- RESTORE_XMM |
- UNSHADOW_ARGS |
- pop rbp |
- ret |
- |
-;void vpx_filter_block1d8_v8_ssse3 |
-;( |
-; unsigned char *src_ptr, |
-; unsigned int src_pitch, |
-; unsigned char *output_ptr, |
-; unsigned int out_pitch, |
-; unsigned int output_height, |
-; short *filter |
-;) |
-global sym(vpx_filter_block1d8_v8_ssse3) PRIVATE |
-sym(vpx_filter_block1d8_v8_ssse3): |
- push rbp |
- mov rbp, rsp |
- SHADOW_ARGS_TO_STACK 6 |
- SAVE_XMM 7 |
- push rsi |
- push rdi |
- push rbx |
- ; end prolog |
- |
- ALIGN_STACK 16, rax |
- sub rsp, 16*5 |
- %define k0k1 [rsp + 16*0] |
- %define k2k3 [rsp + 16*1] |
- %define k4k5 [rsp + 16*2] |
- %define k6k7 [rsp + 16*3] |
- %define krd [rsp + 16*4] |
- |
- VERTx8 0 |
- |
- add rsp, 16*5 |
- pop rsp |
- pop rbx |
- ; begin epilog |
- pop rdi |
- pop rsi |
- RESTORE_XMM |
- UNSHADOW_ARGS |
- pop rbp |
- ret |
- |
-;void vpx_filter_block1d16_v8_ssse3 |
-;( |
-; unsigned char *src_ptr, |
-; unsigned int src_pitch, |
-; unsigned char *output_ptr, |
-; unsigned int out_pitch, |
-; unsigned int output_height, |
-; short *filter |
-;) |
-global sym(vpx_filter_block1d16_v8_ssse3) PRIVATE |
-sym(vpx_filter_block1d16_v8_ssse3): |
- push rbp |
- mov rbp, rsp |
- SHADOW_ARGS_TO_STACK 6 |
- SAVE_XMM 7 |
- push rsi |
- push rdi |
- push rbx |
- ; end prolog |
- |
- ALIGN_STACK 16, rax |
- sub rsp, 16*5 |
- %define k0k1 [rsp + 16*0] |
- %define k2k3 [rsp + 16*1] |
- %define k4k5 [rsp + 16*2] |
- %define k6k7 [rsp + 16*3] |
- %define krd [rsp + 16*4] |
- |
- VERTx16 0 |
- |
- add rsp, 16*5 |
- pop rsp |
- pop rbx |
- ; begin epilog |
- pop rdi |
- pop rsi |
- RESTORE_XMM |
- UNSHADOW_ARGS |
- pop rbp |
- ret |
- |
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
- |
- |
-global sym(vpx_filter_block1d4_v8_avg_ssse3) PRIVATE |
-sym(vpx_filter_block1d4_v8_avg_ssse3): |
- push rbp |
- mov rbp, rsp |
- SHADOW_ARGS_TO_STACK 6 |
- SAVE_XMM 7 |
- push rsi |
- push rdi |
- push rbx |
- ; end prolog |
- |
- ALIGN_STACK 16, rax |
- sub rsp, 16*5 |
- %define k0k1 [rsp + 16*0] |
- %define k2k3 [rsp + 16*1] |
- %define k4k5 [rsp + 16*2] |
- %define k6k7 [rsp + 16*3] |
- %define krd [rsp + 16*4] |
- |
- VERTx4 1 |
- |
- add rsp, 16*5 |
- pop rsp |
- pop rbx |
- ; begin epilog |
- pop rdi |
- pop rsi |
- RESTORE_XMM |
- UNSHADOW_ARGS |
- pop rbp |
- ret |
- |
-global sym(vpx_filter_block1d8_v8_avg_ssse3) PRIVATE |
-sym(vpx_filter_block1d8_v8_avg_ssse3): |
- push rbp |
- mov rbp, rsp |
- SHADOW_ARGS_TO_STACK 6 |
- SAVE_XMM 7 |
- push rsi |
- push rdi |
- push rbx |
- ; end prolog |
- |
- ALIGN_STACK 16, rax |
- sub rsp, 16*5 |
- %define k0k1 [rsp + 16*0] |
- %define k2k3 [rsp + 16*1] |
- %define k4k5 [rsp + 16*2] |
- %define k6k7 [rsp + 16*3] |
- %define krd [rsp + 16*4] |
- |
- VERTx8 1 |
- |
- add rsp, 16*5 |
- pop rsp |
- pop rbx |
- ; begin epilog |
- pop rdi |
- pop rsi |
- RESTORE_XMM |
- UNSHADOW_ARGS |
- pop rbp |
- ret |
- |
-global sym(vpx_filter_block1d16_v8_avg_ssse3) PRIVATE |
-sym(vpx_filter_block1d16_v8_avg_ssse3): |
- push rbp |
- mov rbp, rsp |
- SHADOW_ARGS_TO_STACK 6 |
- SAVE_XMM 7 |
- push rsi |
- push rdi |
- push rbx |
- ; end prolog |
- |
- ALIGN_STACK 16, rax |
- sub rsp, 16*5 |
- %define k0k1 [rsp + 16*0] |
- %define k2k3 [rsp + 16*1] |
- %define k4k5 [rsp + 16*2] |
- %define k6k7 [rsp + 16*3] |
- %define krd [rsp + 16*4] |
- |
- VERTx16 1 |
- |
- add rsp, 16*5 |
- pop rsp |
- pop rbx |
- ; begin epilog |
- pop rdi |
- pop rsi |
- RESTORE_XMM |
- UNSHADOW_ARGS |
- pop rbp |
- ret |
- |
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
-%macro HORIZx4_ROW 2 |
- movdqa %2, %1 |
- pshufb %1, [GLOBAL(shuf_t0t1)] |
- pshufb %2, [GLOBAL(shuf_t2t3)] |
- pmaddubsw %1, k0k1k4k5 |
- pmaddubsw %2, k2k3k6k7 |
- |
- movdqa xmm4, %1 |
- movdqa xmm5, %2 |
- psrldq %1, 8 |
- psrldq %2, 8 |
- movdqa xmm6, xmm5 |
- |
- paddsw xmm4, %2 |
- pmaxsw xmm5, %1 |
- pminsw %1, xmm6 |
- paddsw %1, xmm4 |
- paddsw %1, xmm5 |
- |
- paddsw %1, krd |
- psraw %1, 7 |
- packuswb %1, %1 |
+%macro HORIZx8_ROW 5 |
+ mova %2, %1 |
+ punpcklbw %1, %1 |
+ punpckhbw %2, %2 |
+ |
+ mova %3, %2 |
+ mova %4, %2 |
+ mova %5, %2 |
+ |
+ palignr %2, %1, 1 |
+ palignr %3, %1, 5 |
+ palignr %4, %1, 9 |
+ palignr %5, %1, 13 |
+ |
+ pmaddubsw %2, k0k1 |
+ pmaddubsw %3, k2k3 |
+ pmaddubsw %4, k4k5 |
+ pmaddubsw %5, k6k7 |
+ |
+ paddsw %2, %5 |
+ mova %1, %3 |
+ pminsw %3, %4 |
+ pmaxsw %1, %4 |
+ paddsw %2, %3 |
+ paddsw %1, %2 |
+ paddsw %1, krd |
+ psraw %1, 7 |
+ packuswb %1, %1 |
%endm |
-%macro HORIZx4 1 |
- mov rdx, arg(5) ;filter ptr |
- mov rsi, arg(0) ;src_ptr |
- mov rdi, arg(2) ;output_ptr |
- mov rcx, 0x0400040 |
- |
- movdqa xmm4, [rdx] ;load filters |
- movq xmm5, rcx |
- packsswb xmm4, xmm4 |
- pshuflw xmm6, xmm4, 0b ;k0_k1 |
- pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5 |
- pshuflw xmm7, xmm4, 01010101b ;k2_k3 |
- pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7 |
- pshufd xmm5, xmm5, 0 ;rounding |
- |
- movdqa k0k1k4k5, xmm6 |
- movdqa k2k3k6k7, xmm7 |
- movdqa krd, xmm5 |
+;------------------------------------------------------------------------------- |
+%macro SUBPIX_HFILTER8 1 |
+cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 13, LOCAL_VARS_SIZE, \ |
+ src, sstride, dst, dstride, height, filter |
+ mova m4, [filterq] |
+ SETUP_LOCAL_VARS |
+%if ARCH_X86_64 |
+ %define orig_height r7 |
+%else |
+ %define orig_height heightmp |
+%endif |
+ mov orig_height, heightq |
+ shr heightq, 1 |
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line |
- movsxd rdx, dword ptr arg(3) ;output_pitch |
- movsxd rcx, dword ptr arg(4) ;output_height |
- shr rcx, 1 |
.loop: |
- ;Do two rows once |
- movq xmm0, [rsi - 3] ;load src |
- movq xmm1, [rsi + 5] |
- movq xmm2, [rsi + rax - 3] |
- movq xmm3, [rsi + rax + 5] |
- punpcklqdq xmm0, xmm1 |
- punpcklqdq xmm2, xmm3 |
- |
- HORIZx4_ROW xmm0, xmm1 |
- HORIZx4_ROW xmm2, xmm3 |
-%if %1 |
- movd xmm1, [rdi] |
- pavgb xmm0, xmm1 |
- movd xmm3, [rdi + rdx] |
- pavgb xmm2, xmm3 |
+ movh m0, [srcq - 3] |
+ movh m3, [srcq + 5] |
+ movh m4, [srcq + sstrideq - 3] |
+ movh m7, [srcq + sstrideq + 5] |
+ punpcklqdq m0, m3 |
+ mova m1, m0 |
+ punpcklbw m0, m0 |
+ punpckhbw m1, m1 |
+ mova m5, m1 |
+ palignr m5, m0, 13 |
+ pmaddubsw m5, k6k7 |
+ mova m2, m1 |
+ mova m3, m1 |
+ palignr m1, m0, 1 |
+ pmaddubsw m1, k0k1 |
+ punpcklqdq m4, m7 |
+ mova m6, m4 |
+ punpcklbw m4, m4 |
+ palignr m2, m0, 5 |
+ punpckhbw m6, m6 |
+ palignr m3, m0, 9 |
+ mova m7, m6 |
+ pmaddubsw m2, k2k3 |
+ pmaddubsw m3, k4k5 |
+ |
+ palignr m7, m4, 13 |
+ paddsw m1, m5 |
+ mova m5, m6 |
+ mova m0, m2 |
+ palignr m5, m4, 5 |
+ pminsw m2, m3 |
+ pmaddubsw m7, k6k7 |
+ pmaxsw m3, m0 |
+ paddsw m1, m2 |
+ mova m0, m6 |
+ palignr m6, m4, 1 |
+ pmaddubsw m5, k2k3 |
+ paddsw m1, m3 |
+ pmaddubsw m6, k0k1 |
+ palignr m0, m4, 9 |
+ paddsw m1, krd |
+ pmaddubsw m0, k4k5 |
+ mova m4, m5 |
+ psraw m1, 7 |
+ pminsw m5, m0 |
+ paddsw m6, m7 |
+ packuswb m1, m1 |
+ |
+ paddsw m6, m5 |
+ pmaxsw m0, m4 |
+ paddsw m6, m0 |
+ paddsw m6, krd |
+ psraw m6, 7 |
+ packuswb m6, m6 |
+ |
+%ifidn %1, h8_avg |
+ movh m0, [dstq] |
+ movh m2, [dstq + dstrideq] |
+ pavgb m1, m0 |
+ pavgb m6, m2 |
%endif |
- movd [rdi], xmm0 |
- movd [rdi +rdx], xmm2 |
+ movh [dstq], m1 |
+ movh [dstq + dstrideq], m6 |
- lea rsi, [rsi + rax] |
- prefetcht0 [rsi + 4 * rax - 3] |
- lea rsi, [rsi + rax] |
- lea rdi, [rdi + 2 * rdx] |
- prefetcht0 [rsi + 2 * rax - 3] |
+ lea srcq, [srcq + sstrideq ] |
+ prefetcht0 [srcq + 4 * sstrideq - 3] |
+ lea srcq, [srcq + sstrideq ] |
+ lea dstq, [dstq + 2 * dstrideq ] |
+ prefetcht0 [srcq + 2 * sstrideq - 3] |
+ dec heightq |
+ jnz .loop |
- dec rcx |
- jnz .loop |
+ ;Do last row if output_height is odd |
+ mov heightq, orig_height |
+ and heightq, 1 |
+ je .done |
- ; Do last row if output_height is odd |
- movsxd rcx, dword ptr arg(4) ;output_height |
- and rcx, 1 |
- je .done |
+ movh m0, [srcq - 3] |
+ movh m3, [srcq + 5] |
+ punpcklqdq m0, m3 |
- movq xmm0, [rsi - 3] ; load src |
- movq xmm1, [rsi + 5] |
- punpcklqdq xmm0, xmm1 |
+ HORIZx8_ROW m0, m1, m2, m3, m4 |
- HORIZx4_ROW xmm0, xmm1 |
-%if %1 |
- movd xmm1, [rdi] |
- pavgb xmm0, xmm1 |
+%ifidn %1, h8_avg |
+ movh m1, [dstq] |
+ pavgb m0, m1 |
%endif |
- movd [rdi], xmm0 |
-.done |
+ movh [dstq], m0 |
+.done: |
+ RET |
%endm |
-%macro HORIZx8_ROW 4 |
- movdqa %2, %1 |
- movdqa %3, %1 |
- movdqa %4, %1 |
- |
- pshufb %1, [GLOBAL(shuf_t0t1)] |
- pshufb %2, [GLOBAL(shuf_t2t3)] |
- pshufb %3, [GLOBAL(shuf_t4t5)] |
- pshufb %4, [GLOBAL(shuf_t6t7)] |
- |
- pmaddubsw %1, k0k1 |
- pmaddubsw %2, k2k3 |
- pmaddubsw %3, k4k5 |
- pmaddubsw %4, k6k7 |
- |
- paddsw %1, %4 |
- movdqa %4, %2 |
- pmaxsw %2, %3 |
- pminsw %3, %4 |
- paddsw %1, %3 |
- paddsw %1, %2 |
- |
- paddsw %1, krd |
- psraw %1, 7 |
- packuswb %1, %1 |
+;------------------------------------------------------------------------------- |
+%macro SUBPIX_HFILTER16 1 |
+cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 13, LOCAL_VARS_SIZE, \ |
+ src, sstride, dst, dstride, height, filter |
+ mova m4, [filterq] |
+ SETUP_LOCAL_VARS |
+.loop: |
+ prefetcht0 [srcq + 2 * sstrideq -3] |
+ |
+ movh m0, [srcq - 3] |
+ movh m4, [srcq + 5] |
+ movh m6, [srcq + 13] |
+ punpcklqdq m0, m4 |
+ mova m7, m0 |
+ punpckhbw m0, m0 |
+ mova m1, m0 |
+ punpcklqdq m4, m6 |
+ mova m3, m0 |
+ punpcklbw m7, m7 |
+ |
+ palignr m3, m7, 13 |
+ mova m2, m0 |
+ pmaddubsw m3, k6k7 |
+ palignr m0, m7, 1 |
+ pmaddubsw m0, k0k1 |
+ palignr m1, m7, 5 |
+ pmaddubsw m1, k2k3 |
+ palignr m2, m7, 9 |
+ pmaddubsw m2, k4k5 |
+ paddsw m0, m3 |
+ mova m3, m4 |
+ punpckhbw m4, m4 |
+ mova m5, m4 |
+ punpcklbw m3, m3 |
+ mova m7, m4 |
+ palignr m5, m3, 5 |
+ mova m6, m4 |
+ palignr m4, m3, 1 |
+ pmaddubsw m4, k0k1 |
+ pmaddubsw m5, k2k3 |
+ palignr m6, m3, 9 |
+ pmaddubsw m6, k4k5 |
+ palignr m7, m3, 13 |
+ pmaddubsw m7, k6k7 |
+ |
+ mova m3, m1 |
+ pmaxsw m1, m2 |
+ pminsw m2, m3 |
+ paddsw m0, m2 |
+ paddsw m0, m1 |
+ paddsw m4, m7 |
+ mova m7, m5 |
+ pmaxsw m5, m6 |
+ pminsw m6, m7 |
+ paddsw m4, m6 |
+ paddsw m4, m5 |
+ paddsw m0, krd |
+ paddsw m4, krd |
+ psraw m0, 7 |
+ psraw m4, 7 |
+ packuswb m0, m4 |
+%ifidn %1, h8_avg |
+ mova m1, [dstq] |
+ pavgb m0, m1 |
+%endif |
+ lea srcq, [srcq + sstrideq] |
+ mova [dstq], m0 |
+ lea dstq, [dstq + dstrideq] |
+ dec heightq |
+ jnz .loop |
+ RET |
%endm |
-%macro HORIZx8 1 |
- mov rdx, arg(5) ;filter ptr |
- mov rsi, arg(0) ;src_ptr |
- mov rdi, arg(2) ;output_ptr |
- mov rcx, 0x0400040 |
- |
- movdqa xmm4, [rdx] ;load filters |
- movq xmm5, rcx |
- packsswb xmm4, xmm4 |
- pshuflw xmm0, xmm4, 0b ;k0_k1 |
- pshuflw xmm1, xmm4, 01010101b ;k2_k3 |
- pshuflw xmm2, xmm4, 10101010b ;k4_k5 |
- pshuflw xmm3, xmm4, 11111111b ;k6_k7 |
- |
- punpcklqdq xmm0, xmm0 |
- punpcklqdq xmm1, xmm1 |
- punpcklqdq xmm2, xmm2 |
- punpcklqdq xmm3, xmm3 |
- |
- movdqa k0k1, xmm0 |
- movdqa k2k3, xmm1 |
- pshufd xmm5, xmm5, 0 |
- movdqa k4k5, xmm2 |
- movdqa k6k7, xmm3 |
- movdqa krd, xmm5 |
- |
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line |
- movsxd rdx, dword ptr arg(3) ;output_pitch |
- movsxd rcx, dword ptr arg(4) ;output_height |
- shr rcx, 1 |
+INIT_XMM ssse3 |
+SUBPIX_HFILTER16 h8 |
+SUBPIX_HFILTER16 h8_avg |
+SUBPIX_HFILTER8 h8 |
+SUBPIX_HFILTER8 h8_avg |
+SUBPIX_HFILTER4 h8 |
+SUBPIX_HFILTER4 h8_avg |
+ |
+;------------------------------------------------------------------------------- |
+%macro SUBPIX_VFILTER 2 |
+cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \ |
+ src, sstride, dst, dstride, height, filter |
+ mova m4, [filterq] |
+ SETUP_LOCAL_VARS |
+%if ARCH_X86_64 |
+ %define src1q r7 |
+ %define sstride6q r8 |
+ %define dst_stride dstrideq |
+%else |
+ %define src1q filterq |
+ %define sstride6q dstrideq |
+ %define dst_stride dstridemp |
+%endif |
+ mov src1q, srcq |
+ add src1q, sstrideq |
+ lea sstride6q, [sstrideq + sstrideq * 4] |
+ add sstride6q, sstrideq ;pitch * 6 |
+%ifidn %2, 8 |
+ %define movx movh |
+%else |
+ %define movx movd |
+%endif |
.loop: |
- movq xmm0, [rsi - 3] ;load src |
- movq xmm3, [rsi + 5] |
- movq xmm4, [rsi + rax - 3] |
- movq xmm7, [rsi + rax + 5] |
- punpcklqdq xmm0, xmm3 |
- punpcklqdq xmm4, xmm7 |
- |
- HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 |
- HORIZx8_ROW xmm4, xmm5, xmm6, xmm7 |
-%if %1 |
- movq xmm1, [rdi] |
- movq xmm2, [rdi + rdx] |
- pavgb xmm0, xmm1 |
- pavgb xmm4, xmm2 |
+ movx m0, [srcq ] ;A |
+ movx m1, [srcq + sstrideq ] ;B |
+ punpcklbw m0, m1 ;A B |
+ movx m2, [srcq + sstrideq * 2 ] ;C |
+ pmaddubsw m0, k0k1 |
+ mova m6, m2 |
+ movx m3, [src1q + sstrideq * 2] ;D |
+ punpcklbw m2, m3 ;C D |
+ pmaddubsw m2, k2k3 |
+ movx m4, [srcq + sstrideq * 4 ] ;E |
+ mova m7, m4 |
+ movx m5, [src1q + sstrideq * 4] ;F |
+ punpcklbw m4, m5 ;E F |
+ pmaddubsw m4, k4k5 |
+ punpcklbw m1, m6 ;A B next iter |
+ movx m6, [srcq + sstride6q ] ;G |
+ punpcklbw m5, m6 ;E F next iter |
+ punpcklbw m3, m7 ;C D next iter |
+ pmaddubsw m5, k4k5 |
+ movx m7, [src1q + sstride6q ] ;H |
+ punpcklbw m6, m7 ;G H |
+ pmaddubsw m6, k6k7 |
+ mova tmp, m2 |
+ pmaddubsw m3, k2k3 |
+ pmaddubsw m1, k0k1 |
+ pmaxsw m2, m4 |
+ paddsw m0, m6 |
+ movx m6, [srcq + sstrideq * 8 ] ;H next iter |
+ punpcklbw m7, m6 |
+ pmaddubsw m7, k6k7 |
+ pminsw m4, tmp |
+ paddsw m0, m4 |
+ mova m4, m3 |
+ paddsw m0, m2 |
+ pminsw m3, m5 |
+ pmaxsw m5, m4 |
+ paddsw m0, krd |
+ psraw m0, 7 |
+ paddsw m1, m7 |
+ packuswb m0, m0 |
+ |
+ paddsw m1, m3 |
+ paddsw m1, m5 |
+ paddsw m1, krd |
+ psraw m1, 7 |
+ lea srcq, [srcq + sstrideq * 2 ] |
+ lea src1q, [src1q + sstrideq * 2] |
+ packuswb m1, m1 |
+ |
+%ifidn %1, v8_avg |
+ movx m2, [dstq] |
+ pavgb m0, m2 |
%endif |
- movq [rdi], xmm0 |
- movq [rdi + rdx], xmm4 |
- |
- lea rsi, [rsi + rax] |
- prefetcht0 [rsi + 4 * rax - 3] |
- lea rsi, [rsi + rax] |
- lea rdi, [rdi + 2 * rdx] |
- prefetcht0 [rsi + 2 * rax - 3] |
- dec rcx |
- jnz .loop |
- |
- ;Do last row if output_height is odd |
- movsxd rcx, dword ptr arg(4) ;output_height |
- and rcx, 1 |
- je .done |
- |
- movq xmm0, [rsi - 3] |
- movq xmm3, [rsi + 5] |
- punpcklqdq xmm0, xmm3 |
- |
- HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 |
-%if %1 |
- movq xmm1, [rdi] |
- pavgb xmm0, xmm1 |
+ movx [dstq], m0 |
+ add dstq, dst_stride |
+%ifidn %1, v8_avg |
+ movx m3, [dstq] |
+ pavgb m1, m3 |
%endif |
- movq [rdi], xmm0 |
-.done |
+ movx [dstq], m1 |
+ add dstq, dst_stride |
+ sub heightq, 2 |
+ cmp heightq, 1 |
+ jg .loop |
+ |
+ cmp heightq, 0 |
+ je .done |
+ |
+ movx m0, [srcq ] ;A |
+ movx m1, [srcq + sstrideq ] ;B |
+ movx m6, [srcq + sstride6q ] ;G |
+ punpcklbw m0, m1 ;A B |
+ movx m7, [rax + sstride6q ] ;H |
+ pmaddubsw m0, k0k1 |
+ movx m2, [srcq + sstrideq * 2 ] ;C |
+ punpcklbw m6, m7 ;G H |
+ movx m3, [rax + sstrideq * 2 ] ;D |
+ pmaddubsw m6, k6k7 |
+ movx m4, [srcq + sstrideq * 4 ] ;E |
+ punpcklbw m2, m3 ;C D |
+ movx m5, [src1q + sstrideq * 4] ;F |
+ punpcklbw m4, m5 ;E F |
+ pmaddubsw m2, k2k3 |
+ pmaddubsw m4, k4k5 |
+ paddsw m0, m6 |
+ mova m1, m2 |
+ pmaxsw m2, m4 |
+ pminsw m4, m1 |
+ paddsw m0, m4 |
+ paddsw m0, m2 |
+ paddsw m0, krd |
+ psraw m0, 7 |
+ packuswb m0, m0 |
+%ifidn %1, v8_avg |
+ movx m1, [dstq] |
+ pavgb m0, m1 |
+%endif |
+ movx [dstq], m0 |
+.done: |
+ RET |
%endm |
-%macro HORIZx16 1 |
- mov rdx, arg(5) ;filter ptr |
- mov rsi, arg(0) ;src_ptr |
- mov rdi, arg(2) ;output_ptr |
- mov rcx, 0x0400040 |
- |
- movdqa xmm4, [rdx] ;load filters |
- movq xmm5, rcx |
- packsswb xmm4, xmm4 |
- pshuflw xmm0, xmm4, 0b ;k0_k1 |
- pshuflw xmm1, xmm4, 01010101b ;k2_k3 |
- pshuflw xmm2, xmm4, 10101010b ;k4_k5 |
- pshuflw xmm3, xmm4, 11111111b ;k6_k7 |
- |
- punpcklqdq xmm0, xmm0 |
- punpcklqdq xmm1, xmm1 |
- punpcklqdq xmm2, xmm2 |
- punpcklqdq xmm3, xmm3 |
- |
- movdqa k0k1, xmm0 |
- movdqa k2k3, xmm1 |
- pshufd xmm5, xmm5, 0 |
- movdqa k4k5, xmm2 |
- movdqa k6k7, xmm3 |
- movdqa krd, xmm5 |
- |
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line |
- movsxd rdx, dword ptr arg(3) ;output_pitch |
- movsxd rcx, dword ptr arg(4) ;output_height |
+;------------------------------------------------------------------------------- |
+%macro SUBPIX_VFILTER16 1 |
+cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*2), 13, LOCAL_VARS_SIZE, \ |
+ src, sstride, dst, dstride, height, filter |
+ |
+ mova m4, [filterq] |
+ SETUP_LOCAL_VARS |
+%if ARCH_X86_64 |
+ %define src1q r7 |
+ %define sstride6q r8 |
+ %define dst_stride dstrideq |
+%else |
+ %define src1q filterq |
+ %define sstride6q dstrideq |
+ %define dst_stride dstridemp |
+%endif |
+ mov src1q, srcq |
+ add src1q, sstrideq |
+ lea sstride6q, [sstrideq + sstrideq * 4] |
+ add sstride6q, sstrideq ;pitch * 6 |
.loop: |
- prefetcht0 [rsi + 2 * rax -3] |
- |
- movq xmm0, [rsi - 3] ;load src data |
- movq xmm4, [rsi + 5] |
- movq xmm6, [rsi + 13] |
- punpcklqdq xmm0, xmm4 |
- punpcklqdq xmm4, xmm6 |
- |
- movdqa xmm7, xmm0 |
- |
- punpcklbw xmm7, xmm7 |
- punpckhbw xmm0, xmm0 |
- movdqa xmm1, xmm0 |
- movdqa xmm2, xmm0 |
- movdqa xmm3, xmm0 |
- |
- palignr xmm0, xmm7, 1 |
- palignr xmm1, xmm7, 5 |
- pmaddubsw xmm0, k0k1 |
- palignr xmm2, xmm7, 9 |
- pmaddubsw xmm1, k2k3 |
- palignr xmm3, xmm7, 13 |
- |
- pmaddubsw xmm2, k4k5 |
- pmaddubsw xmm3, k6k7 |
- paddsw xmm0, xmm3 |
- |
- movdqa xmm3, xmm4 |
- punpcklbw xmm3, xmm3 |
- punpckhbw xmm4, xmm4 |
- |
- movdqa xmm5, xmm4 |
- movdqa xmm6, xmm4 |
- movdqa xmm7, xmm4 |
- |
- palignr xmm4, xmm3, 1 |
- palignr xmm5, xmm3, 5 |
- palignr xmm6, xmm3, 9 |
- palignr xmm7, xmm3, 13 |
- |
- movdqa xmm3, xmm1 |
- pmaddubsw xmm4, k0k1 |
- pmaxsw xmm1, xmm2 |
- pmaddubsw xmm5, k2k3 |
- pminsw xmm2, xmm3 |
- pmaddubsw xmm6, k4k5 |
- paddsw xmm0, xmm2 |
- pmaddubsw xmm7, k6k7 |
- paddsw xmm0, xmm1 |
- |
- paddsw xmm4, xmm7 |
- movdqa xmm7, xmm5 |
- pmaxsw xmm5, xmm6 |
- pminsw xmm6, xmm7 |
- paddsw xmm4, xmm6 |
- paddsw xmm4, xmm5 |
- |
- paddsw xmm0, krd |
- paddsw xmm4, krd |
- psraw xmm0, 7 |
- psraw xmm4, 7 |
- packuswb xmm0, xmm0 |
- packuswb xmm4, xmm4 |
- punpcklqdq xmm0, xmm4 |
-%if %1 |
- movdqa xmm1, [rdi] |
- pavgb xmm0, xmm1 |
+ movh m0, [srcq ] ;A |
+ movh m1, [srcq + sstrideq ] ;B |
+ movh m2, [srcq + sstrideq * 2 ] ;C |
+ movh m3, [src1q + sstrideq * 2] ;D |
+ movh m4, [srcq + sstrideq * 4 ] ;E |
+ movh m5, [src1q + sstrideq * 4] ;F |
+ |
+ punpcklbw m0, m1 ;A B |
+ movh m6, [srcq + sstride6q] ;G |
+ punpcklbw m2, m3 ;C D |
+ movh m7, [src1q + sstride6q] ;H |
+ punpcklbw m4, m5 ;E F |
+ pmaddubsw m0, k0k1 |
+ movh m3, [srcq + 8] ;A |
+ pmaddubsw m2, k2k3 |
+ punpcklbw m6, m7 ;G H |
+ movh m5, [srcq + sstrideq + 8] ;B |
+ pmaddubsw m4, k4k5 |
+ punpcklbw m3, m5 ;A B |
+ movh m7, [srcq + sstrideq * 2 + 8] ;C |
+ pmaddubsw m6, k6k7 |
+ mova m1, m2 |
+ movh m5, [src1q + sstrideq * 2 + 8] ;D |
+ pmaxsw m2, m4 |
+ punpcklbw m7, m5 ;C D |
+ pminsw m4, m1 |
+ paddsw m0, m6 |
+ pmaddubsw m3, k0k1 |
+ movh m1, [srcq + sstrideq * 4 + 8] ;E |
+ paddsw m0, m4 |
+ pmaddubsw m7, k2k3 |
+ movh m6, [src1q + sstrideq * 4 + 8] ;F |
+ punpcklbw m1, m6 ;E F |
+ paddsw m0, m2 |
+ paddsw m0, krd |
+ movh m2, [srcq + sstride6q + 8] ;G |
+ pmaddubsw m1, k4k5 |
+ movh m5, [src1q + sstride6q + 8] ;H |
+ psraw m0, 7 |
+ punpcklbw m2, m5 ;G H |
+ packuswb m0, m0 |
+ pmaddubsw m2, k6k7 |
+%ifidn %1, v8_avg |
+ movh m4, [dstq] |
+ pavgb m0, m4 |
%endif |
- |
- lea rsi, [rsi + rax] |
- movdqa [rdi], xmm0 |
- |
- lea rdi, [rdi + rdx] |
- dec rcx |
- jnz .loop |
+ movh [dstq], m0 |
+ mova m6, m7 |
+ pmaxsw m7, m1 |
+ pminsw m1, m6 |
+ paddsw m3, m2 |
+ paddsw m3, m1 |
+ paddsw m3, m7 |
+ paddsw m3, krd |
+ psraw m3, 7 |
+ packuswb m3, m3 |
+ |
+ add srcq, sstrideq |
+ add src1q, sstrideq |
+%ifidn %1, v8_avg |
+ movh m1, [dstq + 8] |
+ pavgb m3, m1 |
+%endif |
+ movh [dstq + 8], m3 |
+ add dstq, dst_stride |
+ dec heightq |
+ jnz .loop |
+ RET |
%endm |
-;void vpx_filter_block1d4_h8_ssse3 |
-;( |
-; unsigned char *src_ptr, |
-; unsigned int src_pixels_per_line, |
-; unsigned char *output_ptr, |
-; unsigned int output_pitch, |
-; unsigned int output_height, |
-; short *filter |
-;) |
-global sym(vpx_filter_block1d4_h8_ssse3) PRIVATE |
-sym(vpx_filter_block1d4_h8_ssse3): |
- push rbp |
- mov rbp, rsp |
- SHADOW_ARGS_TO_STACK 6 |
- SAVE_XMM 7 |
- GET_GOT rbx |
- push rsi |
- push rdi |
- ; end prolog |
- |
- ALIGN_STACK 16, rax |
- sub rsp, 16 * 3 |
- %define k0k1k4k5 [rsp + 16 * 0] |
- %define k2k3k6k7 [rsp + 16 * 1] |
- %define krd [rsp + 16 * 2] |
- |
- HORIZx4 0 |
- |
- add rsp, 16 * 3 |
- pop rsp |
- ; begin epilog |
- pop rdi |
- pop rsi |
- RESTORE_GOT |
- RESTORE_XMM |
- UNSHADOW_ARGS |
- pop rbp |
- ret |
- |
-;void vpx_filter_block1d8_h8_ssse3 |
-;( |
-; unsigned char *src_ptr, |
-; unsigned int src_pixels_per_line, |
-; unsigned char *output_ptr, |
-; unsigned int output_pitch, |
-; unsigned int output_height, |
-; short *filter |
-;) |
-global sym(vpx_filter_block1d8_h8_ssse3) PRIVATE |
-sym(vpx_filter_block1d8_h8_ssse3): |
- push rbp |
- mov rbp, rsp |
- SHADOW_ARGS_TO_STACK 6 |
- SAVE_XMM 7 |
- GET_GOT rbx |
- push rsi |
- push rdi |
- ; end prolog |
- |
- ALIGN_STACK 16, rax |
- sub rsp, 16*5 |
- %define k0k1 [rsp + 16*0] |
- %define k2k3 [rsp + 16*1] |
- %define k4k5 [rsp + 16*2] |
- %define k6k7 [rsp + 16*3] |
- %define krd [rsp + 16*4] |
- |
- HORIZx8 0 |
- |
- add rsp, 16*5 |
- pop rsp |
- |
- ; begin epilog |
- pop rdi |
- pop rsi |
- RESTORE_GOT |
- RESTORE_XMM |
- UNSHADOW_ARGS |
- pop rbp |
- ret |
- |
-;void vpx_filter_block1d16_h8_ssse3 |
-;( |
-; unsigned char *src_ptr, |
-; unsigned int src_pixels_per_line, |
-; unsigned char *output_ptr, |
-; unsigned int output_pitch, |
-; unsigned int output_height, |
-; short *filter |
-;) |
-global sym(vpx_filter_block1d16_h8_ssse3) PRIVATE |
-sym(vpx_filter_block1d16_h8_ssse3): |
- push rbp |
- mov rbp, rsp |
- SHADOW_ARGS_TO_STACK 6 |
- SAVE_XMM 7 |
- GET_GOT rbx |
- push rsi |
- push rdi |
- ; end prolog |
- |
- ALIGN_STACK 16, rax |
- sub rsp, 16*5 |
- %define k0k1 [rsp + 16*0] |
- %define k2k3 [rsp + 16*1] |
- %define k4k5 [rsp + 16*2] |
- %define k6k7 [rsp + 16*3] |
- %define krd [rsp + 16*4] |
- |
- HORIZx16 0 |
- |
- add rsp, 16*5 |
- pop rsp |
- |
- ; begin epilog |
- pop rdi |
- pop rsi |
- RESTORE_GOT |
- RESTORE_XMM |
- UNSHADOW_ARGS |
- pop rbp |
- ret |
- |
-global sym(vpx_filter_block1d4_h8_avg_ssse3) PRIVATE |
-sym(vpx_filter_block1d4_h8_avg_ssse3): |
- push rbp |
- mov rbp, rsp |
- SHADOW_ARGS_TO_STACK 6 |
- SAVE_XMM 7 |
- GET_GOT rbx |
- push rsi |
- push rdi |
- ; end prolog |
- |
- ALIGN_STACK 16, rax |
- sub rsp, 16 * 3 |
- %define k0k1k4k5 [rsp + 16 * 0] |
- %define k2k3k6k7 [rsp + 16 * 1] |
- %define krd [rsp + 16 * 2] |
- |
- HORIZx4 1 |
- |
- add rsp, 16 * 3 |
- pop rsp |
- ; begin epilog |
- pop rdi |
- pop rsi |
- RESTORE_GOT |
- RESTORE_XMM |
- UNSHADOW_ARGS |
- pop rbp |
- ret |
- |
-global sym(vpx_filter_block1d8_h8_avg_ssse3) PRIVATE |
-sym(vpx_filter_block1d8_h8_avg_ssse3): |
- push rbp |
- mov rbp, rsp |
- SHADOW_ARGS_TO_STACK 6 |
- SAVE_XMM 7 |
- GET_GOT rbx |
- push rsi |
- push rdi |
- ; end prolog |
- |
- ALIGN_STACK 16, rax |
- sub rsp, 16*5 |
- %define k0k1 [rsp + 16*0] |
- %define k2k3 [rsp + 16*1] |
- %define k4k5 [rsp + 16*2] |
- %define k6k7 [rsp + 16*3] |
- %define krd [rsp + 16*4] |
- |
- HORIZx8 1 |
- |
- add rsp, 16*5 |
- pop rsp |
- |
- ; begin epilog |
- pop rdi |
- pop rsi |
- RESTORE_GOT |
- RESTORE_XMM |
- UNSHADOW_ARGS |
- pop rbp |
- ret |
- |
-global sym(vpx_filter_block1d16_h8_avg_ssse3) PRIVATE |
-sym(vpx_filter_block1d16_h8_avg_ssse3): |
- push rbp |
- mov rbp, rsp |
- SHADOW_ARGS_TO_STACK 6 |
- SAVE_XMM 7 |
- GET_GOT rbx |
- push rsi |
- push rdi |
- ; end prolog |
- |
- ALIGN_STACK 16, rax |
- sub rsp, 16*5 |
- %define k0k1 [rsp + 16*0] |
- %define k2k3 [rsp + 16*1] |
- %define k4k5 [rsp + 16*2] |
- %define k6k7 [rsp + 16*3] |
- %define krd [rsp + 16*4] |
- |
- HORIZx16 1 |
- |
- add rsp, 16*5 |
- pop rsp |
- |
- ; begin epilog |
- pop rdi |
- pop rsi |
- RESTORE_GOT |
- RESTORE_XMM |
- UNSHADOW_ARGS |
- pop rbp |
- ret |
-SECTION_RODATA |
-align 16 |
-shuf_t0t1: |
- db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
-align 16 |
-shuf_t2t3: |
- db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |
-align 16 |
-shuf_t4t5: |
- db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 |
-align 16 |
-shuf_t6t7: |
- db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |
+INIT_XMM ssse3 |
+SUBPIX_VFILTER16 v8 |
+SUBPIX_VFILTER16 v8_avg |
+SUBPIX_VFILTER v8, 8 |
+SUBPIX_VFILTER v8_avg, 8 |
+SUBPIX_VFILTER v8, 4 |
+SUBPIX_VFILTER v8_avg, 4 |