Index: source/libvpx/vp9/common/x86/vp9_loopfilter_sse2.asm |
=================================================================== |
--- source/libvpx/vp9/common/x86/vp9_loopfilter_sse2.asm (revision 0) |
+++ source/libvpx/vp9/common/x86/vp9_loopfilter_sse2.asm (revision 0) |
@@ -0,0 +1,1238 @@ |
+; |
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
+; |
+; Use of this source code is governed by a BSD-style license |
+; that can be found in the LICENSE file in the root of the source |
+; tree. An additional intellectual property rights grant can be found |
+; in the file PATENTS. All contributing project authors may |
+; be found in the AUTHORS file in the root of the source tree. |
+; |
+ |
+ |
+%include "vpx_ports/x86_abi_support.asm" |
+ |
+; Use of pmaxub instead of psubusb to compute filter mask was seen |
+; in ffvp8 |
+ |
+%macro LFH_FILTER_AND_HEV_MASK 1 |
+%if %1 |
+ movdqa xmm2, [rdi+2*rax] ; q3 |
+ movdqa xmm1, [rsi+2*rax] ; q2 |
+ movdqa xmm4, [rsi+rax] ; q1 |
+ movdqa xmm5, [rsi] ; q0 |
+ neg rax ; negate pitch to deal with above border |
+%else |
+ movlps xmm2, [rsi + rcx*2] ; q3 |
+ movlps xmm1, [rsi + rcx] ; q2 |
+ movlps xmm4, [rsi] ; q1 |
+ movlps xmm5, [rsi + rax] ; q0 |
+ |
+ movhps xmm2, [rdi + rcx*2] |
+ movhps xmm1, [rdi + rcx] |
+ movhps xmm4, [rdi] |
+ movhps xmm5, [rdi + rax] |
+ |
+ lea rsi, [rsi + rax*4] |
+ lea rdi, [rdi + rax*4] |
+ |
+ movdqa XMMWORD PTR [rsp], xmm1 ; store q2 |
+ movdqa XMMWORD PTR [rsp + 16], xmm4 ; store q1 |
+%endif |
+ |
+ movdqa xmm6, xmm1 ; q2 |
+ movdqa xmm3, xmm4 ; q1 |
+ |
+ psubusb xmm1, xmm2 ; q2-=q3 |
+ psubusb xmm2, xmm6 ; q3-=q2 |
+ |
+ psubusb xmm4, xmm6 ; q1-=q2 |
+ psubusb xmm6, xmm3 ; q2-=q1 |
+ |
+ por xmm4, xmm6 ; abs(q2-q1) |
+ por xmm1, xmm2 ; abs(q3-q2) |
+ |
+ movdqa xmm0, xmm5 ; q0 |
+ pmaxub xmm1, xmm4 |
+ |
+ psubusb xmm5, xmm3 ; q0-=q1 |
+ psubusb xmm3, xmm0 ; q1-=q0 |
+ |
+ por xmm5, xmm3 ; abs(q0-q1) |
+ movdqa t0, xmm5 ; save to t0 |
+ |
+ pmaxub xmm1, xmm5 |
+ |
+%if %1 |
+ movdqa xmm2, [rsi+4*rax] ; p3 |
+ movdqa xmm4, [rdi+4*rax] ; p2 |
+ movdqa xmm6, [rsi+2*rax] ; p1 |
+%else |
+ movlps xmm2, [rsi + rax] ; p3 |
+ movlps xmm4, [rsi] ; p2 |
+ movlps xmm6, [rsi + rcx] ; p1 |
+ |
+ movhps xmm2, [rdi + rax] |
+ movhps xmm4, [rdi] |
+ movhps xmm6, [rdi + rcx] |
+ |
+ movdqa XMMWORD PTR [rsp + 32], xmm4 ; store p2 |
+ movdqa XMMWORD PTR [rsp + 48], xmm6 ; store p1 |
+%endif |
+ |
+ movdqa xmm5, xmm4 ; p2 |
+ movdqa xmm3, xmm6 ; p1 |
+ |
+ psubusb xmm4, xmm2 ; p2-=p3 |
+ psubusb xmm2, xmm5 ; p3-=p2 |
+ |
+ psubusb xmm3, xmm5 ; p1-=p2 |
+ pmaxub xmm1, xmm4 ; abs(p3 - p2) |
+ |
+ psubusb xmm5, xmm6 ; p2-=p1 |
+ pmaxub xmm1, xmm2 ; abs(p3 - p2) |
+ |
+ pmaxub xmm1, xmm5 ; abs(p2 - p1) |
+ movdqa xmm2, xmm6 ; p1 |
+ |
+ pmaxub xmm1, xmm3 ; abs(p2 - p1) |
+%if %1 |
+ movdqa xmm4, [rsi+rax] ; p0 |
+ movdqa xmm3, [rdi] ; q1 |
+%else |
+ movlps xmm4, [rsi + rcx*2] ; p0 |
+ movhps xmm4, [rdi + rcx*2] |
+ movdqa xmm3, q1 ; q1 |
+%endif |
+ |
+ movdqa xmm5, xmm4 ; p0 |
+ psubusb xmm4, xmm6 ; p0-=p1 |
+ |
+ psubusb xmm6, xmm5 ; p1-=p0 |
+ |
+ por xmm6, xmm4 ; abs(p1 - p0) |
+ mov rdx, arg(2) ; get blimit |
+ |
+ movdqa t1, xmm6 ; save to t1 |
+ |
+ movdqa xmm4, xmm3 ; q1 |
+ pmaxub xmm1, xmm6 |
+ |
+ psubusb xmm3, xmm2 ; q1-=p1 |
+ psubusb xmm2, xmm4 ; p1-=q1 |
+ |
+ psubusb xmm1, xmm7 |
+ por xmm2, xmm3 ; abs(p1-q1) |
+ |
+ movdqa xmm7, XMMWORD PTR [rdx] ; blimit |
+ |
+ movdqa xmm3, xmm0 ; q0 |
+ pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero |
+ |
+ mov rdx, arg(4) ; hev get thresh |
+ |
+ movdqa xmm6, xmm5 ; p0 |
+ psrlw xmm2, 1 ; abs(p1-q1)/2 |
+ |
+ psubusb xmm5, xmm3 ; p0-=q0 |
+ |
+ psubusb xmm3, xmm6 ; q0-=p0 |
+ por xmm5, xmm3 ; abs(p0 - q0) |
+ |
+ paddusb xmm5, xmm5 ; abs(p0-q0)*2 |
+ |
+ movdqa xmm4, t0 ; hev get abs (q1 - q0) |
+ |
+ movdqa xmm3, t1 ; get abs (p1 - p0) |
+ |
+ paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 |
+ |
+ movdqa xmm2, XMMWORD PTR [rdx] ; hev |
+ |
+ psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit |
+ psubusb xmm4, xmm2 ; hev |
+ |
+ psubusb xmm3, xmm2 ; hev |
+ por xmm1, xmm5 |
+ |
+ pxor xmm7, xmm7 |
+ paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh |
+ |
+ pcmpeqb xmm4, xmm5 ; hev |
+ pcmpeqb xmm3, xmm3 ; hev |
+ |
+ pcmpeqb xmm1, xmm7 ; mask xmm1 |
+ pxor xmm4, xmm3 ; hev |
+%endmacro |
+ |
+%macro B_FILTER 1 |
+%if %1 == 0 |
+ movdqa xmm2, p1 ; p1 |
+ movdqa xmm7, q1 ; q1 |
+%elif %1 == 1 |
+ movdqa xmm2, [rsi+2*rax] ; p1 |
+ movdqa xmm7, [rdi] ; q1 |
+%elif %1 == 2 |
+ lea rdx, srct |
+ |
+ movdqa xmm2, [rdx] ; p1 |
+ movdqa xmm7, [rdx+48] ; q1 |
+ movdqa xmm6, [rdx+16] ; p0 |
+ movdqa xmm0, [rdx+32] ; q0 |
+%endif |
+ |
+ pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values |
+ pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values |
+ |
+ psubsb xmm2, xmm7 ; p1 - q1 |
+ pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values |
+ |
+ pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) |
+ pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values |
+ |
+ movdqa xmm3, xmm0 ; q0 |
+ psubsb xmm0, xmm6 ; q0 - p0 |
+ |
+ paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) |
+ |
+ paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) |
+ |
+ paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1) |
+ |
+ pand xmm1, xmm2 ; mask filter values we don't care about |
+ |
+ movdqa xmm2, xmm1 |
+ |
+ paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 |
+ paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 |
+ |
+ punpckhbw xmm5, xmm2 ; axbxcxdx |
+ punpcklbw xmm2, xmm2 ; exfxgxhx |
+ |
+ punpcklbw xmm0, xmm1 ; exfxgxhx |
+ psraw xmm5, 11 ; sign extended shift right by 3 |
+ |
+ punpckhbw xmm1, xmm1 ; axbxcxdx |
+ psraw xmm2, 11 ; sign extended shift right by 3 |
+ |
+ packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; |
+ psraw xmm0, 11 ; sign extended shift right by 3 |
+ |
+ psraw xmm1, 11 ; sign extended shift right by 3 |
+ movdqa xmm5, xmm0 ; save results |
+ |
+ packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 |
+ paddsw xmm5, [GLOBAL(ones)] |
+ |
+ paddsw xmm1, [GLOBAL(ones)] |
+ psraw xmm5, 1 ; partial shifted one more time for 2nd tap |
+ |
+ psraw xmm1, 1 ; partial shifted one more time for 2nd tap |
+ |
+ paddsb xmm6, xmm2 ; p0+= p0 add |
+ packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 |
+ |
+%if %1 == 0 |
+ movdqa xmm1, p1 ; p1 |
+%elif %1 == 1 |
+ movdqa xmm1, [rsi+2*rax] ; p1 |
+%elif %1 == 2 |
+ movdqa xmm1, [rdx] ; p1 |
+%endif |
+ pandn xmm4, xmm5 ; high edge variance additive |
+ pxor xmm6, [GLOBAL(t80)] ; unoffset |
+ |
+ pxor xmm1, [GLOBAL(t80)] ; reoffset |
+ psubsb xmm3, xmm0 ; q0-= q0 add |
+ |
+ paddsb xmm1, xmm4 ; p1+= p1 add |
+ pxor xmm3, [GLOBAL(t80)] ; unoffset |
+ |
+ pxor xmm1, [GLOBAL(t80)] ; unoffset |
+ psubsb xmm7, xmm4 ; q1-= q1 add |
+ |
+ pxor xmm7, [GLOBAL(t80)] ; unoffset |
+%if %1 == 0 |
+ lea rsi, [rsi + rcx*2] |
+ lea rdi, [rdi + rcx*2] |
+ movq MMWORD PTR [rsi], xmm6 ; p0 |
+ movhps MMWORD PTR [rdi], xmm6 |
+ movq MMWORD PTR [rsi + rax], xmm1 ; p1 |
+ movhps MMWORD PTR [rdi + rax], xmm1 |
+ movq MMWORD PTR [rsi + rcx], xmm3 ; q0 |
+ movhps MMWORD PTR [rdi + rcx], xmm3 |
+ movq MMWORD PTR [rsi + rcx*2],xmm7 ; q1 |
+ movhps MMWORD PTR [rdi + rcx*2],xmm7 |
+%elif %1 == 1 |
+ movdqa [rsi+rax], xmm6 ; write back |
+ movdqa [rsi+2*rax], xmm1 ; write back |
+ movdqa [rsi], xmm3 ; write back |
+ movdqa [rdi], xmm7 ; write back |
+%endif |
+ |
+%endmacro |
+ |
+ |
+;void vp9_loop_filter_horizontal_edge_sse2 |
+;( |
+; unsigned char *src_ptr, |
+; int src_pixel_step, |
+; const char *blimit, |
+; const char *limit, |
+; const char *thresh, |
+; int count |
+;) |
+global sym(vp9_loop_filter_horizontal_edge_sse2) |
+sym(vp9_loop_filter_horizontal_edge_sse2): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 6 |
+ SAVE_XMM 7 |
+ GET_GOT rbx |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ |
+ ALIGN_STACK 16, rax |
+ sub rsp, 32 ; reserve 32 bytes |
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; |
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; |
+ |
+ mov rsi, arg(0) ;src_ptr |
+ movsxd rax, dword ptr arg(1) ;src_pixel_step |
+ |
+ mov rdx, arg(3) ;limit |
+ movdqa xmm7, XMMWORD PTR [rdx] |
+ |
+ lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing |
+ |
+ ; calculate breakout conditions and high edge variance |
+ LFH_FILTER_AND_HEV_MASK 1 |
+ ; filter and write back the result |
+ B_FILTER 1 |
+ |
+ add rsp, 32 |
+ pop rsp |
+ ; begin epilog |
+ pop rdi |
+ pop rsi |
+ RESTORE_GOT |
+ RESTORE_XMM |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+ |
+;void vp9_loop_filter_horizontal_edge_uv_sse2 |
+;( |
+; unsigned char *src_ptr, |
+; int src_pixel_step, |
+; const char *blimit, |
+; const char *limit, |
+; const char *thresh, |
+; int count |
+;) |
+global sym(vp9_loop_filter_horizontal_edge_uv_sse2) |
+sym(vp9_loop_filter_horizontal_edge_uv_sse2): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 6 |
+ SAVE_XMM 7 |
+ GET_GOT rbx |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ |
+ ALIGN_STACK 16, rax |
+ sub rsp, 96 ; reserve 96 bytes |
+ %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16]; |
+ %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16]; |
+ %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16]; |
+ %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16]; |
+ %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16]; |
+ %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16]; |
+ |
+ mov rsi, arg(0) ; u |
+ mov rdi, arg(5) ; v |
+ movsxd rax, dword ptr arg(1) ; src_pixel_step |
+ mov rcx, rax |
+ neg rax ; negate pitch to deal with above border |
+ |
+ mov rdx, arg(3) ;limit |
+ movdqa xmm7, XMMWORD PTR [rdx] |
+ |
+ lea rsi, [rsi + rcx] |
+ lea rdi, [rdi + rcx] |
+ |
+ ; calculate breakout conditions and high edge variance |
+ LFH_FILTER_AND_HEV_MASK 0 |
+ ; filter and write back the result |
+ B_FILTER 0 |
+ |
+ add rsp, 96 |
+ pop rsp |
+ ; begin epilog |
+ pop rdi |
+ pop rsi |
+ RESTORE_GOT |
+ RESTORE_XMM |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+ |
+%macro TRANSPOSE_16X8 2 |
+ movq xmm4, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 |
+ movq xmm1, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 |
+ movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 |
+ movq xmm7, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 |
+ movq xmm5, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40 |
+ movq xmm2, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50 |
+ |
+ punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 |
+ |
+ movq xmm1, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 |
+ |
+ movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 |
+ punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20 |
+ |
+ movq xmm7, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 |
+ |
+ punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 |
+%if %1 |
+ lea rsi, [rsi+rax*8] |
+%else |
+ mov rsi, arg(5) ; v_ptr |
+%endif |
+ |
+ movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 |
+ punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 |
+ |
+ punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 |
+ |
+ punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 |
+%if %1 |
+ lea rdi, [rdi+rax*8] |
+%else |
+ lea rsi, [rsi - 4] |
+%endif |
+ |
+ punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 |
+%if %1 |
+ lea rdx, srct |
+%else |
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing |
+%endif |
+ |
+ movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 |
+ punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 |
+ |
+ movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 |
+ punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 |
+ |
+ punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 |
+ |
+ punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 |
+ |
+ punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 |
+ |
+ movdqa t0, xmm2 ; save to free XMM2 |
+ movq xmm2, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 |
+ movq xmm6, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 |
+ movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 |
+ movq xmm5, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 |
+ movq xmm1, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 |
+ |
+ punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 |
+ |
+ movq xmm6, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 |
+ |
+ punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 |
+ |
+ movq xmm5, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 |
+ |
+ punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0 |
+ |
+ movq xmm6, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 |
+ |
+ punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 |
+ |
+ movdqa xmm6, xmm1 ; |
+ punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4 |
+ |
+ punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 |
+ movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 |
+ |
+ punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 |
+ |
+ punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 |
+ |
+ movdqa xmm0, xmm5 |
+ punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 |
+ |
+ punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 |
+ movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 |
+ |
+ punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84 |
+ |
+ punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86 |
+ movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 |
+ |
+ punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 |
+ |
+ punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07 |
+%if %2 |
+ movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 |
+ punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 |
+ |
+ punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 |
+ |
+ movdqa [rdx], xmm2 ; save 2 |
+ |
+ movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 |
+ punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 |
+ |
+ movdqa [rdx+16], xmm3 ; save 3 |
+ |
+ punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 |
+ |
+ movdqa [rdx+32], xmm4 ; save 4 |
+ movdqa [rdx+48], xmm5 ; save 5 |
+ movdqa xmm1, t0 ; get |
+ |
+ movdqa xmm2, xmm1 ; |
+ punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 |
+ |
+ punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 |
+%else |
+ movdqa [rdx+112], xmm7 ; save 7 |
+ |
+ movdqa [rdx+96], xmm6 ; save 6 |
+ |
+ movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 |
+ punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 |
+ |
+ punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 |
+ |
+ movdqa [rdx+32], xmm2 ; save 2 |
+ |
+ movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 |
+ punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 |
+ |
+ movdqa [rdx+48], xmm3 ; save 3 |
+ |
+ punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 |
+ |
+ movdqa [rdx+64], xmm4 ; save 4 |
+ movdqa [rdx+80], xmm5 ; save 5 |
+ movdqa xmm1, t0 ; get |
+ |
+ movdqa xmm2, xmm1 |
+ punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 |
+ |
+ punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 |
+ |
+ movdqa [rdx+16], xmm1 |
+ |
+ movdqa [rdx], xmm2 |
+%endif |
+%endmacro |
+ |
+%macro LFV_FILTER_MASK_HEV_MASK 1 |
+ movdqa xmm0, xmm6 ; q2 |
+ psubusb xmm0, xmm7 ; q2-q3 |
+ |
+ psubusb xmm7, xmm6 ; q3-q2 |
+ movdqa xmm4, xmm5 ; q1 |
+ |
+ por xmm7, xmm0 ; abs (q3-q2) |
+ psubusb xmm4, xmm6 ; q1-q2 |
+ |
+ movdqa xmm0, xmm1 |
+ psubusb xmm6, xmm5 ; q2-q1 |
+ |
+ por xmm6, xmm4 ; abs (q2-q1) |
+ psubusb xmm0, xmm2 ; p2 - p3; |
+ |
+ psubusb xmm2, xmm1 ; p3 - p2; |
+ por xmm0, xmm2 ; abs(p2-p3) |
+%if %1 |
+ movdqa xmm2, [rdx] ; p1 |
+%else |
+ movdqa xmm2, [rdx+32] ; p1 |
+%endif |
+ movdqa xmm5, xmm2 ; p1 |
+ pmaxub xmm0, xmm7 |
+ |
+ psubusb xmm5, xmm1 ; p1-p2 |
+ psubusb xmm1, xmm2 ; p2-p1 |
+ |
+ movdqa xmm7, xmm3 ; p0 |
+ psubusb xmm7, xmm2 ; p0-p1 |
+ |
+ por xmm1, xmm5 ; abs(p2-p1) |
+ pmaxub xmm0, xmm6 |
+ |
+ pmaxub xmm0, xmm1 |
+ movdqa xmm1, xmm2 ; p1 |
+ |
+ psubusb xmm2, xmm3 ; p1-p0 |
+ lea rdx, srct |
+ |
+ por xmm2, xmm7 ; abs(p1-p0) |
+ |
+ movdqa t0, xmm2 ; save abs(p1-p0) |
+ |
+ pmaxub xmm0, xmm2 |
+ |
+%if %1 |
+ movdqa xmm5, [rdx+32] ; q0 |
+ movdqa xmm7, [rdx+48] ; q1 |
+%else |
+ movdqa xmm5, [rdx+64] ; q0 |
+ movdqa xmm7, [rdx+80] ; q1 |
+%endif |
+ mov rdx, arg(3) ; limit |
+ |
+ movdqa xmm6, xmm5 ; q0 |
+ movdqa xmm2, xmm7 ; q1 |
+ |
+ psubusb xmm5, xmm7 ; q0-q1 |
+ psubusb xmm7, xmm6 ; q1-q0 |
+ |
+ por xmm7, xmm5 ; abs(q1-q0) |
+ |
+ movdqa t1, xmm7 ; save abs(q1-q0) |
+ |
+ movdqa xmm4, XMMWORD PTR [rdx]; limit |
+ |
+ pmaxub xmm0, xmm7 |
+ mov rdx, arg(2) ; blimit |
+ |
+ psubusb xmm0, xmm4 |
+ movdqa xmm5, xmm2 ; q1 |
+ |
+ psubusb xmm5, xmm1 ; q1-=p1 |
+ psubusb xmm1, xmm2 ; p1-=q1 |
+ |
+ por xmm5, xmm1 ; abs(p1-q1) |
+ movdqa xmm1, xmm3 ; p0 |
+ |
+ pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero |
+ psubusb xmm1, xmm6 ; p0-q0 |
+ |
+ psrlw xmm5, 1 ; abs(p1-q1)/2 |
+ psubusb xmm6, xmm3 ; q0-p0 |
+ |
+ movdqa xmm4, XMMWORD PTR [rdx]; blimit |
+ |
+ mov rdx, arg(4) ; get thresh |
+ |
+ por xmm1, xmm6 ; abs(q0-p0) |
+ |
+ movdqa xmm6, t0 ; get abs (q1 - q0) |
+ |
+ paddusb xmm1, xmm1 ; abs(q0-p0)*2 |
+ |
+ movdqa xmm3, t1 ; get abs (p1 - p0) |
+ |
+ movdqa xmm7, XMMWORD PTR [rdx] |
+ |
+ paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 |
+ psubusb xmm6, xmm7 ; abs(q1 - q0) > thresh |
+ |
+ psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh |
+ |
+ psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit |
+ por xmm6, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh |
+ |
+ por xmm1, xmm0 ; mask |
+ pcmpeqb xmm6, xmm0 |
+ |
+ pxor xmm0, xmm0 |
+ pcmpeqb xmm4, xmm4 |
+ |
+ pcmpeqb xmm1, xmm0 |
+ pxor xmm4, xmm6 |
+%endmacro |
+ |
+%macro BV_TRANSPOSE 0 |
+ ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 |
+ ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 |
+ ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 |
+ ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 |
+ movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 |
+ punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 |
+ |
+ movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 |
+ punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 |
+ |
+ punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 |
+ |
+ punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 |
+ |
+ movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 |
+ punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 |
+ |
+ punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 |
+ movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 |
+ |
+ punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 |
+ |
+ punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 |
+ ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 |
+ ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 |
+ ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 |
+ ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 |
+%endmacro |
+ |
+%macro BV_WRITEBACK 2 |
+ movd [rsi+2], %1 |
+ psrldq %1, 4 |
+ |
+ movd [rdi+2], %1 |
+ psrldq %1, 4 |
+ |
+ movd [rsi+2*rax+2], %1 |
+ psrldq %1, 4 |
+ |
+ movd [rdi+2*rax+2], %1 |
+ |
+ movd [rsi+4*rax+2], %2 |
+ psrldq %2, 4 |
+ |
+ movd [rdi+4*rax+2], %2 |
+ psrldq %2, 4 |
+ |
+ movd [rsi+2*rcx+2], %2 |
+ psrldq %2, 4 |
+ |
+ movd [rdi+2*rcx+2], %2 |
+%endmacro |
+ |
+ |
+;void vp9_loop_filter_vertical_edge_sse2 |
+;( |
+; unsigned char *src_ptr, |
+; int src_pixel_step, |
+; const char *blimit, |
+; const char *limit, |
+; const char *thresh, |
+; int count |
+;) |
+global sym(vp9_loop_filter_vertical_edge_sse2) |
+sym(vp9_loop_filter_vertical_edge_sse2): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 6 |
+ SAVE_XMM 7 |
+ GET_GOT rbx |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ |
+ ALIGN_STACK 16, rax |
+ sub rsp, 96 ; reserve 96 bytes |
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; |
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; |
+ %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; |
+ |
+ mov rsi, arg(0) ; src_ptr |
+ movsxd rax, dword ptr arg(1) ; src_pixel_step |
+ |
+ lea rsi, [rsi - 4] |
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing |
+ lea rcx, [rax*2+rax] |
+ |
+ ;transpose 16x8 to 8x16, and store the 8-line result on stack. |
+ TRANSPOSE_16X8 1, 1 |
+ |
+ ; calculate filter mask and high edge variance |
+ LFV_FILTER_MASK_HEV_MASK 1 |
+ |
+ ; start work on filters |
+ B_FILTER 2 |
+ |
+ ; tranpose and write back - only work on q1, q0, p0, p1 |
+ BV_TRANSPOSE |
+ ; store 16-line result |
+ |
+ lea rdx, [rax] |
+ neg rdx |
+ |
+ BV_WRITEBACK xmm1, xmm5 |
+ |
+ lea rsi, [rsi+rdx*8] |
+ lea rdi, [rdi+rdx*8] |
+ BV_WRITEBACK xmm2, xmm6 |
+ |
+ add rsp, 96 |
+ pop rsp |
+ ; begin epilog |
+ pop rdi |
+ pop rsi |
+ RESTORE_GOT |
+ RESTORE_XMM |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+ |
+;void vp9_loop_filter_vertical_edge_uv_sse2 |
+;( |
+; unsigned char *u, |
+; int src_pixel_step, |
+; const char *blimit, |
+; const char *limit, |
+; const char *thresh, |
+; unsigned char *v |
+;) |
+global sym(vp9_loop_filter_vertical_edge_uv_sse2) |
+sym(vp9_loop_filter_vertical_edge_uv_sse2): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 6 |
+ SAVE_XMM 7 |
+ GET_GOT rbx |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ |
+ ALIGN_STACK 16, rax |
+ sub rsp, 96 ; reserve 96 bytes |
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; |
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; |
+ %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; |
+ |
+ mov rsi, arg(0) ; u_ptr |
+ movsxd rax, dword ptr arg(1) ; src_pixel_step |
+ |
+ lea rsi, [rsi - 4] |
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing |
+ lea rcx, [rax+2*rax] |
+ |
+ lea rdx, srct |
+ |
+ ;transpose 16x8 to 8x16, and store the 8-line result on stack. |
+ TRANSPOSE_16X8 0, 1 |
+ |
+ ; calculate filter mask and high edge variance |
+ LFV_FILTER_MASK_HEV_MASK 1 |
+ |
+ ; start work on filters |
+ B_FILTER 2 |
+ |
+ ; tranpose and write back - only work on q1, q0, p0, p1 |
+ BV_TRANSPOSE |
+ |
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing |
+ |
+ ; store 16-line result |
+ BV_WRITEBACK xmm1, xmm5 |
+ |
+ mov rsi, arg(0) ; u_ptr |
+ lea rsi, [rsi - 4] |
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing |
+ BV_WRITEBACK xmm2, xmm6 |
+ |
+ add rsp, 96 |
+ pop rsp |
+ ; begin epilog |
+ pop rdi |
+ pop rsi |
+ RESTORE_GOT |
+ RESTORE_XMM |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+;void vp9_loop_filter_simple_horizontal_edge_sse2 |
+;( |
+; unsigned char *src_ptr, |
+; int src_pixel_step, |
+; const char *blimit, |
+;) |
+global sym(vp9_loop_filter_simple_horizontal_edge_sse2) |
+sym(vp9_loop_filter_simple_horizontal_edge_sse2): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 3 |
+ SAVE_XMM 7 |
+ GET_GOT rbx |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ |
+ mov rsi, arg(0) ;src_ptr |
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? |
+ mov rdx, arg(2) ;blimit |
+ movdqa xmm3, XMMWORD PTR [rdx] |
+ |
+ mov rdi, rsi ; rdi points to row +1 for indirect addressing |
+ add rdi, rax |
+ neg rax |
+ |
+ ; calculate mask |
+ movdqa xmm1, [rsi+2*rax] ; p1 |
+ movdqa xmm0, [rdi] ; q1 |
+ movdqa xmm2, xmm1 |
+ movdqa xmm7, xmm0 |
+ movdqa xmm4, xmm0 |
+ psubusb xmm0, xmm1 ; q1-=p1 |
+ psubusb xmm1, xmm4 ; p1-=q1 |
+ por xmm1, xmm0 ; abs(p1-q1) |
+ pand xmm1, [GLOBAL(tfe)] ; set lsb of each byte to zero |
+ psrlw xmm1, 1 ; abs(p1-q1)/2 |
+ |
+ movdqa xmm5, [rsi+rax] ; p0 |
+ movdqa xmm4, [rsi] ; q0 |
+ movdqa xmm0, xmm4 ; q0 |
+ movdqa xmm6, xmm5 ; p0 |
+ psubusb xmm5, xmm4 ; p0-=q0 |
+ psubusb xmm4, xmm6 ; q0-=p0 |
+ por xmm5, xmm4 ; abs(p0 - q0) |
+ paddusb xmm5, xmm5 ; abs(p0-q0)*2 |
+ paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 |
+ |
+ psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit |
+ pxor xmm3, xmm3 |
+ pcmpeqb xmm5, xmm3 |
+ |
+ ; start work on filters |
+ pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values |
+ pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values |
+ psubsb xmm2, xmm7 ; p1 - q1 |
+ |
+ pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values |
+ pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values |
+ movdqa xmm3, xmm0 ; q0 |
+ psubsb xmm0, xmm6 ; q0 - p0 |
+ paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0) |
+ paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0) |
+ paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0) |
+ pand xmm5, xmm2 ; mask filter values we don't care about |
+ |
+ ; do + 4 side |
+ paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 |
+ |
+ movdqa xmm0, xmm5 ; get a copy of filters |
+ psllw xmm0, 8 ; shift left 8 |
+ psraw xmm0, 3 ; arithmetic shift right 11 |
+ psrlw xmm0, 8 |
+ movdqa xmm1, xmm5 ; get a copy of filters |
+ psraw xmm1, 11 ; arithmetic shift right 11 |
+ psllw xmm1, 8 ; shift left 8 to put it back |
+ |
+ por xmm0, xmm1 ; put the two together to get result |
+ |
+ psubsb xmm3, xmm0 ; q0-= q0 add |
+ pxor xmm3, [GLOBAL(t80)] ; unoffset |
+ movdqa [rsi], xmm3 ; write back |
+ |
+ ; now do +3 side |
+ psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4 |
+ |
+ movdqa xmm0, xmm5 ; get a copy of filters |
+ psllw xmm0, 8 ; shift left 8 |
+ psraw xmm0, 3 ; arithmetic shift right 11 |
+ psrlw xmm0, 8 |
+ psraw xmm5, 11 ; arithmetic shift right 11 |
+ psllw xmm5, 8 ; shift left 8 to put it back |
+ por xmm0, xmm5 ; put the two together to get result |
+ |
+ |
+ paddsb xmm6, xmm0 ; p0+= p0 add |
+ pxor xmm6, [GLOBAL(t80)] ; unoffset |
+ movdqa [rsi+rax], xmm6 ; write back |
+ |
+ ; begin epilog |
+ pop rdi |
+ pop rsi |
+ RESTORE_GOT |
+ RESTORE_XMM |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+ |
+;void vp9_loop_filter_simple_vertical_edge_sse2 |
+;( |
+; unsigned char *src_ptr, |
+; int src_pixel_step, |
+; const char *blimit, |
+;) |
+global sym(vp9_loop_filter_simple_vertical_edge_sse2) |
+sym(vp9_loop_filter_simple_vertical_edge_sse2): |
+ push rbp ; save old base pointer value. |
+ mov rbp, rsp ; set new base pointer value. |
+ SHADOW_ARGS_TO_STACK 3 |
+ SAVE_XMM 7 |
+ GET_GOT rbx ; save callee-saved reg |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ |
+ ALIGN_STACK 16, rax |
+ sub rsp, 32 ; reserve 32 bytes |
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; |
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; |
+ |
+ mov rsi, arg(0) ;src_ptr |
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? |
+ |
+ lea rsi, [rsi - 2 ] |
+ lea rdi, [rsi + rax] |
+ lea rdx, [rsi + rax*4] |
+ lea rcx, [rdx + rax] |
+ |
+ movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00 |
+ movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40 |
+ movd xmm2, [rdi] ; 13 12 11 10 |
+ movd xmm3, [rcx] ; 53 52 51 50 |
+ punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00 |
+ punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10 |
+ |
+ movd xmm4, [rsi + rax*2] ; 23 22 21 20 |
+ movd xmm5, [rdx + rax*2] ; 63 62 61 60 |
+ movd xmm6, [rdi + rax*2] ; 33 32 31 30 |
+ movd xmm7, [rcx + rax*2] ; 73 72 71 70 |
+ punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20 |
+ punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30 |
+ |
+ punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 |
+ punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 |
+ |
+ movdqa xmm1, xmm0 |
+ punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 |
+ punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 |
+ |
+ movdqa xmm2, xmm0 |
+ punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 |
+ punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 |
+ |
+ movdqa t0, xmm0 ; save to t0 |
+ movdqa t1, xmm2 ; save to t1 |
+ |
+ lea rsi, [rsi + rax*8] |
+ lea rdi, [rsi + rax] |
+ lea rdx, [rsi + rax*4] |
+ lea rcx, [rdx + rax] |
+ |
+ movd xmm4, [rsi] ; 83 82 81 80 |
+ movd xmm1, [rdx] ; c3 c2 c1 c0 |
+ movd xmm6, [rdi] ; 93 92 91 90 |
+ movd xmm3, [rcx] ; d3 d2 d1 d0 |
+ punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80 |
+ punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90 |
+ |
+ movd xmm0, [rsi + rax*2] ; a3 a2 a1 a0 |
+ movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0 |
+ movd xmm2, [rdi + rax*2] ; b3 b2 b1 b0 |
+ movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0 |
+ punpckldq xmm0, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0 |
+ punpckldq xmm2, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0 |
+ |
+ punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80 |
+ punpcklbw xmm0, xmm2 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0 |
+ |
+ movdqa xmm1, xmm4 |
+ punpcklwd xmm4, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 |
+ punpckhwd xmm1, xmm0 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 |
+ |
+ movdqa xmm6, xmm4 |
+ punpckldq xmm4, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 |
+ punpckhdq xmm6, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 |
+ |
+ movdqa xmm0, t0 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 |
+ movdqa xmm2, t1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 |
+ movdqa xmm1, xmm0 |
+ movdqa xmm3, xmm2 |
+ |
+ punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 |
+ punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 |
+ punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 |
+ punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 |
+ |
+ ; calculate mask |
+ movdqa xmm6, xmm0 ; p1 |
+ movdqa xmm7, xmm3 ; q1 |
+ psubusb xmm7, xmm0 ; q1-=p1 |
+ psubusb xmm6, xmm3 ; p1-=q1 |
+ por xmm6, xmm7 ; abs(p1-q1) |
+ pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero |
+ psrlw xmm6, 1 ; abs(p1-q1)/2 |
+ |
+ movdqa xmm5, xmm1 ; p0 |
+ movdqa xmm4, xmm2 ; q0 |
+ psubusb xmm5, xmm2 ; p0-=q0 |
+ psubusb xmm4, xmm1 ; q0-=p0 |
+ por xmm5, xmm4 ; abs(p0 - q0) |
+ paddusb xmm5, xmm5 ; abs(p0-q0)*2 |
+ paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 |
+ |
+ mov rdx, arg(2) ;blimit |
+ movdqa xmm7, XMMWORD PTR [rdx] |
+ |
+ psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit |
+ pxor xmm7, xmm7 |
+ pcmpeqb xmm5, xmm7 ; mm5 = mask |
+ |
+ ; start work on filters |
+ movdqa t0, xmm0 |
+ movdqa t1, xmm3 |
+ |
+ pxor xmm0, [GLOBAL(t80)] ; p1 offset to convert to signed values |
+ pxor xmm3, [GLOBAL(t80)] ; q1 offset to convert to signed values |
+ |
+ psubsb xmm0, xmm3 ; p1 - q1 |
+ movdqa xmm6, xmm1 ; p0 |
+ |
+ movdqa xmm7, xmm2 ; q0 |
+ pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values |
+ |
+ pxor xmm7, [GLOBAL(t80)] ; offset to convert to signed values |
+ movdqa xmm3, xmm7 ; offseted ; q0 |
+ |
+ psubsb xmm7, xmm6 ; q0 - p0 |
+ paddsb xmm0, xmm7 ; p1 - q1 + 1 * (q0 - p0) |
+ |
+ paddsb xmm0, xmm7 ; p1 - q1 + 2 * (q0 - p0) |
+ paddsb xmm0, xmm7 ; p1 - q1 + 3 * (q0 - p0) |
+ |
+ pand xmm5, xmm0 ; mask filter values we don't care about |
+ |
+ |
+ paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 |
+ |
+ movdqa xmm0, xmm5 ; get a copy of filters |
+ psllw xmm0, 8 ; shift left 8 |
+ |
+ psraw xmm0, 3 ; arithmetic shift right 11 |
+ psrlw xmm0, 8 |
+ |
+ movdqa xmm7, xmm5 ; get a copy of filters |
+ psraw xmm7, 11 ; arithmetic shift right 11 |
+ |
+ psllw xmm7, 8 ; shift left 8 to put it back |
+ por xmm0, xmm7 ; put the two together to get result |
+ |
+ psubsb xmm3, xmm0 ; q0-= q0sz add |
+ pxor xmm3, [GLOBAL(t80)] ; unoffset q0 |
+ |
+ ; now do +3 side |
+ psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4 |
+ movdqa xmm0, xmm5 ; get a copy of filters |
+ |
+ psllw xmm0, 8 ; shift left 8 |
+ psraw xmm0, 3 ; arithmetic shift right 11 |
+ |
+ psrlw xmm0, 8 |
+ psraw xmm5, 11 ; arithmetic shift right 11 |
+ |
+ psllw xmm5, 8 ; shift left 8 to put it back |
+ por xmm0, xmm5 ; put the two together to get result |
+ |
+ paddsb xmm6, xmm0 ; p0+= p0 add |
+ pxor xmm6, [GLOBAL(t80)] ; unoffset p0 |
+ |
+ movdqa xmm0, t0 ; p1 |
+ movdqa xmm4, t1 ; q1 |
+ |
+ ; transpose back to write out |
+ ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 |
+ ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 |
+ ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 |
+ ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 |
+ movdqa xmm1, xmm0 |
+ punpcklbw xmm0, xmm6 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 |
+ punpckhbw xmm1, xmm6 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 |
+ |
+ movdqa xmm5, xmm3 |
+ punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 |
+ punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 |
+ |
+ movdqa xmm2, xmm0 |
+ punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 |
+ punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 |
+ |
+ movdqa xmm3, xmm1 |
+ punpcklwd xmm1, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 |
+ punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 |
+ |
+ ; write out order: xmm0 xmm2 xmm1 xmm3 |
+ lea rdx, [rsi + rax*4] |
+ |
+ movd [rsi], xmm1 ; write the second 8-line result |
+ psrldq xmm1, 4 |
+ movd [rdi], xmm1 |
+ psrldq xmm1, 4 |
+ movd [rsi + rax*2], xmm1 |
+ psrldq xmm1, 4 |
+ movd [rdi + rax*2], xmm1 |
+ |
+ movd [rdx], xmm3 |
+ psrldq xmm3, 4 |
+ movd [rcx], xmm3 |
+ psrldq xmm3, 4 |
+ movd [rdx + rax*2], xmm3 |
+ psrldq xmm3, 4 |
+ movd [rcx + rax*2], xmm3 |
+ |
+ neg rax |
+ lea rsi, [rsi + rax*8] |
+ neg rax |
+ lea rdi, [rsi + rax] |
+ lea rdx, [rsi + rax*4] |
+ lea rcx, [rdx + rax] |
+ |
+ movd [rsi], xmm0 ; write the first 8-line result |
+ psrldq xmm0, 4 |
+ movd [rdi], xmm0 |
+ psrldq xmm0, 4 |
+ movd [rsi + rax*2], xmm0 |
+ psrldq xmm0, 4 |
+ movd [rdi + rax*2], xmm0 |
+ |
+ movd [rdx], xmm2 |
+ psrldq xmm2, 4 |
+ movd [rcx], xmm2 |
+ psrldq xmm2, 4 |
+ movd [rdx + rax*2], xmm2 |
+ psrldq xmm2, 4 |
+ movd [rcx + rax*2], xmm2 |
+ |
+ add rsp, 32 |
+ pop rsp |
+ ; begin epilog |
+ pop rdi |
+ pop rsi |
+ RESTORE_GOT |
+ RESTORE_XMM |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+SECTION_RODATA |
+align 16 |
+tfe: |
+ times 16 db 0xfe |
+align 16 |
+t80: |
+ times 16 db 0x80 |
+align 16 |
+t1s: |
+ times 16 db 0x01 |
+align 16 |
+t3: |
+ times 16 db 0x03 |
+align 16 |
+t4: |
+ times 16 db 0x04 |
+align 16 |
+ones: |
+ times 8 dw 0x0001 |
+align 16 |
+s9: |
+ times 8 dw 0x0900 |
+align 16 |
+s63: |
+ times 8 dw 0x003f |