Index: source/libvpx/vp8/common/x86/loopfilter_sse2.asm |
=================================================================== |
--- source/libvpx/vp8/common/x86/loopfilter_sse2.asm (revision 60257) |
+++ source/libvpx/vp8/common/x86/loopfilter_sse2.asm (working copy) |
@@ -1,5 +1,5 @@ |
; |
-; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. |
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
; |
; Use of this source code is governed by a BSD-style license |
; that can be found in the LICENSE file in the root of the source |
@@ -11,6 +11,8 @@ |
%include "vpx_ports/x86_abi_support.asm" |
+; Use of pmaxub instead of psubusb to compute filter mask was seen |
+; in ffvp8 |
%macro LFH_FILTER_MASK 1 |
%if %1 |
@@ -33,8 +35,6 @@ |
psubusb xmm2, xmm6 ; q3-=q2 |
por xmm1, xmm2 ; abs(q3-q2) |
- psubusb xmm1, xmm7 |
- |
%if %1 |
movdqa xmm4, [rsi+rax] ; q1 |
%else |
@@ -49,10 +49,8 @@ |
psubusb xmm4, xmm6 ; q1-=q2 |
psubusb xmm6, xmm3 ; q2-=q1 |
por xmm4, xmm6 ; abs(q2-q1) |
- psubusb xmm4, xmm7 |
+ pmaxub xmm1, xmm4 |
- por xmm1, xmm4 |
- |
%if %1 |
movdqa xmm4, [rsi] ; q0 |
%else |
@@ -67,10 +65,8 @@ |
psubusb xmm3, xmm0 ; q1-=q0 |
por xmm4, xmm3 ; abs(q0-q1) |
movdqa t0, xmm4 ; save to t0 |
+ pmaxub xmm1, xmm4 |
- psubusb xmm4, xmm7 |
- por xmm1, xmm4 |
- |
%if %1 |
neg rax ; negate pitch to deal with above border |
@@ -95,10 +91,8 @@ |
psubusb xmm4, xmm2 ; p2-=p3 |
psubusb xmm2, xmm5 ; p3-=p2 |
por xmm4, xmm2 ; abs(p3 - p2) |
+ pmaxub xmm1, xmm4 |
- psubusb xmm4, xmm7 |
- por xmm1, xmm4 |
- |
%if %1 |
movdqa xmm4, [rsi+2*rax] ; p1 |
%else |
@@ -113,9 +107,8 @@ |
psubusb xmm4, xmm5 ; p1-=p2 |
psubusb xmm5, xmm3 ; p2-=p1 |
por xmm4, xmm5 ; abs(p2 - p1) |
- psubusb xmm4, xmm7 |
+ pmaxub xmm1, xmm4 |
- por xmm1, xmm4 |
movdqa xmm2, xmm3 ; p1 |
%if %1 |
@@ -133,8 +126,8 @@ |
por xmm4, xmm3 ; abs(p1 - p0) |
movdqa t1, xmm4 ; save to t1 |
- psubusb xmm4, xmm7 |
- por xmm1, xmm4 |
+ pmaxub xmm1, xmm4 |
+ psubusb xmm1, xmm7 |
%if %1 |
movdqa xmm3, [rdi] ; q1 |
@@ -196,12 +189,12 @@ |
pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values |
psubsb xmm2, xmm7 ; p1 - q1 |
- pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) |
pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values |
+ pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) |
pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values |
- movdqa xmm3, xmm0 ; q0 |
+ movdqa xmm3, xmm0 ; q0 |
psubsb xmm0, xmm6 ; q0 - p0 |
paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) |
paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) |
@@ -211,29 +204,28 @@ |
paddsb xmm1, [t4 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 |
paddsb xmm2, [t3 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 |
- pxor xmm0, xmm0 |
- pxor xmm5, xmm5 |
- punpcklbw xmm0, xmm2 |
- punpckhbw xmm5, xmm2 |
- psraw xmm0, 11 |
- psraw xmm5, 11 |
- packsswb xmm0, xmm5 |
- movdqa xmm2, xmm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; |
+ punpckhbw xmm5, xmm2 ; axbxcxdx |
+ punpcklbw xmm2, xmm2 ; exfxgxhx |
- pxor xmm0, xmm0 ; 0 |
- movdqa xmm5, xmm1 ; abcdefgh |
- punpcklbw xmm0, xmm1 ; e0f0g0h0 |
+ psraw xmm5, 11 ; sign extended shift right by 3 |
+ psraw xmm2, 11 ; sign extended shift right by 3 |
+ packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; |
+ |
+ punpcklbw xmm0, xmm1 ; exfxgxhx |
+ punpckhbw xmm1, xmm1 ; axbxcxdx |
+ |
psraw xmm0, 11 ; sign extended shift right by 3 |
- pxor xmm1, xmm1 ; 0 |
- punpckhbw xmm1, xmm5 ; a0b0c0d0 |
psraw xmm1, 11 ; sign extended shift right by 3 |
- movdqa xmm5, xmm0 ; save results |
+ movdqa xmm5, xmm0 ; save results |
packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 |
+ |
paddsw xmm5, [ones GLOBAL] |
paddsw xmm1, [ones GLOBAL] |
+ |
psraw xmm5, 1 ; partial shifted one more time for 2nd tap |
psraw xmm1, 1 ; partial shifted one more time for 2nd tap |
+ |
packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 |
pandn xmm4, xmm5 ; high edge variance additive |
%endmacro |
@@ -433,29 +425,27 @@ |
pand xmm2, xmm4; ; Filter2 = vp8_filter & hev |
movdqa xmm5, xmm2 |
- paddsb xmm5, [t3 GLOBAL] |
+ paddsb xmm5, [t3 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 3) |
- pxor xmm0, xmm0 ; 0 |
- pxor xmm7, xmm7 ; 0 |
- punpcklbw xmm0, xmm5 ; e0f0g0h0 |
- psraw xmm0, 11 ; sign extended shift right by 3 |
- punpckhbw xmm7, xmm5 ; a0b0c0d0 |
+ punpckhbw xmm7, xmm5 ; axbxcxdx |
+ punpcklbw xmm5, xmm5 ; exfxgxhx |
+ |
psraw xmm7, 11 ; sign extended shift right by 3 |
- packsswb xmm0, xmm7 ; Filter2 >>=3; |
- movdqa xmm5, xmm0 ; Filter2 |
- paddsb xmm2, [t4 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 4) |
+ psraw xmm5, 11 ; sign extended shift right by 3 |
- pxor xmm0, xmm0 ; 0 |
- pxor xmm7, xmm7 ; 0 |
- punpcklbw xmm0, xmm2 ; e0f0g0h0 |
- psraw xmm0, 11 ; sign extended shift right by 3 |
- punpckhbw xmm7, xmm2 ; a0b0c0d0 |
+ packsswb xmm5, xmm7 ; Filter2 >>=3; |
+ paddsb xmm2, [t4 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 4) |
+ |
+ punpckhbw xmm7, xmm2 ; axbxcxdx |
+ punpcklbw xmm0, xmm2 ; exfxgxhx |
+ |
psraw xmm7, 11 ; sign extended shift right by 3 |
- packsswb xmm0, xmm7 ; Filter2 >>=3; |
+ psraw xmm0, 11 ; sign extended shift right by 3 |
- psubsb xmm3, xmm0 ; qs0 =qs0 - filter1 |
+ packsswb xmm0, xmm7 ; Filter2 >>=3; |
paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2 |
+ psubsb xmm3, xmm0 ; qs0 =qs0 - filter1 |
pandn xmm4, xmm1 ; vp8_filter&=~hev |
%endmacro |
@@ -465,7 +455,6 @@ |
; *oq0 = s^0x80; |
; s = vp8_signed_char_clamp(ps0 + u); |
; *op0 = s^0x80; |
- pxor xmm0, xmm0 |
pxor xmm1, xmm1 |
pxor xmm2, xmm2 |
@@ -737,30 +726,31 @@ |
%macro TRANSPOSE_16X8_1 0 |
- movq xmm0, QWORD PTR [rdi+rcx*2] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 |
- movq xmm7, QWORD PTR [rsi+rcx*2] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 |
+ movq xmm4, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 |
+ movq xmm7, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 |
- punpcklbw xmm7, xmm0 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 |
- movq xmm0, QWORD PTR [rsi+rcx] |
+ punpcklbw xmm4, xmm7 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 |
+ movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 |
- movq xmm5, QWORD PTR [rsi] ; |
- punpcklbw xmm5, xmm0 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 |
+ movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 |
- movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 |
- punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 |
+ movq xmm7, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 |
+ punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20 |
- punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 |
- movq xmm7, QWORD PTR [rsi + rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 |
+ movq xmm5, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40 |
+ movq xmm2, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50 |
- movq xmm0, QWORD PTR [rsi + rax*2] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 |
- punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20 |
+ punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 |
+ movq xmm7, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 |
- movq xmm4, QWORD PTR [rsi + rax*4] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 |
- movq xmm7, QWORD PTR [rdi + rax*4] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 |
+ movq xmm1, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 |
+ movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 |
- punpcklbw xmm4, xmm7 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 |
- movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 |
+ punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 |
+ punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 |
+ punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 |
+ |
punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 |
punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 |
@@ -777,28 +767,28 @@ |
%endmacro |
%macro TRANSPOSE_16X8_2 1 |
- movq xmm6, QWORD PTR [rdi+rcx*2] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 |
- movq xmm5, QWORD PTR [rsi+rcx*2] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 |
+ movq xmm2, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 |
+ movq xmm5, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 |
- punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 |
- movq xmm6, QWORD PTR [rsi+rcx] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 |
+ punpcklbw xmm2, xmm5 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 |
+ movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 |
- movq xmm1, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 |
+ movq xmm5, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 |
+ punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 |
+ |
+ movq xmm1, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 |
+ movq xmm6, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 |
+ |
punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0 |
+ movq xmm5, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 |
+ movq xmm6, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 |
+ punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 |
+ |
movdqa xmm6, xmm1 ; |
punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4 |
punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 |
- movq xmm5, QWORD PTR [rsi+rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 |
- |
- movq xmm0, QWORD PTR [rsi+rax*2] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 |
- punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 |
- |
- movq xmm2, QWORD PTR [rsi+rax*4] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 |
- movq xmm5, QWORD PTR [rdi+rax*4] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 |
- |
- punpcklbw xmm2, xmm5 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 |
movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 |
punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 |
@@ -875,19 +865,18 @@ |
psubusb xmm0, xmm7 ; q2-q3 |
psubusb xmm7, xmm6 ; q3-q2 |
- por xmm7, xmm0 ; abs (q3-q2) |
- |
movdqa xmm4, xmm5 ; q1 |
+ |
+ por xmm7, xmm0 ; abs (q3-q2) |
psubusb xmm4, xmm6 ; q1-q2 |
+ movdqa xmm0, xmm1 |
psubusb xmm6, xmm5 ; q2-q1 |
- por xmm6, xmm4 ; abs (q2-q1) |
- movdqa xmm0, xmm1 |
- |
+ por xmm6, xmm4 ; abs (q2-q1) |
psubusb xmm0, xmm2 ; p2 - p3; |
- psubusb xmm2, xmm1 ; p3 - p2; |
+ psubusb xmm2, xmm1 ; p3 - p2; |
por xmm0, xmm2 ; abs(p2-p3) |
%if %1 |
movdqa xmm2, [rdx] ; p1 |
@@ -895,39 +884,28 @@ |
movdqa xmm2, [rdx+32] ; p1 |
%endif |
movdqa xmm5, xmm2 ; p1 |
+ pmaxub xmm0, xmm7 |
psubusb xmm5, xmm1 ; p1-p2 |
psubusb xmm1, xmm2 ; p2-p1 |
+ movdqa xmm7, xmm3 ; p0 |
+ psubusb xmm7, xmm2 ; p0-p1 |
+ |
por xmm1, xmm5 ; abs(p2-p1) |
+ pmaxub xmm0, xmm6 |
- mov rdx, arg(3) ; limit |
- movdqa xmm4, [rdx] ; limit |
- |
- psubusb xmm7, xmm4 |
- |
- psubusb xmm0, xmm4 ; abs(p3-p2) > limit |
- psubusb xmm1, xmm4 ; abs(p2-p1) > limit |
- |
- psubusb xmm6, xmm4 ; abs(q2-q1) > limit |
- por xmm7, xmm6 ; or |
- |
- por xmm0, xmm1 |
- por xmm0, xmm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit |
- |
+ pmaxub xmm0, xmm1 |
movdqa xmm1, xmm2 ; p1 |
- movdqa xmm7, xmm3 ; p0 |
- psubusb xmm7, xmm2 ; p0-p1 |
- |
psubusb xmm2, xmm3 ; p1-p0 |
por xmm2, xmm7 ; abs(p1-p0) |
movdqa t0, xmm2 ; save abs(p1-p0) |
lea rdx, srct |
- psubusb xmm2, xmm4 ; abs(p1-p0)>limit |
- por xmm0, xmm2 ; mask |
+ pmaxub xmm0, xmm2 |
+ |
%if %1 |
movdqa xmm5, [rdx+32] ; q0 |
movdqa xmm7, [rdx+48] ; q1 |
@@ -943,10 +921,13 @@ |
por xmm7, xmm5 ; abs(q1-q0) |
movdqa t1, xmm7 ; save abs(q1-q0) |
- psubusb xmm7, xmm4 ; abs(q1-q0)> limit |
- por xmm0, xmm7 ; mask |
+ mov rdx, arg(3) ; limit |
+ movdqa xmm4, [rdx] ; limit |
+ pmaxub xmm0, xmm7 |
+ psubusb xmm0, xmm4 |
+ |
movdqa xmm5, xmm2 ; q1 |
psubusb xmm5, xmm1 ; q1-=p1 |
psubusb xmm1, xmm2 ; p1-=q1 |
@@ -995,7 +976,6 @@ |
lea rdx, srct |
movdqa xmm2, [rdx] ; p1 lea rsi, [rsi+rcx*8] |
- lea rdi, [rsi+rcx] |
movdqa xmm7, [rdx+48] ; q1 |
movdqa xmm6, [rdx+16] ; p0 |
movdqa xmm0, [rdx+32] ; q0 |
@@ -1022,28 +1002,19 @@ |
paddsb xmm1, [t4 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 |
paddsb xmm2, [t3 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 |
- pxor xmm0, xmm0 |
- pxor xmm5, xmm5 |
- punpcklbw xmm0, xmm2 |
- |
punpckhbw xmm5, xmm2 |
- psraw xmm0, 11 |
+ punpcklbw xmm2, xmm2 |
psraw xmm5, 11 |
- packsswb xmm0, xmm5 |
+ psraw xmm2, 11 |
- movdqa xmm2, xmm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; |
+ packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; |
+ punpcklbw xmm0, xmm1 ; exfxgxhx |
- pxor xmm0, xmm0 ; 0 |
- movdqa xmm5, xmm1 ; abcdefgh |
- |
- punpcklbw xmm0, xmm1 ; e0f0g0h0 |
+ punpckhbw xmm1, xmm1 ; axbxcxdx |
psraw xmm0, 11 ; sign extended shift right by 3 |
- pxor xmm1, xmm1 ; 0 |
- punpckhbw xmm1, xmm5 ; a0b0c0d0 |
- |
psraw xmm1, 11 ; sign extended shift right by 3 |
movdqa xmm5, xmm0 ; save results |
@@ -1103,27 +1074,27 @@ |
%endmacro |
%macro BV_WRITEBACK 2 |
- movd [rsi+rax*4+2], %1 |
+ movd [rsi+2], %1 |
psrldq %1, 4 |
- movd [rdi+rax*4+2], %1 |
+ movd [rdi+2], %1 |
psrldq %1, 4 |
- movd [rsi+rax*2+2], %1 |
+ movd [rsi+2*rax+2], %1 |
psrldq %1, 4 |
- movd [rdi+rax*2+2], %1 |
+ movd [rdi+2*rax+2], %1 |
- movd [rsi+2], %2 |
+ movd [rsi+4*rax+2], %2 |
psrldq %2, 4 |
- movd [rdi+2], %2 |
+ movd [rdi+4*rax+2], %2 |
psrldq %2, 4 |
- movd [rdi+rcx+2], %2 |
+ movd [rsi+2*rcx+2], %2 |
psrldq %2, 4 |
- movd [rdi+rcx*2+2], %2 |
+ movd [rdi+2*rcx+2], %2 |
%endmacro |
@@ -1156,16 +1127,15 @@ |
mov rsi, arg(0) ; src_ptr |
movsxd rax, dword ptr arg(1) ; src_pixel_step |
- lea rsi, [rsi + rax*4 - 4] |
+ lea rsi, [rsi - 4] |
lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing |
- mov rcx, rax |
- neg rax |
+ lea rcx, [rax*2+rax] |
;transpose 16x8 to 8x16, and store the 8-line result on stack. |
TRANSPOSE_16X8_1 |
- lea rsi, [rsi+rcx*8] |
- lea rdi, [rdi+rcx*8] |
+ lea rsi, [rsi+rax*8] |
+ lea rdi, [rdi+rax*8] |
lea rdx, srct |
TRANSPOSE_16X8_2 1 |
@@ -1180,10 +1150,14 @@ |
; tranpose and write back - only work on q1, q0, p0, p1 |
BV_TRANSPOSE |
; store 16-line result |
+ |
+ lea rdx, [rax] |
+ neg rdx |
+ |
BV_WRITEBACK xmm1, xmm5 |
- lea rsi, [rsi+rax*8] |
- lea rdi, [rsi+rcx] |
+ lea rsi, [rsi+rdx*8] |
+ lea rdi, [rdi+rdx*8] |
BV_WRITEBACK xmm2, xmm6 |
add rsp, 96 |
@@ -1227,17 +1201,16 @@ |
mov rsi, arg(0) ; u_ptr |
movsxd rax, dword ptr arg(1) ; src_pixel_step |
- lea rsi, [rsi + rax*4 - 4] |
+ lea rsi, [rsi - 4] |
lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing |
- mov rcx, rax |
- neg rax |
+ lea rcx, [rax+2*rax] |
;transpose 16x8 to 8x16, and store the 8-line result on stack. |
TRANSPOSE_16X8_1 |
- mov rsi, arg(5) ; v_ptr |
- lea rsi, [rsi + rcx*4 - 4] |
- lea rdi, [rsi + rcx] ; rdi points to row +1 for indirect addressing |
+ mov rsi, arg(5) ; v_ptr |
+ lea rsi, [rsi - 4] |
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing |
lea rdx, srct |
TRANSPOSE_16X8_2 1 |
@@ -1252,12 +1225,15 @@ |
; tranpose and write back - only work on q1, q0, p0, p1 |
BV_TRANSPOSE |
+ |
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing |
+ |
; store 16-line result |
BV_WRITEBACK xmm1, xmm5 |
- mov rsi, arg(0) ;u_ptr |
- lea rsi, [rsi + rcx*4 - 4] |
- lea rdi, [rsi + rcx] |
+ mov rsi, arg(0) ; u_ptr |
+ lea rsi, [rsi - 4] |
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing |
BV_WRITEBACK xmm2, xmm6 |
add rsp, 96 |
@@ -1303,28 +1279,22 @@ |
movdqa xmm5, xmm2 |
paddsb xmm5, [t3 GLOBAL] |
- pxor xmm0, xmm0 ; 0 |
- pxor xmm7, xmm7 ; 0 |
+ punpckhbw xmm7, xmm5 ; axbxcxdx |
+ punpcklbw xmm5, xmm5 ; exfxgxhx |
- punpcklbw xmm0, xmm5 ; e0f0g0h0 |
- psraw xmm0, 11 ; sign extended shift right by 3 |
- |
- punpckhbw xmm7, xmm5 ; a0b0c0d0 |
psraw xmm7, 11 ; sign extended shift right by 3 |
+ psraw xmm5, 11 ; sign extended shift right by 3 |
- packsswb xmm0, xmm7 ; Filter2 >>=3; |
- movdqa xmm5, xmm0 ; Filter2 |
+ packsswb xmm5, xmm7 ; Filter2 >>=3; |
paddsb xmm2, [t4 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 4) |
- pxor xmm0, xmm0 ; 0 |
- pxor xmm7, xmm7 ; 0 |
- punpcklbw xmm0, xmm2 ; e0f0g0h0 |
+ punpcklbw xmm0, xmm2 ; exfxgxhx |
+ punpckhbw xmm7, xmm2 ; axbxcxdx |
psraw xmm0, 11 ; sign extended shift right by 3 |
- punpckhbw xmm7, xmm2 ; a0b0c0d0 |
- |
psraw xmm7, 11 ; sign extended shift right by 3 |
+ |
packsswb xmm0, xmm7 ; Filter2 >>=3; |
psubsb xmm3, xmm0 ; qs0 =qs0 - filter1 |
@@ -1339,7 +1309,6 @@ |
; *oq0 = s^0x80; |
; s = vp8_signed_char_clamp(ps0 + u); |
; *op0 = s^0x80; |
- pxor xmm0, xmm0 |
pxor xmm1, xmm1 |
pxor xmm2, xmm2 |
@@ -1479,28 +1448,30 @@ |
%endmacro |
%macro MBV_WRITEBACK_1 0 |
- movq QWORD PTR [rsi+rax*4], xmm0 |
+ movq QWORD PTR [rsi], xmm0 |
psrldq xmm0, 8 |
- movq QWORD PTR [rsi+rax*2], xmm6 |
+ movq QWORD PTR [rdi], xmm0 |
+ |
+ movq QWORD PTR [rsi+2*rax], xmm6 |
psrldq xmm6, 8 |
- movq QWORD PTR [rdi+rax*4], xmm0 |
- movq QWORD PTR [rsi+rax], xmm6 |
+ movq QWORD PTR [rdi+2*rax], xmm6 |
movdqa xmm0, xmm5 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 |
punpckldq xmm0, xmm7 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40 |
punpckhdq xmm5, xmm7 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60 |
- movq QWORD PTR [rsi], xmm0 |
+ movq QWORD PTR [rsi+4*rax], xmm0 |
psrldq xmm0, 8 |
- movq QWORD PTR [rsi+rcx*2], xmm5 |
+ movq QWORD PTR [rdi+4*rax], xmm0 |
+ |
+ movq QWORD PTR [rsi+2*rcx], xmm5 |
psrldq xmm5, 8 |
- movq QWORD PTR [rsi+rcx], xmm0 |
- movq QWORD PTR [rdi+rcx*2], xmm5 |
+ movq QWORD PTR [rdi+2*rcx], xmm5 |
movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 |
punpckhbw xmm2, [rdx+80] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 |
@@ -1518,28 +1489,30 @@ |
%endmacro |
%macro MBV_WRITEBACK_2 0 |
- movq QWORD PTR [rsi+rax*4], xmm1 |
+ movq QWORD PTR [rsi], xmm1 |
psrldq xmm1, 8 |
- movq QWORD PTR [rsi+rax*2], xmm3 |
+ movq QWORD PTR [rdi], xmm1 |
+ |
+ movq QWORD PTR [rsi+2*rax], xmm3 |
psrldq xmm3, 8 |
- movq QWORD PTR [rdi+rax*4], xmm1 |
- movq QWORD PTR [rsi+rax], xmm3 |
+ movq QWORD PTR [rdi+2*rax], xmm3 |
movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 |
punpckldq xmm1, xmm2 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0 |
punpckhdq xmm4, xmm2 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0 |
- movq QWORD PTR [rsi], xmm1 |
+ movq QWORD PTR [rsi+4*rax], xmm1 |
psrldq xmm1, 8 |
- movq QWORD PTR [rsi+rcx*2], xmm4 |
+ movq QWORD PTR [rdi+4*rax], xmm1 |
+ |
+ movq QWORD PTR [rsi+2*rcx], xmm4 |
psrldq xmm4, 8 |
- movq QWORD PTR [rsi+rcx], xmm1 |
- movq QWORD PTR [rdi+rcx*2], xmm4 |
+ movq QWORD PTR [rdi+2*rcx], xmm4 |
%endmacro |
@@ -1569,20 +1542,19 @@ |
%define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; |
%define srct [rsp + 32] ;__declspec(align(16)) char srct[128]; |
- mov rsi, arg(0) ;src_ptr |
- movsxd rax, dword ptr arg(1) ;src_pixel_step |
+ mov rsi, arg(0) ; src_ptr |
+ movsxd rax, dword ptr arg(1) ; src_pixel_step |
- lea rsi, [rsi + rax*4 - 4] |
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing |
- mov rcx, rax |
- neg rax |
+ lea rsi, [rsi - 4] |
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing |
+ lea rcx, [rax*2+rax] |
; Transpose |
TRANSPOSE_16X8_1 |
- lea rsi, [rsi+rcx*8] |
- lea rdi, [rdi+rcx*8] |
- lea rdx, srct |
+ lea rsi, [rsi+rax*8] |
+ lea rdi, [rdi+rax*8] |
+ lea rdx, srct |
TRANSPOSE_16X8_2 0 |
; calculate filter mask |
@@ -1590,18 +1562,22 @@ |
; calculate high edge variance |
LFV_HEV_MASK |
+ neg rax |
; start work on filters |
MBV_FILTER |
+ lea rsi, [rsi+rax*8] |
+ lea rdi, [rdi+rax*8] |
+ |
; transpose and write back |
MBV_TRANSPOSE |
- lea rsi, [rsi+rax*8] |
- lea rdi, [rdi+rax*8] |
+ neg rax |
+ |
MBV_WRITEBACK_1 |
- lea rsi, [rsi+rcx*8] |
- lea rdi, [rdi+rcx*8] |
+ lea rsi, [rsi+rax*8] |
+ lea rdi, [rdi+rax*8] |
MBV_WRITEBACK_2 |
add rsp, 160 |
@@ -1642,21 +1618,20 @@ |
%define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; |
%define srct [rsp + 32] ;__declspec(align(16)) char srct[128]; |
- mov rsi, arg(0) ;u_ptr |
- movsxd rax, dword ptr arg(1) ; src_pixel_step |
+ mov rsi, arg(0) ; u_ptr |
+ movsxd rax, dword ptr arg(1) ; src_pixel_step |
- lea rsi, [rsi + rax*4 - 4] |
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing |
- mov rcx, rax |
- neg rax |
+ lea rsi, [rsi - 4] |
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing |
+ lea rcx, [rax+2*rax] |
; Transpose |
TRANSPOSE_16X8_1 |
; XMM3 XMM4 XMM7 in use |
- mov rsi, arg(5) ;v_ptr |
- lea rsi, [rsi + rcx*4 - 4] |
- lea rdi, [rsi + rcx] |
+ mov rsi, arg(5) ; v_ptr |
+ lea rsi, [rsi - 4] |
+ lea rdi, [rsi + rax] |
lea rdx, srct |
TRANSPOSE_16X8_2 0 |
@@ -1672,12 +1647,12 @@ |
MBV_TRANSPOSE |
mov rsi, arg(0) ;u_ptr |
- lea rsi, [rsi + rcx*4 - 4] |
- lea rdi, [rsi + rcx] |
+ lea rsi, [rsi - 4] |
+ lea rdi, [rsi + rax] |
MBV_WRITEBACK_1 |
mov rsi, arg(5) ;v_ptr |
- lea rsi, [rsi + rcx*4 - 4] |
- lea rdi, [rsi + rcx] |
+ lea rsi, [rsi - 4] |
+ lea rdi, [rsi + rax] |
MBV_WRITEBACK_2 |
add rsp, 160 |