| Index: libvpx/source/libvpx/vp8/common/x86/loopfilter_sse2.asm
|
| diff --git a/libvpx/source/libvpx/vp8/common/x86/loopfilter_sse2.asm b/libvpx/source/libvpx/vp8/common/x86/loopfilter_sse2.asm
|
| index 4efff7eb584d227a16b9dabe66094ffbf0814b68..849133dc42ae1b5aee3214ff8cab46ddab9c18e3 100644
|
| --- a/libvpx/source/libvpx/vp8/common/x86/loopfilter_sse2.asm
|
| +++ b/libvpx/source/libvpx/vp8/common/x86/loopfilter_sse2.asm
|
| @@ -110,7 +110,7 @@
|
| psubusb xmm6, xmm5 ; p1-=p0
|
|
|
| por xmm6, xmm4 ; abs(p1 - p0)
|
| - mov rdx, arg(2) ; get blimit
|
| + mov rdx, arg(2) ; get flimit
|
|
|
| movdqa t1, xmm6 ; save to t1
|
|
|
| @@ -123,7 +123,7 @@
|
| psubusb xmm1, xmm7
|
| por xmm2, xmm3 ; abs(p1-q1)
|
|
|
| - movdqa xmm7, XMMWORD PTR [rdx] ; blimit
|
| + movdqa xmm4, XMMWORD PTR [rdx] ; flimit
|
|
|
| movdqa xmm3, xmm0 ; q0
|
| pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
|
| @@ -134,11 +134,13 @@
|
| psrlw xmm2, 1 ; abs(p1-q1)/2
|
|
|
| psubusb xmm5, xmm3 ; p0-=q0
|
| + paddb xmm4, xmm4 ; flimit*2 (less than 255)
|
|
|
| psubusb xmm3, xmm6 ; q0-=p0
|
| por xmm5, xmm3 ; abs(p0 - q0)
|
|
|
| paddusb xmm5, xmm5 ; abs(p0-q0)*2
|
| + paddb xmm7, xmm4 ; flimit * 2 + limit (less than 255)
|
|
|
| movdqa xmm4, t0 ; hev get abs (q1 - q0)
|
|
|
| @@ -148,7 +150,7 @@
|
|
|
| movdqa xmm2, XMMWORD PTR [rdx] ; hev
|
|
|
| - psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
|
| + psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
|
| psubusb xmm4, xmm2 ; hev
|
|
|
| psubusb xmm3, xmm2 ; hev
|
| @@ -276,7 +278,7 @@
|
| ;(
|
| ; unsigned char *src_ptr,
|
| ; int src_pixel_step,
|
| -; const char *blimit,
|
| +; const char *flimit,
|
| ; const char *limit,
|
| ; const char *thresh,
|
| ; int count
|
| @@ -286,7 +288,7 @@ sym(vp8_loop_filter_horizontal_edge_sse2):
|
| push rbp
|
| mov rbp, rsp
|
| SHADOW_ARGS_TO_STACK 6
|
| - SAVE_XMM 7
|
| + SAVE_XMM
|
| GET_GOT rbx
|
| push rsi
|
| push rdi
|
| @@ -326,7 +328,7 @@ sym(vp8_loop_filter_horizontal_edge_sse2):
|
| ;(
|
| ; unsigned char *src_ptr,
|
| ; int src_pixel_step,
|
| -; const char *blimit,
|
| +; const char *flimit,
|
| ; const char *limit,
|
| ; const char *thresh,
|
| ; int count
|
| @@ -336,7 +338,7 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2):
|
| push rbp
|
| mov rbp, rsp
|
| SHADOW_ARGS_TO_STACK 6
|
| - SAVE_XMM 7
|
| + SAVE_XMM
|
| GET_GOT rbx
|
| push rsi
|
| push rdi
|
| @@ -572,7 +574,7 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2):
|
| ;(
|
| ; unsigned char *src_ptr,
|
| ; int src_pixel_step,
|
| -; const char *blimit,
|
| +; const char *flimit,
|
| ; const char *limit,
|
| ; const char *thresh,
|
| ; int count
|
| @@ -582,7 +584,7 @@ sym(vp8_mbloop_filter_horizontal_edge_sse2):
|
| push rbp
|
| mov rbp, rsp
|
| SHADOW_ARGS_TO_STACK 6
|
| - SAVE_XMM 7
|
| + SAVE_XMM
|
| GET_GOT rbx
|
| push rsi
|
| push rdi
|
| @@ -622,7 +624,7 @@ sym(vp8_mbloop_filter_horizontal_edge_sse2):
|
| ;(
|
| ; unsigned char *u,
|
| ; int src_pixel_step,
|
| -; const char *blimit,
|
| +; const char *flimit,
|
| ; const char *limit,
|
| ; const char *thresh,
|
| ; unsigned char *v
|
| @@ -632,7 +634,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
|
| push rbp
|
| mov rbp, rsp
|
| SHADOW_ARGS_TO_STACK 6
|
| - SAVE_XMM 7
|
| + SAVE_XMM
|
| GET_GOT rbx
|
| push rsi
|
| push rdi
|
| @@ -902,7 +904,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
|
| movdqa xmm4, XMMWORD PTR [rdx]; limit
|
|
|
| pmaxub xmm0, xmm7
|
| - mov rdx, arg(2) ; blimit
|
| + mov rdx, arg(2) ; flimit
|
|
|
| psubusb xmm0, xmm4
|
| movdqa xmm5, xmm2 ; q1
|
| @@ -919,11 +921,12 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
|
| psrlw xmm5, 1 ; abs(p1-q1)/2
|
| psubusb xmm6, xmm3 ; q0-p0
|
|
|
| - movdqa xmm4, XMMWORD PTR [rdx]; blimit
|
| + movdqa xmm2, XMMWORD PTR [rdx]; flimit
|
|
|
| mov rdx, arg(4) ; get thresh
|
|
|
| por xmm1, xmm6 ; abs(q0-p0)
|
| + paddb xmm2, xmm2 ; flimit*2 (less than 255)
|
|
|
| movdqa xmm6, t0 ; get abs (q1 - q0)
|
|
|
| @@ -936,9 +939,10 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
|
| paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
|
| psubusb xmm6, xmm7 ; abs(q1 - q0) > thresh
|
|
|
| + paddb xmm4, xmm2 ; flimit * 2 + limit (less than 255)
|
| psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh
|
|
|
| - psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
|
| + psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
|
| por xmm6, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
|
|
|
| por xmm1, xmm0 ; mask
|
| @@ -1010,7 +1014,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
|
| ;(
|
| ; unsigned char *src_ptr,
|
| ; int src_pixel_step,
|
| -; const char *blimit,
|
| +; const char *flimit,
|
| ; const char *limit,
|
| ; const char *thresh,
|
| ; int count
|
| @@ -1020,7 +1024,7 @@ sym(vp8_loop_filter_vertical_edge_sse2):
|
| push rbp
|
| mov rbp, rsp
|
| SHADOW_ARGS_TO_STACK 6
|
| - SAVE_XMM 7
|
| + SAVE_XMM
|
| GET_GOT rbx
|
| push rsi
|
| push rdi
|
| @@ -1077,7 +1081,7 @@ sym(vp8_loop_filter_vertical_edge_sse2):
|
| ;(
|
| ; unsigned char *u,
|
| ; int src_pixel_step,
|
| -; const char *blimit,
|
| +; const char *flimit,
|
| ; const char *limit,
|
| ; const char *thresh,
|
| ; unsigned char *v
|
| @@ -1087,7 +1091,7 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2):
|
| push rbp
|
| mov rbp, rsp
|
| SHADOW_ARGS_TO_STACK 6
|
| - SAVE_XMM 7
|
| + SAVE_XMM
|
| GET_GOT rbx
|
| push rsi
|
| push rdi
|
| @@ -1235,7 +1239,7 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2):
|
| ;(
|
| ; unsigned char *src_ptr,
|
| ; int src_pixel_step,
|
| -; const char *blimit,
|
| +; const char *flimit,
|
| ; const char *limit,
|
| ; const char *thresh,
|
| ; int count
|
| @@ -1245,7 +1249,7 @@ sym(vp8_mbloop_filter_vertical_edge_sse2):
|
| push rbp
|
| mov rbp, rsp
|
| SHADOW_ARGS_TO_STACK 6
|
| - SAVE_XMM 7
|
| + SAVE_XMM
|
| GET_GOT rbx
|
| push rsi
|
| push rdi
|
| @@ -1304,7 +1308,7 @@ sym(vp8_mbloop_filter_vertical_edge_sse2):
|
| ;(
|
| ; unsigned char *u,
|
| ; int src_pixel_step,
|
| -; const char *blimit,
|
| +; const char *flimit,
|
| ; const char *limit,
|
| ; const char *thresh,
|
| ; unsigned char *v
|
| @@ -1314,7 +1318,7 @@ sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
|
| push rbp
|
| mov rbp, rsp
|
| SHADOW_ARGS_TO_STACK 6
|
| - SAVE_XMM 7
|
| + SAVE_XMM
|
| GET_GOT rbx
|
| push rsi
|
| push rdi
|
| @@ -1372,14 +1376,17 @@ sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
|
| ;(
|
| ; unsigned char *src_ptr,
|
| ; int src_pixel_step,
|
| -; const char *blimit,
|
| +; const char *flimit,
|
| +; const char *limit,
|
| +; const char *thresh,
|
| +; int count
|
| ;)
|
| global sym(vp8_loop_filter_simple_horizontal_edge_sse2)
|
| sym(vp8_loop_filter_simple_horizontal_edge_sse2):
|
| push rbp
|
| mov rbp, rsp
|
| - SHADOW_ARGS_TO_STACK 3
|
| - SAVE_XMM 7
|
| + SHADOW_ARGS_TO_STACK 6
|
| + SAVE_XMM
|
| GET_GOT rbx
|
| push rsi
|
| push rdi
|
| @@ -1387,8 +1394,13 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
|
|
|
| mov rsi, arg(0) ;src_ptr
|
| movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
|
| - mov rdx, arg(2) ;blimit
|
| + mov rdx, arg(2) ;flimit ; get flimit
|
| movdqa xmm3, XMMWORD PTR [rdx]
|
| + mov rdx, arg(3) ;limit
|
| + movdqa xmm7, XMMWORD PTR [rdx]
|
| +
|
| + paddb xmm3, xmm3 ; flimit*2 (less than 255)
|
| + paddb xmm3, xmm7 ; flimit * 2 + limit (less than 255)
|
|
|
| mov rdi, rsi ; rdi points to row +1 for indirect addressing
|
| add rdi, rax
|
| @@ -1416,7 +1428,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
|
| paddusb xmm5, xmm5 ; abs(p0-q0)*2
|
| paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
|
|
|
| - psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
|
| + psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
|
| pxor xmm3, xmm3
|
| pcmpeqb xmm5, xmm3
|
|
|
| @@ -1481,14 +1493,17 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
|
| ;(
|
| ; unsigned char *src_ptr,
|
| ; int src_pixel_step,
|
| -; const char *blimit,
|
| +; const char *flimit,
|
| +; const char *limit,
|
| +; const char *thresh,
|
| +; int count
|
| ;)
|
| global sym(vp8_loop_filter_simple_vertical_edge_sse2)
|
| sym(vp8_loop_filter_simple_vertical_edge_sse2):
|
| push rbp ; save old base pointer value.
|
| mov rbp, rsp ; set new base pointer value.
|
| - SHADOW_ARGS_TO_STACK 3
|
| - SAVE_XMM 7
|
| + SHADOW_ARGS_TO_STACK 6
|
| + SAVE_XMM
|
| GET_GOT rbx ; save callee-saved reg
|
| push rsi
|
| push rdi
|
| @@ -1592,10 +1607,14 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
|
| paddusb xmm5, xmm5 ; abs(p0-q0)*2
|
| paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
|
|
|
| - mov rdx, arg(2) ;blimit
|
| + mov rdx, arg(2) ;flimit
|
| movdqa xmm7, XMMWORD PTR [rdx]
|
| + mov rdx, arg(3) ; get limit
|
| + movdqa xmm6, XMMWORD PTR [rdx]
|
| + paddb xmm7, xmm7 ; flimit*2 (less than 255)
|
| + paddb xmm7, xmm6 ; flimit * 2 + limit (less than 255)
|
|
|
| - psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
|
| + psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
|
| pxor xmm7, xmm7
|
| pcmpeqb xmm5, xmm7 ; mm5 = mask
|
|
|
|
|