| Index: libvpx/source/libvpx/vp8/common/x86/loopfilter_mmx.asm
|
| diff --git a/libvpx/source/libvpx/vp8/common/x86/loopfilter_mmx.asm b/libvpx/source/libvpx/vp8/common/x86/loopfilter_mmx.asm
|
| index ad47284cf9b6c202e7f9a41a7cd1809d11f0fab9..c6c215c3c6fcaef13c4328534d58fec0927aa16c 100644
|
| --- a/libvpx/source/libvpx/vp8/common/x86/loopfilter_mmx.asm
|
| +++ b/libvpx/source/libvpx/vp8/common/x86/loopfilter_mmx.asm
|
| @@ -16,7 +16,7 @@
|
| ;(
|
| ; unsigned char *src_ptr,
|
| ; int src_pixel_step,
|
| -; const char *blimit,
|
| +; const char *flimit,
|
| ; const char *limit,
|
| ; const char *thresh,
|
| ; int count
|
| @@ -122,10 +122,12 @@ next8_h:
|
| paddusb mm5, mm5 ; abs(p0-q0)*2
|
| paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
|
|
|
| - mov rdx, arg(2) ;blimit ; get blimit
|
| - movq mm7, [rdx] ; blimit
|
| + mov rdx, arg(2) ;flimit ; get flimit
|
| + movq mm2, [rdx] ; flimit mm2
|
| + paddb mm2, mm2 ; flimit*2 (less than 255)
|
| + paddb mm7, mm2 ; flimit * 2 + limit (less than 255)
|
|
|
| - psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
|
| + psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
|
| por mm1, mm5
|
| pxor mm5, mm5
|
| pcmpeqb mm1, mm5 ; mask mm1
|
| @@ -228,7 +230,7 @@ next8_h:
|
| ;(
|
| ; unsigned char *src_ptr,
|
| ; int src_pixel_step,
|
| -; const char *blimit,
|
| +; const char *flimit,
|
| ; const char *limit,
|
| ; const char *thresh,
|
| ; int count
|
| @@ -404,9 +406,9 @@ next8_v:
|
| pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
|
| psrlw mm5, 1 ; abs(p1-q1)/2
|
|
|
| - mov rdx, arg(2) ;blimit ;
|
| + mov rdx, arg(2) ;flimit ;
|
|
|
| - movq mm4, [rdx] ;blimit
|
| + movq mm2, [rdx] ;flimit mm2
|
| movq mm1, mm3 ; mm1=mm3=p0
|
|
|
| movq mm7, mm6 ; mm7=mm6=q0
|
| @@ -417,7 +419,10 @@ next8_v:
|
| paddusb mm1, mm1 ; abs(q0-p0)*2
|
| paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
|
|
|
| - psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
|
| + paddb mm2, mm2 ; flimit*2 (less than 255)
|
| + paddb mm4, mm2 ; flimit * 2 + limit (less than 255)
|
| +
|
| + psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
|
| por mm1, mm0; ; mask
|
|
|
| pxor mm0, mm0
|
| @@ -598,7 +603,7 @@ next8_v:
|
| ;(
|
| ; unsigned char *src_ptr,
|
| ; int src_pixel_step,
|
| -; const char *blimit,
|
| +; const char *flimit,
|
| ; const char *limit,
|
| ; const char *thresh,
|
| ; int count
|
| @@ -714,15 +719,17 @@ next8_mbh:
|
| paddusb mm5, mm5 ; abs(p0-q0)*2
|
| paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
|
|
|
| - mov rdx, arg(2) ;blimit ; get blimit
|
| - movq mm7, [rdx] ; blimit
|
| + mov rdx, arg(2) ;flimit ; get flimit
|
| + movq mm2, [rdx] ; flimit mm2
|
| + paddb mm2, mm2 ; flimit*2 (less than 255)
|
| + paddb mm7, mm2 ; flimit * 2 + limit (less than 255)
|
|
|
| - psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
|
| + psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
|
| por mm1, mm5
|
| pxor mm5, mm5
|
| pcmpeqb mm1, mm5 ; mask mm1
|
|
|
| - ; mm1 = mask, mm0=q0, mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
|
| + ; mm1 = mask, mm0=q0, mm7 = flimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
|
| ; mm6 = p0,
|
|
|
| ; calculate high edge variance
|
| @@ -915,7 +922,7 @@ next8_mbh:
|
| ;(
|
| ; unsigned char *src_ptr,
|
| ; int src_pixel_step,
|
| -; const char *blimit,
|
| +; const char *flimit,
|
| ; const char *limit,
|
| ; const char *thresh,
|
| ; int count
|
| @@ -1101,9 +1108,9 @@ next8_mbv:
|
| pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
|
| psrlw mm5, 1 ; abs(p1-q1)/2
|
|
|
| - mov rdx, arg(2) ;blimit ;
|
| + mov rdx, arg(2) ;flimit ;
|
|
|
| - movq mm4, [rdx] ;blimit
|
| + movq mm2, [rdx] ;flimit mm2
|
| movq mm1, mm3 ; mm1=mm3=p0
|
|
|
| movq mm7, mm6 ; mm7=mm6=q0
|
| @@ -1114,7 +1121,10 @@ next8_mbv:
|
| paddusb mm1, mm1 ; abs(q0-p0)*2
|
| paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
|
|
|
| - psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
|
| + paddb mm2, mm2 ; flimit*2 (less than 255)
|
| + paddb mm4, mm2 ; flimit * 2 + limit (less than 255)
|
| +
|
| + psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
|
| por mm1, mm0; ; mask
|
|
|
| pxor mm0, mm0
|
| @@ -1382,13 +1392,16 @@ next8_mbv:
|
| ;(
|
| ; unsigned char *src_ptr,
|
| ; int src_pixel_step,
|
| -; const char *blimit
|
| +; const char *flimit,
|
| +; const char *limit,
|
| +; const char *thresh,
|
| +; int count
|
| ;)
|
| global sym(vp8_loop_filter_simple_horizontal_edge_mmx)
|
| sym(vp8_loop_filter_simple_horizontal_edge_mmx):
|
| push rbp
|
| mov rbp, rsp
|
| - SHADOW_ARGS_TO_STACK 3
|
| + SHADOW_ARGS_TO_STACK 6
|
| GET_GOT rbx
|
| push rsi
|
| push rdi
|
| @@ -1397,10 +1410,14 @@ sym(vp8_loop_filter_simple_horizontal_edge_mmx):
|
| mov rsi, arg(0) ;src_ptr
|
| movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
|
|
|
| - mov rcx, 2 ; count
|
| + movsxd rcx, dword ptr arg(5) ;count
|
| nexts8_h:
|
| - mov rdx, arg(2) ;blimit ; get blimit
|
| + mov rdx, arg(3) ;limit
|
| + movq mm7, [rdx]
|
| + mov rdx, arg(2) ;flimit ; get flimit
|
| movq mm3, [rdx] ;
|
| + paddb mm3, mm3 ; flimit*2 (less than 255)
|
| + paddb mm3, mm7 ; flimit * 2 + limit (less than 255)
|
|
|
| mov rdi, rsi ; rdi points to row +1 for indirect addressing
|
| add rdi, rax
|
| @@ -1428,7 +1445,7 @@ nexts8_h:
|
| paddusb mm5, mm5 ; abs(p0-q0)*2
|
| paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
|
|
|
| - psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
|
| + psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
|
| pxor mm3, mm3
|
| pcmpeqb mm5, mm3
|
|
|
| @@ -1498,13 +1515,16 @@ nexts8_h:
|
| ;(
|
| ; unsigned char *src_ptr,
|
| ; int src_pixel_step,
|
| -; const char *blimit
|
| +; const char *flimit,
|
| +; const char *limit,
|
| +; const char *thresh,
|
| +; int count
|
| ;)
|
| global sym(vp8_loop_filter_simple_vertical_edge_mmx)
|
| sym(vp8_loop_filter_simple_vertical_edge_mmx):
|
| push rbp
|
| mov rbp, rsp
|
| - SHADOW_ARGS_TO_STACK 3
|
| + SHADOW_ARGS_TO_STACK 6
|
| GET_GOT rbx
|
| push rsi
|
| push rdi
|
| @@ -1519,7 +1539,7 @@ sym(vp8_loop_filter_simple_vertical_edge_mmx):
|
| movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
|
|
|
| lea rsi, [rsi + rax*4- 2]; ;
|
| - mov rcx, 2 ; count
|
| + movsxd rcx, dword ptr arg(5) ;count
|
| nexts8_v:
|
|
|
| lea rdi, [rsi + rax];
|
| @@ -1582,10 +1602,14 @@ nexts8_v:
|
| paddusb mm5, mm5 ; abs(p0-q0)*2
|
| paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
|
|
|
| - mov rdx, arg(2) ;blimit ; get blimit
|
| + mov rdx, arg(2) ;flimit ; get flimit
|
| movq mm7, [rdx]
|
| + mov rdx, arg(3) ; get limit
|
| + movq mm6, [rdx]
|
| + paddb mm7, mm7 ; flimit*2 (less than 255)
|
| + paddb mm7, mm6 ; flimit * 2 + limit (less than 255)
|
|
|
| - psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
|
| + psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
|
| pxor mm7, mm7
|
| pcmpeqb mm5, mm7 ; mm5 = mask
|
|
|
|
|