Index: source/libvpx/vp9/common/x86/vp9_postproc_mmx.asm |
=================================================================== |
--- source/libvpx/vp9/common/x86/vp9_postproc_mmx.asm (revision 291857) |
+++ source/libvpx/vp9/common/x86/vp9_postproc_mmx.asm (working copy) |
@@ -1,533 +0,0 @@ |
-; |
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
-; |
-; Use of this source code is governed by a BSD-style license |
-; that can be found in the LICENSE file in the root of the source |
-; tree. An additional intellectual property rights grant can be found |
-; in the file PATENTS. All contributing project authors may |
-; be found in the AUTHORS file in the root of the source tree. |
-; |
- |
- |
-%include "vpx_ports/x86_abi_support.asm" |
- |
-%define VP9_FILTER_WEIGHT 128 |
-%define VP9_FILTER_SHIFT 7 |
- |
-;void vp9_post_proc_down_and_across_mmx |
-;( |
-; unsigned char *src_ptr, |
-; unsigned char *dst_ptr, |
-; int src_pixels_per_line, |
-; int dst_pixels_per_line, |
-; int rows, |
-; int cols, |
-; int flimit |
-;) |
-global sym(vp9_post_proc_down_and_across_mmx) PRIVATE |
-sym(vp9_post_proc_down_and_across_mmx): |
- push rbp |
- mov rbp, rsp |
- SHADOW_ARGS_TO_STACK 7 |
- GET_GOT rbx |
- push rsi |
- push rdi |
- ; end prolog |
- |
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1 |
- ; move the global rd onto the stack, since we don't have enough registers |
- ; to do PIC addressing |
- movq mm0, [GLOBAL(rd)] |
- sub rsp, 8 |
- movq [rsp], mm0 |
-%define RD [rsp] |
-%else |
-%define RD [GLOBAL(rd)] |
-%endif |
- |
- push rbx |
- lea rbx, [GLOBAL(Blur)] |
- movd mm2, dword ptr arg(6) ;flimit |
- punpcklwd mm2, mm2 |
- punpckldq mm2, mm2 |
- |
- mov rsi, arg(0) ;src_ptr |
- mov rdi, arg(1) ;dst_ptr |
- |
- movsxd rcx, DWORD PTR arg(4) ;rows |
- movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? |
- pxor mm0, mm0 ; mm0 = 00000000 |
- |
-.nextrow: |
- |
- xor rdx, rdx ; clear out rdx for use as loop counter |
-.nextcol: |
- |
- pxor mm7, mm7 ; mm7 = 00000000 |
- movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps |
- movq mm3, [rsi] ; mm4 = r0 p0..p7 |
- punpcklbw mm3, mm0 ; mm3 = p0..p3 |
- movq mm1, mm3 ; mm1 = p0..p3 |
- pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers |
- |
- movq mm6, [rbx + 48] ; mm6 = kernel 3 taps |
- movq mm5, [rsi + rax] ; mm4 = r1 p0..p7 |
- punpcklbw mm5, mm0 ; mm5 = r1 p0..p3 |
- pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers |
- paddusw mm3, mm6 ; mm3 += mm6 |
- |
- ; thresholding |
- movq mm7, mm1 ; mm7 = r0 p0..p3 |
- psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3 |
- psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3 |
- paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) |
- pcmpgtw mm7, mm2 |
- |
- movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers |
- movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7 |
- punpcklbw mm5, mm0 ; mm5 = r2 p0..p3 |
- pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers |
- paddusw mm3, mm6 ; mm3 += mm5 |
- |
- ; thresholding |
- movq mm6, mm1 ; mm6 = r0 p0..p3 |
- psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3 |
- psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3 |
- paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) |
- pcmpgtw mm6, mm2 |
- por mm7, mm6 ; accumulate thresholds |
- |
- |
- neg rax |
- movq mm6, [rbx ] ; kernel 0 taps |
- movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7 |
- punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3 |
- pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers |
- paddusw mm3, mm6 ; mm3 += mm5 |
- |
- ; thresholding |
- movq mm6, mm1 ; mm6 = r0 p0..p3 |
- psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3 |
- psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3 |
- paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) |
- pcmpgtw mm6, mm2 |
- por mm7, mm6 ; accumulate thresholds |
- |
- movq mm6, [rbx + 16] ; kernel 1 taps |
- movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7 |
- punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3 |
- pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. |
- paddusw mm3, mm6 ; mm3 += mm5 |
- |
- ; thresholding |
- movq mm6, mm1 ; mm6 = r0 p0..p3 |
- psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3 |
- psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3 |
- paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) |
- pcmpgtw mm6, mm2 |
- por mm7, mm6 ; accumulate thresholds |
- |
- |
- paddusw mm3, RD ; mm3 += round value |
- psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 |
- |
- pand mm1, mm7 ; mm1 select vals > thresh from source |
- pandn mm7, mm3 ; mm7 select vals < thresh from blurred result |
- paddusw mm1, mm7 ; combination |
- |
- packuswb mm1, mm0 ; pack to bytes |
- |
- movd [rdi], mm1 ; |
- neg rax ; pitch is positive |
- |
- |
- add rsi, 4 |
- add rdi, 4 |
- add rdx, 4 |
- |
- cmp edx, dword ptr arg(5) ;cols |
- jl .nextcol |
- ; done with the all cols, start the across filtering in place |
- sub rsi, rdx |
- sub rdi, rdx |
- |
- |
- push rax |
- xor rdx, rdx |
- mov rax, [rdi-4]; |
- |
-.acrossnextcol: |
- pxor mm7, mm7 ; mm7 = 00000000 |
- movq mm6, [rbx + 32 ] ; |
- movq mm4, [rdi+rdx] ; mm4 = p0..p7 |
- movq mm3, mm4 ; mm3 = p0..p7 |
- punpcklbw mm3, mm0 ; mm3 = p0..p3 |
- movq mm1, mm3 ; mm1 = p0..p3 |
- pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers |
- |
- movq mm6, [rbx + 48] |
- psrlq mm4, 8 ; mm4 = p1..p7 |
- movq mm5, mm4 ; mm5 = p1..p7 |
- punpcklbw mm5, mm0 ; mm5 = p1..p4 |
- pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers |
- paddusw mm3, mm6 ; mm3 += mm6 |
- |
- ; thresholding |
- movq mm7, mm1 ; mm7 = p0..p3 |
- psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4 |
- psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 |
- paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4) |
- pcmpgtw mm7, mm2 |
- |
- movq mm6, [rbx + 64 ] |
- psrlq mm4, 8 ; mm4 = p2..p7 |
- movq mm5, mm4 ; mm5 = p2..p7 |
- punpcklbw mm5, mm0 ; mm5 = p2..p5 |
- pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers |
- paddusw mm3, mm6 ; mm3 += mm5 |
- |
- ; thresholding |
- movq mm6, mm1 ; mm6 = p0..p3 |
- psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 |
- psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 |
- paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) |
- pcmpgtw mm6, mm2 |
- por mm7, mm6 ; accumulate thresholds |
- |
- |
- movq mm6, [rbx ] |
- movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5 |
- movq mm5, mm4 ; mm5 = p-2..p5 |
- punpcklbw mm5, mm0 ; mm5 = p-2..p1 |
- pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers |
- paddusw mm3, mm6 ; mm3 += mm5 |
- |
- ; thresholding |
- movq mm6, mm1 ; mm6 = p0..p3 |
- psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 |
- psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 |
- paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) |
- pcmpgtw mm6, mm2 |
- por mm7, mm6 ; accumulate thresholds |
- |
- movq mm6, [rbx + 16] |
- psrlq mm4, 8 ; mm4 = p-1..p5 |
- punpcklbw mm4, mm0 ; mm4 = p-1..p2 |
- pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. |
- paddusw mm3, mm6 ; mm3 += mm5 |
- |
- ; thresholding |
- movq mm6, mm1 ; mm6 = p0..p3 |
- psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4 |
- psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3 |
- paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4) |
- pcmpgtw mm6, mm2 |
- por mm7, mm6 ; accumulate thresholds |
- |
- paddusw mm3, RD ; mm3 += round value |
- psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 |
- |
- pand mm1, mm7 ; mm1 select vals > thresh from source |
- pandn mm7, mm3 ; mm7 select vals < thresh from blurred result |
- paddusw mm1, mm7 ; combination |
- |
- packuswb mm1, mm0 ; pack to bytes |
- mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes |
- movd eax, mm1 |
- |
- add rdx, 4 |
- cmp edx, dword ptr arg(5) ;cols |
- jl .acrossnextcol; |
- |
- mov DWORD PTR [rdi+rdx-4], eax |
- pop rax |
- |
- ; done with this rwo |
- add rsi,rax ; next line |
- movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch? |
- add rdi,rax ; next destination |
- movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch? |
- |
- dec rcx ; decrement count |
- jnz .nextrow ; next row |
- pop rbx |
- |
- ; begin epilog |
- pop rdi |
- pop rsi |
- RESTORE_GOT |
- UNSHADOW_ARGS |
- pop rbp |
- ret |
-%undef RD |
- |
- |
-;void vp9_mbpost_proc_down_mmx(unsigned char *dst, |
-; int pitch, int rows, int cols,int flimit) |
-extern sym(vp9_rv) |
-global sym(vp9_mbpost_proc_down_mmx) PRIVATE |
-sym(vp9_mbpost_proc_down_mmx): |
- push rbp |
- mov rbp, rsp |
- SHADOW_ARGS_TO_STACK 5 |
- GET_GOT rbx |
- push rsi |
- push rdi |
- ; end prolog |
- |
- ALIGN_STACK 16, rax |
- sub rsp, 136 |
- |
- ; unsigned char d[16][8] at [rsp] |
- ; create flimit2 at [rsp+128] |
- mov eax, dword ptr arg(4) ;flimit |
- mov [rsp+128], eax |
- mov [rsp+128+4], eax |
-%define flimit2 [rsp+128] |
- |
-%if ABI_IS_32BIT=0 |
- lea r8, [GLOBAL(sym(vp9_rv))] |
-%endif |
- |
- ;rows +=8; |
- add dword ptr arg(2), 8 |
- |
- ;for(c=0; c<cols; c+=4) |
-.loop_col: |
- mov rsi, arg(0) ;s |
- pxor mm0, mm0 ; |
- |
- movsxd rax, dword ptr arg(1) ;pitch ; |
- neg rax ; rax = -pitch |
- |
- lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] |
- neg rax |
- |
- |
- pxor mm5, mm5 |
- pxor mm6, mm6 ; |
- |
- pxor mm7, mm7 ; |
- mov rdi, rsi |
- |
- mov rcx, 15 ; |
- |
-.loop_initvar: |
- movd mm1, DWORD PTR [rdi]; |
- punpcklbw mm1, mm0 ; |
- |
- paddw mm5, mm1 ; |
- pmullw mm1, mm1 ; |
- |
- movq mm2, mm1 ; |
- punpcklwd mm1, mm0 ; |
- |
- punpckhwd mm2, mm0 ; |
- paddd mm6, mm1 ; |
- |
- paddd mm7, mm2 ; |
- lea rdi, [rdi+rax] ; |
- |
- dec rcx |
- jne .loop_initvar |
- ;save the var and sum |
- xor rdx, rdx |
-.loop_row: |
- movd mm1, DWORD PTR [rsi] ; [s-pitch*8] |
- movd mm2, DWORD PTR [rdi] ; [s+pitch*7] |
- |
- punpcklbw mm1, mm0 |
- punpcklbw mm2, mm0 |
- |
- paddw mm5, mm2 |
- psubw mm5, mm1 |
- |
- pmullw mm2, mm2 |
- movq mm4, mm2 |
- |
- punpcklwd mm2, mm0 |
- punpckhwd mm4, mm0 |
- |
- paddd mm6, mm2 |
- paddd mm7, mm4 |
- |
- pmullw mm1, mm1 |
- movq mm2, mm1 |
- |
- punpcklwd mm1, mm0 |
- psubd mm6, mm1 |
- |
- punpckhwd mm2, mm0 |
- psubd mm7, mm2 |
- |
- |
- movq mm3, mm6 |
- pslld mm3, 4 |
- |
- psubd mm3, mm6 |
- movq mm1, mm5 |
- |
- movq mm4, mm5 |
- pmullw mm1, mm1 |
- |
- pmulhw mm4, mm4 |
- movq mm2, mm1 |
- |
- punpcklwd mm1, mm4 |
- punpckhwd mm2, mm4 |
- |
- movq mm4, mm7 |
- pslld mm4, 4 |
- |
- psubd mm4, mm7 |
- |
- psubd mm3, mm1 |
- psubd mm4, mm2 |
- |
- psubd mm3, flimit2 |
- psubd mm4, flimit2 |
- |
- psrad mm3, 31 |
- psrad mm4, 31 |
- |
- packssdw mm3, mm4 |
- packsswb mm3, mm0 |
- |
- movd mm1, DWORD PTR [rsi+rax*8] |
- |
- movq mm2, mm1 |
- punpcklbw mm1, mm0 |
- |
- paddw mm1, mm5 |
- mov rcx, rdx |
- |
- and rcx, 127 |
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1 |
- push rax |
- lea rax, [GLOBAL(sym(vp9_rv))] |
- movq mm4, [rax + rcx*2] ;vp9_rv[rcx*2] |
- pop rax |
-%elif ABI_IS_32BIT=0 |
- movq mm4, [r8 + rcx*2] ;vp9_rv[rcx*2] |
-%else |
- movq mm4, [sym(vp9_rv) + rcx*2] |
-%endif |
- paddw mm1, mm4 |
- ;paddw xmm1, eight8s |
- psraw mm1, 4 |
- |
- packuswb mm1, mm0 |
- pand mm1, mm3 |
- |
- pandn mm3, mm2 |
- por mm1, mm3 |
- |
- and rcx, 15 |
- movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4] |
- |
- mov rcx, rdx |
- sub rcx, 8 |
- |
- and rcx, 15 |
- movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4] |
- |
- movd [rsi], mm1 |
- lea rsi, [rsi+rax] |
- |
- lea rdi, [rdi+rax] |
- add rdx, 1 |
- |
- cmp edx, dword arg(2) ;rows |
- jl .loop_row |
- |
- |
- add dword arg(0), 4 ; s += 4 |
- sub dword arg(3), 4 ; cols -= 4 |
- cmp dword arg(3), 0 |
- jg .loop_col |
- |
- add rsp, 136 |
- pop rsp |
- |
- ; begin epilog |
- pop rdi |
- pop rsi |
- RESTORE_GOT |
- UNSHADOW_ARGS |
- pop rbp |
- ret |
-%undef flimit2 |
- |
- |
-;void vp9_plane_add_noise_mmx (unsigned char *start, unsigned char *noise, |
-; unsigned char blackclamp[16], |
-; unsigned char whiteclamp[16], |
-; unsigned char bothclamp[16], |
-; unsigned int width, unsigned int height, int pitch) |
-global sym(vp9_plane_add_noise_mmx) PRIVATE |
-sym(vp9_plane_add_noise_mmx): |
- push rbp |
- mov rbp, rsp |
- SHADOW_ARGS_TO_STACK 8 |
- GET_GOT rbx |
- push rsi |
- push rdi |
- ; end prolog |
- |
-.addnoise_loop: |
- call sym(LIBVPX_RAND) WRT_PLT |
- mov rcx, arg(1) ;noise |
- and rax, 0xff |
- add rcx, rax |
- |
- ; we rely on the fact that the clamping vectors are stored contiguously |
- ; in black/white/both order. Note that we have to reload this here because |
- ; rdx could be trashed by rand() |
- mov rdx, arg(2) ; blackclamp |
- |
- |
- mov rdi, rcx |
- movsxd rcx, dword arg(5) ;[Width] |
- mov rsi, arg(0) ;Pos |
- xor rax,rax |
- |
-.addnoise_nextset: |
- movq mm1,[rsi+rax] ; get the source |
- |
- psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise |
- paddusb mm1, [rdx+32] ;bothclamp |
- psubusb mm1, [rdx+16] ;whiteclamp |
- |
- movq mm2,[rdi+rax] ; get the noise for this line |
- paddb mm1,mm2 ; add it in |
- movq [rsi+rax],mm1 ; store the result |
- |
- add rax,8 ; move to the next line |
- |
- cmp rax, rcx |
- jl .addnoise_nextset |
- |
- movsxd rax, dword arg(7) ; Pitch |
- add arg(0), rax ; Start += Pitch |
- sub dword arg(6), 1 ; Height -= 1 |
- jg .addnoise_loop |
- |
- ; begin epilog |
- pop rdi |
- pop rsi |
- RESTORE_GOT |
- UNSHADOW_ARGS |
- pop rbp |
- ret |
- |
- |
-SECTION_RODATA |
-align 16 |
-Blur: |
- times 16 dw 16 |
- times 8 dw 64 |
- times 16 dw 16 |
- times 8 dw 0 |
- |
-rd: |
- times 4 dw 0x40 |