source/libvpx/vp9/common/x86/vp9_postproc_mmx.asm - Issue 11555023: libvpx: Add VP9 decoder.

Unified Diff: source/libvpx/vp9/common/x86/vp9_postproc_mmx.asm

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: source/libvpx/vp9/common/x86/vp9_postproc_mmx.asm

===================================================================

--- source/libvpx/vp9/common/x86/vp9_postproc_mmx.asm (revision 0)

+++ source/libvpx/vp9/common/x86/vp9_postproc_mmx.asm (revision 0)

@@ -0,0 +1,534 @@

+; Use of this source code is governed by a BSD-style license

+; that can be found in the LICENSE file in the root of the source

+; tree. An additional intellectual property rights grant can be found

+; in the file PATENTS. All contributing project authors may

+; be found in the AUTHORS file in the root of the source tree.

+%include "vpx_ports/x86_abi_support.asm"

+%define VP9_FILTER_WEIGHT 128

+%define VP9_FILTER_SHIFT 7

+;void vp9_post_proc_down_and_across_mmx

+;(

+; unsigned char *src_ptr,

+; unsigned char *dst_ptr,

+; int src_pixels_per_line,

+; int dst_pixels_per_line,

+; int rows,

+; int cols,

+; int flimit

+;)

+global sym(vp9_post_proc_down_and_across_mmx)

+sym(vp9_post_proc_down_and_across_mmx):

+ push rbp

+ mov rbp, rsp

+ SHADOW_ARGS_TO_STACK 7

+ GET_GOT rbx

+ push rsi

+ push rdi

+ ; end prolog

+%if ABI_IS_32BIT=1 && CONFIG_PIC=1

+ ; move the global rd onto the stack, since we don't have enough registers

+ ; to do PIC addressing

+ movq mm0, [GLOBAL(rd)]

+ sub rsp, 8

+ movq [rsp], mm0

+%define RD [rsp]

+%else

+%define RD [GLOBAL(rd)]

+%endif

+ push rbx

+ lea rbx, [GLOBAL(Blur)]

+ movd mm2, dword ptr arg(6) ;flimit

+ punpcklwd mm2, mm2

+ punpckldq mm2, mm2

+ mov rsi, arg(0) ;src_ptr

+ mov rdi, arg(1) ;dst_ptr

+ movsxd rcx, DWORD PTR arg(4) ;rows

+ movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?

+ pxor mm0, mm0 ; mm0 = 00000000

+.nextrow:

+ xor rdx, rdx ; clear out rdx for use as loop counter

+.nextcol:

+ pxor mm7, mm7 ; mm7 = 00000000

+ movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps

+ movq mm3, [rsi] ; mm4 = r0 p0..p7

+ punpcklbw mm3, mm0 ; mm3 = p0..p3

+ movq mm1, mm3 ; mm1 = p0..p3

+ pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers

+ movq mm6, [rbx + 48] ; mm6 = kernel 3 taps

+ movq mm5, [rsi + rax] ; mm4 = r1 p0..p7

+ punpcklbw mm5, mm0 ; mm5 = r1 p0..p3

+ pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers

+ paddusw mm3, mm6 ; mm3 += mm6

+ ; thresholding

+ movq mm7, mm1 ; mm7 = r0 p0..p3

+ psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3

+ psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3

+ paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)

+ pcmpgtw mm7, mm2

+ movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers

+ movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7

+ punpcklbw mm5, mm0 ; mm5 = r2 p0..p3

+ pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers

+ paddusw mm3, mm6 ; mm3 += mm5

+ ; thresholding

+ movq mm6, mm1 ; mm6 = r0 p0..p3

+ psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3

+ psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3

+ paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)

+ pcmpgtw mm6, mm2

+ por mm7, mm6 ; accumulate thresholds

+ neg rax

+ movq mm6, [rbx ] ; kernel 0 taps

+ movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7

+ punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3

+ pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers

+ paddusw mm3, mm6 ; mm3 += mm5

+ ; thresholding

+ movq mm6, mm1 ; mm6 = r0 p0..p3

+ psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3

+ psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3

+ paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)

+ pcmpgtw mm6, mm2

+ por mm7, mm6 ; accumulate thresholds

+ movq mm6, [rbx + 16] ; kernel 1 taps

+ movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7

+ punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3

+ pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.

+ paddusw mm3, mm6 ; mm3 += mm5

+ ; thresholding

+ movq mm6, mm1 ; mm6 = r0 p0..p3

+ psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3

+ psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3

+ paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)

+ pcmpgtw mm6, mm2

+ por mm7, mm6 ; accumulate thresholds

+ paddusw mm3, RD ; mm3 += round value

+ psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128

+ pand mm1, mm7 ; mm1 select vals > thresh from source

+ pandn mm7, mm3 ; mm7 select vals < thresh from blurred result

+ paddusw mm1, mm7 ; combination

+ packuswb mm1, mm0 ; pack to bytes

+ movd [rdi], mm1 ;

+ neg rax ; pitch is positive

+ add rsi, 4

+ add rdi, 4

+ add rdx, 4

+ cmp edx, dword ptr arg(5) ;cols

+ jl .nextcol

+ ; done with the all cols, start the across filtering in place

+ sub rsi, rdx

+ sub rdi, rdx

+ push rax

+ xor rdx, rdx

+ mov rax, [rdi-4];

+.acrossnextcol:

+ pxor mm7, mm7 ; mm7 = 00000000

+ movq mm6, [rbx + 32 ] ;

+ movq mm4, [rdi+rdx] ; mm4 = p0..p7

+ movq mm3, mm4 ; mm3 = p0..p7

+ punpcklbw mm3, mm0 ; mm3 = p0..p3

+ movq mm1, mm3 ; mm1 = p0..p3

+ pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers

+ movq mm6, [rbx + 48]

+ psrlq mm4, 8 ; mm4 = p1..p7

+ movq mm5, mm4 ; mm5 = p1..p7

+ punpcklbw mm5, mm0 ; mm5 = p1..p4

+ pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers

+ paddusw mm3, mm6 ; mm3 += mm6

+ ; thresholding

+ movq mm7, mm1 ; mm7 = p0..p3

+ psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4

+ psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3

+ paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4)

+ pcmpgtw mm7, mm2

+ movq mm6, [rbx + 64 ]

+ psrlq mm4, 8 ; mm4 = p2..p7

+ movq mm5, mm4 ; mm5 = p2..p7

+ punpcklbw mm5, mm0 ; mm5 = p2..p5

+ pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers

+ paddusw mm3, mm6 ; mm3 += mm5

+ ; thresholding

+ movq mm6, mm1 ; mm6 = p0..p3

+ psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4

+ psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3

+ paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)

+ pcmpgtw mm6, mm2

+ por mm7, mm6 ; accumulate thresholds

+ movq mm6, [rbx ]

+ movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5

+ movq mm5, mm4 ; mm5 = p-2..p5

+ punpcklbw mm5, mm0 ; mm5 = p-2..p1

+ pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers

+ paddusw mm3, mm6 ; mm3 += mm5

+ ; thresholding

+ movq mm6, mm1 ; mm6 = p0..p3

+ psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4

+ psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3

+ paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)

+ pcmpgtw mm6, mm2

+ por mm7, mm6 ; accumulate thresholds

+ movq mm6, [rbx + 16]

+ psrlq mm4, 8 ; mm4 = p-1..p5

+ punpcklbw mm4, mm0 ; mm4 = p-1..p2

+ pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.

+ paddusw mm3, mm6 ; mm3 += mm5

+ ; thresholding

+ movq mm6, mm1 ; mm6 = p0..p3

+ psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4

+ psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3

+ paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4)

+ pcmpgtw mm6, mm2

+ por mm7, mm6 ; accumulate thresholds

+ paddusw mm3, RD ; mm3 += round value

+ psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128

+ pand mm1, mm7 ; mm1 select vals > thresh from source

+ pandn mm7, mm3 ; mm7 select vals < thresh from blurred result

+ paddusw mm1, mm7 ; combination

+ packuswb mm1, mm0 ; pack to bytes

+ mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes

+ movd eax, mm1

+ add rdx, 4

+ cmp edx, dword ptr arg(5) ;cols

+ jl .acrossnextcol;

+ mov DWORD PTR [rdi+rdx-4], eax

+ pop rax

+ ; done with this rwo

+ add rsi,rax ; next line

+ movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?

+ add rdi,rax ; next destination

+ movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?

+ dec rcx ; decrement count

+ jnz .nextrow ; next row

+ pop rbx

+ ; begin epilog

+ pop rdi

+ pop rsi

+ RESTORE_GOT

+ UNSHADOW_ARGS

+ pop rbp

+ ret

+%undef RD

+;void vp9_mbpost_proc_down_mmx(unsigned char *dst,

+; int pitch, int rows, int cols,int flimit)

+extern sym(vp9_rv)

+global sym(vp9_mbpost_proc_down_mmx)

+sym(vp9_mbpost_proc_down_mmx):

+ push rbp

+ mov rbp, rsp

+ SHADOW_ARGS_TO_STACK 5

+ GET_GOT rbx

+ push rsi

+ push rdi

+ ; end prolog

+ ALIGN_STACK 16, rax

+ sub rsp, 136

+ ; unsigned char d[16][8] at [rsp]

+ ; create flimit2 at [rsp+128]

+ mov eax, dword ptr arg(4) ;flimit

+ mov [rsp+128], eax

+ mov [rsp+128+4], eax

+%define flimit2 [rsp+128]

+%if ABI_IS_32BIT=0

+ lea r8, [GLOBAL(sym(vp9_rv))]

+%endif

+ ;rows +=8;

+ add dword ptr arg(2), 8

+ ;for(c=0; c<cols; c+=4)

+.loop_col:

+ mov rsi, arg(0) ;s

+ pxor mm0, mm0 ;

+ movsxd rax, dword ptr arg(1) ;pitch ;

+ neg rax ; rax = -pitch

+ lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]

+ neg rax

+ pxor mm5, mm5

+ pxor mm6, mm6 ;

+ pxor mm7, mm7 ;

+ mov rdi, rsi

+ mov rcx, 15 ;

+.loop_initvar:

+ movd mm1, DWORD PTR [rdi];

+ punpcklbw mm1, mm0 ;

+ paddw mm5, mm1 ;

+ pmullw mm1, mm1 ;

+ movq mm2, mm1 ;

+ punpcklwd mm1, mm0 ;

+ punpckhwd mm2, mm0 ;

+ paddd mm6, mm1 ;

+ paddd mm7, mm2 ;

+ lea rdi, [rdi+rax] ;

+ dec rcx

+ jne .loop_initvar

+ ;save the var and sum

+ xor rdx, rdx

+.loop_row:

+ movd mm1, DWORD PTR [rsi] ; [s-pitch*8]

+ movd mm2, DWORD PTR [rdi] ; [s+pitch*7]

+ punpcklbw mm1, mm0

+ punpcklbw mm2, mm0

+ paddw mm5, mm2

+ psubw mm5, mm1

+ pmullw mm2, mm2

+ movq mm4, mm2

+ punpcklwd mm2, mm0

+ punpckhwd mm4, mm0

+ paddd mm6, mm2

+ paddd mm7, mm4

+ pmullw mm1, mm1

+ movq mm2, mm1

+ punpcklwd mm1, mm0

+ psubd mm6, mm1

+ punpckhwd mm2, mm0

+ psubd mm7, mm2

+ movq mm3, mm6

+ pslld mm3, 4

+ psubd mm3, mm6

+ movq mm1, mm5

+ movq mm4, mm5

+ pmullw mm1, mm1

+ pmulhw mm4, mm4

+ movq mm2, mm1

+ punpcklwd mm1, mm4

+ punpckhwd mm2, mm4

+ movq mm4, mm7

+ pslld mm4, 4

+ psubd mm4, mm7

+ psubd mm3, mm1

+ psubd mm4, mm2

+ psubd mm3, flimit2

+ psubd mm4, flimit2

+ psrad mm3, 31

+ psrad mm4, 31

+ packssdw mm3, mm4

+ packsswb mm3, mm0

+ movd mm1, DWORD PTR [rsi+rax*8]

+ movq mm2, mm1

+ punpcklbw mm1, mm0

+ paddw mm1, mm5

+ mov rcx, rdx

+ and rcx, 127

+%if ABI_IS_32BIT=1 && CONFIG_PIC=1

+ push rax

+ lea rax, [GLOBAL(sym(vp9_rv))]

+ movq mm4, [rax + rcx*2] ;vp9_rv[rcx*2]

+ pop rax

+%elif ABI_IS_32BIT=0

+ movq mm4, [r8 + rcx*2] ;vp9_rv[rcx*2]

+%else

+ movq mm4, [sym(vp9_rv) + rcx*2]

+%endif

+ paddw mm1, mm4

+ ;paddw xmm1, eight8s

+ psraw mm1, 4

+ packuswb mm1, mm0

+ pand mm1, mm3

+ pandn mm3, mm2

+ por mm1, mm3

+ and rcx, 15

+ movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4]

+ mov rcx, rdx

+ sub rcx, 8

+ and rcx, 15

+ movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4]

+ movd [rsi], mm1

+ lea rsi, [rsi+rax]

+ lea rdi, [rdi+rax]

+ add rdx, 1

+ cmp edx, dword arg(2) ;rows

+ jl .loop_row

+ add dword arg(0), 4 ; s += 4

+ sub dword arg(3), 4 ; cols -= 4

+ cmp dword arg(3), 0

+ jg .loop_col

+ add rsp, 136

+ pop rsp

+ ; begin epilog

+ pop rdi

+ pop rsi

+ RESTORE_GOT

+ UNSHADOW_ARGS

+ pop rbp

+ ret

+%undef flimit2

+;void vp9_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,

+; unsigned char blackclamp[16],

+; unsigned char whiteclamp[16],

+; unsigned char bothclamp[16],

+; unsigned int Width, unsigned int Height, int Pitch)

+extern sym(rand)

+global sym(vp9_plane_add_noise_mmx)

+sym(vp9_plane_add_noise_mmx):

+ push rbp

+ mov rbp, rsp

+ SHADOW_ARGS_TO_STACK 8

+ GET_GOT rbx

+ push rsi

+ push rdi

+ ; end prolog

+.addnoise_loop:

+ call sym(rand) WRT_PLT

+ mov rcx, arg(1) ;noise

+ and rax, 0xff

+ add rcx, rax

+ ; we rely on the fact that the clamping vectors are stored contiguously

+ ; in black/white/both order. Note that we have to reload this here because

+ ; rdx could be trashed by rand()

+ mov rdx, arg(2) ; blackclamp

+ mov rdi, rcx

+ movsxd rcx, dword arg(5) ;[Width]

+ mov rsi, arg(0) ;Pos

+ xor rax,rax

+.addnoise_nextset:

+ movq mm1,[rsi+rax] ; get the source

+ psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise

+ paddusb mm1, [rdx+32] ;bothclamp

+ psubusb mm1, [rdx+16] ;whiteclamp

+ movq mm2,[rdi+rax] ; get the noise for this line

+ paddb mm1,mm2 ; add it in

+ movq [rsi+rax],mm1 ; store the result

+ add rax,8 ; move to the next line

+ cmp rax, rcx

+ jl .addnoise_nextset

+ movsxd rax, dword arg(7) ; Pitch

+ add arg(0), rax ; Start += Pitch

+ sub dword arg(6), 1 ; Height -= 1

+ jg .addnoise_loop

+ ; begin epilog

+ pop rdi

+ pop rsi

+ RESTORE_GOT

+ UNSHADOW_ARGS

+ pop rbp

+ ret

+SECTION_RODATA

+align 16

+Blur:

+ times 16 dw 16

+ times 8 dw 64

+ times 16 dw 16

+ times 8 dw 0

+rd:

+ times 4 dw 0x40

« libvpx.gyp ('K') | « source/libvpx/vp9/common/x86/vp9_mask_sse3.asm ('k') | source/libvpx/vp9/common/x86/vp9_postproc_sse2.asm » ('j') | no next file with comments »