source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm - Issue 11555023: libvpx: Add VP9 decoder.

Unified Diff: source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm

===================================================================

--- source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm (revision 0)

+++ source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm (revision 0)

@@ -0,0 +1,207 @@

+; Use of this source code is governed by a BSD-style license

+; that can be found in the LICENSE file in the root of the source

+; tree. An additional intellectual property rights grant can be found

+; in the file PATENTS. All contributing project authors may

+; be found in the AUTHORS file in the root of the source tree.

+%include "vpx_ports/x86_abi_support.asm"

+; void vp9_temporal_filter_apply_sse2 | arg

+; (unsigned char *frame1, | 0

+; unsigned int stride, | 1

+; unsigned char *frame2, | 2

+; unsigned int block_size, | 3

+; int strength, | 4

+; int filter_weight, | 5

+; unsigned int *accumulator, | 6

+; unsigned short *count) | 7

+global sym(vp9_temporal_filter_apply_sse2)

+sym(vp9_temporal_filter_apply_sse2):

+ push rbp

+ mov rbp, rsp

+ SHADOW_ARGS_TO_STACK 8

+ SAVE_XMM 7

+ GET_GOT rbx

+ push rsi

+ push rdi

+ ALIGN_STACK 16, rax

+ %define block_size 0

+ %define strength 16

+ %define filter_weight 32

+ %define rounding_bit 48

+ %define rbp_backup 64

+ %define stack_size 80

+ sub rsp, stack_size

+ mov [rsp + rbp_backup], rbp

+ ; end prolog

+ mov rdx, arg(3)

+ mov [rsp + block_size], rdx

+ movd xmm6, arg(4)

+ movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read

+ ; calculate the rounding bit outside the loop

+ ; 0x8000 >> (16 - strength)

+ mov rdx, 16

+ sub rdx, arg(4) ; 16 - strength

+ movd xmm4, rdx ; can't use rdx w/ shift

+ movdqa xmm5, [GLOBAL(_const_top_bit)]

+ psrlw xmm5, xmm4

+ movdqa [rsp + rounding_bit], xmm5

+ mov rsi, arg(0) ; src/frame1

+ mov rdx, arg(2) ; predictor frame

+ mov rdi, arg(6) ; accumulator

+ mov rax, arg(7) ; count

+ ; dup the filter weight and store for later

+ movd xmm0, arg(5) ; filter_weight

+ pshuflw xmm0, xmm0, 0

+ punpcklwd xmm0, xmm0

+ movdqa [rsp + filter_weight], xmm0

+ mov rbp, arg(1) ; stride

+ pxor xmm7, xmm7 ; zero for extraction

+ lea rcx, [rdx + 16*16*1]

+ cmp dword ptr [rsp + block_size], 8

+ jne .temporal_filter_apply_load_16

+ lea rcx, [rdx + 8*8*1]

+.temporal_filter_apply_load_8:

+ movq xmm0, [rsi] ; first row

+ lea rsi, [rsi + rbp] ; += stride

+ punpcklbw xmm0, xmm7 ; src[ 0- 7]

+ movq xmm1, [rsi] ; second row

+ lea rsi, [rsi + rbp] ; += stride

+ punpcklbw xmm1, xmm7 ; src[ 8-15]

+ jmp .temporal_filter_apply_load_finished

+.temporal_filter_apply_load_16:

+ movdqa xmm0, [rsi] ; src (frame1)

+ lea rsi, [rsi + rbp] ; += stride

+ movdqa xmm1, xmm0

+ punpcklbw xmm0, xmm7 ; src[ 0- 7]

+ punpckhbw xmm1, xmm7 ; src[ 8-15]

+.temporal_filter_apply_load_finished:

+ movdqa xmm2, [rdx] ; predictor (frame2)

+ movdqa xmm3, xmm2

+ punpcklbw xmm2, xmm7 ; pred[ 0- 7]

+ punpckhbw xmm3, xmm7 ; pred[ 8-15]

+ ; modifier = src_byte - pixel_value

+ psubw xmm0, xmm2 ; src - pred[ 0- 7]

+ psubw xmm1, xmm3 ; src - pred[ 8-15]

+ ; modifier *= modifier

+ pmullw xmm0, xmm0 ; modifer[ 0- 7]^2

+ pmullw xmm1, xmm1 ; modifer[ 8-15]^2

+ ; modifier *= 3

+ pmullw xmm0, [GLOBAL(_const_3w)]

+ pmullw xmm1, [GLOBAL(_const_3w)]

+ ; modifer += 0x8000 >> (16 - strength)

+ paddw xmm0, [rsp + rounding_bit]

+ paddw xmm1, [rsp + rounding_bit]

+ ; modifier >>= strength

+ psrlw xmm0, [rsp + strength]

+ psrlw xmm1, [rsp + strength]

+ ; modifier = 16 - modifier

+ ; saturation takes care of modifier > 16

+ movdqa xmm3, [GLOBAL(_const_16w)]

+ movdqa xmm2, [GLOBAL(_const_16w)]

+ psubusw xmm3, xmm1

+ psubusw xmm2, xmm0

+ ; modifier *= filter_weight

+ pmullw xmm2, [rsp + filter_weight]

+ pmullw xmm3, [rsp + filter_weight]

+ ; count

+ movdqa xmm4, [rax]

+ movdqa xmm5, [rax+16]

+ ; += modifier

+ paddw xmm4, xmm2

+ paddw xmm5, xmm3

+ ; write back

+ movdqa [rax], xmm4

+ movdqa [rax+16], xmm5

+ lea rax, [rax + 16*2] ; count += 16*(sizeof(short))

+ ; load and extract the predictor up to shorts

+ pxor xmm7, xmm7

+ movdqa xmm0, [rdx]

+ lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char))

+ movdqa xmm1, xmm0

+ punpcklbw xmm0, xmm7 ; pred[ 0- 7]

+ punpckhbw xmm1, xmm7 ; pred[ 8-15]

+ ; modifier *= pixel_value

+ pmullw xmm0, xmm2

+ pmullw xmm1, xmm3

+ ; expand to double words

+ movdqa xmm2, xmm0

+ punpcklwd xmm0, xmm7 ; [ 0- 3]

+ punpckhwd xmm2, xmm7 ; [ 4- 7]

+ movdqa xmm3, xmm1

+ punpcklwd xmm1, xmm7 ; [ 8-11]

+ punpckhwd xmm3, xmm7 ; [12-15]

+ ; accumulator

+ movdqa xmm4, [rdi]

+ movdqa xmm5, [rdi+16]

+ movdqa xmm6, [rdi+32]

+ movdqa xmm7, [rdi+48]

+ ; += modifier

+ paddd xmm4, xmm0

+ paddd xmm5, xmm2

+ paddd xmm6, xmm1

+ paddd xmm7, xmm3

+ ; write back

+ movdqa [rdi], xmm4

+ movdqa [rdi+16], xmm5

+ movdqa [rdi+32], xmm6

+ movdqa [rdi+48], xmm7

+ lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int))

+ cmp rdx, rcx

+ je .temporal_filter_apply_epilog

+ pxor xmm7, xmm7 ; zero for extraction

+ cmp dword ptr [rsp + block_size], 16

+ je .temporal_filter_apply_load_16

+ jmp .temporal_filter_apply_load_8

+.temporal_filter_apply_epilog:

+ ; begin epilog

+ mov rbp, [rsp + rbp_backup]

+ add rsp, stack_size

+ pop rsp

+ pop rdi

+ pop rsi

+ RESTORE_GOT

+ RESTORE_XMM

+ UNSHADOW_ARGS

+ pop rbp

+ ret

+SECTION_RODATA

+align 16

+_const_3w:

+ times 8 dw 3

+align 16

+_const_top_bit:

+ times 8 dw 1<<15

+align 16

+_const_16w

+ times 8 dw 16

« libvpx.gyp ('K') | « source/libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm ('k') | source/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm » ('j') | no next file with comments »