source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.asm - Issue 11555023: libvpx: Add VP9 decoder.

Unified Diff: source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.asm

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.asm

===================================================================

--- source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.asm (revision 0)

+++ source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.asm (revision 0)

@@ -0,0 +1,380 @@

+; Use of this source code is governed by a BSD-style license and patent

+; grant that can be found in the LICENSE file in the root of the source

+; tree. All contributing project authors may be found in the AUTHORS

+; file in the root of the source tree.

+%include "vpx_ports/x86_abi_support.asm"

+%include "vp9_asm_enc_offsets.asm"

+; void vp9_regular_quantize_b_sse2 | arg

+; (BLOCK *b, | 0

+; BLOCKD *d) | 1

+global sym(vp9_regular_quantize_b_sse2)

+sym(vp9_regular_quantize_b_sse2):

+ push rbp

+ mov rbp, rsp

+ SAVE_XMM 7

+ GET_GOT rbx

+%if ABI_IS_32BIT

+ push rdi

+ push rsi

+%else

+ %ifidn __OUTPUT_FORMAT__,x64

+ push rdi

+ push rsi

+ %endif

+%endif

+ ALIGN_STACK 16, rax

+ %define zrun_zbin_boost 0 ; 8

+ %define abs_minus_zbin 8 ; 32

+ %define temp_qcoeff 40 ; 32

+ %define qcoeff 72 ; 32

+ %define stack_size 104

+ sub rsp, stack_size

+ ; end prolog

+%if ABI_IS_32BIT

+ mov rdi, arg(0) ; BLOCK *b

+ mov rsi, arg(1) ; BLOCKD *d

+%else

+ %ifidn __OUTPUT_FORMAT__,x64

+ mov rdi, rcx ; BLOCK *b

+ mov rsi, rdx ; BLOCKD *d

+ %else

+ ;mov rdi, rdi ; BLOCK *b

+ ;mov rsi, rsi ; BLOCKD *d

+ %endif

+%endif

+ mov rdx, [rdi + vp9_block_coeff] ; coeff_ptr

+ mov rcx, [rdi + vp9_block_zbin] ; zbin_ptr

+ movd xmm7, [rdi + vp9_block_zbin_extra] ; zbin_oq_value

+ ; z

+ movdqa xmm0, [rdx]

+ movdqa xmm4, [rdx + 16]

+ mov rdx, [rdi + vp9_block_round] ; round_ptr

+ pshuflw xmm7, xmm7, 0

+ punpcklwd xmm7, xmm7 ; duplicated zbin_oq_value

+ movdqa xmm1, xmm0

+ movdqa xmm5, xmm4

+ ; sz

+ psraw xmm0, 15

+ psraw xmm4, 15

+ ; (z ^ sz)

+ pxor xmm1, xmm0

+ pxor xmm5, xmm4

+ ; x = abs(z)

+ psubw xmm1, xmm0

+ psubw xmm5, xmm4

+ movdqa xmm2, [rcx]

+ movdqa xmm3, [rcx + 16]

+ mov rcx, [rdi + vp9_block_quant] ; quant_ptr

+ ; *zbin_ptr + zbin_oq_value

+ paddw xmm2, xmm7

+ paddw xmm3, xmm7

+ ; x - (*zbin_ptr + zbin_oq_value)

+ psubw xmm1, xmm2

+ psubw xmm5, xmm3

+ movdqa [rsp + abs_minus_zbin], xmm1

+ movdqa [rsp + abs_minus_zbin + 16], xmm5

+ ; add (zbin_ptr + zbin_oq_value) back

+ paddw xmm1, xmm2

+ paddw xmm5, xmm3

+ movdqa xmm2, [rdx]

+ movdqa xmm6, [rdx + 16]

+ movdqa xmm3, [rcx]

+ movdqa xmm7, [rcx + 16]

+ ; x + round

+ paddw xmm1, xmm2

+ paddw xmm5, xmm6

+ ; y = x * quant_ptr >> 16

+ pmulhw xmm3, xmm1

+ pmulhw xmm7, xmm5

+ ; y += x

+ paddw xmm1, xmm3

+ paddw xmm5, xmm7

+ movdqa [rsp + temp_qcoeff], xmm1

+ movdqa [rsp + temp_qcoeff + 16], xmm5

+ pxor xmm6, xmm6

+ ; zero qcoeff

+ movdqa [rsp + qcoeff], xmm6

+ movdqa [rsp + qcoeff + 16], xmm6

+ mov rdx, [rdi + vp9_block_zrun_zbin_boost] ; zbin_boost_ptr

+ mov rax, [rdi + vp9_block_quant_shift] ; quant_shift_ptr

+ mov [rsp + zrun_zbin_boost], rdx

+%macro ZIGZAG_LOOP 1

+ ; x

+ movsx ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]

+ ; if (x >= zbin)

+ sub cx, WORD PTR[rdx] ; x - zbin

+ lea rdx, [rdx + 2] ; zbin_boost_ptr++

+ jl .rq_zigzag_loop_%1 ; x < zbin

+ movsx edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]

+ ; downshift by quant_shift[rc]

+ movsx cx, BYTE PTR[rax + %1] ; quant_shift_ptr[rc]

+ sar edi, cl ; also sets Z bit

+ je .rq_zigzag_loop_%1 ; !y

+ mov WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]

+ mov rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost

+.rq_zigzag_loop_%1:

+%endmacro

+; in vp9_default_zig_zag1d order: see vp9/common/vp9_entropy.c

+ZIGZAG_LOOP 0

+ZIGZAG_LOOP 1

+ZIGZAG_LOOP 4

+ZIGZAG_LOOP 8

+ZIGZAG_LOOP 5

+ZIGZAG_LOOP 2

+ZIGZAG_LOOP 3

+ZIGZAG_LOOP 6

+ZIGZAG_LOOP 9

+ZIGZAG_LOOP 12

+ZIGZAG_LOOP 13

+ZIGZAG_LOOP 10

+ZIGZAG_LOOP 7

+ZIGZAG_LOOP 11

+ZIGZAG_LOOP 14

+ZIGZAG_LOOP 15

+ movdqa xmm2, [rsp + qcoeff]

+ movdqa xmm3, [rsp + qcoeff + 16]

+ mov rcx, [rsi + vp9_blockd_dequant] ; dequant_ptr

+ mov rdi, [rsi + vp9_blockd_dqcoeff] ; dqcoeff_ptr

+ ; y ^ sz

+ pxor xmm2, xmm0

+ pxor xmm3, xmm4

+ ; x = (y ^ sz) - sz

+ psubw xmm2, xmm0

+ psubw xmm3, xmm4

+ ; dequant

+ movdqa xmm0, [rcx]

+ movdqa xmm1, [rcx + 16]

+ mov rcx, [rsi + vp9_blockd_qcoeff] ; qcoeff_ptr

+ pmullw xmm0, xmm2

+ pmullw xmm1, xmm3

+ movdqa [rcx], xmm2 ; store qcoeff

+ movdqa [rcx + 16], xmm3

+ movdqa [rdi], xmm0 ; store dqcoeff

+ movdqa [rdi + 16], xmm1

+ ; select the last value (in zig_zag order) for EOB

+ pcmpeqw xmm2, xmm6

+ pcmpeqw xmm3, xmm6

+ ; !

+ pcmpeqw xmm6, xmm6

+ pxor xmm2, xmm6

+ pxor xmm3, xmm6

+ ; mask inv_zig_zag

+ pand xmm2, [GLOBAL(inv_zig_zag)]

+ pand xmm3, [GLOBAL(inv_zig_zag + 16)]

+ ; select the max value

+ pmaxsw xmm2, xmm3

+ pshufd xmm3, xmm2, 00001110b

+ pmaxsw xmm2, xmm3

+ pshuflw xmm3, xmm2, 00001110b

+ pmaxsw xmm2, xmm3

+ pshuflw xmm3, xmm2, 00000001b

+ pmaxsw xmm2, xmm3

+ movd eax, xmm2

+ and eax, 0xff

+ mov [rsi + vp9_blockd_eob], eax

+ ; begin epilog

+ add rsp, stack_size

+ pop rsp

+%if ABI_IS_32BIT

+ pop rsi

+ pop rdi

+%else

+ %ifidn __OUTPUT_FORMAT__,x64

+ pop rsi

+ pop rdi

+ %endif

+%endif

+ RESTORE_GOT

+ RESTORE_XMM

+ pop rbp

+ ret

+; void vp9_fast_quantize_b_sse2 | arg

+; (BLOCK *b, | 0

+; BLOCKD *d) | 1

+global sym(vp9_fast_quantize_b_sse2)

+sym(vp9_fast_quantize_b_sse2):

+ push rbp

+ mov rbp, rsp

+ GET_GOT rbx

+%if ABI_IS_32BIT

+ push rdi

+ push rsi

+%else

+ %ifidn __OUTPUT_FORMAT__,x64

+ push rdi

+ push rsi

+ %else

+ ; these registers are used for passing arguments

+ %endif

+%endif

+ ; end prolog

+%if ABI_IS_32BIT

+ mov rdi, arg(0) ; BLOCK *b

+ mov rsi, arg(1) ; BLOCKD *d

+%else

+ %ifidn __OUTPUT_FORMAT__,x64

+ mov rdi, rcx ; BLOCK *b

+ mov rsi, rdx ; BLOCKD *d

+ %else

+ ;mov rdi, rdi ; BLOCK *b

+ ;mov rsi, rsi ; BLOCKD *d

+ %endif

+%endif

+ mov rax, [rdi + vp9_block_coeff]

+ mov rcx, [rdi + vp9_block_round]

+ mov rdx, [rdi + vp9_block_quant_fast]

+ ; z = coeff

+ movdqa xmm0, [rax]

+ movdqa xmm4, [rax + 16]

+ ; dup z so we can save sz

+ movdqa xmm1, xmm0

+ movdqa xmm5, xmm4

+ ; sz = z >> 15

+ psraw xmm0, 15

+ psraw xmm4, 15

+ ; x = abs(z) = (z ^ sz) - sz

+ pxor xmm1, xmm0

+ pxor xmm5, xmm4

+ psubw xmm1, xmm0

+ psubw xmm5, xmm4

+ ; x += round

+ paddw xmm1, [rcx]

+ paddw xmm5, [rcx + 16]

+ mov rax, [rsi + vp9_blockd_qcoeff]

+ mov rcx, [rsi + vp9_blockd_dequant]

+ mov rdi, [rsi + vp9_blockd_dqcoeff]

+ ; y = x * quant >> 16

+ pmulhw xmm1, [rdx]

+ pmulhw xmm5, [rdx + 16]

+ ; x = (y ^ sz) - sz

+ pxor xmm1, xmm0

+ pxor xmm5, xmm4

+ psubw xmm1, xmm0

+ psubw xmm5, xmm4

+ ; qcoeff = x

+ movdqa [rax], xmm1

+ movdqa [rax + 16], xmm5

+ ; x * dequant

+ movdqa xmm2, xmm1

+ movdqa xmm3, xmm5

+ pmullw xmm2, [rcx]

+ pmullw xmm3, [rcx + 16]

+ ; dqcoeff = x * dequant

+ movdqa [rdi], xmm2

+ movdqa [rdi + 16], xmm3

+ pxor xmm4, xmm4 ;clear all bits

+ pcmpeqw xmm1, xmm4

+ pcmpeqw xmm5, xmm4

+ pcmpeqw xmm4, xmm4 ;set all bits

+ pxor xmm1, xmm4

+ pxor xmm5, xmm4

+ pand xmm1, [GLOBAL(inv_zig_zag)]

+ pand xmm5, [GLOBAL(inv_zig_zag + 16)]

+ pmaxsw xmm1, xmm5

+ ; now down to 8

+ pshufd xmm5, xmm1, 00001110b

+ pmaxsw xmm1, xmm5

+ ; only 4 left

+ pshuflw xmm5, xmm1, 00001110b

+ pmaxsw xmm1, xmm5

+ ; okay, just 2!

+ pshuflw xmm5, xmm1, 00000001b

+ pmaxsw xmm1, xmm5

+ movd eax, xmm1

+ and eax, 0xff

+ mov [rsi + vp9_blockd_eob], eax

+ ; begin epilog

+%if ABI_IS_32BIT

+ pop rsi

+ pop rdi

+%else

+ %ifidn __OUTPUT_FORMAT__,x64

+ pop rsi

+ pop rdi

+ %endif

+%endif

+ RESTORE_GOT

+ pop rbp

+ ret

+SECTION_RODATA

+align 16

+inv_zig_zag:

+ dw 0x0001, 0x0002, 0x0006, 0x0007

+ dw 0x0003, 0x0005, 0x0008, 0x000d

+ dw 0x0004, 0x0009, 0x000c, 0x000e

+ dw 0x000a, 0x000b, 0x000f, 0x0010

« libvpx.gyp ('K') | « source/libvpx/vp9/encoder/x86/vp9_quantize_mmx.asm ('k') | source/libvpx/vp9/encoder/x86/vp9_quantize_sse4.asm » ('j') | no next file with comments »