source/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm - Issue 897063002: Revert "libvpx: Pull from upstream"

Unified Diff: source/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm

Issue 897063002: Revert "libvpx: Pull from upstream" (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: source/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm

diff --git a/source/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm b/source/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm

deleted file mode 100644

index 6029420d11424f0913ee56428704e90fdc8b2934..0000000000000000000000000000000000000000

--- a/source/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm

+++ /dev/null

@@ -1,287 +0,0 @@

-; Use of this source code is governed by a BSD-style license

-; that can be found in the LICENSE file in the root of the source

-; tree. An additional intellectual property rights grant can be found

-; in the file PATENTS. All contributing project authors may

-; be found in the AUTHORS file in the root of the source tree.

-; This file is a duplicate of mfqe_sse2.asm in VP8.

-; TODO(jackychen): Find a way to fix the duplicate.

-%include "vpx_ports/x86_abi_support.asm"

-;void vp9_filter_by_weight16x16_sse2

-;(

-; unsigned char *src,

-; int src_stride,

-; unsigned char *dst,

-; int dst_stride,

-; int src_weight

-;)

-global sym(vp9_filter_by_weight16x16_sse2) PRIVATE

-sym(vp9_filter_by_weight16x16_sse2):

- push rbp

- mov rbp, rsp

- SHADOW_ARGS_TO_STACK 5

- SAVE_XMM 6

- GET_GOT rbx

- push rsi

- push rdi

- ; end prolog

- movd xmm0, arg(4) ; src_weight

- pshuflw xmm0, xmm0, 0x0 ; replicate to all low words

- punpcklqdq xmm0, xmm0 ; replicate to all hi words

- movdqa xmm1, [GLOBAL(tMFQE)]

- psubw xmm1, xmm0 ; dst_weight

- mov rax, arg(0) ; src

- mov rsi, arg(1) ; src_stride

- mov rdx, arg(2) ; dst

- mov rdi, arg(3) ; dst_stride

- mov rcx, 16 ; loop count

- pxor xmm6, xmm6

-.combine

- movdqa xmm2, [rax]

- movdqa xmm4, [rdx]

- add rax, rsi

- ; src * src_weight

- movdqa xmm3, xmm2

- punpcklbw xmm2, xmm6

- punpckhbw xmm3, xmm6

- pmullw xmm2, xmm0

- pmullw xmm3, xmm0

- ; dst * dst_weight

- movdqa xmm5, xmm4

- punpcklbw xmm4, xmm6

- punpckhbw xmm5, xmm6

- pmullw xmm4, xmm1

- pmullw xmm5, xmm1

- ; sum, round and shift

- paddw xmm2, xmm4

- paddw xmm3, xmm5

- paddw xmm2, [GLOBAL(tMFQE_round)]

- paddw xmm3, [GLOBAL(tMFQE_round)]

- psrlw xmm2, 4

- psrlw xmm3, 4

- packuswb xmm2, xmm3

- movdqa [rdx], xmm2

- add rdx, rdi

- dec rcx

- jnz .combine

- ; begin epilog

- pop rdi

- pop rsi

- RESTORE_GOT

- RESTORE_XMM

- UNSHADOW_ARGS

- pop rbp

- ret

-;void vp9_filter_by_weight8x8_sse2

-;(

-; unsigned char *src,

-; int src_stride,

-; unsigned char *dst,

-; int dst_stride,

-; int src_weight

-;)

-global sym(vp9_filter_by_weight8x8_sse2) PRIVATE

-sym(vp9_filter_by_weight8x8_sse2):

- push rbp

- mov rbp, rsp

- SHADOW_ARGS_TO_STACK 5

- GET_GOT rbx

- push rsi

- push rdi

- ; end prolog

- movd xmm0, arg(4) ; src_weight

- pshuflw xmm0, xmm0, 0x0 ; replicate to all low words

- punpcklqdq xmm0, xmm0 ; replicate to all hi words

- movdqa xmm1, [GLOBAL(tMFQE)]

- psubw xmm1, xmm0 ; dst_weight

- mov rax, arg(0) ; src

- mov rsi, arg(1) ; src_stride

- mov rdx, arg(2) ; dst

- mov rdi, arg(3) ; dst_stride

- mov rcx, 8 ; loop count

- pxor xmm4, xmm4

-.combine

- movq xmm2, [rax]

- movq xmm3, [rdx]

- add rax, rsi

- ; src * src_weight

- punpcklbw xmm2, xmm4

- pmullw xmm2, xmm0

- ; dst * dst_weight

- punpcklbw xmm3, xmm4

- pmullw xmm3, xmm1

- ; sum, round and shift

- paddw xmm2, xmm3

- paddw xmm2, [GLOBAL(tMFQE_round)]

- psrlw xmm2, 4

- packuswb xmm2, xmm4

- movq [rdx], xmm2

- add rdx, rdi

- dec rcx

- jnz .combine

- ; begin epilog

- pop rdi

- pop rsi

- RESTORE_GOT

- UNSHADOW_ARGS

- pop rbp

- ret

-;void vp9_variance_and_sad_16x16_sse2 | arg

-;(

-; unsigned char *src1, 0

-; int stride1, 1

-; unsigned char *src2, 2

-; int stride2, 3

-; unsigned int *variance, 4

-; unsigned int *sad, 5

-;)

-global sym(vp9_variance_and_sad_16x16_sse2) PRIVATE

-sym(vp9_variance_and_sad_16x16_sse2):

- push rbp

- mov rbp, rsp

- SHADOW_ARGS_TO_STACK 6

- GET_GOT rbx

- push rsi

- push rdi

- ; end prolog

- mov rax, arg(0) ; src1

- mov rcx, arg(1) ; stride1

- mov rdx, arg(2) ; src2

- mov rdi, arg(3) ; stride2

- mov rsi, 16 ; block height

- ; Prep accumulator registers

- pxor xmm3, xmm3 ; SAD

- pxor xmm4, xmm4 ; sum of src2

- pxor xmm5, xmm5 ; sum of src2^2

- ; Because we're working with the actual output frames

- ; we can't depend on any kind of data alignment.

-.accumulate

- movdqa xmm0, [rax] ; src1

- movdqa xmm1, [rdx] ; src2

- add rax, rcx ; src1 + stride1

- add rdx, rdi ; src2 + stride2

- ; SAD(src1, src2)

- psadbw xmm0, xmm1

- paddusw xmm3, xmm0

- ; SUM(src2)

- pxor xmm2, xmm2

- psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0

- paddusw xmm4, xmm2

- ; pmaddubsw would be ideal if it took two unsigned values. instead,

- ; it expects a signed and an unsigned value. so instead we zero extend

- ; and operate on words.

- pxor xmm2, xmm2

- movdqa xmm0, xmm1

- punpcklbw xmm0, xmm2

- punpckhbw xmm1, xmm2

- pmaddwd xmm0, xmm0

- pmaddwd xmm1, xmm1

- paddd xmm5, xmm0

- paddd xmm5, xmm1

- sub rsi, 1

- jnz .accumulate

- ; phaddd only operates on adjacent double words.

- ; Finalize SAD and store

- movdqa xmm0, xmm3

- psrldq xmm0, 8

- paddusw xmm0, xmm3

- paddd xmm0, [GLOBAL(t128)]

- psrld xmm0, 8

- mov rax, arg(5)

- movd [rax], xmm0

- ; Accumulate sum of src2

- movdqa xmm0, xmm4

- psrldq xmm0, 8

- paddusw xmm0, xmm4

- ; Square src2. Ignore high value

- pmuludq xmm0, xmm0

- psrld xmm0, 8

- ; phaddw could be used to sum adjacent values but we want

- ; all the values summed. promote to doubles, accumulate,

- ; shift and sum

- pxor xmm2, xmm2

- movdqa xmm1, xmm5

- punpckldq xmm1, xmm2

- punpckhdq xmm5, xmm2

- paddd xmm1, xmm5

- movdqa xmm2, xmm1

- psrldq xmm1, 8

- paddd xmm1, xmm2

- psubd xmm1, xmm0

- ; (variance + 128) >> 8

- paddd xmm1, [GLOBAL(t128)]

- psrld xmm1, 8

- mov rax, arg(4)

- movd [rax], xmm1

- ; begin epilog

- pop rdi

- pop rsi

- RESTORE_GOT

- UNSHADOW_ARGS

- pop rbp

- ret

-SECTION_RODATA

-align 16

-t128:

-%ifndef __NASM_VER__

- ddq 128

-%elif CONFIG_BIG_ENDIAN

- dq 0, 128

-%else

- dq 128, 0

-%endif

-align 16

-tMFQE: ; 1 << MFQE_PRECISION

- times 8 dw 0x10

-align 16

-tMFQE_round: ; 1 << (MFQE_PRECISION - 1)

- times 8 dw 0x08

« no previous file with comments | « source/libvpx/vp9/common/vp9_rtcd_defs.pl ('k') | source/libvpx/vp9/decoder/vp9_decodeframe.h » ('j') | no next file with comments »