| Index: source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
|
| diff --git a/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm b/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
|
| index 1126fdb61640e40cb820dc3928ed2e3ecbc9553a..56373e897c9d2c41bbda1dfe87a6768e96750783 100644
|
| --- a/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
|
| +++ b/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
|
| @@ -72,3 +72,49 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
|
| movd edx, m5
|
| %endif
|
| RET
|
| +
|
| +; Compute the sum of squared difference between two int16_t vectors.
|
| +; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
|
| +; intptr_t block_size)
|
| +
|
| +INIT_XMM sse2
|
| +cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
|
| + pxor m4, m4 ; sse accumulator
|
| + pxor m5, m5 ; dedicated zero register
|
| + lea uqcq, [uqcq+sizeq*2]
|
| + lea dqcq, [dqcq+sizeq*2]
|
| + neg sizeq
|
| +.loop:
|
| + mova m2, [uqcq+sizeq*2]
|
| + mova m0, [dqcq+sizeq*2]
|
| + mova m3, [uqcq+sizeq*2+mmsize]
|
| + mova m1, [dqcq+sizeq*2+mmsize]
|
| + psubw m0, m2
|
| + psubw m1, m3
|
| + ; individual errors are max. 15bit+sign, so squares are 30bit, and
|
| + ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
|
| + pmaddwd m0, m0
|
| + pmaddwd m1, m1
|
| + ; accumulate in 64bit
|
| + punpckldq m3, m0, m5
|
| + punpckhdq m0, m5
|
| + paddq m4, m3
|
| + punpckldq m3, m1, m5
|
| + paddq m4, m0
|
| + punpckhdq m1, m5
|
| + paddq m4, m3
|
| + paddq m4, m1
|
| + add sizeq, mmsize
|
| + jl .loop
|
| +
|
| + ; accumulate horizontally and store in return value
|
| + movhlps m5, m4
|
| + paddq m4, m5
|
| +%if ARCH_X86_64
|
| + movq rax, m4
|
| +%else
|
| + pshufd m5, m4, 0x1
|
| + movd eax, m4
|
| + movd edx, m5
|
| +%endif
|
| + RET
|
|
|