Index: source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm |
diff --git a/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm b/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm |
index 1126fdb61640e40cb820dc3928ed2e3ecbc9553a..56373e897c9d2c41bbda1dfe87a6768e96750783 100644 |
--- a/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm |
+++ b/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm |
@@ -72,3 +72,49 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz |
movd edx, m5 |
%endif |
RET |
+ |
+; Compute the sum of squared difference between two int16_t vectors. |
+; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff, |
+; intptr_t block_size) |
+ |
+INIT_XMM sse2 |
+cglobal block_error_fp, 3, 3, 6, uqc, dqc, size |
+ pxor m4, m4 ; sse accumulator |
+ pxor m5, m5 ; dedicated zero register |
+ lea uqcq, [uqcq+sizeq*2] |
+ lea dqcq, [dqcq+sizeq*2] |
+ neg sizeq |
+.loop: |
+ mova m2, [uqcq+sizeq*2] |
+ mova m0, [dqcq+sizeq*2] |
+ mova m3, [uqcq+sizeq*2+mmsize] |
+ mova m1, [dqcq+sizeq*2+mmsize] |
+ psubw m0, m2 |
+ psubw m1, m3 |
+ ; individual errors are max. 15bit+sign, so squares are 30bit, and |
+ ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) |
+ pmaddwd m0, m0 |
+ pmaddwd m1, m1 |
+ ; accumulate in 64bit |
+ punpckldq m3, m0, m5 |
+ punpckhdq m0, m5 |
+ paddq m4, m3 |
+ punpckldq m3, m1, m5 |
+ paddq m4, m0 |
+ punpckhdq m1, m5 |
+ paddq m4, m3 |
+ paddq m4, m1 |
+ add sizeq, mmsize |
+ jl .loop |
+ |
+ ; accumulate horizontally and store in return value |
+ movhlps m5, m4 |
+ paddq m4, m5 |
+%if ARCH_X86_64 |
+ movq rax, m4 |
+%else |
+ pshufd m5, m4, 0x1 |
+ movd eax, m4 |
+ movd edx, m5 |
+%endif |
+ RET |