| Index: source/libvpx/vp9/common/x86/vp9_idct_ssse3.asm
|
| ===================================================================
|
| --- source/libvpx/vp9/common/x86/vp9_idct_ssse3.asm (revision 271012)
|
| +++ source/libvpx/vp9/common/x86/vp9_idct_ssse3.asm (working copy)
|
| @@ -28,6 +28,29 @@
|
| TRANSFORM_COEFFS 3196, 16069
|
| TRANSFORM_COEFFS 13623, 9102
|
|
|
| +%macro PAIR_PP_COEFFS 2
|
| +dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2
|
| +%endmacro
|
| +
|
| +%macro PAIR_MP_COEFFS 2
|
| +dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2
|
| +%endmacro
|
| +
|
| +%macro PAIR_MM_COEFFS 2
|
| +dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2
|
| +%endmacro
|
| +
|
| +PAIR_PP_COEFFS 30274, 12540
|
| +PAIR_PP_COEFFS 6392, 32138
|
| +PAIR_MP_COEFFS 18204, 27246
|
| +
|
| +PAIR_PP_COEFFS 12540, 12540
|
| +PAIR_PP_COEFFS 30274, 30274
|
| +PAIR_PP_COEFFS 6392, 6392
|
| +PAIR_PP_COEFFS 32138, 32138
|
| +PAIR_MM_COEFFS 18204, 18204
|
| +PAIR_PP_COEFFS 27246, 27246
|
| +
|
| SECTION .text
|
|
|
| %if ARCH_X86_64
|
| @@ -128,6 +151,7 @@
|
| %endmacro
|
|
|
| INIT_XMM ssse3
|
| +; full inverse 8x8 2D-DCT transform
|
| cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
|
| mova m8, [pd_8192]
|
| mova m11, [pw_16]
|
| @@ -159,4 +183,118 @@
|
| ADD_STORE_8P_2X 6, 7, 9, 10, 12
|
|
|
| RET
|
| +
|
| +; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
|
| +cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
|
| + mova m8, [pd_8192]
|
| + mova m11, [pw_16]
|
| + mova m12, [pw_11585x2]
|
| +
|
| + lea r3, [2 * strideq]
|
| +
|
| + mova m0, [inputq + 0]
|
| + mova m1, [inputq + 16]
|
| + mova m2, [inputq + 32]
|
| + mova m3, [inputq + 48]
|
| +
|
| + punpcklwd m0, m1
|
| + punpcklwd m2, m3
|
| + punpckhdq m9, m0, m2
|
| + punpckldq m0, m2
|
| + SWAP 2, 9
|
| +
|
| + ; m0 -> [0], [0]
|
| + ; m1 -> [1], [1]
|
| + ; m2 -> [2], [2]
|
| + ; m3 -> [3], [3]
|
| + punpckhqdq m10, m0, m0
|
| + punpcklqdq m0, m0
|
| + punpckhqdq m9, m2, m2
|
| + punpcklqdq m2, m2
|
| + SWAP 1, 10
|
| + SWAP 3, 9
|
| +
|
| + pmulhrsw m0, m12
|
| + pmulhrsw m2, [dpw_30274_12540]
|
| + pmulhrsw m1, [dpw_6392_32138]
|
| + pmulhrsw m3, [dpw_m18204_27246]
|
| +
|
| + SUM_SUB 0, 2, 9
|
| + SUM_SUB 1, 3, 9
|
| +
|
| + punpcklqdq m9, m3, m3
|
| + punpckhqdq m5, m3, m9
|
| +
|
| + SUM_SUB 3, 5, 9
|
| + punpckhqdq m5, m3
|
| + pmulhrsw m5, m12
|
| +
|
| + punpckhqdq m9, m1, m5
|
| + punpcklqdq m1, m5
|
| + SWAP 5, 9
|
| +
|
| + SUM_SUB 0, 5, 9
|
| + SUM_SUB 2, 1, 9
|
| +
|
| + punpckhqdq m3, m0, m0
|
| + punpckhqdq m4, m1, m1
|
| + punpckhqdq m6, m5, m5
|
| + punpckhqdq m7, m2, m2
|
| +
|
| + punpcklwd m0, m3
|
| + punpcklwd m7, m2
|
| + punpcklwd m1, m4
|
| + punpcklwd m6, m5
|
| +
|
| + punpckhdq m4, m0, m7
|
| + punpckldq m0, m7
|
| + punpckhdq m10, m1, m6
|
| + punpckldq m5, m1, m6
|
| +
|
| + punpckhqdq m1, m0, m5
|
| + punpcklqdq m0, m5
|
| + punpckhqdq m3, m4, m10
|
| + punpcklqdq m2, m4, m10
|
| +
|
| +
|
| + pmulhrsw m0, m12
|
| + pmulhrsw m6, m2, [dpw_30274_30274]
|
| + pmulhrsw m4, m2, [dpw_12540_12540]
|
| +
|
| + pmulhrsw m7, m1, [dpw_32138_32138]
|
| + pmulhrsw m1, [dpw_6392_6392]
|
| + pmulhrsw m5, m3, [dpw_m18204_m18204]
|
| + pmulhrsw m3, [dpw_27246_27246]
|
| +
|
| + mova m2, m0
|
| + SUM_SUB 0, 6, 9
|
| + SUM_SUB 2, 4, 9
|
| + SUM_SUB 1, 5, 9
|
| + SUM_SUB 7, 3, 9
|
| +
|
| + SUM_SUB 3, 5, 9
|
| + pmulhrsw m3, m12
|
| + pmulhrsw m5, m12
|
| +
|
| + SUM_SUB 0, 7, 9
|
| + SUM_SUB 2, 3, 9
|
| + SUM_SUB 4, 5, 9
|
| + SUM_SUB 6, 1, 9
|
| +
|
| + SWAP 3, 6
|
| + SWAP 1, 2
|
| + SWAP 2, 4
|
| +
|
| +
|
| + pxor m12, m12
|
| + ADD_STORE_8P_2X 0, 1, 9, 10, 12
|
| + lea outputq, [outputq + r3]
|
| + ADD_STORE_8P_2X 2, 3, 9, 10, 12
|
| + lea outputq, [outputq + r3]
|
| + ADD_STORE_8P_2X 4, 5, 9, 10, 12
|
| + lea outputq, [outputq + r3]
|
| + ADD_STORE_8P_2X 6, 7, 9, 10, 12
|
| +
|
| + RET
|
| +
|
| %endif
|
|
|