source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c - Issue 1162573005: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c

Issue 1162573005: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include <assert.h>	11 #include <assert.h>

12 #include <emmintrin.h> // SSE2	12 #include <emmintrin.h> // SSE2

	13

	14 #include "./vp9_rtcd.h"

13 #include "vp9/common/vp9_idct.h" // for cospi constants	15 #include "vp9/common/vp9_idct.h" // for cospi constants

14 #include "vp9/encoder/vp9_dct.h"	16 #include "vp9/encoder/vp9_dct.h"

15 #include "vp9/encoder/x86/vp9_dct_sse2.h"	17 #include "vp9/encoder/x86/vp9_dct_sse2.h"

16 #include "vpx_ports/mem.h"	18 #include "vpx_ports/mem.h"

17	19

18 void vp9_fdct4x4_1_sse2(const int16_t input, tran_low_t output, int stride) {	20 void vp9_fdct4x4_1_sse2(const int16_t input, tran_low_t output, int stride) {

19 __m128i in0, in1;	21 __m128i in0, in1;

20 __m128i tmp;	22 __m128i tmp;

21 const __m128i zero = _mm_setzero_si128();	23 const __m128i zero = _mm_setzero_si128();

22 in0 = _mm_loadl_epi64((const __m128i )(input + 0 stride));	24 in0 = _mm_loadl_epi64((const __m128i )(input + 0 stride));

(...skipping 66 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
89 res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);	91 res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);

90 res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);	92 res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);

91	93

92 // 00 10 20 30 01 11 21 31	94 // 00 10 20 30 01 11 21 31

93 // 02 12 22 32 03 13 23 33	95 // 02 12 22 32 03 13 23 33

94 // only use the first 4 16-bit integers	96 // only use the first 4 16-bit integers

95 res[1] = _mm_unpackhi_epi64(res[0], res[0]);	97 res[1] = _mm_unpackhi_epi64(res[0], res[0]);

96 res[3] = _mm_unpackhi_epi64(res[2], res[2]);	98 res[3] = _mm_unpackhi_epi64(res[2], res[2]);

97 }	99 }

98	100

99 void fdct4_sse2(__m128i *in) {	101 static void fdct4_sse2(__m128i *in) {

100 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);	102 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);

101 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);	103 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

102 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);	104 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);

103 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);	105 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);

104 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);	106 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);

105	107

106 __m128i u[4], v[4];	108 __m128i u[4], v[4];

107 u[0]=_mm_unpacklo_epi16(in[0], in[1]);	109 u[0]=_mm_unpacklo_epi16(in[0], in[1]);

108 u[1]=_mm_unpacklo_epi16(in[3], in[2]);	110 u[1]=_mm_unpacklo_epi16(in[3], in[2]);

109	111

(...skipping 12 matching lines...) Expand all Loading...
122 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);	124 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);

123 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);	125 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);

124 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);	126 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);

125 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);	127 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);

126	128

127 in[0] = _mm_packs_epi32(u[0], u[1]);	129 in[0] = _mm_packs_epi32(u[0], u[1]);

128 in[1] = _mm_packs_epi32(u[2], u[3]);	130 in[1] = _mm_packs_epi32(u[2], u[3]);

129 transpose_4x4(in);	131 transpose_4x4(in);

130 }	132 }

131	133

132 void fadst4_sse2(__m128i *in) {	134 static void fadst4_sse2(__m128i *in) {

133 const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);	135 const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);

134 const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);	136 const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);

135 const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);	137 const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);

136 const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);	138 const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);

137 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);	139 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);

138 const __m128i kZero = _mm_set1_epi16(0);	140 const __m128i kZero = _mm_set1_epi16(0);

139 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);	141 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);

140 __m128i u[8], v[8];	142 __m128i u[8], v[8];

141 __m128i in7 = _mm_add_epi16(in[0], in[1]);	143 __m128i in7 = _mm_add_epi16(in[0], in[1]);

142	144

(...skipping 681 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
824 // 00 10 20 30 40 50 60 70	826 // 00 10 20 30 40 50 60 70

825 // 01 11 21 31 41 51 61 71	827 // 01 11 21 31 41 51 61 71

826 // 02 12 22 32 42 52 62 72	828 // 02 12 22 32 42 52 62 72

827 // 03 13 23 33 43 53 63 73	829 // 03 13 23 33 43 53 63 73

828 // 04 14 24 34 44 54 64 74	830 // 04 14 24 34 44 54 64 74

829 // 05 15 25 35 45 55 65 75	831 // 05 15 25 35 45 55 65 75

830 // 06 16 26 36 46 56 66 76	832 // 06 16 26 36 46 56 66 76

831 // 07 17 27 37 47 57 67 77	833 // 07 17 27 37 47 57 67 77

832 }	834 }

833	835

834 void fdct8_sse2(__m128i *in) {	836 static void fdct8_sse2(__m128i *in) {

835 // constants	837 // constants

836 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);	838 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);

837 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);	839 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

838 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);	840 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);

839 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);	841 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);

840 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);	842 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);

841 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);	843 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);

842 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);	844 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);

843 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);	845 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);

844 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);	846 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);

(...skipping 119 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
964	966

965 in[1] = _mm_packs_epi32(v0, v1);	967 in[1] = _mm_packs_epi32(v0, v1);

966 in[3] = _mm_packs_epi32(v4, v5);	968 in[3] = _mm_packs_epi32(v4, v5);

967 in[5] = _mm_packs_epi32(v2, v3);	969 in[5] = _mm_packs_epi32(v2, v3);

968 in[7] = _mm_packs_epi32(v6, v7);	970 in[7] = _mm_packs_epi32(v6, v7);

969	971

970 // transpose	972 // transpose

971 array_transpose_8x8(in, in);	973 array_transpose_8x8(in, in);

972 }	974 }

973	975

974 void fadst8_sse2(__m128i *in) {	976 static void fadst8_sse2(__m128i *in) {

975 // Constants	977 // Constants

976 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);	978 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);

977 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);	979 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);

978 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);	980 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);

979 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);	981 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);

980 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);	982 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);

981 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);	983 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);

982 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);	984 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);

983 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);	985 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);

984 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);	986 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);

(...skipping 361 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1346 }	1348 }

1347	1349

1348 static INLINE void right_shift_16x16(__m128i res0, __m128i res1) {	1350 static INLINE void right_shift_16x16(__m128i res0, __m128i res1) {

1349 // perform rounding operations	1351 // perform rounding operations

1350 right_shift_8x8(res0, 2);	1352 right_shift_8x8(res0, 2);

1351 right_shift_8x8(res0 + 8, 2);	1353 right_shift_8x8(res0 + 8, 2);

1352 right_shift_8x8(res1, 2);	1354 right_shift_8x8(res1, 2);

1353 right_shift_8x8(res1 + 8, 2);	1355 right_shift_8x8(res1 + 8, 2);

1354 }	1356 }

1355	1357

1356 void fdct16_8col(__m128i *in) {	1358 static void fdct16_8col(__m128i *in) {

1357 // perform 16x16 1-D DCT for 8 columns	1359 // perform 16x16 1-D DCT for 8 columns

1358 __m128i i[8], s[8], p[8], t[8], u[16], v[16];	1360 __m128i i[8], s[8], p[8], t[8], u[16], v[16];

1359 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);	1361 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);

1360 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);	1362 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

1361 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);	1363 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);

1362 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);	1364 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);

1363 const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);	1365 const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);

1364 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);	1366 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);

1365 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);	1367 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);

1366 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);	1368 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);

(...skipping 301 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1668 in[1] = _mm_packs_epi32(v[0], v[1]);	1670 in[1] = _mm_packs_epi32(v[0], v[1]);

1669 in[9] = _mm_packs_epi32(v[2], v[3]);	1671 in[9] = _mm_packs_epi32(v[2], v[3]);

1670 in[5] = _mm_packs_epi32(v[4], v[5]);	1672 in[5] = _mm_packs_epi32(v[4], v[5]);

1671 in[13] = _mm_packs_epi32(v[6], v[7]);	1673 in[13] = _mm_packs_epi32(v[6], v[7]);

1672 in[3] = _mm_packs_epi32(v[8], v[9]);	1674 in[3] = _mm_packs_epi32(v[8], v[9]);

1673 in[11] = _mm_packs_epi32(v[10], v[11]);	1675 in[11] = _mm_packs_epi32(v[10], v[11]);

1674 in[7] = _mm_packs_epi32(v[12], v[13]);	1676 in[7] = _mm_packs_epi32(v[12], v[13]);

1675 in[15] = _mm_packs_epi32(v[14], v[15]);	1677 in[15] = _mm_packs_epi32(v[14], v[15]);

1676 }	1678 }

1677	1679

1678 void fadst16_8col(__m128i *in) {	1680 static void fadst16_8col(__m128i *in) {

1679 // perform 16x16 1-D ADST for 8 columns	1681 // perform 16x16 1-D ADST for 8 columns

1680 __m128i s[16], x[16], u[32], v[32];	1682 __m128i s[16], x[16], u[32], v[32];

1681 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);	1683 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);

1682 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);	1684 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);

1683 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);	1685 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);

1684 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);	1686 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);

1685 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);	1687 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);

1686 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);	1688 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);

1687 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);	1689 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);

1688 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);	1690 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);

(...skipping 449 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2138 in[8] = _mm_packs_epi32(v[2], v[3]);	2140 in[8] = _mm_packs_epi32(v[2], v[3]);

2139 in[9] = _mm_packs_epi32(v[10], v[11]);	2141 in[9] = _mm_packs_epi32(v[10], v[11]);

2140 in[10] = _mm_packs_epi32(v[14], v[15]);	2142 in[10] = _mm_packs_epi32(v[14], v[15]);

2141 in[11] = _mm_packs_epi32(v[6], v[7]);	2143 in[11] = _mm_packs_epi32(v[6], v[7]);

2142 in[12] = s[5];	2144 in[12] = s[5];

2143 in[13] = _mm_sub_epi16(kZero, s[13]);	2145 in[13] = _mm_sub_epi16(kZero, s[13]);

2144 in[14] = s[9];	2146 in[14] = s[9];

2145 in[15] = _mm_sub_epi16(kZero, s[1]);	2147 in[15] = _mm_sub_epi16(kZero, s[1]);

2146 }	2148 }

2147	2149

2148 void fdct16_sse2(__m128i in0, __m128i in1) {	2150 static void fdct16_sse2(__m128i in0, __m128i in1) {

2149 fdct16_8col(in0);	2151 fdct16_8col(in0);

2150 fdct16_8col(in1);	2152 fdct16_8col(in1);

2151 array_transpose_16x16(in0, in1);	2153 array_transpose_16x16(in0, in1);

2152 }	2154 }

2153	2155

2154 void fadst16_sse2(__m128i in0, __m128i in1) {	2156 static void fadst16_sse2(__m128i in0, __m128i in1) {

2155 fadst16_8col(in0);	2157 fadst16_8col(in0);

2156 fadst16_8col(in1);	2158 fadst16_8col(in1);

2157 array_transpose_16x16(in0, in1);	2159 array_transpose_16x16(in0, in1);

2158 }	2160 }

2159	2161

2160 void vp9_fht16x16_sse2(const int16_t input, tran_low_t output,	2162 void vp9_fht16x16_sse2(const int16_t input, tran_low_t output,

2161 int stride, int tx_type) {	2163 int stride, int tx_type) {

2162 __m128i in0[16], in1[16];	2164 __m128i in0[16], in1[16];

2163	2165

2164 switch (tx_type) {	2166 switch (tx_type) {

(...skipping 162 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2327 for (i = 0; i < 8; ++i) {	2329 for (i = 0; i < 8; ++i) {

2328 for (j = 0; j < 8; ++j)	2330 for (j = 0; j < 8; ++j)

2329 temp_in[j] = out[j + i * 8];	2331 temp_in[j] = out[j + i * 8];

2330 ht.rows(temp_in, temp_out);	2332 ht.rows(temp_in, temp_out);

2331 for (j = 0; j < 8; ++j)	2333 for (j = 0; j < 8; ++j)

2332 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;	2334 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;

2333 }	2335 }

2334 }	2336 }

2335 }	2337 }

2336	2338

2337 void vp9_highbd_fht16x16_sse2(int16_t input, tran_low_t output,	2339 void vp9_highbd_fht16x16_sse2(const int16_t input, tran_low_t output,

2338 int stride, int tx_type) {	2340 int stride, int tx_type) {

2339 if (tx_type == DCT_DCT) {	2341 if (tx_type == DCT_DCT) {

2340 vp9_highbd_fdct16x16_sse2(input, output, stride);	2342 vp9_highbd_fdct16x16_sse2(input, output, stride);

2341 } else {	2343 } else {

2342 tran_low_t out[256];	2344 tran_low_t out[256];

2343 tran_low_t *outptr = &out[0];	2345 tran_low_t *outptr = &out[0];

2344 int i, j;	2346 int i, j;

2345 tran_low_t temp_in[16], temp_out[16];	2347 tran_low_t temp_in[16], temp_out[16];

2346 const transform_2d ht = FHT_16[tx_type];	2348 const transform_2d ht = FHT_16[tx_type];

2347	2349

(...skipping 13 matching lines...) Expand all Loading...
2361 ht.rows(temp_in, temp_out);	2363 ht.rows(temp_in, temp_out);

2362 for (j = 0; j < 16; ++j)	2364 for (j = 0; j < 16; ++j)

2363 output[j + i * 16] = temp_out[j];	2365 output[j + i * 16] = temp_out[j];

2364 }	2366 }

2365 }	2367 }

2366 }	2368 }

2367 #endif // CONFIG_VP9_HIGHBITDEPTH	2369 #endif // CONFIG_VP9_HIGHBITDEPTH

2368	2370

2369 /*	2371 /*

2370 * The DCTnxn functions are defined using the macros below. The main code for	2372 * The DCTnxn functions are defined using the macros below. The main code for

2371 * them is in separate files (vp9/encoder/x86/vp9_dct_impl_sse2.c &	2373 * them is in separate files (vp9/encoder/x86/vp9_dct_sse2_impl.h &

2372 * vp9/encoder/x86/vp9_dct32x32_sse2.c) which are used by both the 8 bit code	2374 * vp9/encoder/x86/vp9_dct32x32_sse2_impl.h) which are used by both the 8 bit co de

2373 * and the high bit depth code.	2375 * and the high bit depth code.

2374 */	2376 */

2375	2377

2376 #define DCT_HIGH_BIT_DEPTH 0	2378 #define DCT_HIGH_BIT_DEPTH 0

2377	2379

2378 #define FDCT4x4_2D vp9_fdct4x4_sse2	2380 #define FDCT4x4_2D vp9_fdct4x4_sse2

2379 #define FDCT8x8_2D vp9_fdct8x8_sse2	2381 #define FDCT8x8_2D vp9_fdct8x8_sse2

2380 #define FDCT16x16_2D vp9_fdct16x16_sse2	2382 #define FDCT16x16_2D vp9_fdct16x16_sse2

2381 #include "vp9/encoder/x86/vp9_dct_impl_sse2.c"	2383 #include "vp9/encoder/x86/vp9_dct_sse2_impl.h"

2382 #undef FDCT4x4_2D	2384 #undef FDCT4x4_2D

2383 #undef FDCT8x8_2D	2385 #undef FDCT8x8_2D

2384 #undef FDCT16x16_2D	2386 #undef FDCT16x16_2D

2385	2387

2386 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2	2388 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2

2387 #define FDCT32x32_HIGH_PRECISION 0	2389 #define FDCT32x32_HIGH_PRECISION 0

2388 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c"	2390 #include "vp9/encoder/x86/vp9_dct32x32_sse2_impl.h"

2389 #undef FDCT32x32_2D	2391 #undef FDCT32x32_2D

2390 #undef FDCT32x32_HIGH_PRECISION	2392 #undef FDCT32x32_HIGH_PRECISION

2391	2393

2392 #define FDCT32x32_2D vp9_fdct32x32_sse2	2394 #define FDCT32x32_2D vp9_fdct32x32_sse2

2393 #define FDCT32x32_HIGH_PRECISION 1	2395 #define FDCT32x32_HIGH_PRECISION 1

2394 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT	2396 #include "vp9/encoder/x86/vp9_dct32x32_sse2_impl.h" // NOLINT

2395 #undef FDCT32x32_2D	2397 #undef FDCT32x32_2D

2396 #undef FDCT32x32_HIGH_PRECISION	2398 #undef FDCT32x32_HIGH_PRECISION

2397	2399

2398 #undef DCT_HIGH_BIT_DEPTH	2400 #undef DCT_HIGH_BIT_DEPTH

2399	2401

2400	2402

2401 #if CONFIG_VP9_HIGHBITDEPTH	2403 #if CONFIG_VP9_HIGHBITDEPTH

2402	2404

2403 #define DCT_HIGH_BIT_DEPTH 1	2405 #define DCT_HIGH_BIT_DEPTH 1

2404	2406

2405 #define FDCT4x4_2D vp9_highbd_fdct4x4_sse2	2407 #define FDCT4x4_2D vp9_highbd_fdct4x4_sse2

2406 #define FDCT8x8_2D vp9_highbd_fdct8x8_sse2	2408 #define FDCT8x8_2D vp9_highbd_fdct8x8_sse2

2407 #define FDCT16x16_2D vp9_highbd_fdct16x16_sse2	2409 #define FDCT16x16_2D vp9_highbd_fdct16x16_sse2

2408 #include "vp9/encoder/x86/vp9_dct_impl_sse2.c" // NOLINT	2410 #include "vp9/encoder/x86/vp9_dct_sse2_impl.h" // NOLINT

2409 #undef FDCT4x4_2D	2411 #undef FDCT4x4_2D

2410 #undef FDCT8x8_2D	2412 #undef FDCT8x8_2D

2411 #undef FDCT16x16_2D	2413 #undef FDCT16x16_2D

2412	2414

2413 #define FDCT32x32_2D vp9_highbd_fdct32x32_rd_sse2	2415 #define FDCT32x32_2D vp9_highbd_fdct32x32_rd_sse2

2414 #define FDCT32x32_HIGH_PRECISION 0	2416 #define FDCT32x32_HIGH_PRECISION 0

2415 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT	2417 #include "vp9/encoder/x86/vp9_dct32x32_sse2_impl.h" // NOLINT

2416 #undef FDCT32x32_2D	2418 #undef FDCT32x32_2D

2417 #undef FDCT32x32_HIGH_PRECISION	2419 #undef FDCT32x32_HIGH_PRECISION

2418	2420

2419 #define FDCT32x32_2D vp9_highbd_fdct32x32_sse2	2421 #define FDCT32x32_2D vp9_highbd_fdct32x32_sse2

2420 #define FDCT32x32_HIGH_PRECISION 1	2422 #define FDCT32x32_HIGH_PRECISION 1

2421 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT	2423 #include "vp9/encoder/x86/vp9_dct32x32_sse2_impl.h" // NOLINT

2422 #undef FDCT32x32_2D	2424 #undef FDCT32x32_2D

2423 #undef FDCT32x32_HIGH_PRECISION	2425 #undef FDCT32x32_HIGH_PRECISION

2424	2426

2425 #undef DCT_HIGH_BIT_DEPTH	2427 #undef DCT_HIGH_BIT_DEPTH

2426	2428

2427 #endif // CONFIG_VP9_HIGHBITDEPTH	2429 #endif // CONFIG_VP9_HIGHBITDEPTH

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_dct_impl_sse2.c ('k') | source/libvpx/vp9/encoder/x86/vp9_dct_sse2_impl.h » ('j') | no next file with comments »