source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c - Issue 54923004: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c

Issue 54923004: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 7 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include <emmintrin.h> // SSE2	11 #include <emmintrin.h> // SSE2

12 #include "vp9/common/vp9_idct.h" // for cospi constants	12 #include "vp9/common/vp9_idct.h" // for cospi constants

13 #include "vpx_ports/mem.h"	13 #include "vpx_ports/mem.h"

14	14

15 void vp9_short_fdct4x4_sse2(int16_t input, int16_t output, int pitch) {	15 void vp9_fdct4x4_sse2(const int16_t input, int16_t output, int stride) {

16 // The 2D transform is done with two passes which are actually pretty	16 // The 2D transform is done with two passes which are actually pretty

17 // similar. In the first one, we transform the columns and transpose	17 // similar. In the first one, we transform the columns and transpose

18 // the results. In the second one, we transform the rows. To achieve that,	18 // the results. In the second one, we transform the rows. To achieve that,

19 // as the first pass results are transposed, we tranpose the columns (that	19 // as the first pass results are transposed, we tranpose the columns (that

20 // is the transposed rows) and transpose the results (so that it goes back	20 // is the transposed rows) and transpose the results (so that it goes back

21 // in normal/row positions).	21 // in normal/row positions).

22 const int stride = pitch >> 1;

23 int pass;	22 int pass;

24 // Constants	23 // Constants

25 // When we use them, in one case, they are all the same. In all others	24 // When we use them, in one case, they are all the same. In all others

26 // it's a pair of them that we need to repeat four times. This is done	25 // it's a pair of them that we need to repeat four times. This is done

27 // by constructing the 32 bit constant corresponding to that pair.	26 // by constructing the 32 bit constant corresponding to that pair.

28 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);	27 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);

29 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);	28 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

30 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);	29 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);

31 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);	30 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);

32 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);	31 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);

(...skipping 72 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
105 __m128i out01 = _mm_add_epi16(in0, kOne);	104 __m128i out01 = _mm_add_epi16(in0, kOne);

106 __m128i out23 = _mm_add_epi16(in2, kOne);	105 __m128i out23 = _mm_add_epi16(in2, kOne);

107 out01 = _mm_srai_epi16(out01, 2);	106 out01 = _mm_srai_epi16(out01, 2);

108 out23 = _mm_srai_epi16(out23, 2);	107 out23 = _mm_srai_epi16(out23, 2);

109 _mm_storeu_si128((__m128i )(output + 0 4), out01);	108 _mm_storeu_si128((__m128i )(output + 0 4), out01);

110 _mm_storeu_si128((__m128i )(output + 2 4), out23);	109 _mm_storeu_si128((__m128i )(output + 2 4), out23);

111 }	110 }

112 }	111 }

113 }	112 }

114	113

115 void vp9_short_fdct8x4_sse2(int16_t input, int16_t output, int pitch) {	114 static INLINE void load_buffer_4x4(const int16_t input, __m128i in,

116 vp9_short_fdct4x4_sse2(input, output, pitch);	115 int stride) {

117 vp9_short_fdct4x4_sse2(input + 4, output + 16, pitch);

118 }

119

120 static INLINE void load_buffer_4x4(int16_t input, __m128i in, int stride) {

121 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);	116 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);

122 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);	117 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);

123 __m128i mask;	118 __m128i mask;

124	119

125 in[0] = _mm_loadl_epi64((const __m128i )(input + 0 stride));	120 in[0] = _mm_loadl_epi64((const __m128i )(input + 0 stride));

126 in[1] = _mm_loadl_epi64((const __m128i )(input + 1 stride));	121 in[1] = _mm_loadl_epi64((const __m128i )(input + 1 stride));

127 in[2] = _mm_loadl_epi64((const __m128i )(input + 2 stride));	122 in[2] = _mm_loadl_epi64((const __m128i )(input + 2 stride));

128 in[3] = _mm_loadl_epi64((const __m128i )(input + 3 stride));	123 in[3] = _mm_loadl_epi64((const __m128i )(input + 3 stride));

129	124

130 in[0] = _mm_slli_epi16(in[0], 4);	125 in[0] = _mm_slli_epi16(in[0], 4);

(...skipping 33 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
164 // 00 10 20 30 01 11 21 31	159 // 00 10 20 30 01 11 21 31

165 // 02 12 22 32 03 13 23 33	160 // 02 12 22 32 03 13 23 33

166 // only use the first 4 16-bit integers	161 // only use the first 4 16-bit integers

167 res[1] = _mm_unpackhi_epi64(res[0], res[0]);	162 res[1] = _mm_unpackhi_epi64(res[0], res[0]);

168 res[3] = _mm_unpackhi_epi64(res[2], res[2]);	163 res[3] = _mm_unpackhi_epi64(res[2], res[2]);

169 }	164 }

170	165

171 void fdct4_1d_sse2(__m128i *in) {	166 void fdct4_1d_sse2(__m128i *in) {

172 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);	167 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);

173 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);	168 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

174 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);	169 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);

175 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);	170 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);

176 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);	171 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);

177	172

178 __m128i u[4], v[4];	173 __m128i u[4], v[4];

179 u[0] = _mm_add_epi16(in[0], in[3]);	174 u[0]=_mm_unpacklo_epi16(in[0], in[1]);

180 u[1] = _mm_add_epi16(in[1], in[2]);	175 u[1]=_mm_unpacklo_epi16(in[3], in[2]);

181 u[2] = _mm_sub_epi16(in[1], in[2]);

182 u[3] = _mm_sub_epi16(in[0], in[3]);

183	176

184 v[0] = _mm_unpacklo_epi16(u[0], u[1]);	177 v[0] = _mm_add_epi16(u[0], u[1]);

185 v[1] = _mm_unpacklo_epi16(u[2], u[3]);	178 v[1] = _mm_sub_epi16(u[0], u[1]);

	179

186 u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0	180 u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0

187 u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2	181 u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2

188 u[2] = _mm_madd_epi16(v[1], k__cospi_p24_p08); // 1	182 u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1

189 u[3] = _mm_madd_epi16(v[1], k__cospi_m08_p24); // 3	183 u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3

190	184

191 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);	185 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);

192 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);	186 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);

193 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);	187 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);

194 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);	188 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);

195 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);	189 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);

196 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);	190 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);

197 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);	191 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);

198 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);	192 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);

199	193

(...skipping 42 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
242 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);	236 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);

243 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);	237 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);

244 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);	238 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);

245 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);	239 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);

246	240

247 in[0] = _mm_packs_epi32(u[0], u[2]);	241 in[0] = _mm_packs_epi32(u[0], u[2]);

248 in[1] = _mm_packs_epi32(u[1], u[3]);	242 in[1] = _mm_packs_epi32(u[1], u[3]);

249 transpose_4x4(in);	243 transpose_4x4(in);

250 }	244 }

251	245

252 void vp9_short_fht4x4_sse2(int16_t input, int16_t output,	246 void vp9_short_fht4x4_sse2(const int16_t input, int16_t output,

253 int stride, int tx_type) {	247 int stride, int tx_type) {

254 __m128i in[4];	248 __m128i in[4];

255 load_buffer_4x4(input, in, stride);	249 load_buffer_4x4(input, in, stride);

256 switch (tx_type) {	250 switch (tx_type) {

257 case 0: // DCT_DCT	251 case 0: // DCT_DCT

258 fdct4_1d_sse2(in);	252 fdct4_1d_sse2(in);

259 fdct4_1d_sse2(in);	253 fdct4_1d_sse2(in);

260 break;	254 break;

261 case 1: // ADST_DCT	255 case 1: // ADST_DCT

262 fadst4_1d_sse2(in);	256 fadst4_1d_sse2(in);

263 fdct4_1d_sse2(in);	257 fdct4_1d_sse2(in);

264 break;	258 break;

265 case 2: // DCT_ADST	259 case 2: // DCT_ADST

266 fdct4_1d_sse2(in);	260 fdct4_1d_sse2(in);

267 fadst4_1d_sse2(in);	261 fadst4_1d_sse2(in);

268 break;	262 break;

269 case 3: // ADST_ADST	263 case 3: // ADST_ADST

270 fadst4_1d_sse2(in);	264 fadst4_1d_sse2(in);

271 fadst4_1d_sse2(in);	265 fadst4_1d_sse2(in);

272 break;	266 break;

273 default:	267 default:

274 assert(0);	268 assert(0);

275 break;	269 break;

276 }	270 }

277 write_buffer_4x4(output, in);	271 write_buffer_4x4(output, in);

278 }	272 }

279	273

280 void vp9_short_fdct8x8_sse2(int16_t input, int16_t output, int pitch) {	274 void vp9_fdct8x8_sse2(const int16_t input, int16_t output, int stride) {

281 const int stride = pitch >> 1;

282 int pass;	275 int pass;

283 // Constants	276 // Constants

284 // When we use them, in one case, they are all the same. In all others	277 // When we use them, in one case, they are all the same. In all others

285 // it's a pair of them that we need to repeat four times. This is done	278 // it's a pair of them that we need to repeat four times. This is done

286 // by constructing the 32 bit constant corresponding to that pair.	279 // by constructing the 32 bit constant corresponding to that pair.

287 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);	280 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);

288 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);	281 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

289 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);	282 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);

290 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);	283 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);

291 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);	284 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);

(...skipping 236 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
528 _mm_store_si128((__m128i )(output + 2 8), in2);	521 _mm_store_si128((__m128i )(output + 2 8), in2);

529 _mm_store_si128((__m128i )(output + 3 8), in3);	522 _mm_store_si128((__m128i )(output + 3 8), in3);

530 _mm_store_si128((__m128i )(output + 4 8), in4);	523 _mm_store_si128((__m128i )(output + 4 8), in4);

531 _mm_store_si128((__m128i )(output + 5 8), in5);	524 _mm_store_si128((__m128i )(output + 5 8), in5);

532 _mm_store_si128((__m128i )(output + 6 8), in6);	525 _mm_store_si128((__m128i )(output + 6 8), in6);

533 _mm_store_si128((__m128i )(output + 7 8), in7);	526 _mm_store_si128((__m128i )(output + 7 8), in7);

534 }	527 }

535 }	528 }

536	529

537 // load 8x8 array	530 // load 8x8 array

538 static INLINE void load_buffer_8x8(int16_t input, __m128i in, int stride) {	531 static INLINE void load_buffer_8x8(const int16_t input, __m128i in,

539 in[0] = _mm_load_si128((__m128i )(input + 0 stride));	532 int stride) {

540 in[1] = _mm_load_si128((__m128i )(input + 1 stride));	533 in[0] = _mm_load_si128((const __m128i )(input + 0 stride));

541 in[2] = _mm_load_si128((__m128i )(input + 2 stride));	534 in[1] = _mm_load_si128((const __m128i )(input + 1 stride));

542 in[3] = _mm_load_si128((__m128i )(input + 3 stride));	535 in[2] = _mm_load_si128((const __m128i )(input + 2 stride));

543 in[4] = _mm_load_si128((__m128i )(input + 4 stride));	536 in[3] = _mm_load_si128((const __m128i )(input + 3 stride));

544 in[5] = _mm_load_si128((__m128i )(input + 5 stride));	537 in[4] = _mm_load_si128((const __m128i )(input + 4 stride));

545 in[6] = _mm_load_si128((__m128i )(input + 6 stride));	538 in[5] = _mm_load_si128((const __m128i )(input + 5 stride));

546 in[7] = _mm_load_si128((__m128i )(input + 7 stride));	539 in[6] = _mm_load_si128((const __m128i )(input + 6 stride));

	540 in[7] = _mm_load_si128((const __m128i )(input + 7 stride));

547	541

548 in[0] = _mm_slli_epi16(in[0], 2);	542 in[0] = _mm_slli_epi16(in[0], 2);

549 in[1] = _mm_slli_epi16(in[1], 2);	543 in[1] = _mm_slli_epi16(in[1], 2);

550 in[2] = _mm_slli_epi16(in[2], 2);	544 in[2] = _mm_slli_epi16(in[2], 2);

551 in[3] = _mm_slli_epi16(in[3], 2);	545 in[3] = _mm_slli_epi16(in[3], 2);

552 in[4] = _mm_slli_epi16(in[4], 2);	546 in[4] = _mm_slli_epi16(in[4], 2);

553 in[5] = _mm_slli_epi16(in[5], 2);	547 in[5] = _mm_slli_epi16(in[5], 2);

554 in[6] = _mm_slli_epi16(in[6], 2);	548 in[6] = _mm_slli_epi16(in[6], 2);

555 in[7] = _mm_slli_epi16(in[7], 2);	549 in[7] = _mm_slli_epi16(in[7], 2);

556 }	550 }

(...skipping 469 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1026 in[3] = _mm_sub_epi16(k__const_0, s2);	1020 in[3] = _mm_sub_epi16(k__const_0, s2);

1027 in[4] = s3;	1021 in[4] = s3;

1028 in[5] = _mm_sub_epi16(k__const_0, s7);	1022 in[5] = _mm_sub_epi16(k__const_0, s7);

1029 in[6] = s5;	1023 in[6] = s5;

1030 in[7] = _mm_sub_epi16(k__const_0, s1);	1024 in[7] = _mm_sub_epi16(k__const_0, s1);

1031	1025

1032 // transpose	1026 // transpose

1033 array_transpose_8x8(in, in);	1027 array_transpose_8x8(in, in);

1034 }	1028 }

1035	1029

1036 void vp9_short_fht8x8_sse2(int16_t input, int16_t output,	1030 void vp9_short_fht8x8_sse2(const int16_t input, int16_t output,

1037 int stride, int tx_type) {	1031 int stride, int tx_type) {

1038 __m128i in[8];	1032 __m128i in[8];

1039 load_buffer_8x8(input, in, stride);	1033 load_buffer_8x8(input, in, stride);

1040 switch (tx_type) {	1034 switch (tx_type) {

1041 case 0: // DCT_DCT	1035 case 0: // DCT_DCT

1042 fdct8_1d_sse2(in);	1036 fdct8_1d_sse2(in);

1043 fdct8_1d_sse2(in);	1037 fdct8_1d_sse2(in);

1044 break;	1038 break;

1045 case 1: // ADST_DCT	1039 case 1: // ADST_DCT

1046 fadst8_1d_sse2(in);	1040 fadst8_1d_sse2(in);

1047 fdct8_1d_sse2(in);	1041 fdct8_1d_sse2(in);

1048 break;	1042 break;

1049 case 2: // DCT_ADST	1043 case 2: // DCT_ADST

1050 fdct8_1d_sse2(in);	1044 fdct8_1d_sse2(in);

1051 fadst8_1d_sse2(in);	1045 fadst8_1d_sse2(in);

1052 break;	1046 break;

1053 case 3: // ADST_ADST	1047 case 3: // ADST_ADST

1054 fadst8_1d_sse2(in);	1048 fadst8_1d_sse2(in);

1055 fadst8_1d_sse2(in);	1049 fadst8_1d_sse2(in);

1056 break;	1050 break;

1057 default:	1051 default:

1058 assert(0);	1052 assert(0);

1059 break;	1053 break;

1060 }	1054 }

1061 right_shift_8x8(in, 1);	1055 right_shift_8x8(in, 1);

1062 write_buffer_8x8(output, in, 8);	1056 write_buffer_8x8(output, in, 8);

1063 }	1057 }

1064	1058

1065 void vp9_short_fdct16x16_sse2(int16_t input, int16_t output, int pitch) {	1059 void vp9_fdct16x16_sse2(const int16_t input, int16_t output, int stride) {

1066 // The 2D transform is done with two passes which are actually pretty	1060 // The 2D transform is done with two passes which are actually pretty

1067 // similar. In the first one, we transform the columns and transpose	1061 // similar. In the first one, we transform the columns and transpose

1068 // the results. In the second one, we transform the rows. To achieve that,	1062 // the results. In the second one, we transform the rows. To achieve that,

1069 // as the first pass results are transposed, we tranpose the columns (that	1063 // as the first pass results are transposed, we tranpose the columns (that

1070 // is the transposed rows) and transpose the results (so that it goes back	1064 // is the transposed rows) and transpose the results (so that it goes back

1071 // in normal/row positions).	1065 // in normal/row positions).

1072 const int stride = pitch >> 1;

1073 int pass;	1066 int pass;

1074 // We need an intermediate buffer between passes.	1067 // We need an intermediate buffer between passes.

1075 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);	1068 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);

1076 int16_t *in = input;	1069 const int16_t *in = input;

1077 int16_t *out = intermediate;	1070 int16_t *out = intermediate;

1078 // Constants	1071 // Constants

1079 // When we use them, in one case, they are all the same. In all others	1072 // When we use them, in one case, they are all the same. In all others

1080 // it's a pair of them that we need to repeat four times. This is done	1073 // it's a pair of them that we need to repeat four times. This is done

1081 // by constructing the 32 bit constant corresponding to that pair.	1074 // by constructing the 32 bit constant corresponding to that pair.

1082 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);	1075 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);

1083 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);	1076 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

1084 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);	1077 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);

1085 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);	1078 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);

1086 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);	1079 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);

(...skipping 594 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1681 _mm_store_si128((__m128i )(out + 8 + 7 16), tr2_7);	1674 _mm_store_si128((__m128i )(out + 8 + 7 16), tr2_7);

1682 }	1675 }

1683 out += 8*16;	1676 out += 8*16;

1684 }	1677 }

1685 // Setup in/out for next pass.	1678 // Setup in/out for next pass.

1686 in = intermediate;	1679 in = intermediate;

1687 out = output;	1680 out = output;

1688 }	1681 }

1689 }	1682 }

1690	1683

1691 static INLINE void load_buffer_16x16(int16_t* input, __m128i *in0,	1684 static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,

1692 __m128i *in1, int stride) {	1685 __m128i *in1, int stride) {

1693 // load first 8 columns	1686 // load first 8 columns

1694 load_buffer_8x8(input, in0, stride);	1687 load_buffer_8x8(input, in0, stride);

1695 load_buffer_8x8(input + 8 * stride, in0 + 8, stride);	1688 load_buffer_8x8(input + 8 * stride, in0 + 8, stride);

1696	1689

1697 input += 8;	1690 input += 8;

1698 // load second 8 columns	1691 // load second 8 columns

1699 load_buffer_8x8(input, in1, stride);	1692 load_buffer_8x8(input, in1, stride);

1700 load_buffer_8x8(input + 8 * stride, in1 + 8, stride);	1693 load_buffer_8x8(input + 8 * stride, in1 + 8, stride);

1701 }	1694 }

(...skipping 831 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2533 fdct16_1d_8col(in1);	2526 fdct16_1d_8col(in1);

2534 array_transpose_16x16(in0, in1);	2527 array_transpose_16x16(in0, in1);

2535 }	2528 }

2536	2529

2537 void fadst16_1d_sse2(__m128i in0, __m128i in1) {	2530 void fadst16_1d_sse2(__m128i in0, __m128i in1) {

2538 fadst16_1d_8col(in0);	2531 fadst16_1d_8col(in0);

2539 fadst16_1d_8col(in1);	2532 fadst16_1d_8col(in1);

2540 array_transpose_16x16(in0, in1);	2533 array_transpose_16x16(in0, in1);

2541 }	2534 }

2542	2535

2543 void vp9_short_fht16x16_sse2(int16_t input, int16_t output,	2536 void vp9_short_fht16x16_sse2(const int16_t input, int16_t output,

2544 int stride, int tx_type) {	2537 int stride, int tx_type) {

2545 __m128i in0[16], in1[16];	2538 __m128i in0[16], in1[16];

2546 load_buffer_16x16(input, in0, in1, stride);	2539 load_buffer_16x16(input, in0, in1, stride);

2547 switch (tx_type) {	2540 switch (tx_type) {

2548 case 0: // DCT_DCT	2541 case 0: // DCT_DCT

2549 fdct16_1d_sse2(in0, in1);	2542 fdct16_1d_sse2(in0, in1);

2550 right_shift_16x16(in0, in1);	2543 right_shift_16x16(in0, in1);

2551 fdct16_1d_sse2(in0, in1);	2544 fdct16_1d_sse2(in0, in1);

2552 break;	2545 break;

2553 case 1: // ADST_DCT	2546 case 1: // ADST_DCT

(...skipping 11 matching lines...) Expand all Loading...
2565 right_shift_16x16(in0, in1);	2558 right_shift_16x16(in0, in1);

2566 fadst16_1d_sse2(in0, in1);	2559 fadst16_1d_sse2(in0, in1);

2567 break;	2560 break;

2568 default:	2561 default:

2569 assert(0);	2562 assert(0);

2570 break;	2563 break;

2571 }	2564 }

2572 write_buffer_16x16(output, in0, in1, 16);	2565 write_buffer_16x16(output, in0, in1, 16);

2573 }	2566 }

2574	2567

2575 #define FDCT32x32_2D vp9_short_fdct32x32_rd_sse2	2568 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2

2576 #define FDCT32x32_HIGH_PRECISION 0	2569 #define FDCT32x32_HIGH_PRECISION 0

2577 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c"	2570 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c"

2578 #undef FDCT32x32_2D	2571 #undef FDCT32x32_2D

2579 #undef FDCT32x32_HIGH_PRECISION	2572 #undef FDCT32x32_HIGH_PRECISION

2580	2573

2581 #define FDCT32x32_2D vp9_short_fdct32x32_sse2	2574 #define FDCT32x32_2D vp9_fdct32x32_sse2

2582 #define FDCT32x32_HIGH_PRECISION 1	2575 #define FDCT32x32_HIGH_PRECISION 1

2583 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT	2576 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT

2584 #undef FDCT32x32_2D	2577 #undef FDCT32x32_2D

2585 #undef FDCT32x32_HIGH_PRECISION	2578 #undef FDCT32x32_HIGH_PRECISION

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c ('k') | source/libvpx/vp9/encoder/x86/vp9_variance_mmx.c » ('j') | no next file with comments »