source/libvpx/vp9/encoder/x86/vp9_dct_avx2.c - Issue 168343002: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_dct_avx2.c

Issue 168343002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: libvpx: Pull from upstream Created 6 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 226 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
237 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);	237 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);

238 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);	238 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);

239 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);	239 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);

240 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);	240 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);

241	241

242 in[0] = _mm_packs_epi32(u[0], u[2]);	242 in[0] = _mm_packs_epi32(u[0], u[2]);

243 in[1] = _mm_packs_epi32(u[1], u[3]);	243 in[1] = _mm_packs_epi32(u[1], u[3]);

244 transpose_4x4_avx2(in);	244 transpose_4x4_avx2(in);

245 }	245 }

246	246

247 void vp9_short_fht4x4_avx2(const int16_t input, int16_t output,	247 void vp9_fht4x4_avx2(const int16_t input, int16_t output,

248 int stride, int tx_type) {	248 int stride, int tx_type) {

249 __m128i in[4];	249 __m128i in[4];

250 load_buffer_4x4_avx2(input, in, stride);	250

251 switch (tx_type) {	251 switch (tx_type) {

252 case 0: // DCT_DCT	252 case DCT_DCT:

253 fdct4_avx2(in);	253 vp9_fdct4x4_avx2(input, output, stride);

254 fdct4_avx2(in);

255 break;	254 break;

256 case 1: // ADST_DCT	255 case ADST_DCT:

	256 load_buffer_4x4_avx2(input, in, stride);

257 fadst4_avx2(in);	257 fadst4_avx2(in);

258 fdct4_avx2(in);	258 fdct4_avx2(in);

	259 write_buffer_4x4_avx2(output, in);

259 break;	260 break;

260 case 2: // DCT_ADST	261 case DCT_ADST:

	262 load_buffer_4x4_avx2(input, in, stride);

261 fdct4_avx2(in);	263 fdct4_avx2(in);

262 fadst4_avx2(in);	264 fadst4_avx2(in);

	265 write_buffer_4x4_avx2(output, in);

263 break;	266 break;

264 case 3: // ADST_ADST	267 case ADST_ADST:

	268 load_buffer_4x4_avx2(input, in, stride);

265 fadst4_avx2(in);	269 fadst4_avx2(in);

266 fadst4_avx2(in);	270 fadst4_avx2(in);

	271 write_buffer_4x4_avx2(output, in);

267 break;	272 break;

268 default:	273 default:

269 assert(0);	274 assert(0);

270 break;	275 break;

271 }	276 }

272 write_buffer_4x4_avx2(output, in);

273 }	277 }

274	278

275 void vp9_fdct8x8_avx2(const int16_t input, int16_t output, int stride) {	279 void vp9_fdct8x8_avx2(const int16_t input, int16_t output, int stride) {

276 int pass;	280 int pass;

277 // Constants	281 // Constants

278 // When we use them, in one case, they are all the same. In all others	282 // When we use them, in one case, they are all the same. In all others

279 // it's a pair of them that we need to repeat four times. This is done	283 // it's a pair of them that we need to repeat four times. This is done

280 // by constructing the 32 bit constant corresponding to that pair.	284 // by constructing the 32 bit constant corresponding to that pair.

281 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);	285 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);

282 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);	286 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

(...skipping 738 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1021 in[3] = _mm_sub_epi16(k__const_0, s2);	1025 in[3] = _mm_sub_epi16(k__const_0, s2);

1022 in[4] = s3;	1026 in[4] = s3;

1023 in[5] = _mm_sub_epi16(k__const_0, s7);	1027 in[5] = _mm_sub_epi16(k__const_0, s7);

1024 in[6] = s5;	1028 in[6] = s5;

1025 in[7] = _mm_sub_epi16(k__const_0, s1);	1029 in[7] = _mm_sub_epi16(k__const_0, s1);

1026	1030

1027 // transpose	1031 // transpose

1028 array_transpose_8x8_avx2(in, in);	1032 array_transpose_8x8_avx2(in, in);

1029 }	1033 }

1030	1034

1031 void vp9_short_fht8x8_avx2(const int16_t input, int16_t output,	1035 void vp9_fht8x8_avx2(const int16_t input, int16_t output,

1032 int stride, int tx_type) {	1036 int stride, int tx_type) {

1033 __m128i in[8];	1037 __m128i in[8];

1034 load_buffer_8x8_avx2(input, in, stride);	1038

1035 switch (tx_type) {	1039 switch (tx_type) {

1036 case 0: // DCT_DCT	1040 case DCT_DCT:

1037 fdct8_avx2(in);	1041 vp9_fdct8x8_avx2(input, output, stride);

1038 fdct8_avx2(in);

1039 break;	1042 break;

1040 case 1: // ADST_DCT	1043 case ADST_DCT:

	1044 load_buffer_8x8_avx2(input, in, stride);

1041 fadst8_avx2(in);	1045 fadst8_avx2(in);

1042 fdct8_avx2(in);	1046 fdct8_avx2(in);

	1047 right_shift_8x8_avx2(in, 1);

	1048 write_buffer_8x8_avx2(output, in, 8);

1043 break;	1049 break;

1044 case 2: // DCT_ADST	1050 case DCT_ADST:

	1051 load_buffer_8x8_avx2(input, in, stride);

1045 fdct8_avx2(in);	1052 fdct8_avx2(in);

1046 fadst8_avx2(in);	1053 fadst8_avx2(in);

	1054 right_shift_8x8_avx2(in, 1);

	1055 write_buffer_8x8_avx2(output, in, 8);

1047 break;	1056 break;

1048 case 3: // ADST_ADST	1057 case ADST_ADST:

	1058 load_buffer_8x8_avx2(input, in, stride);

1049 fadst8_avx2(in);	1059 fadst8_avx2(in);

1050 fadst8_avx2(in);	1060 fadst8_avx2(in);

	1061 right_shift_8x8_avx2(in, 1);

	1062 write_buffer_8x8_avx2(output, in, 8);

1051 break;	1063 break;

1052 default:	1064 default:

1053 assert(0);	1065 assert(0);

1054 break;	1066 break;

1055 }	1067 }

1056 right_shift_8x8_avx2(in, 1);

1057 write_buffer_8x8_avx2(output, in, 8);

1058 }	1068 }

1059	1069

1060 void vp9_fdct16x16_avx2(const int16_t input, int16_t output, int stride) {	1070 void vp9_fdct16x16_avx2(const int16_t input, int16_t output, int stride) {

1061 // The 2D transform is done with two passes which are actually pretty	1071 // The 2D transform is done with two passes which are actually pretty

1062 // similar. In the first one, we transform the columns and transpose	1072 // similar. In the first one, we transform the columns and transpose

1063 // the results. In the second one, we transform the rows. To achieve that,	1073 // the results. In the second one, we transform the rows. To achieve that,

1064 // as the first pass results are transposed, we tranpose the columns (that	1074 // as the first pass results are transposed, we tranpose the columns (that

1065 // is the transposed rows) and transpose the results (so that it goes back	1075 // is the transposed rows) and transpose the results (so that it goes back

1066 // in normal/row positions).	1076 // in normal/row positions).

1067 int pass;	1077 int pass;

(...skipping 1459 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2527 fdct16_8col_avx2(in1);	2537 fdct16_8col_avx2(in1);

2528 array_transpose_16x16_avx2(in0, in1);	2538 array_transpose_16x16_avx2(in0, in1);

2529 }	2539 }

2530	2540

2531 void fadst16_avx2(__m128i in0, __m128i in1) {	2541 void fadst16_avx2(__m128i in0, __m128i in1) {

2532 fadst16_8col_avx2(in0);	2542 fadst16_8col_avx2(in0);

2533 fadst16_8col_avx2(in1);	2543 fadst16_8col_avx2(in1);

2534 array_transpose_16x16_avx2(in0, in1);	2544 array_transpose_16x16_avx2(in0, in1);

2535 }	2545 }

2536	2546

2537 void vp9_short_fht16x16_avx2(const int16_t input, int16_t output,	2547 void vp9_fht16x16_avx2(const int16_t input, int16_t output,

2538 int stride, int tx_type) {	2548 int stride, int tx_type) {

2539 __m128i in0[16], in1[16];	2549 __m128i in0[16], in1[16];

2540 load_buffer_16x16_avx2(input, in0, in1, stride);	2550

2541 switch (tx_type) {	2551 switch (tx_type) {

2542 case 0: // DCT_DCT	2552 case DCT_DCT:

2543 fdct16_avx2(in0, in1);	2553 vp9_fdct16x16_avx2(input, output, stride);

2544 right_shift_16x16_avx2(in0, in1);

2545 fdct16_avx2(in0, in1);

2546 break;	2554 break;

2547 case 1: // ADST_DCT	2555 case ADST_DCT:

	2556 load_buffer_16x16_avx2(input, in0, in1, stride);

2548 fadst16_avx2(in0, in1);	2557 fadst16_avx2(in0, in1);

2549 right_shift_16x16_avx2(in0, in1);	2558 right_shift_16x16_avx2(in0, in1);

2550 fdct16_avx2(in0, in1);	2559 fdct16_avx2(in0, in1);

	2560 write_buffer_16x16_avx2(output, in0, in1, 16);

2551 break;	2561 break;

2552 case 2: // DCT_ADST	2562 case DCT_ADST:

	2563 load_buffer_16x16_avx2(input, in0, in1, stride);

2553 fdct16_avx2(in0, in1);	2564 fdct16_avx2(in0, in1);

2554 right_shift_16x16_avx2(in0, in1);	2565 right_shift_16x16_avx2(in0, in1);

2555 fadst16_avx2(in0, in1);	2566 fadst16_avx2(in0, in1);

	2567 write_buffer_16x16_avx2(output, in0, in1, 16);

2556 break;	2568 break;

2557 case 3: // ADST_ADST	2569 case ADST_ADST:

	2570 load_buffer_16x16_avx2(input, in0, in1, stride);

2558 fadst16_avx2(in0, in1);	2571 fadst16_avx2(in0, in1);

2559 right_shift_16x16_avx2(in0, in1);	2572 right_shift_16x16_avx2(in0, in1);

2560 fadst16_avx2(in0, in1);	2573 fadst16_avx2(in0, in1);

	2574 write_buffer_16x16_avx2(output, in0, in1, 16);

2561 break;	2575 break;

2562 default:	2576 default:

2563 assert(0);	2577 assert(0);

2564 break;	2578 break;

2565 }	2579 }

2566 write_buffer_16x16_avx2(output, in0, in1, 16);

2567 }	2580 }

2568	2581

2569 #define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2	2582 #define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2

2570 #define FDCT32x32_HIGH_PRECISION 0	2583 #define FDCT32x32_HIGH_PRECISION 0

2571 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c"	2584 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c"

2572 #undef FDCT32x32_2D_AVX2	2585 #undef FDCT32x32_2D_AVX2

2573 #undef FDCT32x32_HIGH_PRECISION	2586 #undef FDCT32x32_HIGH_PRECISION

2574	2587

2575 #define FDCT32x32_2D_AVX2 vp9_fdct32x32_avx2	2588 #define FDCT32x32_2D_AVX2 vp9_fdct32x32_avx2

2576 #define FDCT32x32_HIGH_PRECISION 1	2589 #define FDCT32x32_HIGH_PRECISION 1

2577 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c" // NOLINT	2590 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c" // NOLINT

2578 #undef FDCT32x32_2D_AVX2	2591 #undef FDCT32x32_2D_AVX2

2579 #undef FDCT32x32_HIGH_PRECISION	2592 #undef FDCT32x32_HIGH_PRECISION

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/encoder/vp9_writer.c ('k') | source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c » ('j') | no next file with comments »