source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c - Issue 168343002: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c

Issue 168343002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: libvpx: Pull from upstream Created 6 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 224 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
235 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);	235 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);

236 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);	236 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);

237 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);	237 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);

238 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);	238 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);

239	239

240 in[0] = _mm_packs_epi32(u[0], u[2]);	240 in[0] = _mm_packs_epi32(u[0], u[2]);

241 in[1] = _mm_packs_epi32(u[1], u[3]);	241 in[1] = _mm_packs_epi32(u[1], u[3]);

242 transpose_4x4(in);	242 transpose_4x4(in);

243 }	243 }

244	244

245 void vp9_short_fht4x4_sse2(const int16_t input, int16_t output,	245 void vp9_fht4x4_sse2(const int16_t input, int16_t output,

246 int stride, int tx_type) {	246 int stride, int tx_type) {

247 __m128i in[4];	247 __m128i in[4];

248 load_buffer_4x4(input, in, stride);	248

249 switch (tx_type) {	249 switch (tx_type) {

250 case 0: // DCT_DCT	250 case DCT_DCT:

251 fdct4_sse2(in);	251 vp9_fdct4x4_sse2(input, output, stride);

252 fdct4_sse2(in);

253 break;	252 break;

254 case 1: // ADST_DCT	253 case ADST_DCT:

	254 load_buffer_4x4(input, in, stride);

255 fadst4_sse2(in);	255 fadst4_sse2(in);

256 fdct4_sse2(in);	256 fdct4_sse2(in);

	257 write_buffer_4x4(output, in);

257 break;	258 break;

258 case 2: // DCT_ADST	259 case DCT_ADST:

	260 load_buffer_4x4(input, in, stride);

259 fdct4_sse2(in);	261 fdct4_sse2(in);

260 fadst4_sse2(in);	262 fadst4_sse2(in);

	263 write_buffer_4x4(output, in);

261 break;	264 break;

262 case 3: // ADST_ADST	265 case ADST_ADST:

	266 load_buffer_4x4(input, in, stride);

263 fadst4_sse2(in);	267 fadst4_sse2(in);

264 fadst4_sse2(in);	268 fadst4_sse2(in);

	269 write_buffer_4x4(output, in);

265 break;	270 break;

266 default:	271 default:

267 assert(0);	272 assert(0);

268 break;	273 break;

269 }	274 }

270 write_buffer_4x4(output, in);

271 }	275 }

272	276

273 void vp9_fdct8x8_sse2(const int16_t input, int16_t output, int stride) {	277 void vp9_fdct8x8_sse2(const int16_t input, int16_t output, int stride) {

274 int pass;	278 int pass;

275 // Constants	279 // Constants

276 // When we use them, in one case, they are all the same. In all others	280 // When we use them, in one case, they are all the same. In all others

277 // it's a pair of them that we need to repeat four times. This is done	281 // it's a pair of them that we need to repeat four times. This is done

278 // by constructing the 32 bit constant corresponding to that pair.	282 // by constructing the 32 bit constant corresponding to that pair.

279 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);	283 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);

280 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);	284 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

(...skipping 738 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1019 in[3] = _mm_sub_epi16(k__const_0, s2);	1023 in[3] = _mm_sub_epi16(k__const_0, s2);

1020 in[4] = s3;	1024 in[4] = s3;

1021 in[5] = _mm_sub_epi16(k__const_0, s7);	1025 in[5] = _mm_sub_epi16(k__const_0, s7);

1022 in[6] = s5;	1026 in[6] = s5;

1023 in[7] = _mm_sub_epi16(k__const_0, s1);	1027 in[7] = _mm_sub_epi16(k__const_0, s1);

1024	1028

1025 // transpose	1029 // transpose

1026 array_transpose_8x8(in, in);	1030 array_transpose_8x8(in, in);

1027 }	1031 }

1028	1032

1029 void vp9_short_fht8x8_sse2(const int16_t input, int16_t output,	1033 void vp9_fht8x8_sse2(const int16_t input, int16_t output,

1030 int stride, int tx_type) {	1034 int stride, int tx_type) {

1031 __m128i in[8];	1035 __m128i in[8];

1032 load_buffer_8x8(input, in, stride);	1036

1033 switch (tx_type) {	1037 switch (tx_type) {

1034 case 0: // DCT_DCT	1038 case DCT_DCT:

1035 fdct8_sse2(in);	1039 vp9_fdct8x8_sse2(input, output, stride);

1036 fdct8_sse2(in);

1037 break;	1040 break;

1038 case 1: // ADST_DCT	1041 case ADST_DCT:

	1042 load_buffer_8x8(input, in, stride);

1039 fadst8_sse2(in);	1043 fadst8_sse2(in);

1040 fdct8_sse2(in);	1044 fdct8_sse2(in);

	1045 right_shift_8x8(in, 1);

	1046 write_buffer_8x8(output, in, 8);

1041 break;	1047 break;

1042 case 2: // DCT_ADST	1048 case DCT_ADST:

	1049 load_buffer_8x8(input, in, stride);

1043 fdct8_sse2(in);	1050 fdct8_sse2(in);

1044 fadst8_sse2(in);	1051 fadst8_sse2(in);

	1052 right_shift_8x8(in, 1);

	1053 write_buffer_8x8(output, in, 8);

1045 break;	1054 break;

1046 case 3: // ADST_ADST	1055 case ADST_ADST:

	1056 load_buffer_8x8(input, in, stride);

1047 fadst8_sse2(in);	1057 fadst8_sse2(in);

1048 fadst8_sse2(in);	1058 fadst8_sse2(in);

	1059 right_shift_8x8(in, 1);

	1060 write_buffer_8x8(output, in, 8);

1049 break;	1061 break;

1050 default:	1062 default:

1051 assert(0);	1063 assert(0);

1052 break;	1064 break;

1053 }	1065 }

1054 right_shift_8x8(in, 1);

1055 write_buffer_8x8(output, in, 8);

1056 }	1066 }

1057	1067

1058 void vp9_fdct16x16_sse2(const int16_t input, int16_t output, int stride) {	1068 void vp9_fdct16x16_sse2(const int16_t input, int16_t output, int stride) {

1059 // The 2D transform is done with two passes which are actually pretty	1069 // The 2D transform is done with two passes which are actually pretty

1060 // similar. In the first one, we transform the columns and transpose	1070 // similar. In the first one, we transform the columns and transpose

1061 // the results. In the second one, we transform the rows. To achieve that,	1071 // the results. In the second one, we transform the rows. To achieve that,

1062 // as the first pass results are transposed, we tranpose the columns (that	1072 // as the first pass results are transposed, we tranpose the columns (that

1063 // is the transposed rows) and transpose the results (so that it goes back	1073 // is the transposed rows) and transpose the results (so that it goes back

1064 // in normal/row positions).	1074 // in normal/row positions).

1065 int pass;	1075 int pass;

(...skipping 1459 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2525 fdct16_8col(in1);	2535 fdct16_8col(in1);

2526 array_transpose_16x16(in0, in1);	2536 array_transpose_16x16(in0, in1);

2527 }	2537 }

2528	2538

2529 void fadst16_sse2(__m128i in0, __m128i in1) {	2539 void fadst16_sse2(__m128i in0, __m128i in1) {

2530 fadst16_8col(in0);	2540 fadst16_8col(in0);

2531 fadst16_8col(in1);	2541 fadst16_8col(in1);

2532 array_transpose_16x16(in0, in1);	2542 array_transpose_16x16(in0, in1);

2533 }	2543 }

2534	2544

2535 void vp9_short_fht16x16_sse2(const int16_t input, int16_t output,	2545 void vp9_fht16x16_sse2(const int16_t input, int16_t output,

2536 int stride, int tx_type) {	2546 int stride, int tx_type) {

2537 __m128i in0[16], in1[16];	2547 __m128i in0[16], in1[16];

2538 load_buffer_16x16(input, in0, in1, stride);	2548

2539 switch (tx_type) {	2549 switch (tx_type) {

2540 case 0: // DCT_DCT	2550 case DCT_DCT:

2541 fdct16_sse2(in0, in1);	2551 vp9_fdct16x16_sse2(input, output, stride);

2542 right_shift_16x16(in0, in1);

2543 fdct16_sse2(in0, in1);

2544 break;	2552 break;

2545 case 1: // ADST_DCT	2553 case ADST_DCT:

	2554 load_buffer_16x16(input, in0, in1, stride);

2546 fadst16_sse2(in0, in1);	2555 fadst16_sse2(in0, in1);

2547 right_shift_16x16(in0, in1);	2556 right_shift_16x16(in0, in1);

2548 fdct16_sse2(in0, in1);	2557 fdct16_sse2(in0, in1);

	2558 write_buffer_16x16(output, in0, in1, 16);

2549 break;	2559 break;

2550 case 2: // DCT_ADST	2560 case DCT_ADST:

	2561 load_buffer_16x16(input, in0, in1, stride);

2551 fdct16_sse2(in0, in1);	2562 fdct16_sse2(in0, in1);

2552 right_shift_16x16(in0, in1);	2563 right_shift_16x16(in0, in1);

2553 fadst16_sse2(in0, in1);	2564 fadst16_sse2(in0, in1);

	2565 write_buffer_16x16(output, in0, in1, 16);

2554 break;	2566 break;

2555 case 3: // ADST_ADST	2567 case ADST_ADST:

	2568 load_buffer_16x16(input, in0, in1, stride);

2556 fadst16_sse2(in0, in1);	2569 fadst16_sse2(in0, in1);

2557 right_shift_16x16(in0, in1);	2570 right_shift_16x16(in0, in1);

2558 fadst16_sse2(in0, in1);	2571 fadst16_sse2(in0, in1);

	2572 write_buffer_16x16(output, in0, in1, 16);

2559 break;	2573 break;

2560 default:	2574 default:

2561 assert(0);	2575 assert(0);

2562 break;	2576 break;

2563 }	2577 }

2564 write_buffer_16x16(output, in0, in1, 16);

2565 }	2578 }

2566	2579

2567 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2	2580 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2

2568 #define FDCT32x32_HIGH_PRECISION 0	2581 #define FDCT32x32_HIGH_PRECISION 0

2569 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c"	2582 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c"

2570 #undef FDCT32x32_2D	2583 #undef FDCT32x32_2D

2571 #undef FDCT32x32_HIGH_PRECISION	2584 #undef FDCT32x32_HIGH_PRECISION

2572	2585

2573 #define FDCT32x32_2D vp9_fdct32x32_sse2	2586 #define FDCT32x32_2D vp9_fdct32x32_sse2

2574 #define FDCT32x32_HIGH_PRECISION 1	2587 #define FDCT32x32_HIGH_PRECISION 1

2575 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT	2588 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT

2576 #undef FDCT32x32_2D	2589 #undef FDCT32x32_2D

2577 #undef FDCT32x32_HIGH_PRECISION	2590 #undef FDCT32x32_HIGH_PRECISION

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_dct_avx2.c ('k') | source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm » ('j') | no next file with comments »