Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(71)

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_dct_avx2.c

Issue 168343002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: libvpx: Pull from upstream Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « source/libvpx/vp9/encoder/vp9_writer.c ('k') | source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 226 matching lines...) Expand 10 before | Expand all | Expand 10 after
237 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 237 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
238 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 238 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
239 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 239 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
240 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 240 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
241 241
242 in[0] = _mm_packs_epi32(u[0], u[2]); 242 in[0] = _mm_packs_epi32(u[0], u[2]);
243 in[1] = _mm_packs_epi32(u[1], u[3]); 243 in[1] = _mm_packs_epi32(u[1], u[3]);
244 transpose_4x4_avx2(in); 244 transpose_4x4_avx2(in);
245 } 245 }
246 246
247 void vp9_short_fht4x4_avx2(const int16_t *input, int16_t *output, 247 void vp9_fht4x4_avx2(const int16_t *input, int16_t *output,
248 int stride, int tx_type) { 248 int stride, int tx_type) {
249 __m128i in[4]; 249 __m128i in[4];
250 load_buffer_4x4_avx2(input, in, stride); 250
251 switch (tx_type) { 251 switch (tx_type) {
252 case 0: // DCT_DCT 252 case DCT_DCT:
253 fdct4_avx2(in); 253 vp9_fdct4x4_avx2(input, output, stride);
254 fdct4_avx2(in);
255 break; 254 break;
256 case 1: // ADST_DCT 255 case ADST_DCT:
256 load_buffer_4x4_avx2(input, in, stride);
257 fadst4_avx2(in); 257 fadst4_avx2(in);
258 fdct4_avx2(in); 258 fdct4_avx2(in);
259 write_buffer_4x4_avx2(output, in);
259 break; 260 break;
260 case 2: // DCT_ADST 261 case DCT_ADST:
262 load_buffer_4x4_avx2(input, in, stride);
261 fdct4_avx2(in); 263 fdct4_avx2(in);
262 fadst4_avx2(in); 264 fadst4_avx2(in);
265 write_buffer_4x4_avx2(output, in);
263 break; 266 break;
264 case 3: // ADST_ADST 267 case ADST_ADST:
268 load_buffer_4x4_avx2(input, in, stride);
265 fadst4_avx2(in); 269 fadst4_avx2(in);
266 fadst4_avx2(in); 270 fadst4_avx2(in);
271 write_buffer_4x4_avx2(output, in);
267 break; 272 break;
268 default: 273 default:
269 assert(0); 274 assert(0);
270 break; 275 break;
271 } 276 }
272 write_buffer_4x4_avx2(output, in);
273 } 277 }
274 278
275 void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) { 279 void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) {
276 int pass; 280 int pass;
277 // Constants 281 // Constants
278 // When we use them, in one case, they are all the same. In all others 282 // When we use them, in one case, they are all the same. In all others
279 // it's a pair of them that we need to repeat four times. This is done 283 // it's a pair of them that we need to repeat four times. This is done
280 // by constructing the 32 bit constant corresponding to that pair. 284 // by constructing the 32 bit constant corresponding to that pair.
281 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 285 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
282 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 286 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
(...skipping 738 matching lines...) Expand 10 before | Expand all | Expand 10 after
1021 in[3] = _mm_sub_epi16(k__const_0, s2); 1025 in[3] = _mm_sub_epi16(k__const_0, s2);
1022 in[4] = s3; 1026 in[4] = s3;
1023 in[5] = _mm_sub_epi16(k__const_0, s7); 1027 in[5] = _mm_sub_epi16(k__const_0, s7);
1024 in[6] = s5; 1028 in[6] = s5;
1025 in[7] = _mm_sub_epi16(k__const_0, s1); 1029 in[7] = _mm_sub_epi16(k__const_0, s1);
1026 1030
1027 // transpose 1031 // transpose
1028 array_transpose_8x8_avx2(in, in); 1032 array_transpose_8x8_avx2(in, in);
1029 } 1033 }
1030 1034
1031 void vp9_short_fht8x8_avx2(const int16_t *input, int16_t *output, 1035 void vp9_fht8x8_avx2(const int16_t *input, int16_t *output,
1032 int stride, int tx_type) { 1036 int stride, int tx_type) {
1033 __m128i in[8]; 1037 __m128i in[8];
1034 load_buffer_8x8_avx2(input, in, stride); 1038
1035 switch (tx_type) { 1039 switch (tx_type) {
1036 case 0: // DCT_DCT 1040 case DCT_DCT:
1037 fdct8_avx2(in); 1041 vp9_fdct8x8_avx2(input, output, stride);
1038 fdct8_avx2(in);
1039 break; 1042 break;
1040 case 1: // ADST_DCT 1043 case ADST_DCT:
1044 load_buffer_8x8_avx2(input, in, stride);
1041 fadst8_avx2(in); 1045 fadst8_avx2(in);
1042 fdct8_avx2(in); 1046 fdct8_avx2(in);
1047 right_shift_8x8_avx2(in, 1);
1048 write_buffer_8x8_avx2(output, in, 8);
1043 break; 1049 break;
1044 case 2: // DCT_ADST 1050 case DCT_ADST:
1051 load_buffer_8x8_avx2(input, in, stride);
1045 fdct8_avx2(in); 1052 fdct8_avx2(in);
1046 fadst8_avx2(in); 1053 fadst8_avx2(in);
1054 right_shift_8x8_avx2(in, 1);
1055 write_buffer_8x8_avx2(output, in, 8);
1047 break; 1056 break;
1048 case 3: // ADST_ADST 1057 case ADST_ADST:
1058 load_buffer_8x8_avx2(input, in, stride);
1049 fadst8_avx2(in); 1059 fadst8_avx2(in);
1050 fadst8_avx2(in); 1060 fadst8_avx2(in);
1061 right_shift_8x8_avx2(in, 1);
1062 write_buffer_8x8_avx2(output, in, 8);
1051 break; 1063 break;
1052 default: 1064 default:
1053 assert(0); 1065 assert(0);
1054 break; 1066 break;
1055 } 1067 }
1056 right_shift_8x8_avx2(in, 1);
1057 write_buffer_8x8_avx2(output, in, 8);
1058 } 1068 }
1059 1069
1060 void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) { 1070 void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) {
1061 // The 2D transform is done with two passes which are actually pretty 1071 // The 2D transform is done with two passes which are actually pretty
1062 // similar. In the first one, we transform the columns and transpose 1072 // similar. In the first one, we transform the columns and transpose
1063 // the results. In the second one, we transform the rows. To achieve that, 1073 // the results. In the second one, we transform the rows. To achieve that,
1064 // as the first pass results are transposed, we tranpose the columns (that 1074 // as the first pass results are transposed, we tranpose the columns (that
1065 // is the transposed rows) and transpose the results (so that it goes back 1075 // is the transposed rows) and transpose the results (so that it goes back
1066 // in normal/row positions). 1076 // in normal/row positions).
1067 int pass; 1077 int pass;
(...skipping 1459 matching lines...) Expand 10 before | Expand all | Expand 10 after
2527 fdct16_8col_avx2(in1); 2537 fdct16_8col_avx2(in1);
2528 array_transpose_16x16_avx2(in0, in1); 2538 array_transpose_16x16_avx2(in0, in1);
2529 } 2539 }
2530 2540
2531 void fadst16_avx2(__m128i *in0, __m128i *in1) { 2541 void fadst16_avx2(__m128i *in0, __m128i *in1) {
2532 fadst16_8col_avx2(in0); 2542 fadst16_8col_avx2(in0);
2533 fadst16_8col_avx2(in1); 2543 fadst16_8col_avx2(in1);
2534 array_transpose_16x16_avx2(in0, in1); 2544 array_transpose_16x16_avx2(in0, in1);
2535 } 2545 }
2536 2546
2537 void vp9_short_fht16x16_avx2(const int16_t *input, int16_t *output, 2547 void vp9_fht16x16_avx2(const int16_t *input, int16_t *output,
2538 int stride, int tx_type) { 2548 int stride, int tx_type) {
2539 __m128i in0[16], in1[16]; 2549 __m128i in0[16], in1[16];
2540 load_buffer_16x16_avx2(input, in0, in1, stride); 2550
2541 switch (tx_type) { 2551 switch (tx_type) {
2542 case 0: // DCT_DCT 2552 case DCT_DCT:
2543 fdct16_avx2(in0, in1); 2553 vp9_fdct16x16_avx2(input, output, stride);
2544 right_shift_16x16_avx2(in0, in1);
2545 fdct16_avx2(in0, in1);
2546 break; 2554 break;
2547 case 1: // ADST_DCT 2555 case ADST_DCT:
2556 load_buffer_16x16_avx2(input, in0, in1, stride);
2548 fadst16_avx2(in0, in1); 2557 fadst16_avx2(in0, in1);
2549 right_shift_16x16_avx2(in0, in1); 2558 right_shift_16x16_avx2(in0, in1);
2550 fdct16_avx2(in0, in1); 2559 fdct16_avx2(in0, in1);
2560 write_buffer_16x16_avx2(output, in0, in1, 16);
2551 break; 2561 break;
2552 case 2: // DCT_ADST 2562 case DCT_ADST:
2563 load_buffer_16x16_avx2(input, in0, in1, stride);
2553 fdct16_avx2(in0, in1); 2564 fdct16_avx2(in0, in1);
2554 right_shift_16x16_avx2(in0, in1); 2565 right_shift_16x16_avx2(in0, in1);
2555 fadst16_avx2(in0, in1); 2566 fadst16_avx2(in0, in1);
2567 write_buffer_16x16_avx2(output, in0, in1, 16);
2556 break; 2568 break;
2557 case 3: // ADST_ADST 2569 case ADST_ADST:
2570 load_buffer_16x16_avx2(input, in0, in1, stride);
2558 fadst16_avx2(in0, in1); 2571 fadst16_avx2(in0, in1);
2559 right_shift_16x16_avx2(in0, in1); 2572 right_shift_16x16_avx2(in0, in1);
2560 fadst16_avx2(in0, in1); 2573 fadst16_avx2(in0, in1);
2574 write_buffer_16x16_avx2(output, in0, in1, 16);
2561 break; 2575 break;
2562 default: 2576 default:
2563 assert(0); 2577 assert(0);
2564 break; 2578 break;
2565 } 2579 }
2566 write_buffer_16x16_avx2(output, in0, in1, 16);
2567 } 2580 }
2568 2581
2569 #define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2 2582 #define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2
2570 #define FDCT32x32_HIGH_PRECISION 0 2583 #define FDCT32x32_HIGH_PRECISION 0
2571 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c" 2584 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c"
2572 #undef FDCT32x32_2D_AVX2 2585 #undef FDCT32x32_2D_AVX2
2573 #undef FDCT32x32_HIGH_PRECISION 2586 #undef FDCT32x32_HIGH_PRECISION
2574 2587
2575 #define FDCT32x32_2D_AVX2 vp9_fdct32x32_avx2 2588 #define FDCT32x32_2D_AVX2 vp9_fdct32x32_avx2
2576 #define FDCT32x32_HIGH_PRECISION 1 2589 #define FDCT32x32_HIGH_PRECISION 1
2577 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c" // NOLINT 2590 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c" // NOLINT
2578 #undef FDCT32x32_2D_AVX2 2591 #undef FDCT32x32_2D_AVX2
2579 #undef FDCT32x32_HIGH_PRECISION 2592 #undef FDCT32x32_HIGH_PRECISION
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/vp9_writer.c ('k') | source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698