OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 226 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
237 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); | 237 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); |
238 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); | 238 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); |
239 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); | 239 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); |
240 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); | 240 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); |
241 | 241 |
242 in[0] = _mm_packs_epi32(u[0], u[2]); | 242 in[0] = _mm_packs_epi32(u[0], u[2]); |
243 in[1] = _mm_packs_epi32(u[1], u[3]); | 243 in[1] = _mm_packs_epi32(u[1], u[3]); |
244 transpose_4x4_avx2(in); | 244 transpose_4x4_avx2(in); |
245 } | 245 } |
246 | 246 |
247 void vp9_short_fht4x4_avx2(const int16_t *input, int16_t *output, | 247 void vp9_fht4x4_avx2(const int16_t *input, int16_t *output, |
248 int stride, int tx_type) { | 248 int stride, int tx_type) { |
249 __m128i in[4]; | 249 __m128i in[4]; |
250 load_buffer_4x4_avx2(input, in, stride); | 250 |
251 switch (tx_type) { | 251 switch (tx_type) { |
252 case 0: // DCT_DCT | 252 case DCT_DCT: |
253 fdct4_avx2(in); | 253 vp9_fdct4x4_avx2(input, output, stride); |
254 fdct4_avx2(in); | |
255 break; | 254 break; |
256 case 1: // ADST_DCT | 255 case ADST_DCT: |
| 256 load_buffer_4x4_avx2(input, in, stride); |
257 fadst4_avx2(in); | 257 fadst4_avx2(in); |
258 fdct4_avx2(in); | 258 fdct4_avx2(in); |
| 259 write_buffer_4x4_avx2(output, in); |
259 break; | 260 break; |
260 case 2: // DCT_ADST | 261 case DCT_ADST: |
| 262 load_buffer_4x4_avx2(input, in, stride); |
261 fdct4_avx2(in); | 263 fdct4_avx2(in); |
262 fadst4_avx2(in); | 264 fadst4_avx2(in); |
| 265 write_buffer_4x4_avx2(output, in); |
263 break; | 266 break; |
264 case 3: // ADST_ADST | 267 case ADST_ADST: |
| 268 load_buffer_4x4_avx2(input, in, stride); |
265 fadst4_avx2(in); | 269 fadst4_avx2(in); |
266 fadst4_avx2(in); | 270 fadst4_avx2(in); |
| 271 write_buffer_4x4_avx2(output, in); |
267 break; | 272 break; |
268 default: | 273 default: |
269 assert(0); | 274 assert(0); |
270 break; | 275 break; |
271 } | 276 } |
272 write_buffer_4x4_avx2(output, in); | |
273 } | 277 } |
274 | 278 |
275 void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) { | 279 void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) { |
276 int pass; | 280 int pass; |
277 // Constants | 281 // Constants |
278 // When we use them, in one case, they are all the same. In all others | 282 // When we use them, in one case, they are all the same. In all others |
279 // it's a pair of them that we need to repeat four times. This is done | 283 // it's a pair of them that we need to repeat four times. This is done |
280 // by constructing the 32 bit constant corresponding to that pair. | 284 // by constructing the 32 bit constant corresponding to that pair. |
281 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | 285 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); |
282 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 286 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
(...skipping 738 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1021 in[3] = _mm_sub_epi16(k__const_0, s2); | 1025 in[3] = _mm_sub_epi16(k__const_0, s2); |
1022 in[4] = s3; | 1026 in[4] = s3; |
1023 in[5] = _mm_sub_epi16(k__const_0, s7); | 1027 in[5] = _mm_sub_epi16(k__const_0, s7); |
1024 in[6] = s5; | 1028 in[6] = s5; |
1025 in[7] = _mm_sub_epi16(k__const_0, s1); | 1029 in[7] = _mm_sub_epi16(k__const_0, s1); |
1026 | 1030 |
1027 // transpose | 1031 // transpose |
1028 array_transpose_8x8_avx2(in, in); | 1032 array_transpose_8x8_avx2(in, in); |
1029 } | 1033 } |
1030 | 1034 |
1031 void vp9_short_fht8x8_avx2(const int16_t *input, int16_t *output, | 1035 void vp9_fht8x8_avx2(const int16_t *input, int16_t *output, |
1032 int stride, int tx_type) { | 1036 int stride, int tx_type) { |
1033 __m128i in[8]; | 1037 __m128i in[8]; |
1034 load_buffer_8x8_avx2(input, in, stride); | 1038 |
1035 switch (tx_type) { | 1039 switch (tx_type) { |
1036 case 0: // DCT_DCT | 1040 case DCT_DCT: |
1037 fdct8_avx2(in); | 1041 vp9_fdct8x8_avx2(input, output, stride); |
1038 fdct8_avx2(in); | |
1039 break; | 1042 break; |
1040 case 1: // ADST_DCT | 1043 case ADST_DCT: |
| 1044 load_buffer_8x8_avx2(input, in, stride); |
1041 fadst8_avx2(in); | 1045 fadst8_avx2(in); |
1042 fdct8_avx2(in); | 1046 fdct8_avx2(in); |
| 1047 right_shift_8x8_avx2(in, 1); |
| 1048 write_buffer_8x8_avx2(output, in, 8); |
1043 break; | 1049 break; |
1044 case 2: // DCT_ADST | 1050 case DCT_ADST: |
| 1051 load_buffer_8x8_avx2(input, in, stride); |
1045 fdct8_avx2(in); | 1052 fdct8_avx2(in); |
1046 fadst8_avx2(in); | 1053 fadst8_avx2(in); |
| 1054 right_shift_8x8_avx2(in, 1); |
| 1055 write_buffer_8x8_avx2(output, in, 8); |
1047 break; | 1056 break; |
1048 case 3: // ADST_ADST | 1057 case ADST_ADST: |
| 1058 load_buffer_8x8_avx2(input, in, stride); |
1049 fadst8_avx2(in); | 1059 fadst8_avx2(in); |
1050 fadst8_avx2(in); | 1060 fadst8_avx2(in); |
| 1061 right_shift_8x8_avx2(in, 1); |
| 1062 write_buffer_8x8_avx2(output, in, 8); |
1051 break; | 1063 break; |
1052 default: | 1064 default: |
1053 assert(0); | 1065 assert(0); |
1054 break; | 1066 break; |
1055 } | 1067 } |
1056 right_shift_8x8_avx2(in, 1); | |
1057 write_buffer_8x8_avx2(output, in, 8); | |
1058 } | 1068 } |
1059 | 1069 |
1060 void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) { | 1070 void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) { |
1061 // The 2D transform is done with two passes which are actually pretty | 1071 // The 2D transform is done with two passes which are actually pretty |
1062 // similar. In the first one, we transform the columns and transpose | 1072 // similar. In the first one, we transform the columns and transpose |
1063 // the results. In the second one, we transform the rows. To achieve that, | 1073 // the results. In the second one, we transform the rows. To achieve that, |
1064 // as the first pass results are transposed, we tranpose the columns (that | 1074 // as the first pass results are transposed, we tranpose the columns (that |
1065 // is the transposed rows) and transpose the results (so that it goes back | 1075 // is the transposed rows) and transpose the results (so that it goes back |
1066 // in normal/row positions). | 1076 // in normal/row positions). |
1067 int pass; | 1077 int pass; |
(...skipping 1459 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2527 fdct16_8col_avx2(in1); | 2537 fdct16_8col_avx2(in1); |
2528 array_transpose_16x16_avx2(in0, in1); | 2538 array_transpose_16x16_avx2(in0, in1); |
2529 } | 2539 } |
2530 | 2540 |
2531 void fadst16_avx2(__m128i *in0, __m128i *in1) { | 2541 void fadst16_avx2(__m128i *in0, __m128i *in1) { |
2532 fadst16_8col_avx2(in0); | 2542 fadst16_8col_avx2(in0); |
2533 fadst16_8col_avx2(in1); | 2543 fadst16_8col_avx2(in1); |
2534 array_transpose_16x16_avx2(in0, in1); | 2544 array_transpose_16x16_avx2(in0, in1); |
2535 } | 2545 } |
2536 | 2546 |
2537 void vp9_short_fht16x16_avx2(const int16_t *input, int16_t *output, | 2547 void vp9_fht16x16_avx2(const int16_t *input, int16_t *output, |
2538 int stride, int tx_type) { | 2548 int stride, int tx_type) { |
2539 __m128i in0[16], in1[16]; | 2549 __m128i in0[16], in1[16]; |
2540 load_buffer_16x16_avx2(input, in0, in1, stride); | 2550 |
2541 switch (tx_type) { | 2551 switch (tx_type) { |
2542 case 0: // DCT_DCT | 2552 case DCT_DCT: |
2543 fdct16_avx2(in0, in1); | 2553 vp9_fdct16x16_avx2(input, output, stride); |
2544 right_shift_16x16_avx2(in0, in1); | |
2545 fdct16_avx2(in0, in1); | |
2546 break; | 2554 break; |
2547 case 1: // ADST_DCT | 2555 case ADST_DCT: |
| 2556 load_buffer_16x16_avx2(input, in0, in1, stride); |
2548 fadst16_avx2(in0, in1); | 2557 fadst16_avx2(in0, in1); |
2549 right_shift_16x16_avx2(in0, in1); | 2558 right_shift_16x16_avx2(in0, in1); |
2550 fdct16_avx2(in0, in1); | 2559 fdct16_avx2(in0, in1); |
| 2560 write_buffer_16x16_avx2(output, in0, in1, 16); |
2551 break; | 2561 break; |
2552 case 2: // DCT_ADST | 2562 case DCT_ADST: |
| 2563 load_buffer_16x16_avx2(input, in0, in1, stride); |
2553 fdct16_avx2(in0, in1); | 2564 fdct16_avx2(in0, in1); |
2554 right_shift_16x16_avx2(in0, in1); | 2565 right_shift_16x16_avx2(in0, in1); |
2555 fadst16_avx2(in0, in1); | 2566 fadst16_avx2(in0, in1); |
| 2567 write_buffer_16x16_avx2(output, in0, in1, 16); |
2556 break; | 2568 break; |
2557 case 3: // ADST_ADST | 2569 case ADST_ADST: |
| 2570 load_buffer_16x16_avx2(input, in0, in1, stride); |
2558 fadst16_avx2(in0, in1); | 2571 fadst16_avx2(in0, in1); |
2559 right_shift_16x16_avx2(in0, in1); | 2572 right_shift_16x16_avx2(in0, in1); |
2560 fadst16_avx2(in0, in1); | 2573 fadst16_avx2(in0, in1); |
| 2574 write_buffer_16x16_avx2(output, in0, in1, 16); |
2561 break; | 2575 break; |
2562 default: | 2576 default: |
2563 assert(0); | 2577 assert(0); |
2564 break; | 2578 break; |
2565 } | 2579 } |
2566 write_buffer_16x16_avx2(output, in0, in1, 16); | |
2567 } | 2580 } |
2568 | 2581 |
2569 #define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2 | 2582 #define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2 |
2570 #define FDCT32x32_HIGH_PRECISION 0 | 2583 #define FDCT32x32_HIGH_PRECISION 0 |
2571 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c" | 2584 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c" |
2572 #undef FDCT32x32_2D_AVX2 | 2585 #undef FDCT32x32_2D_AVX2 |
2573 #undef FDCT32x32_HIGH_PRECISION | 2586 #undef FDCT32x32_HIGH_PRECISION |
2574 | 2587 |
2575 #define FDCT32x32_2D_AVX2 vp9_fdct32x32_avx2 | 2588 #define FDCT32x32_2D_AVX2 vp9_fdct32x32_avx2 |
2576 #define FDCT32x32_HIGH_PRECISION 1 | 2589 #define FDCT32x32_HIGH_PRECISION 1 |
2577 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c" // NOLINT | 2590 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c" // NOLINT |
2578 #undef FDCT32x32_2D_AVX2 | 2591 #undef FDCT32x32_2D_AVX2 |
2579 #undef FDCT32x32_HIGH_PRECISION | 2592 #undef FDCT32x32_HIGH_PRECISION |
OLD | NEW |