OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 224 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
235 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); | 235 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); |
236 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); | 236 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); |
237 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); | 237 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); |
238 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); | 238 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); |
239 | 239 |
240 in[0] = _mm_packs_epi32(u[0], u[2]); | 240 in[0] = _mm_packs_epi32(u[0], u[2]); |
241 in[1] = _mm_packs_epi32(u[1], u[3]); | 241 in[1] = _mm_packs_epi32(u[1], u[3]); |
242 transpose_4x4(in); | 242 transpose_4x4(in); |
243 } | 243 } |
244 | 244 |
245 void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output, | 245 void vp9_fht4x4_sse2(const int16_t *input, int16_t *output, |
246 int stride, int tx_type) { | 246 int stride, int tx_type) { |
247 __m128i in[4]; | 247 __m128i in[4]; |
248 load_buffer_4x4(input, in, stride); | 248 |
249 switch (tx_type) { | 249 switch (tx_type) { |
250 case 0: // DCT_DCT | 250 case DCT_DCT: |
251 fdct4_sse2(in); | 251 vp9_fdct4x4_sse2(input, output, stride); |
252 fdct4_sse2(in); | |
253 break; | 252 break; |
254 case 1: // ADST_DCT | 253 case ADST_DCT: |
| 254 load_buffer_4x4(input, in, stride); |
255 fadst4_sse2(in); | 255 fadst4_sse2(in); |
256 fdct4_sse2(in); | 256 fdct4_sse2(in); |
| 257 write_buffer_4x4(output, in); |
257 break; | 258 break; |
258 case 2: // DCT_ADST | 259 case DCT_ADST: |
| 260 load_buffer_4x4(input, in, stride); |
259 fdct4_sse2(in); | 261 fdct4_sse2(in); |
260 fadst4_sse2(in); | 262 fadst4_sse2(in); |
| 263 write_buffer_4x4(output, in); |
261 break; | 264 break; |
262 case 3: // ADST_ADST | 265 case ADST_ADST: |
| 266 load_buffer_4x4(input, in, stride); |
263 fadst4_sse2(in); | 267 fadst4_sse2(in); |
264 fadst4_sse2(in); | 268 fadst4_sse2(in); |
| 269 write_buffer_4x4(output, in); |
265 break; | 270 break; |
266 default: | 271 default: |
267 assert(0); | 272 assert(0); |
268 break; | 273 break; |
269 } | 274 } |
270 write_buffer_4x4(output, in); | |
271 } | 275 } |
272 | 276 |
273 void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { | 277 void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { |
274 int pass; | 278 int pass; |
275 // Constants | 279 // Constants |
276 // When we use them, in one case, they are all the same. In all others | 280 // When we use them, in one case, they are all the same. In all others |
277 // it's a pair of them that we need to repeat four times. This is done | 281 // it's a pair of them that we need to repeat four times. This is done |
278 // by constructing the 32 bit constant corresponding to that pair. | 282 // by constructing the 32 bit constant corresponding to that pair. |
279 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | 283 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); |
280 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 284 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
(...skipping 738 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1019 in[3] = _mm_sub_epi16(k__const_0, s2); | 1023 in[3] = _mm_sub_epi16(k__const_0, s2); |
1020 in[4] = s3; | 1024 in[4] = s3; |
1021 in[5] = _mm_sub_epi16(k__const_0, s7); | 1025 in[5] = _mm_sub_epi16(k__const_0, s7); |
1022 in[6] = s5; | 1026 in[6] = s5; |
1023 in[7] = _mm_sub_epi16(k__const_0, s1); | 1027 in[7] = _mm_sub_epi16(k__const_0, s1); |
1024 | 1028 |
1025 // transpose | 1029 // transpose |
1026 array_transpose_8x8(in, in); | 1030 array_transpose_8x8(in, in); |
1027 } | 1031 } |
1028 | 1032 |
1029 void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output, | 1033 void vp9_fht8x8_sse2(const int16_t *input, int16_t *output, |
1030 int stride, int tx_type) { | 1034 int stride, int tx_type) { |
1031 __m128i in[8]; | 1035 __m128i in[8]; |
1032 load_buffer_8x8(input, in, stride); | 1036 |
1033 switch (tx_type) { | 1037 switch (tx_type) { |
1034 case 0: // DCT_DCT | 1038 case DCT_DCT: |
1035 fdct8_sse2(in); | 1039 vp9_fdct8x8_sse2(input, output, stride); |
1036 fdct8_sse2(in); | |
1037 break; | 1040 break; |
1038 case 1: // ADST_DCT | 1041 case ADST_DCT: |
| 1042 load_buffer_8x8(input, in, stride); |
1039 fadst8_sse2(in); | 1043 fadst8_sse2(in); |
1040 fdct8_sse2(in); | 1044 fdct8_sse2(in); |
| 1045 right_shift_8x8(in, 1); |
| 1046 write_buffer_8x8(output, in, 8); |
1041 break; | 1047 break; |
1042 case 2: // DCT_ADST | 1048 case DCT_ADST: |
| 1049 load_buffer_8x8(input, in, stride); |
1043 fdct8_sse2(in); | 1050 fdct8_sse2(in); |
1044 fadst8_sse2(in); | 1051 fadst8_sse2(in); |
| 1052 right_shift_8x8(in, 1); |
| 1053 write_buffer_8x8(output, in, 8); |
1045 break; | 1054 break; |
1046 case 3: // ADST_ADST | 1055 case ADST_ADST: |
| 1056 load_buffer_8x8(input, in, stride); |
1047 fadst8_sse2(in); | 1057 fadst8_sse2(in); |
1048 fadst8_sse2(in); | 1058 fadst8_sse2(in); |
| 1059 right_shift_8x8(in, 1); |
| 1060 write_buffer_8x8(output, in, 8); |
1049 break; | 1061 break; |
1050 default: | 1062 default: |
1051 assert(0); | 1063 assert(0); |
1052 break; | 1064 break; |
1053 } | 1065 } |
1054 right_shift_8x8(in, 1); | |
1055 write_buffer_8x8(output, in, 8); | |
1056 } | 1066 } |
1057 | 1067 |
1058 void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { | 1068 void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { |
1059 // The 2D transform is done with two passes which are actually pretty | 1069 // The 2D transform is done with two passes which are actually pretty |
1060 // similar. In the first one, we transform the columns and transpose | 1070 // similar. In the first one, we transform the columns and transpose |
1061 // the results. In the second one, we transform the rows. To achieve that, | 1071 // the results. In the second one, we transform the rows. To achieve that, |
1062 // as the first pass results are transposed, we tranpose the columns (that | 1072 // as the first pass results are transposed, we tranpose the columns (that |
1063 // is the transposed rows) and transpose the results (so that it goes back | 1073 // is the transposed rows) and transpose the results (so that it goes back |
1064 // in normal/row positions). | 1074 // in normal/row positions). |
1065 int pass; | 1075 int pass; |
(...skipping 1459 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2525 fdct16_8col(in1); | 2535 fdct16_8col(in1); |
2526 array_transpose_16x16(in0, in1); | 2536 array_transpose_16x16(in0, in1); |
2527 } | 2537 } |
2528 | 2538 |
2529 void fadst16_sse2(__m128i *in0, __m128i *in1) { | 2539 void fadst16_sse2(__m128i *in0, __m128i *in1) { |
2530 fadst16_8col(in0); | 2540 fadst16_8col(in0); |
2531 fadst16_8col(in1); | 2541 fadst16_8col(in1); |
2532 array_transpose_16x16(in0, in1); | 2542 array_transpose_16x16(in0, in1); |
2533 } | 2543 } |
2534 | 2544 |
2535 void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output, | 2545 void vp9_fht16x16_sse2(const int16_t *input, int16_t *output, |
2536 int stride, int tx_type) { | 2546 int stride, int tx_type) { |
2537 __m128i in0[16], in1[16]; | 2547 __m128i in0[16], in1[16]; |
2538 load_buffer_16x16(input, in0, in1, stride); | 2548 |
2539 switch (tx_type) { | 2549 switch (tx_type) { |
2540 case 0: // DCT_DCT | 2550 case DCT_DCT: |
2541 fdct16_sse2(in0, in1); | 2551 vp9_fdct16x16_sse2(input, output, stride); |
2542 right_shift_16x16(in0, in1); | |
2543 fdct16_sse2(in0, in1); | |
2544 break; | 2552 break; |
2545 case 1: // ADST_DCT | 2553 case ADST_DCT: |
| 2554 load_buffer_16x16(input, in0, in1, stride); |
2546 fadst16_sse2(in0, in1); | 2555 fadst16_sse2(in0, in1); |
2547 right_shift_16x16(in0, in1); | 2556 right_shift_16x16(in0, in1); |
2548 fdct16_sse2(in0, in1); | 2557 fdct16_sse2(in0, in1); |
| 2558 write_buffer_16x16(output, in0, in1, 16); |
2549 break; | 2559 break; |
2550 case 2: // DCT_ADST | 2560 case DCT_ADST: |
| 2561 load_buffer_16x16(input, in0, in1, stride); |
2551 fdct16_sse2(in0, in1); | 2562 fdct16_sse2(in0, in1); |
2552 right_shift_16x16(in0, in1); | 2563 right_shift_16x16(in0, in1); |
2553 fadst16_sse2(in0, in1); | 2564 fadst16_sse2(in0, in1); |
| 2565 write_buffer_16x16(output, in0, in1, 16); |
2554 break; | 2566 break; |
2555 case 3: // ADST_ADST | 2567 case ADST_ADST: |
| 2568 load_buffer_16x16(input, in0, in1, stride); |
2556 fadst16_sse2(in0, in1); | 2569 fadst16_sse2(in0, in1); |
2557 right_shift_16x16(in0, in1); | 2570 right_shift_16x16(in0, in1); |
2558 fadst16_sse2(in0, in1); | 2571 fadst16_sse2(in0, in1); |
| 2572 write_buffer_16x16(output, in0, in1, 16); |
2559 break; | 2573 break; |
2560 default: | 2574 default: |
2561 assert(0); | 2575 assert(0); |
2562 break; | 2576 break; |
2563 } | 2577 } |
2564 write_buffer_16x16(output, in0, in1, 16); | |
2565 } | 2578 } |
2566 | 2579 |
2567 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2 | 2580 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2 |
2568 #define FDCT32x32_HIGH_PRECISION 0 | 2581 #define FDCT32x32_HIGH_PRECISION 0 |
2569 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" | 2582 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" |
2570 #undef FDCT32x32_2D | 2583 #undef FDCT32x32_2D |
2571 #undef FDCT32x32_HIGH_PRECISION | 2584 #undef FDCT32x32_HIGH_PRECISION |
2572 | 2585 |
2573 #define FDCT32x32_2D vp9_fdct32x32_sse2 | 2586 #define FDCT32x32_2D vp9_fdct32x32_sse2 |
2574 #define FDCT32x32_HIGH_PRECISION 1 | 2587 #define FDCT32x32_HIGH_PRECISION 1 |
2575 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT | 2588 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT |
2576 #undef FDCT32x32_2D | 2589 #undef FDCT32x32_2D |
2577 #undef FDCT32x32_HIGH_PRECISION | 2590 #undef FDCT32x32_HIGH_PRECISION |
OLD | NEW |