| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include <immintrin.h> // AVX2 | 11 #include <immintrin.h> // AVX2 |
| 12 #include "vp9/common/vp9_idct.h" // for cospi constants | 12 #include "vp9/common/vp9_idct.h" // for cospi constants |
| 13 #include "vpx_ports/mem.h" | 13 #include "vpx_ports/mem.h" |
| 14 | 14 |
| 15 void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) { | 15 void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) { |
| 16 // The 2D transform is done with two passes which are actually pretty | 16 // The 2D transform is done with two passes which are actually pretty |
| 17 // similar. In the first one, we transform the columns and transpose | 17 // similar. In the first one, we transform the columns and transpose |
| 18 // the results. In the second one, we transform the rows. To achieve that, | 18 // the results. In the second one, we transform the rows. To achieve that, |
| 19 // as the first pass results are transposed, we tranpose the columns (that | 19 // as the first pass results are transposed, we transpose the columns (that |
| 20 // is the transposed rows) and transpose the results (so that it goes back | 20 // is the transposed rows) and transpose the results (so that it goes back |
| 21 // in normal/row positions). | 21 // in normal/row positions). |
| 22 int pass; | 22 int pass; |
| 23 // Constants | 23 // Constants |
| 24 // When we use them, in one case, they are all the same. In all others | 24 // When we use them, in one case, they are all the same. In all others |
| 25 // it's a pair of them that we need to repeat four times. This is done | 25 // it's a pair of them that we need to repeat four times. This is done |
| 26 // by constructing the 32 bit constant corresponding to that pair. | 26 // by constructing the 32 bit constant corresponding to that pair. |
| 27 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | 27 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); |
| 28 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 28 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
| 29 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | 29 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
| 30 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 30 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
| 31 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | 31 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
| 32 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); | 32 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); |
| 33 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); | 33 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); |
| 34 const __m128i kOne = _mm_set1_epi16(1); | 34 const __m128i kOne = _mm_set1_epi16(1); |
| 35 __m128i in0, in1, in2, in3; | 35 __m128i in0, in1, in2, in3; |
| 36 // Load inputs. | 36 // Load inputs. |
| 37 { | 37 { |
| 38 in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); | 38 in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); |
| 39 in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); | 39 in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); |
| 40 in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); | 40 in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); |
| 41 in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); | 41 in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); |
| 42 // x = x << 4 | 42 // x = x << 4 |
| 43 in0 = _mm_slli_epi16(in0, 4); | 43 in0 = _mm_slli_epi16(in0, 4); |
| 44 in1 = _mm_slli_epi16(in1, 4); | 44 in1 = _mm_slli_epi16(in1, 4); |
| 45 in2 = _mm_slli_epi16(in2, 4); | 45 in2 = _mm_slli_epi16(in2, 4); |
| 46 in3 = _mm_slli_epi16(in3, 4); | 46 in3 = _mm_slli_epi16(in3, 4); |
| 47 // if (i == 0 && input[0]) input[0] += 1; | 47 // if (i == 0 && input[0]) input[0] += 1; |
| 48 { | 48 { |
| 49 // The mask will only contain wether the first value is zero, all | 49 // The mask will only contain whether the first value is zero, all |
| 50 // other comparison will fail as something shifted by 4 (above << 4) | 50 // other comparison will fail as something shifted by 4 (above << 4) |
| 51 // can never be equal to one. To increment in the non-zero case, we | 51 // can never be equal to one. To increment in the non-zero case, we |
| 52 // add the mask and one for the first element: | 52 // add the mask and one for the first element: |
| 53 // - if zero, mask = -1, v = v - 1 + 1 = v | 53 // - if zero, mask = -1, v = v - 1 + 1 = v |
| 54 // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 | 54 // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 |
| 55 __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); | 55 __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); |
| 56 in0 = _mm_add_epi16(in0, mask); | 56 in0 = _mm_add_epi16(in0, mask); |
| 57 in0 = _mm_add_epi16(in0, k__nonzero_bias_b); | 57 in0 = _mm_add_epi16(in0, k__nonzero_bias_b); |
| 58 } | 58 } |
| 59 } | 59 } |
| 60 // Do the two transform/transpose passes | 60 // Do the two transform/transpose passes |
| 61 for (pass = 0; pass < 2; ++pass) { | 61 for (pass = 0; pass < 2; ++pass) { |
| 62 // Transform 1/2: Add/substract | 62 // Transform 1/2: Add/subtract |
| 63 const __m128i r0 = _mm_add_epi16(in0, in3); | 63 const __m128i r0 = _mm_add_epi16(in0, in3); |
| 64 const __m128i r1 = _mm_add_epi16(in1, in2); | 64 const __m128i r1 = _mm_add_epi16(in1, in2); |
| 65 const __m128i r2 = _mm_sub_epi16(in1, in2); | 65 const __m128i r2 = _mm_sub_epi16(in1, in2); |
| 66 const __m128i r3 = _mm_sub_epi16(in0, in3); | 66 const __m128i r3 = _mm_sub_epi16(in0, in3); |
| 67 // Transform 1/2: Interleave to do the multiply by constants which gets us | 67 // Transform 1/2: Interleave to do the multiply by constants which gets us |
| 68 // into 32 bits. | 68 // into 32 bits. |
| 69 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); | 69 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); |
| 70 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); | 70 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); |
| 71 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); | 71 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); |
| 72 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); | 72 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); |
| (...skipping 237 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 310 in6 = _mm_slli_epi16(in6, 2); | 310 in6 = _mm_slli_epi16(in6, 2); |
| 311 in7 = _mm_slli_epi16(in7, 2); | 311 in7 = _mm_slli_epi16(in7, 2); |
| 312 | 312 |
| 313 // We do two passes, first the columns, then the rows. The results of the | 313 // We do two passes, first the columns, then the rows. The results of the |
| 314 // first pass are transposed so that the same column code can be reused. The | 314 // first pass are transposed so that the same column code can be reused. The |
| 315 // results of the second pass are also transposed so that the rows (processed | 315 // results of the second pass are also transposed so that the rows (processed |
| 316 // as columns) are put back in row positions. | 316 // as columns) are put back in row positions. |
| 317 for (pass = 0; pass < 2; pass++) { | 317 for (pass = 0; pass < 2; pass++) { |
| 318 // To store results of each pass before the transpose. | 318 // To store results of each pass before the transpose. |
| 319 __m128i res0, res1, res2, res3, res4, res5, res6, res7; | 319 __m128i res0, res1, res2, res3, res4, res5, res6, res7; |
| 320 // Add/substract | 320 // Add/subtract |
| 321 const __m128i q0 = _mm_add_epi16(in0, in7); | 321 const __m128i q0 = _mm_add_epi16(in0, in7); |
| 322 const __m128i q1 = _mm_add_epi16(in1, in6); | 322 const __m128i q1 = _mm_add_epi16(in1, in6); |
| 323 const __m128i q2 = _mm_add_epi16(in2, in5); | 323 const __m128i q2 = _mm_add_epi16(in2, in5); |
| 324 const __m128i q3 = _mm_add_epi16(in3, in4); | 324 const __m128i q3 = _mm_add_epi16(in3, in4); |
| 325 const __m128i q4 = _mm_sub_epi16(in3, in4); | 325 const __m128i q4 = _mm_sub_epi16(in3, in4); |
| 326 const __m128i q5 = _mm_sub_epi16(in2, in5); | 326 const __m128i q5 = _mm_sub_epi16(in2, in5); |
| 327 const __m128i q6 = _mm_sub_epi16(in1, in6); | 327 const __m128i q6 = _mm_sub_epi16(in1, in6); |
| 328 const __m128i q7 = _mm_sub_epi16(in0, in7); | 328 const __m128i q7 = _mm_sub_epi16(in0, in7); |
| 329 // Work on first four results | 329 // Work on first four results |
| 330 { | 330 { |
| 331 // Add/substract | 331 // Add/subtract |
| 332 const __m128i r0 = _mm_add_epi16(q0, q3); | 332 const __m128i r0 = _mm_add_epi16(q0, q3); |
| 333 const __m128i r1 = _mm_add_epi16(q1, q2); | 333 const __m128i r1 = _mm_add_epi16(q1, q2); |
| 334 const __m128i r2 = _mm_sub_epi16(q1, q2); | 334 const __m128i r2 = _mm_sub_epi16(q1, q2); |
| 335 const __m128i r3 = _mm_sub_epi16(q0, q3); | 335 const __m128i r3 = _mm_sub_epi16(q0, q3); |
| 336 // Interleave to do the multiply by constants which gets us into 32bits | 336 // Interleave to do the multiply by constants which gets us into 32bits |
| 337 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); | 337 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); |
| 338 const __m128i t1 = _mm_unpackhi_epi16(r0, r1); | 338 const __m128i t1 = _mm_unpackhi_epi16(r0, r1); |
| 339 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); | 339 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); |
| 340 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); | 340 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); |
| 341 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); | 341 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); |
| (...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 383 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); | 383 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); |
| 384 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); | 384 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); |
| 385 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); | 385 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); |
| 386 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); | 386 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); |
| 387 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); | 387 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); |
| 388 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); | 388 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); |
| 389 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); | 389 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); |
| 390 // Combine | 390 // Combine |
| 391 const __m128i r0 = _mm_packs_epi32(s0, s1); | 391 const __m128i r0 = _mm_packs_epi32(s0, s1); |
| 392 const __m128i r1 = _mm_packs_epi32(s2, s3); | 392 const __m128i r1 = _mm_packs_epi32(s2, s3); |
| 393 // Add/substract | 393 // Add/subtract |
| 394 const __m128i x0 = _mm_add_epi16(q4, r0); | 394 const __m128i x0 = _mm_add_epi16(q4, r0); |
| 395 const __m128i x1 = _mm_sub_epi16(q4, r0); | 395 const __m128i x1 = _mm_sub_epi16(q4, r0); |
| 396 const __m128i x2 = _mm_sub_epi16(q7, r1); | 396 const __m128i x2 = _mm_sub_epi16(q7, r1); |
| 397 const __m128i x3 = _mm_add_epi16(q7, r1); | 397 const __m128i x3 = _mm_add_epi16(q7, r1); |
| 398 // Interleave to do the multiply by constants which gets us into 32bits | 398 // Interleave to do the multiply by constants which gets us into 32bits |
| 399 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); | 399 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); |
| 400 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); | 400 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); |
| 401 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); | 401 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); |
| 402 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); | 402 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); |
| 403 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); | 403 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); |
| (...skipping 660 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1064 default: | 1064 default: |
| 1065 assert(0); | 1065 assert(0); |
| 1066 break; | 1066 break; |
| 1067 } | 1067 } |
| 1068 } | 1068 } |
| 1069 | 1069 |
| 1070 void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) { | 1070 void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) { |
| 1071 // The 2D transform is done with two passes which are actually pretty | 1071 // The 2D transform is done with two passes which are actually pretty |
| 1072 // similar. In the first one, we transform the columns and transpose | 1072 // similar. In the first one, we transform the columns and transpose |
| 1073 // the results. In the second one, we transform the rows. To achieve that, | 1073 // the results. In the second one, we transform the rows. To achieve that, |
| 1074 // as the first pass results are transposed, we tranpose the columns (that | 1074 // as the first pass results are transposed, we transpose the columns (that |
| 1075 // is the transposed rows) and transpose the results (so that it goes back | 1075 // is the transposed rows) and transpose the results (so that it goes back |
| 1076 // in normal/row positions). | 1076 // in normal/row positions). |
| 1077 int pass; | 1077 int pass; |
| 1078 // We need an intermediate buffer between passes. | 1078 // We need an intermediate buffer between passes. |
| 1079 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); | 1079 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); |
| 1080 const int16_t *in = input; | 1080 const int16_t *in = input; |
| 1081 int16_t *out = intermediate; | 1081 int16_t *out = intermediate; |
| 1082 // Constants | 1082 // Constants |
| 1083 // When we use them, in one case, they are all the same. In all others | 1083 // When we use them, in one case, they are all the same. In all others |
| 1084 // it's a pair of them that we need to repeat four times. This is done | 1084 // it's a pair of them that we need to repeat four times. This is done |
| (...skipping 136 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1221 step1_1 = _mm_sub_epi16(in06, in09); | 1221 step1_1 = _mm_sub_epi16(in06, in09); |
| 1222 step1_2 = _mm_sub_epi16(in05, in10); | 1222 step1_2 = _mm_sub_epi16(in05, in10); |
| 1223 step1_3 = _mm_sub_epi16(in04, in11); | 1223 step1_3 = _mm_sub_epi16(in04, in11); |
| 1224 step1_4 = _mm_sub_epi16(in03, in12); | 1224 step1_4 = _mm_sub_epi16(in03, in12); |
| 1225 step1_5 = _mm_sub_epi16(in02, in13); | 1225 step1_5 = _mm_sub_epi16(in02, in13); |
| 1226 step1_6 = _mm_sub_epi16(in01, in14); | 1226 step1_6 = _mm_sub_epi16(in01, in14); |
| 1227 step1_7 = _mm_sub_epi16(in00, in15); | 1227 step1_7 = _mm_sub_epi16(in00, in15); |
| 1228 } | 1228 } |
| 1229 // Work on the first eight values; fdct8(input, even_results); | 1229 // Work on the first eight values; fdct8(input, even_results); |
| 1230 { | 1230 { |
| 1231 // Add/substract | 1231 // Add/subtract |
| 1232 const __m128i q0 = _mm_add_epi16(input0, input7); | 1232 const __m128i q0 = _mm_add_epi16(input0, input7); |
| 1233 const __m128i q1 = _mm_add_epi16(input1, input6); | 1233 const __m128i q1 = _mm_add_epi16(input1, input6); |
| 1234 const __m128i q2 = _mm_add_epi16(input2, input5); | 1234 const __m128i q2 = _mm_add_epi16(input2, input5); |
| 1235 const __m128i q3 = _mm_add_epi16(input3, input4); | 1235 const __m128i q3 = _mm_add_epi16(input3, input4); |
| 1236 const __m128i q4 = _mm_sub_epi16(input3, input4); | 1236 const __m128i q4 = _mm_sub_epi16(input3, input4); |
| 1237 const __m128i q5 = _mm_sub_epi16(input2, input5); | 1237 const __m128i q5 = _mm_sub_epi16(input2, input5); |
| 1238 const __m128i q6 = _mm_sub_epi16(input1, input6); | 1238 const __m128i q6 = _mm_sub_epi16(input1, input6); |
| 1239 const __m128i q7 = _mm_sub_epi16(input0, input7); | 1239 const __m128i q7 = _mm_sub_epi16(input0, input7); |
| 1240 // Work on first four results | 1240 // Work on first four results |
| 1241 { | 1241 { |
| 1242 // Add/substract | 1242 // Add/subtract |
| 1243 const __m128i r0 = _mm_add_epi16(q0, q3); | 1243 const __m128i r0 = _mm_add_epi16(q0, q3); |
| 1244 const __m128i r1 = _mm_add_epi16(q1, q2); | 1244 const __m128i r1 = _mm_add_epi16(q1, q2); |
| 1245 const __m128i r2 = _mm_sub_epi16(q1, q2); | 1245 const __m128i r2 = _mm_sub_epi16(q1, q2); |
| 1246 const __m128i r3 = _mm_sub_epi16(q0, q3); | 1246 const __m128i r3 = _mm_sub_epi16(q0, q3); |
| 1247 // Interleave to do the multiply by constants which gets us | 1247 // Interleave to do the multiply by constants which gets us |
| 1248 // into 32 bits. | 1248 // into 32 bits. |
| 1249 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); | 1249 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); |
| 1250 const __m128i t1 = _mm_unpackhi_epi16(r0, r1); | 1250 const __m128i t1 = _mm_unpackhi_epi16(r0, r1); |
| 1251 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); | 1251 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); |
| 1252 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); | 1252 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); |
| (...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1296 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); | 1296 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); |
| 1297 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); | 1297 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); |
| 1298 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); | 1298 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); |
| 1299 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); | 1299 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); |
| 1300 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); | 1300 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); |
| 1301 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); | 1301 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); |
| 1302 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); | 1302 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); |
| 1303 // Combine | 1303 // Combine |
| 1304 const __m128i r0 = _mm_packs_epi32(s0, s1); | 1304 const __m128i r0 = _mm_packs_epi32(s0, s1); |
| 1305 const __m128i r1 = _mm_packs_epi32(s2, s3); | 1305 const __m128i r1 = _mm_packs_epi32(s2, s3); |
| 1306 // Add/substract | 1306 // Add/subtract |
| 1307 const __m128i x0 = _mm_add_epi16(q4, r0); | 1307 const __m128i x0 = _mm_add_epi16(q4, r0); |
| 1308 const __m128i x1 = _mm_sub_epi16(q4, r0); | 1308 const __m128i x1 = _mm_sub_epi16(q4, r0); |
| 1309 const __m128i x2 = _mm_sub_epi16(q7, r1); | 1309 const __m128i x2 = _mm_sub_epi16(q7, r1); |
| 1310 const __m128i x3 = _mm_add_epi16(q7, r1); | 1310 const __m128i x3 = _mm_add_epi16(q7, r1); |
| 1311 // Interleave to do the multiply by constants which gets us | 1311 // Interleave to do the multiply by constants which gets us |
| 1312 // into 32 bits. | 1312 // into 32 bits. |
| 1313 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); | 1313 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); |
| 1314 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); | 1314 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); |
| 1315 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); | 1315 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); |
| 1316 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); | 1316 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); |
| (...skipping 1266 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2583 #define FDCT32x32_HIGH_PRECISION 0 | 2583 #define FDCT32x32_HIGH_PRECISION 0 |
| 2584 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c" | 2584 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c" |
| 2585 #undef FDCT32x32_2D_AVX2 | 2585 #undef FDCT32x32_2D_AVX2 |
| 2586 #undef FDCT32x32_HIGH_PRECISION | 2586 #undef FDCT32x32_HIGH_PRECISION |
| 2587 | 2587 |
| 2588 #define FDCT32x32_2D_AVX2 vp9_fdct32x32_avx2 | 2588 #define FDCT32x32_2D_AVX2 vp9_fdct32x32_avx2 |
| 2589 #define FDCT32x32_HIGH_PRECISION 1 | 2589 #define FDCT32x32_HIGH_PRECISION 1 |
| 2590 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c" // NOLINT | 2590 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c" // NOLINT |
| 2591 #undef FDCT32x32_2D_AVX2 | 2591 #undef FDCT32x32_2D_AVX2 |
| 2592 #undef FDCT32x32_HIGH_PRECISION | 2592 #undef FDCT32x32_HIGH_PRECISION |
| OLD | NEW |