| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include <emmintrin.h> // SSE2 | 11 #include <emmintrin.h> // SSE2 |
| 12 #include "vp9/common/vp9_idct.h" // for cospi constants | 12 #include "vp9/common/vp9_idct.h" // for cospi constants |
| 13 #include "vpx_ports/mem.h" | 13 #include "vpx_ports/mem.h" |
| 14 | 14 |
| 15 void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { | 15 void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { |
| 16 // The 2D transform is done with two passes which are actually pretty | 16 // The 2D transform is done with two passes which are actually pretty |
| 17 // similar. In the first one, we transform the columns and transpose | 17 // similar. In the first one, we transform the columns and transpose |
| 18 // the results. In the second one, we transform the rows. To achieve that, | 18 // the results. In the second one, we transform the rows. To achieve that, |
| 19 // as the first pass results are transposed, we tranpose the columns (that | 19 // as the first pass results are transposed, we transpose the columns (that |
| 20 // is the transposed rows) and transpose the results (so that it goes back | 20 // is the transposed rows) and transpose the results (so that it goes back |
| 21 // in normal/row positions). | 21 // in normal/row positions). |
| 22 int pass; | 22 int pass; |
| 23 // Constants | 23 // Constants |
| 24 // When we use them, in one case, they are all the same. In all others | 24 // When we use them, in one case, they are all the same. In all others |
| 25 // it's a pair of them that we need to repeat four times. This is done | 25 // it's a pair of them that we need to repeat four times. This is done |
| 26 // by constructing the 32 bit constant corresponding to that pair. | 26 // by constructing the 32 bit constant corresponding to that pair. |
| 27 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | 27 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); |
| 28 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 28 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
| 29 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); | 29 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
| (...skipping 10 matching lines...) Expand all Loading... |
| 40 (input + 1 * stride))); | 40 (input + 1 * stride))); |
| 41 in1 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); | 41 in1 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); |
| 42 in1 = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *) | 42 in1 = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *) |
| 43 (input + 3 * stride)), in1); | 43 (input + 3 * stride)), in1); |
| 44 | 44 |
| 45 // x = x << 4 | 45 // x = x << 4 |
| 46 in0 = _mm_slli_epi16(in0, 4); | 46 in0 = _mm_slli_epi16(in0, 4); |
| 47 in1 = _mm_slli_epi16(in1, 4); | 47 in1 = _mm_slli_epi16(in1, 4); |
| 48 // if (i == 0 && input[0]) input[0] += 1; | 48 // if (i == 0 && input[0]) input[0] += 1; |
| 49 { | 49 { |
| 50 // The mask will only contain wether the first value is zero, all | 50 // The mask will only contain whether the first value is zero, all |
| 51 // other comparison will fail as something shifted by 4 (above << 4) | 51 // other comparison will fail as something shifted by 4 (above << 4) |
| 52 // can never be equal to one. To increment in the non-zero case, we | 52 // can never be equal to one. To increment in the non-zero case, we |
| 53 // add the mask and one for the first element: | 53 // add the mask and one for the first element: |
| 54 // - if zero, mask = -1, v = v - 1 + 1 = v | 54 // - if zero, mask = -1, v = v - 1 + 1 = v |
| 55 // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 | 55 // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 |
| 56 __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); | 56 __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); |
| 57 in0 = _mm_add_epi16(in0, mask); | 57 in0 = _mm_add_epi16(in0, mask); |
| 58 in0 = _mm_add_epi16(in0, k__nonzero_bias_b); | 58 in0 = _mm_add_epi16(in0, k__nonzero_bias_b); |
| 59 } | 59 } |
| 60 } | 60 } |
| 61 // Do the two transform/transpose passes | 61 // Do the two transform/transpose passes |
| 62 for (pass = 0; pass < 2; ++pass) { | 62 for (pass = 0; pass < 2; ++pass) { |
| 63 // Transform 1/2: Add/substract | 63 // Transform 1/2: Add/subtract |
| 64 const __m128i r0 = _mm_add_epi16(in0, in1); | 64 const __m128i r0 = _mm_add_epi16(in0, in1); |
| 65 const __m128i r1 = _mm_sub_epi16(in0, in1); | 65 const __m128i r1 = _mm_sub_epi16(in0, in1); |
| 66 const __m128i r2 = _mm_unpacklo_epi64(r0, r1); | 66 const __m128i r2 = _mm_unpacklo_epi64(r0, r1); |
| 67 const __m128i r3 = _mm_unpackhi_epi64(r0, r1); | 67 const __m128i r3 = _mm_unpackhi_epi64(r0, r1); |
| 68 // Transform 1/2: Interleave to do the multiply by constants which gets us | 68 // Transform 1/2: Interleave to do the multiply by constants which gets us |
| 69 // into 32 bits. | 69 // into 32 bits. |
| 70 const __m128i t0 = _mm_unpacklo_epi16(r2, r3); | 70 const __m128i t0 = _mm_unpacklo_epi16(r2, r3); |
| 71 const __m128i t2 = _mm_unpackhi_epi16(r2, r3); | 71 const __m128i t2 = _mm_unpackhi_epi16(r2, r3); |
| 72 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); | 72 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); |
| 73 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); | 73 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); |
| (...skipping 234 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 308 in6 = _mm_slli_epi16(in6, 2); | 308 in6 = _mm_slli_epi16(in6, 2); |
| 309 in7 = _mm_slli_epi16(in7, 2); | 309 in7 = _mm_slli_epi16(in7, 2); |
| 310 | 310 |
| 311 // We do two passes, first the columns, then the rows. The results of the | 311 // We do two passes, first the columns, then the rows. The results of the |
| 312 // first pass are transposed so that the same column code can be reused. The | 312 // first pass are transposed so that the same column code can be reused. The |
| 313 // results of the second pass are also transposed so that the rows (processed | 313 // results of the second pass are also transposed so that the rows (processed |
| 314 // as columns) are put back in row positions. | 314 // as columns) are put back in row positions. |
| 315 for (pass = 0; pass < 2; pass++) { | 315 for (pass = 0; pass < 2; pass++) { |
| 316 // To store results of each pass before the transpose. | 316 // To store results of each pass before the transpose. |
| 317 __m128i res0, res1, res2, res3, res4, res5, res6, res7; | 317 __m128i res0, res1, res2, res3, res4, res5, res6, res7; |
| 318 // Add/substract | 318 // Add/subtract |
| 319 const __m128i q0 = _mm_add_epi16(in0, in7); | 319 const __m128i q0 = _mm_add_epi16(in0, in7); |
| 320 const __m128i q1 = _mm_add_epi16(in1, in6); | 320 const __m128i q1 = _mm_add_epi16(in1, in6); |
| 321 const __m128i q2 = _mm_add_epi16(in2, in5); | 321 const __m128i q2 = _mm_add_epi16(in2, in5); |
| 322 const __m128i q3 = _mm_add_epi16(in3, in4); | 322 const __m128i q3 = _mm_add_epi16(in3, in4); |
| 323 const __m128i q4 = _mm_sub_epi16(in3, in4); | 323 const __m128i q4 = _mm_sub_epi16(in3, in4); |
| 324 const __m128i q5 = _mm_sub_epi16(in2, in5); | 324 const __m128i q5 = _mm_sub_epi16(in2, in5); |
| 325 const __m128i q6 = _mm_sub_epi16(in1, in6); | 325 const __m128i q6 = _mm_sub_epi16(in1, in6); |
| 326 const __m128i q7 = _mm_sub_epi16(in0, in7); | 326 const __m128i q7 = _mm_sub_epi16(in0, in7); |
| 327 // Work on first four results | 327 // Work on first four results |
| 328 { | 328 { |
| 329 // Add/substract | 329 // Add/subtract |
| 330 const __m128i r0 = _mm_add_epi16(q0, q3); | 330 const __m128i r0 = _mm_add_epi16(q0, q3); |
| 331 const __m128i r1 = _mm_add_epi16(q1, q2); | 331 const __m128i r1 = _mm_add_epi16(q1, q2); |
| 332 const __m128i r2 = _mm_sub_epi16(q1, q2); | 332 const __m128i r2 = _mm_sub_epi16(q1, q2); |
| 333 const __m128i r3 = _mm_sub_epi16(q0, q3); | 333 const __m128i r3 = _mm_sub_epi16(q0, q3); |
| 334 // Interleave to do the multiply by constants which gets us into 32bits | 334 // Interleave to do the multiply by constants which gets us into 32bits |
| 335 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); | 335 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); |
| 336 const __m128i t1 = _mm_unpackhi_epi16(r0, r1); | 336 const __m128i t1 = _mm_unpackhi_epi16(r0, r1); |
| 337 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); | 337 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); |
| 338 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); | 338 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); |
| 339 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); | 339 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); |
| (...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 381 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); | 381 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); |
| 382 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); | 382 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); |
| 383 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); | 383 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); |
| 384 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); | 384 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); |
| 385 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); | 385 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); |
| 386 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); | 386 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); |
| 387 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); | 387 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); |
| 388 // Combine | 388 // Combine |
| 389 const __m128i r0 = _mm_packs_epi32(s0, s1); | 389 const __m128i r0 = _mm_packs_epi32(s0, s1); |
| 390 const __m128i r1 = _mm_packs_epi32(s2, s3); | 390 const __m128i r1 = _mm_packs_epi32(s2, s3); |
| 391 // Add/substract | 391 // Add/subtract |
| 392 const __m128i x0 = _mm_add_epi16(q4, r0); | 392 const __m128i x0 = _mm_add_epi16(q4, r0); |
| 393 const __m128i x1 = _mm_sub_epi16(q4, r0); | 393 const __m128i x1 = _mm_sub_epi16(q4, r0); |
| 394 const __m128i x2 = _mm_sub_epi16(q7, r1); | 394 const __m128i x2 = _mm_sub_epi16(q7, r1); |
| 395 const __m128i x3 = _mm_add_epi16(q7, r1); | 395 const __m128i x3 = _mm_add_epi16(q7, r1); |
| 396 // Interleave to do the multiply by constants which gets us into 32bits | 396 // Interleave to do the multiply by constants which gets us into 32bits |
| 397 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); | 397 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); |
| 398 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); | 398 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); |
| 399 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); | 399 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); |
| 400 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); | 400 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); |
| 401 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); | 401 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); |
| (...skipping 660 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1062 default: | 1062 default: |
| 1063 assert(0); | 1063 assert(0); |
| 1064 break; | 1064 break; |
| 1065 } | 1065 } |
| 1066 } | 1066 } |
| 1067 | 1067 |
| 1068 void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { | 1068 void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { |
| 1069 // The 2D transform is done with two passes which are actually pretty | 1069 // The 2D transform is done with two passes which are actually pretty |
| 1070 // similar. In the first one, we transform the columns and transpose | 1070 // similar. In the first one, we transform the columns and transpose |
| 1071 // the results. In the second one, we transform the rows. To achieve that, | 1071 // the results. In the second one, we transform the rows. To achieve that, |
| 1072 // as the first pass results are transposed, we tranpose the columns (that | 1072 // as the first pass results are transposed, we transpose the columns (that |
| 1073 // is the transposed rows) and transpose the results (so that it goes back | 1073 // is the transposed rows) and transpose the results (so that it goes back |
| 1074 // in normal/row positions). | 1074 // in normal/row positions). |
| 1075 int pass; | 1075 int pass; |
| 1076 // We need an intermediate buffer between passes. | 1076 // We need an intermediate buffer between passes. |
| 1077 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); | 1077 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); |
| 1078 const int16_t *in = input; | 1078 const int16_t *in = input; |
| 1079 int16_t *out = intermediate; | 1079 int16_t *out = intermediate; |
| 1080 // Constants | 1080 // Constants |
| 1081 // When we use them, in one case, they are all the same. In all others | 1081 // When we use them, in one case, they are all the same. In all others |
| 1082 // it's a pair of them that we need to repeat four times. This is done | 1082 // it's a pair of them that we need to repeat four times. This is done |
| (...skipping 136 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1219 step1_1 = _mm_sub_epi16(in06, in09); | 1219 step1_1 = _mm_sub_epi16(in06, in09); |
| 1220 step1_2 = _mm_sub_epi16(in05, in10); | 1220 step1_2 = _mm_sub_epi16(in05, in10); |
| 1221 step1_3 = _mm_sub_epi16(in04, in11); | 1221 step1_3 = _mm_sub_epi16(in04, in11); |
| 1222 step1_4 = _mm_sub_epi16(in03, in12); | 1222 step1_4 = _mm_sub_epi16(in03, in12); |
| 1223 step1_5 = _mm_sub_epi16(in02, in13); | 1223 step1_5 = _mm_sub_epi16(in02, in13); |
| 1224 step1_6 = _mm_sub_epi16(in01, in14); | 1224 step1_6 = _mm_sub_epi16(in01, in14); |
| 1225 step1_7 = _mm_sub_epi16(in00, in15); | 1225 step1_7 = _mm_sub_epi16(in00, in15); |
| 1226 } | 1226 } |
| 1227 // Work on the first eight values; fdct8(input, even_results); | 1227 // Work on the first eight values; fdct8(input, even_results); |
| 1228 { | 1228 { |
| 1229 // Add/substract | 1229 // Add/subtract |
| 1230 const __m128i q0 = _mm_add_epi16(input0, input7); | 1230 const __m128i q0 = _mm_add_epi16(input0, input7); |
| 1231 const __m128i q1 = _mm_add_epi16(input1, input6); | 1231 const __m128i q1 = _mm_add_epi16(input1, input6); |
| 1232 const __m128i q2 = _mm_add_epi16(input2, input5); | 1232 const __m128i q2 = _mm_add_epi16(input2, input5); |
| 1233 const __m128i q3 = _mm_add_epi16(input3, input4); | 1233 const __m128i q3 = _mm_add_epi16(input3, input4); |
| 1234 const __m128i q4 = _mm_sub_epi16(input3, input4); | 1234 const __m128i q4 = _mm_sub_epi16(input3, input4); |
| 1235 const __m128i q5 = _mm_sub_epi16(input2, input5); | 1235 const __m128i q5 = _mm_sub_epi16(input2, input5); |
| 1236 const __m128i q6 = _mm_sub_epi16(input1, input6); | 1236 const __m128i q6 = _mm_sub_epi16(input1, input6); |
| 1237 const __m128i q7 = _mm_sub_epi16(input0, input7); | 1237 const __m128i q7 = _mm_sub_epi16(input0, input7); |
| 1238 // Work on first four results | 1238 // Work on first four results |
| 1239 { | 1239 { |
| 1240 // Add/substract | 1240 // Add/subtract |
| 1241 const __m128i r0 = _mm_add_epi16(q0, q3); | 1241 const __m128i r0 = _mm_add_epi16(q0, q3); |
| 1242 const __m128i r1 = _mm_add_epi16(q1, q2); | 1242 const __m128i r1 = _mm_add_epi16(q1, q2); |
| 1243 const __m128i r2 = _mm_sub_epi16(q1, q2); | 1243 const __m128i r2 = _mm_sub_epi16(q1, q2); |
| 1244 const __m128i r3 = _mm_sub_epi16(q0, q3); | 1244 const __m128i r3 = _mm_sub_epi16(q0, q3); |
| 1245 // Interleave to do the multiply by constants which gets us | 1245 // Interleave to do the multiply by constants which gets us |
| 1246 // into 32 bits. | 1246 // into 32 bits. |
| 1247 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); | 1247 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); |
| 1248 const __m128i t1 = _mm_unpackhi_epi16(r0, r1); | 1248 const __m128i t1 = _mm_unpackhi_epi16(r0, r1); |
| 1249 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); | 1249 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); |
| 1250 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); | 1250 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); |
| (...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1294 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); | 1294 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); |
| 1295 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); | 1295 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); |
| 1296 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); | 1296 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); |
| 1297 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); | 1297 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); |
| 1298 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); | 1298 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); |
| 1299 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); | 1299 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); |
| 1300 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); | 1300 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); |
| 1301 // Combine | 1301 // Combine |
| 1302 const __m128i r0 = _mm_packs_epi32(s0, s1); | 1302 const __m128i r0 = _mm_packs_epi32(s0, s1); |
| 1303 const __m128i r1 = _mm_packs_epi32(s2, s3); | 1303 const __m128i r1 = _mm_packs_epi32(s2, s3); |
| 1304 // Add/substract | 1304 // Add/subtract |
| 1305 const __m128i x0 = _mm_add_epi16(q4, r0); | 1305 const __m128i x0 = _mm_add_epi16(q4, r0); |
| 1306 const __m128i x1 = _mm_sub_epi16(q4, r0); | 1306 const __m128i x1 = _mm_sub_epi16(q4, r0); |
| 1307 const __m128i x2 = _mm_sub_epi16(q7, r1); | 1307 const __m128i x2 = _mm_sub_epi16(q7, r1); |
| 1308 const __m128i x3 = _mm_add_epi16(q7, r1); | 1308 const __m128i x3 = _mm_add_epi16(q7, r1); |
| 1309 // Interleave to do the multiply by constants which gets us | 1309 // Interleave to do the multiply by constants which gets us |
| 1310 // into 32 bits. | 1310 // into 32 bits. |
| 1311 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); | 1311 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); |
| 1312 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); | 1312 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); |
| 1313 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); | 1313 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); |
| 1314 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); | 1314 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); |
| (...skipping 1266 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2581 #define FDCT32x32_HIGH_PRECISION 0 | 2581 #define FDCT32x32_HIGH_PRECISION 0 |
| 2582 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" | 2582 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" |
| 2583 #undef FDCT32x32_2D | 2583 #undef FDCT32x32_2D |
| 2584 #undef FDCT32x32_HIGH_PRECISION | 2584 #undef FDCT32x32_HIGH_PRECISION |
| 2585 | 2585 |
| 2586 #define FDCT32x32_2D vp9_fdct32x32_sse2 | 2586 #define FDCT32x32_2D vp9_fdct32x32_sse2 |
| 2587 #define FDCT32x32_HIGH_PRECISION 1 | 2587 #define FDCT32x32_HIGH_PRECISION 1 |
| 2588 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT | 2588 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT |
| 2589 #undef FDCT32x32_2D | 2589 #undef FDCT32x32_2D |
| 2590 #undef FDCT32x32_HIGH_PRECISION | 2590 #undef FDCT32x32_HIGH_PRECISION |
| OLD | NEW |