Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(57)

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_dct_avx2.c

Issue 181493009: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <immintrin.h> // AVX2 11 #include <immintrin.h> // AVX2
12 #include "vp9/common/vp9_idct.h" // for cospi constants 12 #include "vp9/common/vp9_idct.h" // for cospi constants
13 #include "vpx_ports/mem.h" 13 #include "vpx_ports/mem.h"
14 14
15 void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) { 15 void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) {
16 // The 2D transform is done with two passes which are actually pretty 16 // The 2D transform is done with two passes which are actually pretty
17 // similar. In the first one, we transform the columns and transpose 17 // similar. In the first one, we transform the columns and transpose
18 // the results. In the second one, we transform the rows. To achieve that, 18 // the results. In the second one, we transform the rows. To achieve that,
19 // as the first pass results are transposed, we tranpose the columns (that 19 // as the first pass results are transposed, we transpose the columns (that
20 // is the transposed rows) and transpose the results (so that it goes back 20 // is the transposed rows) and transpose the results (so that it goes back
21 // in normal/row positions). 21 // in normal/row positions).
22 int pass; 22 int pass;
23 // Constants 23 // Constants
24 // When we use them, in one case, they are all the same. In all others 24 // When we use them, in one case, they are all the same. In all others
25 // it's a pair of them that we need to repeat four times. This is done 25 // it's a pair of them that we need to repeat four times. This is done
26 // by constructing the 32 bit constant corresponding to that pair. 26 // by constructing the 32 bit constant corresponding to that pair.
27 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 27 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
28 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 28 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
29 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 29 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
30 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 30 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
31 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 31 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
32 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); 32 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
33 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); 33 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
34 const __m128i kOne = _mm_set1_epi16(1); 34 const __m128i kOne = _mm_set1_epi16(1);
35 __m128i in0, in1, in2, in3; 35 __m128i in0, in1, in2, in3;
36 // Load inputs. 36 // Load inputs.
37 { 37 {
38 in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); 38 in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
39 in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); 39 in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
40 in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); 40 in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
41 in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); 41 in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
42 // x = x << 4 42 // x = x << 4
43 in0 = _mm_slli_epi16(in0, 4); 43 in0 = _mm_slli_epi16(in0, 4);
44 in1 = _mm_slli_epi16(in1, 4); 44 in1 = _mm_slli_epi16(in1, 4);
45 in2 = _mm_slli_epi16(in2, 4); 45 in2 = _mm_slli_epi16(in2, 4);
46 in3 = _mm_slli_epi16(in3, 4); 46 in3 = _mm_slli_epi16(in3, 4);
47 // if (i == 0 && input[0]) input[0] += 1; 47 // if (i == 0 && input[0]) input[0] += 1;
48 { 48 {
49 // The mask will only contain wether the first value is zero, all 49 // The mask will only contain whether the first value is zero, all
50 // other comparison will fail as something shifted by 4 (above << 4) 50 // other comparison will fail as something shifted by 4 (above << 4)
51 // can never be equal to one. To increment in the non-zero case, we 51 // can never be equal to one. To increment in the non-zero case, we
52 // add the mask and one for the first element: 52 // add the mask and one for the first element:
53 // - if zero, mask = -1, v = v - 1 + 1 = v 53 // - if zero, mask = -1, v = v - 1 + 1 = v
54 // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 54 // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
55 __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); 55 __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
56 in0 = _mm_add_epi16(in0, mask); 56 in0 = _mm_add_epi16(in0, mask);
57 in0 = _mm_add_epi16(in0, k__nonzero_bias_b); 57 in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
58 } 58 }
59 } 59 }
60 // Do the two transform/transpose passes 60 // Do the two transform/transpose passes
61 for (pass = 0; pass < 2; ++pass) { 61 for (pass = 0; pass < 2; ++pass) {
62 // Transform 1/2: Add/substract 62 // Transform 1/2: Add/subtract
63 const __m128i r0 = _mm_add_epi16(in0, in3); 63 const __m128i r0 = _mm_add_epi16(in0, in3);
64 const __m128i r1 = _mm_add_epi16(in1, in2); 64 const __m128i r1 = _mm_add_epi16(in1, in2);
65 const __m128i r2 = _mm_sub_epi16(in1, in2); 65 const __m128i r2 = _mm_sub_epi16(in1, in2);
66 const __m128i r3 = _mm_sub_epi16(in0, in3); 66 const __m128i r3 = _mm_sub_epi16(in0, in3);
67 // Transform 1/2: Interleave to do the multiply by constants which gets us 67 // Transform 1/2: Interleave to do the multiply by constants which gets us
68 // into 32 bits. 68 // into 32 bits.
69 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); 69 const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
70 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); 70 const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
71 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 71 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
72 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); 72 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
(...skipping 237 matching lines...) Expand 10 before | Expand all | Expand 10 after
310 in6 = _mm_slli_epi16(in6, 2); 310 in6 = _mm_slli_epi16(in6, 2);
311 in7 = _mm_slli_epi16(in7, 2); 311 in7 = _mm_slli_epi16(in7, 2);
312 312
313 // We do two passes, first the columns, then the rows. The results of the 313 // We do two passes, first the columns, then the rows. The results of the
314 // first pass are transposed so that the same column code can be reused. The 314 // first pass are transposed so that the same column code can be reused. The
315 // results of the second pass are also transposed so that the rows (processed 315 // results of the second pass are also transposed so that the rows (processed
316 // as columns) are put back in row positions. 316 // as columns) are put back in row positions.
317 for (pass = 0; pass < 2; pass++) { 317 for (pass = 0; pass < 2; pass++) {
318 // To store results of each pass before the transpose. 318 // To store results of each pass before the transpose.
319 __m128i res0, res1, res2, res3, res4, res5, res6, res7; 319 __m128i res0, res1, res2, res3, res4, res5, res6, res7;
320 // Add/substract 320 // Add/subtract
321 const __m128i q0 = _mm_add_epi16(in0, in7); 321 const __m128i q0 = _mm_add_epi16(in0, in7);
322 const __m128i q1 = _mm_add_epi16(in1, in6); 322 const __m128i q1 = _mm_add_epi16(in1, in6);
323 const __m128i q2 = _mm_add_epi16(in2, in5); 323 const __m128i q2 = _mm_add_epi16(in2, in5);
324 const __m128i q3 = _mm_add_epi16(in3, in4); 324 const __m128i q3 = _mm_add_epi16(in3, in4);
325 const __m128i q4 = _mm_sub_epi16(in3, in4); 325 const __m128i q4 = _mm_sub_epi16(in3, in4);
326 const __m128i q5 = _mm_sub_epi16(in2, in5); 326 const __m128i q5 = _mm_sub_epi16(in2, in5);
327 const __m128i q6 = _mm_sub_epi16(in1, in6); 327 const __m128i q6 = _mm_sub_epi16(in1, in6);
328 const __m128i q7 = _mm_sub_epi16(in0, in7); 328 const __m128i q7 = _mm_sub_epi16(in0, in7);
329 // Work on first four results 329 // Work on first four results
330 { 330 {
331 // Add/substract 331 // Add/subtract
332 const __m128i r0 = _mm_add_epi16(q0, q3); 332 const __m128i r0 = _mm_add_epi16(q0, q3);
333 const __m128i r1 = _mm_add_epi16(q1, q2); 333 const __m128i r1 = _mm_add_epi16(q1, q2);
334 const __m128i r2 = _mm_sub_epi16(q1, q2); 334 const __m128i r2 = _mm_sub_epi16(q1, q2);
335 const __m128i r3 = _mm_sub_epi16(q0, q3); 335 const __m128i r3 = _mm_sub_epi16(q0, q3);
336 // Interleave to do the multiply by constants which gets us into 32bits 336 // Interleave to do the multiply by constants which gets us into 32bits
337 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); 337 const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
338 const __m128i t1 = _mm_unpackhi_epi16(r0, r1); 338 const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
339 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); 339 const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
340 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); 340 const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
341 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 341 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
383 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); 383 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
384 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); 384 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
385 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); 385 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
386 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); 386 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
387 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); 387 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
388 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); 388 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
389 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); 389 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
390 // Combine 390 // Combine
391 const __m128i r0 = _mm_packs_epi32(s0, s1); 391 const __m128i r0 = _mm_packs_epi32(s0, s1);
392 const __m128i r1 = _mm_packs_epi32(s2, s3); 392 const __m128i r1 = _mm_packs_epi32(s2, s3);
393 // Add/substract 393 // Add/subtract
394 const __m128i x0 = _mm_add_epi16(q4, r0); 394 const __m128i x0 = _mm_add_epi16(q4, r0);
395 const __m128i x1 = _mm_sub_epi16(q4, r0); 395 const __m128i x1 = _mm_sub_epi16(q4, r0);
396 const __m128i x2 = _mm_sub_epi16(q7, r1); 396 const __m128i x2 = _mm_sub_epi16(q7, r1);
397 const __m128i x3 = _mm_add_epi16(q7, r1); 397 const __m128i x3 = _mm_add_epi16(q7, r1);
398 // Interleave to do the multiply by constants which gets us into 32bits 398 // Interleave to do the multiply by constants which gets us into 32bits
399 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); 399 const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
400 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); 400 const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
401 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); 401 const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
402 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); 402 const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
403 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); 403 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
(...skipping 660 matching lines...) Expand 10 before | Expand all | Expand 10 after
1064 default: 1064 default:
1065 assert(0); 1065 assert(0);
1066 break; 1066 break;
1067 } 1067 }
1068 } 1068 }
1069 1069
1070 void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) { 1070 void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) {
1071 // The 2D transform is done with two passes which are actually pretty 1071 // The 2D transform is done with two passes which are actually pretty
1072 // similar. In the first one, we transform the columns and transpose 1072 // similar. In the first one, we transform the columns and transpose
1073 // the results. In the second one, we transform the rows. To achieve that, 1073 // the results. In the second one, we transform the rows. To achieve that,
1074 // as the first pass results are transposed, we tranpose the columns (that 1074 // as the first pass results are transposed, we transpose the columns (that
1075 // is the transposed rows) and transpose the results (so that it goes back 1075 // is the transposed rows) and transpose the results (so that it goes back
1076 // in normal/row positions). 1076 // in normal/row positions).
1077 int pass; 1077 int pass;
1078 // We need an intermediate buffer between passes. 1078 // We need an intermediate buffer between passes.
1079 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); 1079 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
1080 const int16_t *in = input; 1080 const int16_t *in = input;
1081 int16_t *out = intermediate; 1081 int16_t *out = intermediate;
1082 // Constants 1082 // Constants
1083 // When we use them, in one case, they are all the same. In all others 1083 // When we use them, in one case, they are all the same. In all others
1084 // it's a pair of them that we need to repeat four times. This is done 1084 // it's a pair of them that we need to repeat four times. This is done
(...skipping 136 matching lines...) Expand 10 before | Expand all | Expand 10 after
1221 step1_1 = _mm_sub_epi16(in06, in09); 1221 step1_1 = _mm_sub_epi16(in06, in09);
1222 step1_2 = _mm_sub_epi16(in05, in10); 1222 step1_2 = _mm_sub_epi16(in05, in10);
1223 step1_3 = _mm_sub_epi16(in04, in11); 1223 step1_3 = _mm_sub_epi16(in04, in11);
1224 step1_4 = _mm_sub_epi16(in03, in12); 1224 step1_4 = _mm_sub_epi16(in03, in12);
1225 step1_5 = _mm_sub_epi16(in02, in13); 1225 step1_5 = _mm_sub_epi16(in02, in13);
1226 step1_6 = _mm_sub_epi16(in01, in14); 1226 step1_6 = _mm_sub_epi16(in01, in14);
1227 step1_7 = _mm_sub_epi16(in00, in15); 1227 step1_7 = _mm_sub_epi16(in00, in15);
1228 } 1228 }
1229 // Work on the first eight values; fdct8(input, even_results); 1229 // Work on the first eight values; fdct8(input, even_results);
1230 { 1230 {
1231 // Add/substract 1231 // Add/subtract
1232 const __m128i q0 = _mm_add_epi16(input0, input7); 1232 const __m128i q0 = _mm_add_epi16(input0, input7);
1233 const __m128i q1 = _mm_add_epi16(input1, input6); 1233 const __m128i q1 = _mm_add_epi16(input1, input6);
1234 const __m128i q2 = _mm_add_epi16(input2, input5); 1234 const __m128i q2 = _mm_add_epi16(input2, input5);
1235 const __m128i q3 = _mm_add_epi16(input3, input4); 1235 const __m128i q3 = _mm_add_epi16(input3, input4);
1236 const __m128i q4 = _mm_sub_epi16(input3, input4); 1236 const __m128i q4 = _mm_sub_epi16(input3, input4);
1237 const __m128i q5 = _mm_sub_epi16(input2, input5); 1237 const __m128i q5 = _mm_sub_epi16(input2, input5);
1238 const __m128i q6 = _mm_sub_epi16(input1, input6); 1238 const __m128i q6 = _mm_sub_epi16(input1, input6);
1239 const __m128i q7 = _mm_sub_epi16(input0, input7); 1239 const __m128i q7 = _mm_sub_epi16(input0, input7);
1240 // Work on first four results 1240 // Work on first four results
1241 { 1241 {
1242 // Add/substract 1242 // Add/subtract
1243 const __m128i r0 = _mm_add_epi16(q0, q3); 1243 const __m128i r0 = _mm_add_epi16(q0, q3);
1244 const __m128i r1 = _mm_add_epi16(q1, q2); 1244 const __m128i r1 = _mm_add_epi16(q1, q2);
1245 const __m128i r2 = _mm_sub_epi16(q1, q2); 1245 const __m128i r2 = _mm_sub_epi16(q1, q2);
1246 const __m128i r3 = _mm_sub_epi16(q0, q3); 1246 const __m128i r3 = _mm_sub_epi16(q0, q3);
1247 // Interleave to do the multiply by constants which gets us 1247 // Interleave to do the multiply by constants which gets us
1248 // into 32 bits. 1248 // into 32 bits.
1249 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); 1249 const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
1250 const __m128i t1 = _mm_unpackhi_epi16(r0, r1); 1250 const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
1251 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); 1251 const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
1252 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); 1252 const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after
1296 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); 1296 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
1297 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); 1297 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
1298 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); 1298 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
1299 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); 1299 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
1300 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); 1300 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
1301 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); 1301 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
1302 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); 1302 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
1303 // Combine 1303 // Combine
1304 const __m128i r0 = _mm_packs_epi32(s0, s1); 1304 const __m128i r0 = _mm_packs_epi32(s0, s1);
1305 const __m128i r1 = _mm_packs_epi32(s2, s3); 1305 const __m128i r1 = _mm_packs_epi32(s2, s3);
1306 // Add/substract 1306 // Add/subtract
1307 const __m128i x0 = _mm_add_epi16(q4, r0); 1307 const __m128i x0 = _mm_add_epi16(q4, r0);
1308 const __m128i x1 = _mm_sub_epi16(q4, r0); 1308 const __m128i x1 = _mm_sub_epi16(q4, r0);
1309 const __m128i x2 = _mm_sub_epi16(q7, r1); 1309 const __m128i x2 = _mm_sub_epi16(q7, r1);
1310 const __m128i x3 = _mm_add_epi16(q7, r1); 1310 const __m128i x3 = _mm_add_epi16(q7, r1);
1311 // Interleave to do the multiply by constants which gets us 1311 // Interleave to do the multiply by constants which gets us
1312 // into 32 bits. 1312 // into 32 bits.
1313 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); 1313 const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
1314 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); 1314 const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
1315 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); 1315 const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
1316 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); 1316 const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
(...skipping 1266 matching lines...) Expand 10 before | Expand all | Expand 10 after
2583 #define FDCT32x32_HIGH_PRECISION 0 2583 #define FDCT32x32_HIGH_PRECISION 0
2584 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c" 2584 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c"
2585 #undef FDCT32x32_2D_AVX2 2585 #undef FDCT32x32_2D_AVX2
2586 #undef FDCT32x32_HIGH_PRECISION 2586 #undef FDCT32x32_HIGH_PRECISION
2587 2587
2588 #define FDCT32x32_2D_AVX2 vp9_fdct32x32_avx2 2588 #define FDCT32x32_2D_AVX2 vp9_fdct32x32_avx2
2589 #define FDCT32x32_HIGH_PRECISION 1 2589 #define FDCT32x32_HIGH_PRECISION 1
2590 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c" // NOLINT 2590 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c" // NOLINT
2591 #undef FDCT32x32_2D_AVX2 2591 #undef FDCT32x32_2D_AVX2
2592 #undef FDCT32x32_HIGH_PRECISION 2592 #undef FDCT32x32_HIGH_PRECISION
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/vp9_write_bit_buffer.h ('k') | source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698