source/libvpx/vp9/encoder/x86/vp9_dct_avx2.c - Issue 181493009: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_dct_avx2.c

Issue 181493009: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« no previous file with comments | « source/libvpx/vp9/encoder/vp9_write_bit_buffer.h ('k') | source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c » ('j') | no next file with comments »

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include <immintrin.h> // AVX2	11 #include <immintrin.h> // AVX2

12 #include "vp9/common/vp9_idct.h" // for cospi constants	12 #include "vp9/common/vp9_idct.h" // for cospi constants

13 #include "vpx_ports/mem.h"	13 #include "vpx_ports/mem.h"

14	14

15 void vp9_fdct4x4_avx2(const int16_t input, int16_t output, int stride) {	15 void vp9_fdct4x4_avx2(const int16_t input, int16_t output, int stride) {

16 // The 2D transform is done with two passes which are actually pretty	16 // The 2D transform is done with two passes which are actually pretty

17 // similar. In the first one, we transform the columns and transpose	17 // similar. In the first one, we transform the columns and transpose

18 // the results. In the second one, we transform the rows. To achieve that,	18 // the results. In the second one, we transform the rows. To achieve that,

19 // as the first pass results are transposed, we tranpose the columns (that	19 // as the first pass results are transposed, we transpose the columns (that

20 // is the transposed rows) and transpose the results (so that it goes back	20 // is the transposed rows) and transpose the results (so that it goes back

21 // in normal/row positions).	21 // in normal/row positions).

22 int pass;	22 int pass;

23 // Constants	23 // Constants

24 // When we use them, in one case, they are all the same. In all others	24 // When we use them, in one case, they are all the same. In all others

25 // it's a pair of them that we need to repeat four times. This is done	25 // it's a pair of them that we need to repeat four times. This is done

26 // by constructing the 32 bit constant corresponding to that pair.	26 // by constructing the 32 bit constant corresponding to that pair.

27 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);	27 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);

28 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);	28 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

29 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);	29 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);

30 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);	30 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);

31 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);	31 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);

32 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);	32 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);

33 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);	33 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);

34 const __m128i kOne = _mm_set1_epi16(1);	34 const __m128i kOne = _mm_set1_epi16(1);

35 __m128i in0, in1, in2, in3;	35 __m128i in0, in1, in2, in3;

36 // Load inputs.	36 // Load inputs.

37 {	37 {

38 in0 = _mm_loadl_epi64((const __m128i )(input + 0 stride));	38 in0 = _mm_loadl_epi64((const __m128i )(input + 0 stride));

39 in1 = _mm_loadl_epi64((const __m128i )(input + 1 stride));	39 in1 = _mm_loadl_epi64((const __m128i )(input + 1 stride));

40 in2 = _mm_loadl_epi64((const __m128i )(input + 2 stride));	40 in2 = _mm_loadl_epi64((const __m128i )(input + 2 stride));

41 in3 = _mm_loadl_epi64((const __m128i )(input + 3 stride));	41 in3 = _mm_loadl_epi64((const __m128i )(input + 3 stride));

42 // x = x << 4	42 // x = x << 4

43 in0 = _mm_slli_epi16(in0, 4);	43 in0 = _mm_slli_epi16(in0, 4);

44 in1 = _mm_slli_epi16(in1, 4);	44 in1 = _mm_slli_epi16(in1, 4);

45 in2 = _mm_slli_epi16(in2, 4);	45 in2 = _mm_slli_epi16(in2, 4);

46 in3 = _mm_slli_epi16(in3, 4);	46 in3 = _mm_slli_epi16(in3, 4);

47 // if (i == 0 && input[0]) input[0] += 1;	47 // if (i == 0 && input[0]) input[0] += 1;

48 {	48 {

49 // The mask will only contain wether the first value is zero, all	49 // The mask will only contain whether the first value is zero, all

50 // other comparison will fail as something shifted by 4 (above << 4)	50 // other comparison will fail as something shifted by 4 (above << 4)

51 // can never be equal to one. To increment in the non-zero case, we	51 // can never be equal to one. To increment in the non-zero case, we

52 // add the mask and one for the first element:	52 // add the mask and one for the first element:

53 // - if zero, mask = -1, v = v - 1 + 1 = v	53 // - if zero, mask = -1, v = v - 1 + 1 = v

54 // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1	54 // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1

55 __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);	55 __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);

56 in0 = _mm_add_epi16(in0, mask);	56 in0 = _mm_add_epi16(in0, mask);

57 in0 = _mm_add_epi16(in0, k__nonzero_bias_b);	57 in0 = _mm_add_epi16(in0, k__nonzero_bias_b);

58 }	58 }

59 }	59 }

60 // Do the two transform/transpose passes	60 // Do the two transform/transpose passes

61 for (pass = 0; pass < 2; ++pass) {	61 for (pass = 0; pass < 2; ++pass) {

62 // Transform 1/2: Add/substract	62 // Transform 1/2: Add/subtract

63 const __m128i r0 = _mm_add_epi16(in0, in3);	63 const __m128i r0 = _mm_add_epi16(in0, in3);

64 const __m128i r1 = _mm_add_epi16(in1, in2);	64 const __m128i r1 = _mm_add_epi16(in1, in2);

65 const __m128i r2 = _mm_sub_epi16(in1, in2);	65 const __m128i r2 = _mm_sub_epi16(in1, in2);

66 const __m128i r3 = _mm_sub_epi16(in0, in3);	66 const __m128i r3 = _mm_sub_epi16(in0, in3);

67 // Transform 1/2: Interleave to do the multiply by constants which gets us	67 // Transform 1/2: Interleave to do the multiply by constants which gets us

68 // into 32 bits.	68 // into 32 bits.

69 const __m128i t0 = _mm_unpacklo_epi16(r0, r1);	69 const __m128i t0 = _mm_unpacklo_epi16(r0, r1);

70 const __m128i t2 = _mm_unpacklo_epi16(r2, r3);	70 const __m128i t2 = _mm_unpacklo_epi16(r2, r3);

71 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);	71 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);

72 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);	72 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);

(...skipping 237 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
310 in6 = _mm_slli_epi16(in6, 2);	310 in6 = _mm_slli_epi16(in6, 2);

311 in7 = _mm_slli_epi16(in7, 2);	311 in7 = _mm_slli_epi16(in7, 2);

312	312

313 // We do two passes, first the columns, then the rows. The results of the	313 // We do two passes, first the columns, then the rows. The results of the

314 // first pass are transposed so that the same column code can be reused. The	314 // first pass are transposed so that the same column code can be reused. The

315 // results of the second pass are also transposed so that the rows (processed	315 // results of the second pass are also transposed so that the rows (processed

316 // as columns) are put back in row positions.	316 // as columns) are put back in row positions.

317 for (pass = 0; pass < 2; pass++) {	317 for (pass = 0; pass < 2; pass++) {

318 // To store results of each pass before the transpose.	318 // To store results of each pass before the transpose.

319 __m128i res0, res1, res2, res3, res4, res5, res6, res7;	319 __m128i res0, res1, res2, res3, res4, res5, res6, res7;

320 // Add/substract	320 // Add/subtract

321 const __m128i q0 = _mm_add_epi16(in0, in7);	321 const __m128i q0 = _mm_add_epi16(in0, in7);

322 const __m128i q1 = _mm_add_epi16(in1, in6);	322 const __m128i q1 = _mm_add_epi16(in1, in6);

323 const __m128i q2 = _mm_add_epi16(in2, in5);	323 const __m128i q2 = _mm_add_epi16(in2, in5);

324 const __m128i q3 = _mm_add_epi16(in3, in4);	324 const __m128i q3 = _mm_add_epi16(in3, in4);

325 const __m128i q4 = _mm_sub_epi16(in3, in4);	325 const __m128i q4 = _mm_sub_epi16(in3, in4);

326 const __m128i q5 = _mm_sub_epi16(in2, in5);	326 const __m128i q5 = _mm_sub_epi16(in2, in5);

327 const __m128i q6 = _mm_sub_epi16(in1, in6);	327 const __m128i q6 = _mm_sub_epi16(in1, in6);

328 const __m128i q7 = _mm_sub_epi16(in0, in7);	328 const __m128i q7 = _mm_sub_epi16(in0, in7);

329 // Work on first four results	329 // Work on first four results

330 {	330 {

331 // Add/substract	331 // Add/subtract

332 const __m128i r0 = _mm_add_epi16(q0, q3);	332 const __m128i r0 = _mm_add_epi16(q0, q3);

333 const __m128i r1 = _mm_add_epi16(q1, q2);	333 const __m128i r1 = _mm_add_epi16(q1, q2);

334 const __m128i r2 = _mm_sub_epi16(q1, q2);	334 const __m128i r2 = _mm_sub_epi16(q1, q2);

335 const __m128i r3 = _mm_sub_epi16(q0, q3);	335 const __m128i r3 = _mm_sub_epi16(q0, q3);

336 // Interleave to do the multiply by constants which gets us into 32bits	336 // Interleave to do the multiply by constants which gets us into 32bits

337 const __m128i t0 = _mm_unpacklo_epi16(r0, r1);	337 const __m128i t0 = _mm_unpacklo_epi16(r0, r1);

338 const __m128i t1 = _mm_unpackhi_epi16(r0, r1);	338 const __m128i t1 = _mm_unpackhi_epi16(r0, r1);

339 const __m128i t2 = _mm_unpacklo_epi16(r2, r3);	339 const __m128i t2 = _mm_unpacklo_epi16(r2, r3);

340 const __m128i t3 = _mm_unpackhi_epi16(r2, r3);	340 const __m128i t3 = _mm_unpackhi_epi16(r2, r3);

341 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);	341 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);

(...skipping 41 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
383 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);	383 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);

384 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);	384 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);

385 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);	385 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);

386 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);	386 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);

387 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);	387 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);

388 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);	388 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);

389 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);	389 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);

390 // Combine	390 // Combine

391 const __m128i r0 = _mm_packs_epi32(s0, s1);	391 const __m128i r0 = _mm_packs_epi32(s0, s1);

392 const __m128i r1 = _mm_packs_epi32(s2, s3);	392 const __m128i r1 = _mm_packs_epi32(s2, s3);

393 // Add/substract	393 // Add/subtract

394 const __m128i x0 = _mm_add_epi16(q4, r0);	394 const __m128i x0 = _mm_add_epi16(q4, r0);

395 const __m128i x1 = _mm_sub_epi16(q4, r0);	395 const __m128i x1 = _mm_sub_epi16(q4, r0);

396 const __m128i x2 = _mm_sub_epi16(q7, r1);	396 const __m128i x2 = _mm_sub_epi16(q7, r1);

397 const __m128i x3 = _mm_add_epi16(q7, r1);	397 const __m128i x3 = _mm_add_epi16(q7, r1);

398 // Interleave to do the multiply by constants which gets us into 32bits	398 // Interleave to do the multiply by constants which gets us into 32bits

399 const __m128i t0 = _mm_unpacklo_epi16(x0, x3);	399 const __m128i t0 = _mm_unpacklo_epi16(x0, x3);

400 const __m128i t1 = _mm_unpackhi_epi16(x0, x3);	400 const __m128i t1 = _mm_unpackhi_epi16(x0, x3);

401 const __m128i t2 = _mm_unpacklo_epi16(x1, x2);	401 const __m128i t2 = _mm_unpacklo_epi16(x1, x2);

402 const __m128i t3 = _mm_unpackhi_epi16(x1, x2);	402 const __m128i t3 = _mm_unpackhi_epi16(x1, x2);

403 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);	403 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);

(...skipping 660 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1064 default:	1064 default:

1065 assert(0);	1065 assert(0);

1066 break;	1066 break;

1067 }	1067 }

1068 }	1068 }

1069	1069

1070 void vp9_fdct16x16_avx2(const int16_t input, int16_t output, int stride) {	1070 void vp9_fdct16x16_avx2(const int16_t input, int16_t output, int stride) {

1071 // The 2D transform is done with two passes which are actually pretty	1071 // The 2D transform is done with two passes which are actually pretty

1072 // similar. In the first one, we transform the columns and transpose	1072 // similar. In the first one, we transform the columns and transpose

1073 // the results. In the second one, we transform the rows. To achieve that,	1073 // the results. In the second one, we transform the rows. To achieve that,

1074 // as the first pass results are transposed, we tranpose the columns (that	1074 // as the first pass results are transposed, we transpose the columns (that

1075 // is the transposed rows) and transpose the results (so that it goes back	1075 // is the transposed rows) and transpose the results (so that it goes back

1076 // in normal/row positions).	1076 // in normal/row positions).

1077 int pass;	1077 int pass;

1078 // We need an intermediate buffer between passes.	1078 // We need an intermediate buffer between passes.

1079 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);	1079 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);

1080 const int16_t *in = input;	1080 const int16_t *in = input;

1081 int16_t *out = intermediate;	1081 int16_t *out = intermediate;

1082 // Constants	1082 // Constants

1083 // When we use them, in one case, they are all the same. In all others	1083 // When we use them, in one case, they are all the same. In all others

1084 // it's a pair of them that we need to repeat four times. This is done	1084 // it's a pair of them that we need to repeat four times. This is done

(...skipping 136 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1221 step1_1 = _mm_sub_epi16(in06, in09);	1221 step1_1 = _mm_sub_epi16(in06, in09);

1222 step1_2 = _mm_sub_epi16(in05, in10);	1222 step1_2 = _mm_sub_epi16(in05, in10);

1223 step1_3 = _mm_sub_epi16(in04, in11);	1223 step1_3 = _mm_sub_epi16(in04, in11);

1224 step1_4 = _mm_sub_epi16(in03, in12);	1224 step1_4 = _mm_sub_epi16(in03, in12);

1225 step1_5 = _mm_sub_epi16(in02, in13);	1225 step1_5 = _mm_sub_epi16(in02, in13);

1226 step1_6 = _mm_sub_epi16(in01, in14);	1226 step1_6 = _mm_sub_epi16(in01, in14);

1227 step1_7 = _mm_sub_epi16(in00, in15);	1227 step1_7 = _mm_sub_epi16(in00, in15);

1228 }	1228 }

1229 // Work on the first eight values; fdct8(input, even_results);	1229 // Work on the first eight values; fdct8(input, even_results);

1230 {	1230 {

1231 // Add/substract	1231 // Add/subtract

1232 const __m128i q0 = _mm_add_epi16(input0, input7);	1232 const __m128i q0 = _mm_add_epi16(input0, input7);

1233 const __m128i q1 = _mm_add_epi16(input1, input6);	1233 const __m128i q1 = _mm_add_epi16(input1, input6);

1234 const __m128i q2 = _mm_add_epi16(input2, input5);	1234 const __m128i q2 = _mm_add_epi16(input2, input5);

1235 const __m128i q3 = _mm_add_epi16(input3, input4);	1235 const __m128i q3 = _mm_add_epi16(input3, input4);

1236 const __m128i q4 = _mm_sub_epi16(input3, input4);	1236 const __m128i q4 = _mm_sub_epi16(input3, input4);

1237 const __m128i q5 = _mm_sub_epi16(input2, input5);	1237 const __m128i q5 = _mm_sub_epi16(input2, input5);

1238 const __m128i q6 = _mm_sub_epi16(input1, input6);	1238 const __m128i q6 = _mm_sub_epi16(input1, input6);

1239 const __m128i q7 = _mm_sub_epi16(input0, input7);	1239 const __m128i q7 = _mm_sub_epi16(input0, input7);

1240 // Work on first four results	1240 // Work on first four results

1241 {	1241 {

1242 // Add/substract	1242 // Add/subtract

1243 const __m128i r0 = _mm_add_epi16(q0, q3);	1243 const __m128i r0 = _mm_add_epi16(q0, q3);

1244 const __m128i r1 = _mm_add_epi16(q1, q2);	1244 const __m128i r1 = _mm_add_epi16(q1, q2);

1245 const __m128i r2 = _mm_sub_epi16(q1, q2);	1245 const __m128i r2 = _mm_sub_epi16(q1, q2);

1246 const __m128i r3 = _mm_sub_epi16(q0, q3);	1246 const __m128i r3 = _mm_sub_epi16(q0, q3);

1247 // Interleave to do the multiply by constants which gets us	1247 // Interleave to do the multiply by constants which gets us

1248 // into 32 bits.	1248 // into 32 bits.

1249 const __m128i t0 = _mm_unpacklo_epi16(r0, r1);	1249 const __m128i t0 = _mm_unpacklo_epi16(r0, r1);

1250 const __m128i t1 = _mm_unpackhi_epi16(r0, r1);	1250 const __m128i t1 = _mm_unpackhi_epi16(r0, r1);

1251 const __m128i t2 = _mm_unpacklo_epi16(r2, r3);	1251 const __m128i t2 = _mm_unpacklo_epi16(r2, r3);

1252 const __m128i t3 = _mm_unpackhi_epi16(r2, r3);	1252 const __m128i t3 = _mm_unpackhi_epi16(r2, r3);

(...skipping 43 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1296 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);	1296 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);

1297 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);	1297 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);

1298 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);	1298 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);

1299 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);	1299 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);

1300 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);	1300 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);

1301 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);	1301 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);

1302 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);	1302 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);

1303 // Combine	1303 // Combine

1304 const __m128i r0 = _mm_packs_epi32(s0, s1);	1304 const __m128i r0 = _mm_packs_epi32(s0, s1);

1305 const __m128i r1 = _mm_packs_epi32(s2, s3);	1305 const __m128i r1 = _mm_packs_epi32(s2, s3);

1306 // Add/substract	1306 // Add/subtract

1307 const __m128i x0 = _mm_add_epi16(q4, r0);	1307 const __m128i x0 = _mm_add_epi16(q4, r0);

1308 const __m128i x1 = _mm_sub_epi16(q4, r0);	1308 const __m128i x1 = _mm_sub_epi16(q4, r0);

1309 const __m128i x2 = _mm_sub_epi16(q7, r1);	1309 const __m128i x2 = _mm_sub_epi16(q7, r1);

1310 const __m128i x3 = _mm_add_epi16(q7, r1);	1310 const __m128i x3 = _mm_add_epi16(q7, r1);

1311 // Interleave to do the multiply by constants which gets us	1311 // Interleave to do the multiply by constants which gets us

1312 // into 32 bits.	1312 // into 32 bits.

1313 const __m128i t0 = _mm_unpacklo_epi16(x0, x3);	1313 const __m128i t0 = _mm_unpacklo_epi16(x0, x3);

1314 const __m128i t1 = _mm_unpackhi_epi16(x0, x3);	1314 const __m128i t1 = _mm_unpackhi_epi16(x0, x3);

1315 const __m128i t2 = _mm_unpacklo_epi16(x1, x2);	1315 const __m128i t2 = _mm_unpacklo_epi16(x1, x2);

1316 const __m128i t3 = _mm_unpackhi_epi16(x1, x2);	1316 const __m128i t3 = _mm_unpackhi_epi16(x1, x2);

(...skipping 1266 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2583 #define FDCT32x32_HIGH_PRECISION 0	2583 #define FDCT32x32_HIGH_PRECISION 0

2584 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c"	2584 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c"

2585 #undef FDCT32x32_2D_AVX2	2585 #undef FDCT32x32_2D_AVX2

2586 #undef FDCT32x32_HIGH_PRECISION	2586 #undef FDCT32x32_HIGH_PRECISION

2587	2587

2588 #define FDCT32x32_2D_AVX2 vp9_fdct32x32_avx2	2588 #define FDCT32x32_2D_AVX2 vp9_fdct32x32_avx2

2589 #define FDCT32x32_HIGH_PRECISION 1	2589 #define FDCT32x32_HIGH_PRECISION 1

2590 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c" // NOLINT	2590 #include "vp9/encoder/x86/vp9_dct32x32_avx2.c" // NOLINT

2591 #undef FDCT32x32_2D_AVX2	2591 #undef FDCT32x32_2D_AVX2

2592 #undef FDCT32x32_HIGH_PRECISION	2592 #undef FDCT32x32_HIGH_PRECISION

OLD	NEW