source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c - Issue 181493009: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c

Issue 181493009: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include <emmintrin.h> // SSE2	11 #include <emmintrin.h> // SSE2

12 #include "vp9/common/vp9_idct.h" // for cospi constants	12 #include "vp9/common/vp9_idct.h" // for cospi constants

13 #include "vpx_ports/mem.h"	13 #include "vpx_ports/mem.h"

14	14

15 void vp9_fdct4x4_sse2(const int16_t input, int16_t output, int stride) {	15 void vp9_fdct4x4_sse2(const int16_t input, int16_t output, int stride) {

16 // The 2D transform is done with two passes which are actually pretty	16 // The 2D transform is done with two passes which are actually pretty

17 // similar. In the first one, we transform the columns and transpose	17 // similar. In the first one, we transform the columns and transpose

18 // the results. In the second one, we transform the rows. To achieve that,	18 // the results. In the second one, we transform the rows. To achieve that,

19 // as the first pass results are transposed, we tranpose the columns (that	19 // as the first pass results are transposed, we transpose the columns (that

20 // is the transposed rows) and transpose the results (so that it goes back	20 // is the transposed rows) and transpose the results (so that it goes back

21 // in normal/row positions).	21 // in normal/row positions).

22 int pass;	22 int pass;

23 // Constants	23 // Constants

24 // When we use them, in one case, they are all the same. In all others	24 // When we use them, in one case, they are all the same. In all others

25 // it's a pair of them that we need to repeat four times. This is done	25 // it's a pair of them that we need to repeat four times. This is done

26 // by constructing the 32 bit constant corresponding to that pair.	26 // by constructing the 32 bit constant corresponding to that pair.

27 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);	27 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);

28 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);	28 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

29 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);	29 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);

(...skipping 10 matching lines...) Expand all Loading...
40 (input + 1 * stride)));	40 (input + 1 * stride)));

41 in1 = _mm_loadl_epi64((const __m128i )(input + 2 stride));	41 in1 = _mm_loadl_epi64((const __m128i )(input + 2 stride));

42 in1 = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)	42 in1 = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)

43 (input + 3 * stride)), in1);	43 (input + 3 * stride)), in1);

44	44

45 // x = x << 4	45 // x = x << 4

46 in0 = _mm_slli_epi16(in0, 4);	46 in0 = _mm_slli_epi16(in0, 4);

47 in1 = _mm_slli_epi16(in1, 4);	47 in1 = _mm_slli_epi16(in1, 4);

48 // if (i == 0 && input[0]) input[0] += 1;	48 // if (i == 0 && input[0]) input[0] += 1;

49 {	49 {

50 // The mask will only contain wether the first value is zero, all	50 // The mask will only contain whether the first value is zero, all

51 // other comparison will fail as something shifted by 4 (above << 4)	51 // other comparison will fail as something shifted by 4 (above << 4)

52 // can never be equal to one. To increment in the non-zero case, we	52 // can never be equal to one. To increment in the non-zero case, we

53 // add the mask and one for the first element:	53 // add the mask and one for the first element:

54 // - if zero, mask = -1, v = v - 1 + 1 = v	54 // - if zero, mask = -1, v = v - 1 + 1 = v

55 // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1	55 // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1

56 __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);	56 __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);

57 in0 = _mm_add_epi16(in0, mask);	57 in0 = _mm_add_epi16(in0, mask);

58 in0 = _mm_add_epi16(in0, k__nonzero_bias_b);	58 in0 = _mm_add_epi16(in0, k__nonzero_bias_b);

59 }	59 }

60 }	60 }

61 // Do the two transform/transpose passes	61 // Do the two transform/transpose passes

62 for (pass = 0; pass < 2; ++pass) {	62 for (pass = 0; pass < 2; ++pass) {

63 // Transform 1/2: Add/substract	63 // Transform 1/2: Add/subtract

64 const __m128i r0 = _mm_add_epi16(in0, in1);	64 const __m128i r0 = _mm_add_epi16(in0, in1);

65 const __m128i r1 = _mm_sub_epi16(in0, in1);	65 const __m128i r1 = _mm_sub_epi16(in0, in1);

66 const __m128i r2 = _mm_unpacklo_epi64(r0, r1);	66 const __m128i r2 = _mm_unpacklo_epi64(r0, r1);

67 const __m128i r3 = _mm_unpackhi_epi64(r0, r1);	67 const __m128i r3 = _mm_unpackhi_epi64(r0, r1);

68 // Transform 1/2: Interleave to do the multiply by constants which gets us	68 // Transform 1/2: Interleave to do the multiply by constants which gets us

69 // into 32 bits.	69 // into 32 bits.

70 const __m128i t0 = _mm_unpacklo_epi16(r2, r3);	70 const __m128i t0 = _mm_unpacklo_epi16(r2, r3);

71 const __m128i t2 = _mm_unpackhi_epi16(r2, r3);	71 const __m128i t2 = _mm_unpackhi_epi16(r2, r3);

72 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);	72 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);

73 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);	73 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);

(...skipping 234 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
308 in6 = _mm_slli_epi16(in6, 2);	308 in6 = _mm_slli_epi16(in6, 2);

309 in7 = _mm_slli_epi16(in7, 2);	309 in7 = _mm_slli_epi16(in7, 2);

310	310

311 // We do two passes, first the columns, then the rows. The results of the	311 // We do two passes, first the columns, then the rows. The results of the

312 // first pass are transposed so that the same column code can be reused. The	312 // first pass are transposed so that the same column code can be reused. The

313 // results of the second pass are also transposed so that the rows (processed	313 // results of the second pass are also transposed so that the rows (processed

314 // as columns) are put back in row positions.	314 // as columns) are put back in row positions.

315 for (pass = 0; pass < 2; pass++) {	315 for (pass = 0; pass < 2; pass++) {

316 // To store results of each pass before the transpose.	316 // To store results of each pass before the transpose.

317 __m128i res0, res1, res2, res3, res4, res5, res6, res7;	317 __m128i res0, res1, res2, res3, res4, res5, res6, res7;

318 // Add/substract	318 // Add/subtract

319 const __m128i q0 = _mm_add_epi16(in0, in7);	319 const __m128i q0 = _mm_add_epi16(in0, in7);

320 const __m128i q1 = _mm_add_epi16(in1, in6);	320 const __m128i q1 = _mm_add_epi16(in1, in6);

321 const __m128i q2 = _mm_add_epi16(in2, in5);	321 const __m128i q2 = _mm_add_epi16(in2, in5);

322 const __m128i q3 = _mm_add_epi16(in3, in4);	322 const __m128i q3 = _mm_add_epi16(in3, in4);

323 const __m128i q4 = _mm_sub_epi16(in3, in4);	323 const __m128i q4 = _mm_sub_epi16(in3, in4);

324 const __m128i q5 = _mm_sub_epi16(in2, in5);	324 const __m128i q5 = _mm_sub_epi16(in2, in5);

325 const __m128i q6 = _mm_sub_epi16(in1, in6);	325 const __m128i q6 = _mm_sub_epi16(in1, in6);

326 const __m128i q7 = _mm_sub_epi16(in0, in7);	326 const __m128i q7 = _mm_sub_epi16(in0, in7);

327 // Work on first four results	327 // Work on first four results

328 {	328 {

329 // Add/substract	329 // Add/subtract

330 const __m128i r0 = _mm_add_epi16(q0, q3);	330 const __m128i r0 = _mm_add_epi16(q0, q3);

331 const __m128i r1 = _mm_add_epi16(q1, q2);	331 const __m128i r1 = _mm_add_epi16(q1, q2);

332 const __m128i r2 = _mm_sub_epi16(q1, q2);	332 const __m128i r2 = _mm_sub_epi16(q1, q2);

333 const __m128i r3 = _mm_sub_epi16(q0, q3);	333 const __m128i r3 = _mm_sub_epi16(q0, q3);

334 // Interleave to do the multiply by constants which gets us into 32bits	334 // Interleave to do the multiply by constants which gets us into 32bits

335 const __m128i t0 = _mm_unpacklo_epi16(r0, r1);	335 const __m128i t0 = _mm_unpacklo_epi16(r0, r1);

336 const __m128i t1 = _mm_unpackhi_epi16(r0, r1);	336 const __m128i t1 = _mm_unpackhi_epi16(r0, r1);

337 const __m128i t2 = _mm_unpacklo_epi16(r2, r3);	337 const __m128i t2 = _mm_unpacklo_epi16(r2, r3);

338 const __m128i t3 = _mm_unpackhi_epi16(r2, r3);	338 const __m128i t3 = _mm_unpackhi_epi16(r2, r3);

339 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);	339 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);

(...skipping 41 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
381 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);	381 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);

382 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);	382 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);

383 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);	383 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);

384 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);	384 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);

385 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);	385 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);

386 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);	386 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);

387 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);	387 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);

388 // Combine	388 // Combine

389 const __m128i r0 = _mm_packs_epi32(s0, s1);	389 const __m128i r0 = _mm_packs_epi32(s0, s1);

390 const __m128i r1 = _mm_packs_epi32(s2, s3);	390 const __m128i r1 = _mm_packs_epi32(s2, s3);

391 // Add/substract	391 // Add/subtract

392 const __m128i x0 = _mm_add_epi16(q4, r0);	392 const __m128i x0 = _mm_add_epi16(q4, r0);

393 const __m128i x1 = _mm_sub_epi16(q4, r0);	393 const __m128i x1 = _mm_sub_epi16(q4, r0);

394 const __m128i x2 = _mm_sub_epi16(q7, r1);	394 const __m128i x2 = _mm_sub_epi16(q7, r1);

395 const __m128i x3 = _mm_add_epi16(q7, r1);	395 const __m128i x3 = _mm_add_epi16(q7, r1);

396 // Interleave to do the multiply by constants which gets us into 32bits	396 // Interleave to do the multiply by constants which gets us into 32bits

397 const __m128i t0 = _mm_unpacklo_epi16(x0, x3);	397 const __m128i t0 = _mm_unpacklo_epi16(x0, x3);

398 const __m128i t1 = _mm_unpackhi_epi16(x0, x3);	398 const __m128i t1 = _mm_unpackhi_epi16(x0, x3);

399 const __m128i t2 = _mm_unpacklo_epi16(x1, x2);	399 const __m128i t2 = _mm_unpacklo_epi16(x1, x2);

400 const __m128i t3 = _mm_unpackhi_epi16(x1, x2);	400 const __m128i t3 = _mm_unpackhi_epi16(x1, x2);

401 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);	401 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);

(...skipping 660 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1062 default:	1062 default:

1063 assert(0);	1063 assert(0);

1064 break;	1064 break;

1065 }	1065 }

1066 }	1066 }

1067	1067

1068 void vp9_fdct16x16_sse2(const int16_t input, int16_t output, int stride) {	1068 void vp9_fdct16x16_sse2(const int16_t input, int16_t output, int stride) {

1069 // The 2D transform is done with two passes which are actually pretty	1069 // The 2D transform is done with two passes which are actually pretty

1070 // similar. In the first one, we transform the columns and transpose	1070 // similar. In the first one, we transform the columns and transpose

1071 // the results. In the second one, we transform the rows. To achieve that,	1071 // the results. In the second one, we transform the rows. To achieve that,

1072 // as the first pass results are transposed, we tranpose the columns (that	1072 // as the first pass results are transposed, we transpose the columns (that

1073 // is the transposed rows) and transpose the results (so that it goes back	1073 // is the transposed rows) and transpose the results (so that it goes back

1074 // in normal/row positions).	1074 // in normal/row positions).

1075 int pass;	1075 int pass;

1076 // We need an intermediate buffer between passes.	1076 // We need an intermediate buffer between passes.

1077 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);	1077 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);

1078 const int16_t *in = input;	1078 const int16_t *in = input;

1079 int16_t *out = intermediate;	1079 int16_t *out = intermediate;

1080 // Constants	1080 // Constants

1081 // When we use them, in one case, they are all the same. In all others	1081 // When we use them, in one case, they are all the same. In all others

1082 // it's a pair of them that we need to repeat four times. This is done	1082 // it's a pair of them that we need to repeat four times. This is done

(...skipping 136 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1219 step1_1 = _mm_sub_epi16(in06, in09);	1219 step1_1 = _mm_sub_epi16(in06, in09);

1220 step1_2 = _mm_sub_epi16(in05, in10);	1220 step1_2 = _mm_sub_epi16(in05, in10);

1221 step1_3 = _mm_sub_epi16(in04, in11);	1221 step1_3 = _mm_sub_epi16(in04, in11);

1222 step1_4 = _mm_sub_epi16(in03, in12);	1222 step1_4 = _mm_sub_epi16(in03, in12);

1223 step1_5 = _mm_sub_epi16(in02, in13);	1223 step1_5 = _mm_sub_epi16(in02, in13);

1224 step1_6 = _mm_sub_epi16(in01, in14);	1224 step1_6 = _mm_sub_epi16(in01, in14);

1225 step1_7 = _mm_sub_epi16(in00, in15);	1225 step1_7 = _mm_sub_epi16(in00, in15);

1226 }	1226 }

1227 // Work on the first eight values; fdct8(input, even_results);	1227 // Work on the first eight values; fdct8(input, even_results);

1228 {	1228 {

1229 // Add/substract	1229 // Add/subtract

1230 const __m128i q0 = _mm_add_epi16(input0, input7);	1230 const __m128i q0 = _mm_add_epi16(input0, input7);

1231 const __m128i q1 = _mm_add_epi16(input1, input6);	1231 const __m128i q1 = _mm_add_epi16(input1, input6);

1232 const __m128i q2 = _mm_add_epi16(input2, input5);	1232 const __m128i q2 = _mm_add_epi16(input2, input5);

1233 const __m128i q3 = _mm_add_epi16(input3, input4);	1233 const __m128i q3 = _mm_add_epi16(input3, input4);

1234 const __m128i q4 = _mm_sub_epi16(input3, input4);	1234 const __m128i q4 = _mm_sub_epi16(input3, input4);

1235 const __m128i q5 = _mm_sub_epi16(input2, input5);	1235 const __m128i q5 = _mm_sub_epi16(input2, input5);

1236 const __m128i q6 = _mm_sub_epi16(input1, input6);	1236 const __m128i q6 = _mm_sub_epi16(input1, input6);

1237 const __m128i q7 = _mm_sub_epi16(input0, input7);	1237 const __m128i q7 = _mm_sub_epi16(input0, input7);

1238 // Work on first four results	1238 // Work on first four results

1239 {	1239 {

1240 // Add/substract	1240 // Add/subtract

1241 const __m128i r0 = _mm_add_epi16(q0, q3);	1241 const __m128i r0 = _mm_add_epi16(q0, q3);

1242 const __m128i r1 = _mm_add_epi16(q1, q2);	1242 const __m128i r1 = _mm_add_epi16(q1, q2);

1243 const __m128i r2 = _mm_sub_epi16(q1, q2);	1243 const __m128i r2 = _mm_sub_epi16(q1, q2);

1244 const __m128i r3 = _mm_sub_epi16(q0, q3);	1244 const __m128i r3 = _mm_sub_epi16(q0, q3);

1245 // Interleave to do the multiply by constants which gets us	1245 // Interleave to do the multiply by constants which gets us

1246 // into 32 bits.	1246 // into 32 bits.

1247 const __m128i t0 = _mm_unpacklo_epi16(r0, r1);	1247 const __m128i t0 = _mm_unpacklo_epi16(r0, r1);

1248 const __m128i t1 = _mm_unpackhi_epi16(r0, r1);	1248 const __m128i t1 = _mm_unpackhi_epi16(r0, r1);

1249 const __m128i t2 = _mm_unpacklo_epi16(r2, r3);	1249 const __m128i t2 = _mm_unpacklo_epi16(r2, r3);

1250 const __m128i t3 = _mm_unpackhi_epi16(r2, r3);	1250 const __m128i t3 = _mm_unpackhi_epi16(r2, r3);

(...skipping 43 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1294 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);	1294 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);

1295 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);	1295 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);

1296 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);	1296 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);

1297 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);	1297 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);

1298 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);	1298 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);

1299 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);	1299 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);

1300 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);	1300 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);

1301 // Combine	1301 // Combine

1302 const __m128i r0 = _mm_packs_epi32(s0, s1);	1302 const __m128i r0 = _mm_packs_epi32(s0, s1);

1303 const __m128i r1 = _mm_packs_epi32(s2, s3);	1303 const __m128i r1 = _mm_packs_epi32(s2, s3);

1304 // Add/substract	1304 // Add/subtract

1305 const __m128i x0 = _mm_add_epi16(q4, r0);	1305 const __m128i x0 = _mm_add_epi16(q4, r0);

1306 const __m128i x1 = _mm_sub_epi16(q4, r0);	1306 const __m128i x1 = _mm_sub_epi16(q4, r0);

1307 const __m128i x2 = _mm_sub_epi16(q7, r1);	1307 const __m128i x2 = _mm_sub_epi16(q7, r1);

1308 const __m128i x3 = _mm_add_epi16(q7, r1);	1308 const __m128i x3 = _mm_add_epi16(q7, r1);

1309 // Interleave to do the multiply by constants which gets us	1309 // Interleave to do the multiply by constants which gets us

1310 // into 32 bits.	1310 // into 32 bits.

1311 const __m128i t0 = _mm_unpacklo_epi16(x0, x3);	1311 const __m128i t0 = _mm_unpacklo_epi16(x0, x3);

1312 const __m128i t1 = _mm_unpackhi_epi16(x0, x3);	1312 const __m128i t1 = _mm_unpackhi_epi16(x0, x3);

1313 const __m128i t2 = _mm_unpacklo_epi16(x1, x2);	1313 const __m128i t2 = _mm_unpacklo_epi16(x1, x2);

1314 const __m128i t3 = _mm_unpackhi_epi16(x1, x2);	1314 const __m128i t3 = _mm_unpackhi_epi16(x1, x2);

(...skipping 1266 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2581 #define FDCT32x32_HIGH_PRECISION 0	2581 #define FDCT32x32_HIGH_PRECISION 0

2582 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c"	2582 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c"

2583 #undef FDCT32x32_2D	2583 #undef FDCT32x32_2D

2584 #undef FDCT32x32_HIGH_PRECISION	2584 #undef FDCT32x32_HIGH_PRECISION

2585	2585

2586 #define FDCT32x32_2D vp9_fdct32x32_sse2	2586 #define FDCT32x32_2D vp9_fdct32x32_sse2

2587 #define FDCT32x32_HIGH_PRECISION 1	2587 #define FDCT32x32_HIGH_PRECISION 1

2588 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT	2588 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT

2589 #undef FDCT32x32_2D	2589 #undef FDCT32x32_2D

2590 #undef FDCT32x32_HIGH_PRECISION	2590 #undef FDCT32x32_HIGH_PRECISION

OLD	NEW