Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(90)

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c

Issue 341293003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <emmintrin.h> // SSE2 11 #include <emmintrin.h> // SSE2
12 #include "vp9/common/vp9_idct.h" // for cospi constants 12 #include "vp9/common/vp9_idct.h" // for cospi constants
13 #include "vpx_ports/mem.h" 13 #include "vpx_ports/mem.h"
14 14
15 void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) {
16 __m128i in0, in1;
17 __m128i tmp;
18 const __m128i zero = _mm_setzero_si128();
19 in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
20 in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
21 in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
22 (input + 2 * stride)));
23 in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
24 (input + 3 * stride)));
25
26 tmp = _mm_add_epi16(in0, in1);
27 in0 = _mm_unpacklo_epi16(zero, tmp);
28 in1 = _mm_unpackhi_epi16(zero, tmp);
29 in0 = _mm_srai_epi32(in0, 16);
30 in1 = _mm_srai_epi32(in1, 16);
31
32 tmp = _mm_add_epi32(in0, in1);
33 in0 = _mm_unpacklo_epi32(tmp, zero);
34 in1 = _mm_unpackhi_epi32(tmp, zero);
35
36 tmp = _mm_add_epi32(in0, in1);
37 in0 = _mm_srli_si128(tmp, 8);
38
39 in1 = _mm_add_epi32(tmp, in0);
40 in0 = _mm_slli_epi32(in1, 1);
41 _mm_store_si128((__m128i *)(output), in0);
42 }
43
15 void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { 44 void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
16 // This 2D transform implements 4 vertical 1D transforms followed 45 // This 2D transform implements 4 vertical 1D transforms followed
17 // by 4 horizontal 1D transforms. The multiplies and adds are as given 46 // by 4 horizontal 1D transforms. The multiplies and adds are as given
18 // by Chen, Smith and Fralick ('77). The commands for moving the data 47 // by Chen, Smith and Fralick ('77). The commands for moving the data
19 // around have been minimized by hand. 48 // around have been minimized by hand.
20 // For the purposes of the comments, the 16 inputs are referred to at i0 49 // For the purposes of the comments, the 16 inputs are referred to at i0
21 // through iF (in raster order), intermediate variables are a0, b0, c0 50 // through iF (in raster order), intermediate variables are a0, b0, c0
22 // through f, and correspond to the in-place computations mapped to input 51 // through f, and correspond to the in-place computations mapped to input
23 // locations. The outputs, o0 through oF are labeled according to the 52 // locations. The outputs, o0 through oF are labeled according to the
24 // output locations. 53 // output locations.
(...skipping 345 matching lines...) Expand 10 before | Expand all | Expand 10 after
370 fadst4_sse2(in); 399 fadst4_sse2(in);
371 fadst4_sse2(in); 400 fadst4_sse2(in);
372 write_buffer_4x4(output, in); 401 write_buffer_4x4(output, in);
373 break; 402 break;
374 default: 403 default:
375 assert(0); 404 assert(0);
376 break; 405 break;
377 } 406 }
378 } 407 }
379 408
409 void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride) {
410 __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
411 __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
412 __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
413 __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
414 __m128i u0, u1, sum;
415
416 u0 = _mm_add_epi16(in0, in1);
417 u1 = _mm_add_epi16(in2, in3);
418
419 in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
420 in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
421 in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
422 in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
423
424 sum = _mm_add_epi16(u0, u1);
425
426 in0 = _mm_add_epi16(in0, in1);
427 in2 = _mm_add_epi16(in2, in3);
428 sum = _mm_add_epi16(sum, in0);
429
430 u0 = _mm_setzero_si128();
431 sum = _mm_add_epi16(sum, in2);
432
433 in0 = _mm_unpacklo_epi16(u0, sum);
434 in1 = _mm_unpackhi_epi16(u0, sum);
435 in0 = _mm_srai_epi32(in0, 16);
436 in1 = _mm_srai_epi32(in1, 16);
437
438 sum = _mm_add_epi32(in0, in1);
439 in0 = _mm_unpacklo_epi32(sum, u0);
440 in1 = _mm_unpackhi_epi32(sum, u0);
441
442 sum = _mm_add_epi32(in0, in1);
443 in0 = _mm_srli_si128(sum, 8);
444
445 in1 = _mm_add_epi32(sum, in0);
446 _mm_store_si128((__m128i *)(output), in1);
447 }
448
380 void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { 449 void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
381 int pass; 450 int pass;
382 // Constants 451 // Constants
383 // When we use them, in one case, they are all the same. In all others 452 // When we use them, in one case, they are all the same. In all others
384 // it's a pair of them that we need to repeat four times. This is done 453 // it's a pair of them that we need to repeat four times. This is done
385 // by constructing the 32 bit constant corresponding to that pair. 454 // by constructing the 32 bit constant corresponding to that pair.
386 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 455 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
387 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 456 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
388 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 457 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
389 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 458 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
(...skipping 771 matching lines...) Expand 10 before | Expand all | Expand 10 after
1161 fadst8_sse2(in); 1230 fadst8_sse2(in);
1162 right_shift_8x8(in, 1); 1231 right_shift_8x8(in, 1);
1163 write_buffer_8x8(output, in, 8); 1232 write_buffer_8x8(output, in, 8);
1164 break; 1233 break;
1165 default: 1234 default:
1166 assert(0); 1235 assert(0);
1167 break; 1236 break;
1168 } 1237 }
1169 } 1238 }
1170 1239
1240 void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride) {
1241 __m128i in0, in1, in2, in3;
1242 __m128i u0, u1;
1243 __m128i sum = _mm_setzero_si128();
1244 int i;
1245
1246 for (i = 0; i < 2; ++i) {
1247 input += 8 * i;
1248 in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
1249 in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
1250 in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
1251 in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
1252
1253 u0 = _mm_add_epi16(in0, in1);
1254 u1 = _mm_add_epi16(in2, in3);
1255 sum = _mm_add_epi16(sum, u0);
1256
1257 in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
1258 in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
1259 in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
1260 in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
1261
1262 sum = _mm_add_epi16(sum, u1);
1263 u0 = _mm_add_epi16(in0, in1);
1264 u1 = _mm_add_epi16(in2, in3);
1265 sum = _mm_add_epi16(sum, u0);
1266
1267 in0 = _mm_load_si128((const __m128i *)(input + 8 * stride));
1268 in1 = _mm_load_si128((const __m128i *)(input + 9 * stride));
1269 in2 = _mm_load_si128((const __m128i *)(input + 10 * stride));
1270 in3 = _mm_load_si128((const __m128i *)(input + 11 * stride));
1271
1272 sum = _mm_add_epi16(sum, u1);
1273 u0 = _mm_add_epi16(in0, in1);
1274 u1 = _mm_add_epi16(in2, in3);
1275 sum = _mm_add_epi16(sum, u0);
1276
1277 in0 = _mm_load_si128((const __m128i *)(input + 12 * stride));
1278 in1 = _mm_load_si128((const __m128i *)(input + 13 * stride));
1279 in2 = _mm_load_si128((const __m128i *)(input + 14 * stride));
1280 in3 = _mm_load_si128((const __m128i *)(input + 15 * stride));
1281
1282 sum = _mm_add_epi16(sum, u1);
1283 u0 = _mm_add_epi16(in0, in1);
1284 u1 = _mm_add_epi16(in2, in3);
1285 sum = _mm_add_epi16(sum, u0);
1286
1287 sum = _mm_add_epi16(sum, u1);
1288 }
1289
1290 u0 = _mm_setzero_si128();
1291 in0 = _mm_unpacklo_epi16(u0, sum);
1292 in1 = _mm_unpackhi_epi16(u0, sum);
1293 in0 = _mm_srai_epi32(in0, 16);
1294 in1 = _mm_srai_epi32(in1, 16);
1295
1296 sum = _mm_add_epi32(in0, in1);
1297 in0 = _mm_unpacklo_epi32(sum, u0);
1298 in1 = _mm_unpackhi_epi32(sum, u0);
1299
1300 sum = _mm_add_epi32(in0, in1);
1301 in0 = _mm_srli_si128(sum, 8);
1302
1303 in1 = _mm_add_epi32(sum, in0);
1304 in1 = _mm_srai_epi32(in1, 1);
1305 _mm_store_si128((__m128i *)(output), in1);
1306 }
1307
1171 void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { 1308 void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
1172 // The 2D transform is done with two passes which are actually pretty 1309 // The 2D transform is done with two passes which are actually pretty
1173 // similar. In the first one, we transform the columns and transpose 1310 // similar. In the first one, we transform the columns and transpose
1174 // the results. In the second one, we transform the rows. To achieve that, 1311 // the results. In the second one, we transform the rows. To achieve that,
1175 // as the first pass results are transposed, we transpose the columns (that 1312 // as the first pass results are transposed, we transpose the columns (that
1176 // is the transposed rows) and transpose the results (so that it goes back 1313 // is the transposed rows) and transpose the results (so that it goes back
1177 // in normal/row positions). 1314 // in normal/row positions).
1178 int pass; 1315 int pass;
1179 // We need an intermediate buffer between passes. 1316 // We need an intermediate buffer between passes.
1180 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); 1317 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
1181 const int16_t *in = input; 1318 const int16_t *in = input;
1182 int16_t *out = intermediate; 1319 int16_t *out = intermediate;
1183 // Constants 1320 // Constants
1184 // When we use them, in one case, they are all the same. In all others 1321 // When we use them, in one case, they are all the same. In all others
1185 // it's a pair of them that we need to repeat four times. This is done 1322 // it's a pair of them that we need to repeat four times. This is done
1186 // by constructing the 32 bit constant corresponding to that pair. 1323 // by constructing the 32 bit constant corresponding to that pair.
1187 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1324 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
1188 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1325 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1189 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 1326 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1190 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1327 const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
1191 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1328 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1192 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 1329 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
1193 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 1330 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
1194 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 1331 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
1195 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1332 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
1196 const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); 1333 const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
1197 const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); 1334 const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
1198 const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); 1335 const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
1199 const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); 1336 const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
1200 const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); 1337 const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
(...skipping 305 matching lines...) Expand 10 before | Expand all | Expand 10 after
1506 step3_7 = _mm_add_epi16(step1_7, step2_4); 1643 step3_7 = _mm_add_epi16(step1_7, step2_4);
1507 } 1644 }
1508 // step 4 1645 // step 4
1509 { 1646 {
1510 const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); 1647 const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
1511 const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); 1648 const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
1512 const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); 1649 const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
1513 const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); 1650 const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
1514 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24); 1651 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
1515 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24); 1652 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
1516 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08); 1653 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p24_p08);
1517 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08); 1654 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p24_p08);
1518 // dct_const_round_shift 1655 // dct_const_round_shift
1519 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1656 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
1520 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1657 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1521 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1658 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
1522 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1659 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1523 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1660 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1524 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1661 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1525 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1662 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1526 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1663 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1527 // Combine 1664 // Combine
1528 step2_1 = _mm_packs_epi32(w0, w1); 1665 step2_1 = _mm_packs_epi32(w0, w1);
1529 step2_2 = _mm_packs_epi32(w2, w3); 1666 step2_2 = _mm_packs_epi32(w2, w3);
1530 } 1667 }
1531 { 1668 {
1532 const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); 1669 const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
1533 const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); 1670 const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
1534 const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); 1671 const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
1535 const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); 1672 const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
1536 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08); 1673 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
1537 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08); 1674 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
1538 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24); 1675 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p08_m24);
1539 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24); 1676 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p08_m24);
1540 // dct_const_round_shift 1677 // dct_const_round_shift
1541 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1678 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
1542 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1679 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1543 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1680 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
1544 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1681 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1545 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1682 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1546 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1683 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1547 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1684 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1548 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1685 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1549 // Combine 1686 // Combine
1550 step2_6 = _mm_packs_epi32(w0, w1); 1687 step2_6 = _mm_packs_epi32(w0, w1);
1551 step2_5 = _mm_packs_epi32(w2, w3); 1688 step2_5 = _mm_packs_epi32(w2, w3);
1552 } 1689 }
1553 // step 5 1690 // step 5
1554 { 1691 {
1555 step1_0 = _mm_add_epi16(step3_0, step2_1); 1692 step1_0 = _mm_add_epi16(step3_0, step2_1);
1556 step1_1 = _mm_sub_epi16(step3_0, step2_1); 1693 step1_1 = _mm_sub_epi16(step3_0, step2_1);
1557 step1_2 = _mm_sub_epi16(step3_3, step2_2); 1694 step1_2 = _mm_add_epi16(step3_3, step2_2);
1558 step1_3 = _mm_add_epi16(step3_3, step2_2); 1695 step1_3 = _mm_sub_epi16(step3_3, step2_2);
1559 step1_4 = _mm_add_epi16(step3_4, step2_5); 1696 step1_4 = _mm_sub_epi16(step3_4, step2_5);
1560 step1_5 = _mm_sub_epi16(step3_4, step2_5); 1697 step1_5 = _mm_add_epi16(step3_4, step2_5);
1561 step1_6 = _mm_sub_epi16(step3_7, step2_6); 1698 step1_6 = _mm_sub_epi16(step3_7, step2_6);
1562 step1_7 = _mm_add_epi16(step3_7, step2_6); 1699 step1_7 = _mm_add_epi16(step3_7, step2_6);
1563 } 1700 }
1564 // step 6 1701 // step 6
1565 { 1702 {
1566 const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); 1703 const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
1567 const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); 1704 const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
1568 const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); 1705 const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
1569 const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); 1706 const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
1570 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02); 1707 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);
(...skipping 270 matching lines...) Expand 10 before | Expand all | Expand 10 after
1841 right_shift_8x8(res1 + 8, 2); 1978 right_shift_8x8(res1 + 8, 2);
1842 } 1979 }
1843 1980
1844 void fdct16_8col(__m128i *in) { 1981 void fdct16_8col(__m128i *in) {
1845 // perform 16x16 1-D DCT for 8 columns 1982 // perform 16x16 1-D DCT for 8 columns
1846 __m128i i[8], s[8], p[8], t[8], u[16], v[16]; 1983 __m128i i[8], s[8], p[8], t[8], u[16], v[16];
1847 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1984 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
1848 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1985 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1849 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1986 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1850 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 1987 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1851 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1988 const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
1852 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1989 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1853 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 1990 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
1854 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 1991 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
1855 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 1992 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
1856 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1993 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
1857 const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); 1994 const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
1858 const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); 1995 const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
1859 const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); 1996 const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
1860 const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); 1997 const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
1861 const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); 1998 const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
(...skipping 183 matching lines...) Expand 10 before | Expand all | Expand 10 after
2045 p[7] = _mm_add_epi16(s[7], t[4]); 2182 p[7] = _mm_add_epi16(s[7], t[4]);
2046 2183
2047 // stage 4 2184 // stage 4
2048 u[0] = _mm_unpacklo_epi16(p[1], p[6]); 2185 u[0] = _mm_unpacklo_epi16(p[1], p[6]);
2049 u[1] = _mm_unpackhi_epi16(p[1], p[6]); 2186 u[1] = _mm_unpackhi_epi16(p[1], p[6]);
2050 u[2] = _mm_unpacklo_epi16(p[2], p[5]); 2187 u[2] = _mm_unpacklo_epi16(p[2], p[5]);
2051 u[3] = _mm_unpackhi_epi16(p[2], p[5]); 2188 u[3] = _mm_unpackhi_epi16(p[2], p[5]);
2052 2189
2053 v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); 2190 v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
2054 v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); 2191 v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
2055 v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08); 2192 v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08);
2056 v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08); 2193 v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08);
2057 v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24); 2194 v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24);
2058 v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24); 2195 v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24);
2059 v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); 2196 v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
2060 v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); 2197 v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
2061 2198
2062 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2199 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2063 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2200 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2064 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2201 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2065 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2202 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2066 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2203 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2067 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2204 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2068 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2205 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2069 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2206 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2070 2207
2071 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2208 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2072 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2209 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2073 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2210 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2074 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2211 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2075 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2212 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2076 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2213 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2077 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2214 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2078 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2215 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2079 2216
2080 t[1] = _mm_packs_epi32(v[0], v[1]); 2217 t[1] = _mm_packs_epi32(v[0], v[1]);
2081 t[2] = _mm_packs_epi32(v[2], v[3]); 2218 t[2] = _mm_packs_epi32(v[2], v[3]);
2082 t[5] = _mm_packs_epi32(v[4], v[5]); 2219 t[5] = _mm_packs_epi32(v[4], v[5]);
2083 t[6] = _mm_packs_epi32(v[6], v[7]); 2220 t[6] = _mm_packs_epi32(v[6], v[7]);
2084 2221
2085 // stage 5 2222 // stage 5
2086 s[0] = _mm_add_epi16(p[0], t[1]); 2223 s[0] = _mm_add_epi16(p[0], t[1]);
2087 s[1] = _mm_sub_epi16(p[0], t[1]); 2224 s[1] = _mm_sub_epi16(p[0], t[1]);
2088 s[2] = _mm_sub_epi16(p[3], t[2]); 2225 s[2] = _mm_add_epi16(p[3], t[2]);
2089 s[3] = _mm_add_epi16(p[3], t[2]); 2226 s[3] = _mm_sub_epi16(p[3], t[2]);
2090 s[4] = _mm_add_epi16(p[4], t[5]); 2227 s[4] = _mm_sub_epi16(p[4], t[5]);
2091 s[5] = _mm_sub_epi16(p[4], t[5]); 2228 s[5] = _mm_add_epi16(p[4], t[5]);
2092 s[6] = _mm_sub_epi16(p[7], t[6]); 2229 s[6] = _mm_sub_epi16(p[7], t[6]);
2093 s[7] = _mm_add_epi16(p[7], t[6]); 2230 s[7] = _mm_add_epi16(p[7], t[6]);
2094 2231
2095 // stage 6 2232 // stage 6
2096 u[0] = _mm_unpacklo_epi16(s[0], s[7]); 2233 u[0] = _mm_unpacklo_epi16(s[0], s[7]);
2097 u[1] = _mm_unpackhi_epi16(s[0], s[7]); 2234 u[1] = _mm_unpackhi_epi16(s[0], s[7]);
2098 u[2] = _mm_unpacklo_epi16(s[1], s[6]); 2235 u[2] = _mm_unpacklo_epi16(s[1], s[6]);
2099 u[3] = _mm_unpackhi_epi16(s[1], s[6]); 2236 u[3] = _mm_unpackhi_epi16(s[1], s[6]);
2100 u[4] = _mm_unpacklo_epi16(s[2], s[5]); 2237 u[4] = _mm_unpacklo_epi16(s[2], s[5]);
2101 u[5] = _mm_unpackhi_epi16(s[2], s[5]); 2238 u[5] = _mm_unpackhi_epi16(s[2], s[5]);
(...skipping 571 matching lines...) Expand 10 before | Expand all | Expand 10 after
2673 right_shift_16x16(in0, in1); 2810 right_shift_16x16(in0, in1);
2674 fadst16_sse2(in0, in1); 2811 fadst16_sse2(in0, in1);
2675 write_buffer_16x16(output, in0, in1, 16); 2812 write_buffer_16x16(output, in0, in1, 16);
2676 break; 2813 break;
2677 default: 2814 default:
2678 assert(0); 2815 assert(0);
2679 break; 2816 break;
2680 } 2817 }
2681 } 2818 }
2682 2819
2820 void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride) {
2821 __m128i in0, in1, in2, in3;
2822 __m128i u0, u1;
2823 __m128i sum = _mm_setzero_si128();
2824 int i;
2825
2826 for (i = 0; i < 8; ++i) {
2827 in0 = _mm_load_si128((const __m128i *)(input + 0));
2828 in1 = _mm_load_si128((const __m128i *)(input + 8));
2829 in2 = _mm_load_si128((const __m128i *)(input + 16));
2830 in3 = _mm_load_si128((const __m128i *)(input + 24));
2831
2832 input += stride;
2833 u0 = _mm_add_epi16(in0, in1);
2834 u1 = _mm_add_epi16(in2, in3);
2835 sum = _mm_add_epi16(sum, u0);
2836
2837 in0 = _mm_load_si128((const __m128i *)(input + 0));
2838 in1 = _mm_load_si128((const __m128i *)(input + 8));
2839 in2 = _mm_load_si128((const __m128i *)(input + 16));
2840 in3 = _mm_load_si128((const __m128i *)(input + 24));
2841
2842 input += stride;
2843 sum = _mm_add_epi16(sum, u1);
2844 u0 = _mm_add_epi16(in0, in1);
2845 u1 = _mm_add_epi16(in2, in3);
2846 sum = _mm_add_epi16(sum, u0);
2847
2848 in0 = _mm_load_si128((const __m128i *)(input + 0));
2849 in1 = _mm_load_si128((const __m128i *)(input + 8));
2850 in2 = _mm_load_si128((const __m128i *)(input + 16));
2851 in3 = _mm_load_si128((const __m128i *)(input + 24));
2852
2853 input += stride;
2854 sum = _mm_add_epi16(sum, u1);
2855 u0 = _mm_add_epi16(in0, in1);
2856 u1 = _mm_add_epi16(in2, in3);
2857 sum = _mm_add_epi16(sum, u0);
2858
2859 in0 = _mm_load_si128((const __m128i *)(input + 0));
2860 in1 = _mm_load_si128((const __m128i *)(input + 8));
2861 in2 = _mm_load_si128((const __m128i *)(input + 16));
2862 in3 = _mm_load_si128((const __m128i *)(input + 24));
2863
2864 input += stride;
2865 sum = _mm_add_epi16(sum, u1);
2866 u0 = _mm_add_epi16(in0, in1);
2867 u1 = _mm_add_epi16(in2, in3);
2868 sum = _mm_add_epi16(sum, u0);
2869
2870 sum = _mm_add_epi16(sum, u1);
2871 }
2872
2873 u0 = _mm_setzero_si128();
2874 in0 = _mm_unpacklo_epi16(u0, sum);
2875 in1 = _mm_unpackhi_epi16(u0, sum);
2876 in0 = _mm_srai_epi32(in0, 16);
2877 in1 = _mm_srai_epi32(in1, 16);
2878
2879 sum = _mm_add_epi32(in0, in1);
2880 in0 = _mm_unpacklo_epi32(sum, u0);
2881 in1 = _mm_unpackhi_epi32(sum, u0);
2882
2883 sum = _mm_add_epi32(in0, in1);
2884 in0 = _mm_srli_si128(sum, 8);
2885
2886 in1 = _mm_add_epi32(sum, in0);
2887 in1 = _mm_srai_epi32(in1, 3);
2888 _mm_store_si128((__m128i *)(output), in1);
2889 }
2890
2683 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2 2891 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2
2684 #define FDCT32x32_HIGH_PRECISION 0 2892 #define FDCT32x32_HIGH_PRECISION 0
2685 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" 2893 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
2686 #undef FDCT32x32_2D 2894 #undef FDCT32x32_2D
2687 #undef FDCT32x32_HIGH_PRECISION 2895 #undef FDCT32x32_HIGH_PRECISION
2688 2896
2689 #define FDCT32x32_2D vp9_fdct32x32_sse2 2897 #define FDCT32x32_2D vp9_fdct32x32_sse2
2690 #define FDCT32x32_HIGH_PRECISION 1 2898 #define FDCT32x32_HIGH_PRECISION 1
2691 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT 2899 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
2692 #undef FDCT32x32_2D 2900 #undef FDCT32x32_2D
2693 #undef FDCT32x32_HIGH_PRECISION 2901 #undef FDCT32x32_HIGH_PRECISION
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/vp9_variance.c ('k') | source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698