OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <emmintrin.h> // SSE2 | 11 #include <emmintrin.h> // SSE2 |
12 #include "vp9/common/vp9_idct.h" // for cospi constants | 12 #include "vp9/common/vp9_idct.h" // for cospi constants |
13 #include "vpx_ports/mem.h" | 13 #include "vpx_ports/mem.h" |
14 | 14 |
| 15 void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) { |
| 16 __m128i in0, in1; |
| 17 __m128i tmp; |
| 18 const __m128i zero = _mm_setzero_si128(); |
| 19 in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); |
| 20 in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); |
| 21 in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *) |
| 22 (input + 2 * stride))); |
| 23 in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *) |
| 24 (input + 3 * stride))); |
| 25 |
| 26 tmp = _mm_add_epi16(in0, in1); |
| 27 in0 = _mm_unpacklo_epi16(zero, tmp); |
| 28 in1 = _mm_unpackhi_epi16(zero, tmp); |
| 29 in0 = _mm_srai_epi32(in0, 16); |
| 30 in1 = _mm_srai_epi32(in1, 16); |
| 31 |
| 32 tmp = _mm_add_epi32(in0, in1); |
| 33 in0 = _mm_unpacklo_epi32(tmp, zero); |
| 34 in1 = _mm_unpackhi_epi32(tmp, zero); |
| 35 |
| 36 tmp = _mm_add_epi32(in0, in1); |
| 37 in0 = _mm_srli_si128(tmp, 8); |
| 38 |
| 39 in1 = _mm_add_epi32(tmp, in0); |
| 40 in0 = _mm_slli_epi32(in1, 1); |
| 41 _mm_store_si128((__m128i *)(output), in0); |
| 42 } |
| 43 |
15 void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { | 44 void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { |
16 // This 2D transform implements 4 vertical 1D transforms followed | 45 // This 2D transform implements 4 vertical 1D transforms followed |
17 // by 4 horizontal 1D transforms. The multiplies and adds are as given | 46 // by 4 horizontal 1D transforms. The multiplies and adds are as given |
18 // by Chen, Smith and Fralick ('77). The commands for moving the data | 47 // by Chen, Smith and Fralick ('77). The commands for moving the data |
19 // around have been minimized by hand. | 48 // around have been minimized by hand. |
20 // For the purposes of the comments, the 16 inputs are referred to at i0 | 49 // For the purposes of the comments, the 16 inputs are referred to at i0 |
21 // through iF (in raster order), intermediate variables are a0, b0, c0 | 50 // through iF (in raster order), intermediate variables are a0, b0, c0 |
22 // through f, and correspond to the in-place computations mapped to input | 51 // through f, and correspond to the in-place computations mapped to input |
23 // locations. The outputs, o0 through oF are labeled according to the | 52 // locations. The outputs, o0 through oF are labeled according to the |
24 // output locations. | 53 // output locations. |
(...skipping 345 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
370 fadst4_sse2(in); | 399 fadst4_sse2(in); |
371 fadst4_sse2(in); | 400 fadst4_sse2(in); |
372 write_buffer_4x4(output, in); | 401 write_buffer_4x4(output, in); |
373 break; | 402 break; |
374 default: | 403 default: |
375 assert(0); | 404 assert(0); |
376 break; | 405 break; |
377 } | 406 } |
378 } | 407 } |
379 | 408 |
| 409 void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride) { |
| 410 __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); |
| 411 __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); |
| 412 __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); |
| 413 __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); |
| 414 __m128i u0, u1, sum; |
| 415 |
| 416 u0 = _mm_add_epi16(in0, in1); |
| 417 u1 = _mm_add_epi16(in2, in3); |
| 418 |
| 419 in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); |
| 420 in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); |
| 421 in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); |
| 422 in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); |
| 423 |
| 424 sum = _mm_add_epi16(u0, u1); |
| 425 |
| 426 in0 = _mm_add_epi16(in0, in1); |
| 427 in2 = _mm_add_epi16(in2, in3); |
| 428 sum = _mm_add_epi16(sum, in0); |
| 429 |
| 430 u0 = _mm_setzero_si128(); |
| 431 sum = _mm_add_epi16(sum, in2); |
| 432 |
| 433 in0 = _mm_unpacklo_epi16(u0, sum); |
| 434 in1 = _mm_unpackhi_epi16(u0, sum); |
| 435 in0 = _mm_srai_epi32(in0, 16); |
| 436 in1 = _mm_srai_epi32(in1, 16); |
| 437 |
| 438 sum = _mm_add_epi32(in0, in1); |
| 439 in0 = _mm_unpacklo_epi32(sum, u0); |
| 440 in1 = _mm_unpackhi_epi32(sum, u0); |
| 441 |
| 442 sum = _mm_add_epi32(in0, in1); |
| 443 in0 = _mm_srli_si128(sum, 8); |
| 444 |
| 445 in1 = _mm_add_epi32(sum, in0); |
| 446 _mm_store_si128((__m128i *)(output), in1); |
| 447 } |
| 448 |
380 void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { | 449 void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { |
381 int pass; | 450 int pass; |
382 // Constants | 451 // Constants |
383 // When we use them, in one case, they are all the same. In all others | 452 // When we use them, in one case, they are all the same. In all others |
384 // it's a pair of them that we need to repeat four times. This is done | 453 // it's a pair of them that we need to repeat four times. This is done |
385 // by constructing the 32 bit constant corresponding to that pair. | 454 // by constructing the 32 bit constant corresponding to that pair. |
386 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | 455 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); |
387 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 456 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
388 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | 457 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
389 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 458 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
(...skipping 771 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1161 fadst8_sse2(in); | 1230 fadst8_sse2(in); |
1162 right_shift_8x8(in, 1); | 1231 right_shift_8x8(in, 1); |
1163 write_buffer_8x8(output, in, 8); | 1232 write_buffer_8x8(output, in, 8); |
1164 break; | 1233 break; |
1165 default: | 1234 default: |
1166 assert(0); | 1235 assert(0); |
1167 break; | 1236 break; |
1168 } | 1237 } |
1169 } | 1238 } |
1170 | 1239 |
| 1240 void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride) { |
| 1241 __m128i in0, in1, in2, in3; |
| 1242 __m128i u0, u1; |
| 1243 __m128i sum = _mm_setzero_si128(); |
| 1244 int i; |
| 1245 |
| 1246 for (i = 0; i < 2; ++i) { |
| 1247 input += 8 * i; |
| 1248 in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); |
| 1249 in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); |
| 1250 in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); |
| 1251 in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); |
| 1252 |
| 1253 u0 = _mm_add_epi16(in0, in1); |
| 1254 u1 = _mm_add_epi16(in2, in3); |
| 1255 sum = _mm_add_epi16(sum, u0); |
| 1256 |
| 1257 in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); |
| 1258 in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); |
| 1259 in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); |
| 1260 in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); |
| 1261 |
| 1262 sum = _mm_add_epi16(sum, u1); |
| 1263 u0 = _mm_add_epi16(in0, in1); |
| 1264 u1 = _mm_add_epi16(in2, in3); |
| 1265 sum = _mm_add_epi16(sum, u0); |
| 1266 |
| 1267 in0 = _mm_load_si128((const __m128i *)(input + 8 * stride)); |
| 1268 in1 = _mm_load_si128((const __m128i *)(input + 9 * stride)); |
| 1269 in2 = _mm_load_si128((const __m128i *)(input + 10 * stride)); |
| 1270 in3 = _mm_load_si128((const __m128i *)(input + 11 * stride)); |
| 1271 |
| 1272 sum = _mm_add_epi16(sum, u1); |
| 1273 u0 = _mm_add_epi16(in0, in1); |
| 1274 u1 = _mm_add_epi16(in2, in3); |
| 1275 sum = _mm_add_epi16(sum, u0); |
| 1276 |
| 1277 in0 = _mm_load_si128((const __m128i *)(input + 12 * stride)); |
| 1278 in1 = _mm_load_si128((const __m128i *)(input + 13 * stride)); |
| 1279 in2 = _mm_load_si128((const __m128i *)(input + 14 * stride)); |
| 1280 in3 = _mm_load_si128((const __m128i *)(input + 15 * stride)); |
| 1281 |
| 1282 sum = _mm_add_epi16(sum, u1); |
| 1283 u0 = _mm_add_epi16(in0, in1); |
| 1284 u1 = _mm_add_epi16(in2, in3); |
| 1285 sum = _mm_add_epi16(sum, u0); |
| 1286 |
| 1287 sum = _mm_add_epi16(sum, u1); |
| 1288 } |
| 1289 |
| 1290 u0 = _mm_setzero_si128(); |
| 1291 in0 = _mm_unpacklo_epi16(u0, sum); |
| 1292 in1 = _mm_unpackhi_epi16(u0, sum); |
| 1293 in0 = _mm_srai_epi32(in0, 16); |
| 1294 in1 = _mm_srai_epi32(in1, 16); |
| 1295 |
| 1296 sum = _mm_add_epi32(in0, in1); |
| 1297 in0 = _mm_unpacklo_epi32(sum, u0); |
| 1298 in1 = _mm_unpackhi_epi32(sum, u0); |
| 1299 |
| 1300 sum = _mm_add_epi32(in0, in1); |
| 1301 in0 = _mm_srli_si128(sum, 8); |
| 1302 |
| 1303 in1 = _mm_add_epi32(sum, in0); |
| 1304 in1 = _mm_srai_epi32(in1, 1); |
| 1305 _mm_store_si128((__m128i *)(output), in1); |
| 1306 } |
| 1307 |
1171 void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { | 1308 void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { |
1172 // The 2D transform is done with two passes which are actually pretty | 1309 // The 2D transform is done with two passes which are actually pretty |
1173 // similar. In the first one, we transform the columns and transpose | 1310 // similar. In the first one, we transform the columns and transpose |
1174 // the results. In the second one, we transform the rows. To achieve that, | 1311 // the results. In the second one, we transform the rows. To achieve that, |
1175 // as the first pass results are transposed, we transpose the columns (that | 1312 // as the first pass results are transposed, we transpose the columns (that |
1176 // is the transposed rows) and transpose the results (so that it goes back | 1313 // is the transposed rows) and transpose the results (so that it goes back |
1177 // in normal/row positions). | 1314 // in normal/row positions). |
1178 int pass; | 1315 int pass; |
1179 // We need an intermediate buffer between passes. | 1316 // We need an intermediate buffer between passes. |
1180 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); | 1317 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); |
1181 const int16_t *in = input; | 1318 const int16_t *in = input; |
1182 int16_t *out = intermediate; | 1319 int16_t *out = intermediate; |
1183 // Constants | 1320 // Constants |
1184 // When we use them, in one case, they are all the same. In all others | 1321 // When we use them, in one case, they are all the same. In all others |
1185 // it's a pair of them that we need to repeat four times. This is done | 1322 // it's a pair of them that we need to repeat four times. This is done |
1186 // by constructing the 32 bit constant corresponding to that pair. | 1323 // by constructing the 32 bit constant corresponding to that pair. |
1187 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | 1324 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); |
1188 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 1325 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
1189 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | 1326 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
1190 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); | 1327 const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); |
1191 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 1328 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
1192 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); | 1329 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); |
1193 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); | 1330 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); |
1194 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); | 1331 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); |
1195 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 1332 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
1196 const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); | 1333 const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); |
1197 const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); | 1334 const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); |
1198 const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); | 1335 const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); |
1199 const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); | 1336 const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); |
1200 const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); | 1337 const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); |
(...skipping 305 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1506 step3_7 = _mm_add_epi16(step1_7, step2_4); | 1643 step3_7 = _mm_add_epi16(step1_7, step2_4); |
1507 } | 1644 } |
1508 // step 4 | 1645 // step 4 |
1509 { | 1646 { |
1510 const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); | 1647 const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); |
1511 const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); | 1648 const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); |
1512 const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); | 1649 const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); |
1513 const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); | 1650 const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); |
1514 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24); | 1651 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24); |
1515 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24); | 1652 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24); |
1516 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08); | 1653 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p24_p08); |
1517 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08); | 1654 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p24_p08); |
1518 // dct_const_round_shift | 1655 // dct_const_round_shift |
1519 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); | 1656 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); |
1520 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); | 1657 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); |
1521 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); | 1658 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); |
1522 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); | 1659 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); |
1523 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); | 1660 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); |
1524 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); | 1661 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); |
1525 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); | 1662 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); |
1526 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); | 1663 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); |
1527 // Combine | 1664 // Combine |
1528 step2_1 = _mm_packs_epi32(w0, w1); | 1665 step2_1 = _mm_packs_epi32(w0, w1); |
1529 step2_2 = _mm_packs_epi32(w2, w3); | 1666 step2_2 = _mm_packs_epi32(w2, w3); |
1530 } | 1667 } |
1531 { | 1668 { |
1532 const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); | 1669 const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); |
1533 const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); | 1670 const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); |
1534 const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); | 1671 const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); |
1535 const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); | 1672 const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); |
1536 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08); | 1673 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08); |
1537 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08); | 1674 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08); |
1538 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24); | 1675 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p08_m24); |
1539 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24); | 1676 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p08_m24); |
1540 // dct_const_round_shift | 1677 // dct_const_round_shift |
1541 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); | 1678 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); |
1542 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); | 1679 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); |
1543 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); | 1680 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); |
1544 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); | 1681 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); |
1545 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); | 1682 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); |
1546 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); | 1683 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); |
1547 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); | 1684 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); |
1548 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); | 1685 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); |
1549 // Combine | 1686 // Combine |
1550 step2_6 = _mm_packs_epi32(w0, w1); | 1687 step2_6 = _mm_packs_epi32(w0, w1); |
1551 step2_5 = _mm_packs_epi32(w2, w3); | 1688 step2_5 = _mm_packs_epi32(w2, w3); |
1552 } | 1689 } |
1553 // step 5 | 1690 // step 5 |
1554 { | 1691 { |
1555 step1_0 = _mm_add_epi16(step3_0, step2_1); | 1692 step1_0 = _mm_add_epi16(step3_0, step2_1); |
1556 step1_1 = _mm_sub_epi16(step3_0, step2_1); | 1693 step1_1 = _mm_sub_epi16(step3_0, step2_1); |
1557 step1_2 = _mm_sub_epi16(step3_3, step2_2); | 1694 step1_2 = _mm_add_epi16(step3_3, step2_2); |
1558 step1_3 = _mm_add_epi16(step3_3, step2_2); | 1695 step1_3 = _mm_sub_epi16(step3_3, step2_2); |
1559 step1_4 = _mm_add_epi16(step3_4, step2_5); | 1696 step1_4 = _mm_sub_epi16(step3_4, step2_5); |
1560 step1_5 = _mm_sub_epi16(step3_4, step2_5); | 1697 step1_5 = _mm_add_epi16(step3_4, step2_5); |
1561 step1_6 = _mm_sub_epi16(step3_7, step2_6); | 1698 step1_6 = _mm_sub_epi16(step3_7, step2_6); |
1562 step1_7 = _mm_add_epi16(step3_7, step2_6); | 1699 step1_7 = _mm_add_epi16(step3_7, step2_6); |
1563 } | 1700 } |
1564 // step 6 | 1701 // step 6 |
1565 { | 1702 { |
1566 const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); | 1703 const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); |
1567 const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); | 1704 const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); |
1568 const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); | 1705 const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); |
1569 const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); | 1706 const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); |
1570 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02); | 1707 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02); |
(...skipping 270 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1841 right_shift_8x8(res1 + 8, 2); | 1978 right_shift_8x8(res1 + 8, 2); |
1842 } | 1979 } |
1843 | 1980 |
1844 void fdct16_8col(__m128i *in) { | 1981 void fdct16_8col(__m128i *in) { |
1845 // perform 16x16 1-D DCT for 8 columns | 1982 // perform 16x16 1-D DCT for 8 columns |
1846 __m128i i[8], s[8], p[8], t[8], u[16], v[16]; | 1983 __m128i i[8], s[8], p[8], t[8], u[16], v[16]; |
1847 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | 1984 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); |
1848 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 1985 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
1849 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); | 1986 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
1850 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | 1987 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
1851 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); | 1988 const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); |
1852 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 1989 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
1853 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); | 1990 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); |
1854 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); | 1991 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); |
1855 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); | 1992 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); |
1856 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 1993 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
1857 const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); | 1994 const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); |
1858 const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); | 1995 const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); |
1859 const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); | 1996 const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); |
1860 const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); | 1997 const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); |
1861 const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); | 1998 const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); |
(...skipping 183 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2045 p[7] = _mm_add_epi16(s[7], t[4]); | 2182 p[7] = _mm_add_epi16(s[7], t[4]); |
2046 | 2183 |
2047 // stage 4 | 2184 // stage 4 |
2048 u[0] = _mm_unpacklo_epi16(p[1], p[6]); | 2185 u[0] = _mm_unpacklo_epi16(p[1], p[6]); |
2049 u[1] = _mm_unpackhi_epi16(p[1], p[6]); | 2186 u[1] = _mm_unpackhi_epi16(p[1], p[6]); |
2050 u[2] = _mm_unpacklo_epi16(p[2], p[5]); | 2187 u[2] = _mm_unpacklo_epi16(p[2], p[5]); |
2051 u[3] = _mm_unpackhi_epi16(p[2], p[5]); | 2188 u[3] = _mm_unpackhi_epi16(p[2], p[5]); |
2052 | 2189 |
2053 v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); | 2190 v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); |
2054 v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); | 2191 v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); |
2055 v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08); | 2192 v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08); |
2056 v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08); | 2193 v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08); |
2057 v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24); | 2194 v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24); |
2058 v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24); | 2195 v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24); |
2059 v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); | 2196 v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); |
2060 v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); | 2197 v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); |
2061 | 2198 |
2062 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); | 2199 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); |
2063 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); | 2200 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); |
2064 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); | 2201 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); |
2065 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); | 2202 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); |
2066 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); | 2203 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); |
2067 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); | 2204 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); |
2068 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); | 2205 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); |
2069 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); | 2206 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); |
2070 | 2207 |
2071 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); | 2208 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); |
2072 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); | 2209 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); |
2073 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); | 2210 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); |
2074 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); | 2211 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); |
2075 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); | 2212 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); |
2076 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); | 2213 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); |
2077 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); | 2214 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); |
2078 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); | 2215 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); |
2079 | 2216 |
2080 t[1] = _mm_packs_epi32(v[0], v[1]); | 2217 t[1] = _mm_packs_epi32(v[0], v[1]); |
2081 t[2] = _mm_packs_epi32(v[2], v[3]); | 2218 t[2] = _mm_packs_epi32(v[2], v[3]); |
2082 t[5] = _mm_packs_epi32(v[4], v[5]); | 2219 t[5] = _mm_packs_epi32(v[4], v[5]); |
2083 t[6] = _mm_packs_epi32(v[6], v[7]); | 2220 t[6] = _mm_packs_epi32(v[6], v[7]); |
2084 | 2221 |
2085 // stage 5 | 2222 // stage 5 |
2086 s[0] = _mm_add_epi16(p[0], t[1]); | 2223 s[0] = _mm_add_epi16(p[0], t[1]); |
2087 s[1] = _mm_sub_epi16(p[0], t[1]); | 2224 s[1] = _mm_sub_epi16(p[0], t[1]); |
2088 s[2] = _mm_sub_epi16(p[3], t[2]); | 2225 s[2] = _mm_add_epi16(p[3], t[2]); |
2089 s[3] = _mm_add_epi16(p[3], t[2]); | 2226 s[3] = _mm_sub_epi16(p[3], t[2]); |
2090 s[4] = _mm_add_epi16(p[4], t[5]); | 2227 s[4] = _mm_sub_epi16(p[4], t[5]); |
2091 s[5] = _mm_sub_epi16(p[4], t[5]); | 2228 s[5] = _mm_add_epi16(p[4], t[5]); |
2092 s[6] = _mm_sub_epi16(p[7], t[6]); | 2229 s[6] = _mm_sub_epi16(p[7], t[6]); |
2093 s[7] = _mm_add_epi16(p[7], t[6]); | 2230 s[7] = _mm_add_epi16(p[7], t[6]); |
2094 | 2231 |
2095 // stage 6 | 2232 // stage 6 |
2096 u[0] = _mm_unpacklo_epi16(s[0], s[7]); | 2233 u[0] = _mm_unpacklo_epi16(s[0], s[7]); |
2097 u[1] = _mm_unpackhi_epi16(s[0], s[7]); | 2234 u[1] = _mm_unpackhi_epi16(s[0], s[7]); |
2098 u[2] = _mm_unpacklo_epi16(s[1], s[6]); | 2235 u[2] = _mm_unpacklo_epi16(s[1], s[6]); |
2099 u[3] = _mm_unpackhi_epi16(s[1], s[6]); | 2236 u[3] = _mm_unpackhi_epi16(s[1], s[6]); |
2100 u[4] = _mm_unpacklo_epi16(s[2], s[5]); | 2237 u[4] = _mm_unpacklo_epi16(s[2], s[5]); |
2101 u[5] = _mm_unpackhi_epi16(s[2], s[5]); | 2238 u[5] = _mm_unpackhi_epi16(s[2], s[5]); |
(...skipping 571 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2673 right_shift_16x16(in0, in1); | 2810 right_shift_16x16(in0, in1); |
2674 fadst16_sse2(in0, in1); | 2811 fadst16_sse2(in0, in1); |
2675 write_buffer_16x16(output, in0, in1, 16); | 2812 write_buffer_16x16(output, in0, in1, 16); |
2676 break; | 2813 break; |
2677 default: | 2814 default: |
2678 assert(0); | 2815 assert(0); |
2679 break; | 2816 break; |
2680 } | 2817 } |
2681 } | 2818 } |
2682 | 2819 |
| 2820 void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride) { |
| 2821 __m128i in0, in1, in2, in3; |
| 2822 __m128i u0, u1; |
| 2823 __m128i sum = _mm_setzero_si128(); |
| 2824 int i; |
| 2825 |
| 2826 for (i = 0; i < 8; ++i) { |
| 2827 in0 = _mm_load_si128((const __m128i *)(input + 0)); |
| 2828 in1 = _mm_load_si128((const __m128i *)(input + 8)); |
| 2829 in2 = _mm_load_si128((const __m128i *)(input + 16)); |
| 2830 in3 = _mm_load_si128((const __m128i *)(input + 24)); |
| 2831 |
| 2832 input += stride; |
| 2833 u0 = _mm_add_epi16(in0, in1); |
| 2834 u1 = _mm_add_epi16(in2, in3); |
| 2835 sum = _mm_add_epi16(sum, u0); |
| 2836 |
| 2837 in0 = _mm_load_si128((const __m128i *)(input + 0)); |
| 2838 in1 = _mm_load_si128((const __m128i *)(input + 8)); |
| 2839 in2 = _mm_load_si128((const __m128i *)(input + 16)); |
| 2840 in3 = _mm_load_si128((const __m128i *)(input + 24)); |
| 2841 |
| 2842 input += stride; |
| 2843 sum = _mm_add_epi16(sum, u1); |
| 2844 u0 = _mm_add_epi16(in0, in1); |
| 2845 u1 = _mm_add_epi16(in2, in3); |
| 2846 sum = _mm_add_epi16(sum, u0); |
| 2847 |
| 2848 in0 = _mm_load_si128((const __m128i *)(input + 0)); |
| 2849 in1 = _mm_load_si128((const __m128i *)(input + 8)); |
| 2850 in2 = _mm_load_si128((const __m128i *)(input + 16)); |
| 2851 in3 = _mm_load_si128((const __m128i *)(input + 24)); |
| 2852 |
| 2853 input += stride; |
| 2854 sum = _mm_add_epi16(sum, u1); |
| 2855 u0 = _mm_add_epi16(in0, in1); |
| 2856 u1 = _mm_add_epi16(in2, in3); |
| 2857 sum = _mm_add_epi16(sum, u0); |
| 2858 |
| 2859 in0 = _mm_load_si128((const __m128i *)(input + 0)); |
| 2860 in1 = _mm_load_si128((const __m128i *)(input + 8)); |
| 2861 in2 = _mm_load_si128((const __m128i *)(input + 16)); |
| 2862 in3 = _mm_load_si128((const __m128i *)(input + 24)); |
| 2863 |
| 2864 input += stride; |
| 2865 sum = _mm_add_epi16(sum, u1); |
| 2866 u0 = _mm_add_epi16(in0, in1); |
| 2867 u1 = _mm_add_epi16(in2, in3); |
| 2868 sum = _mm_add_epi16(sum, u0); |
| 2869 |
| 2870 sum = _mm_add_epi16(sum, u1); |
| 2871 } |
| 2872 |
| 2873 u0 = _mm_setzero_si128(); |
| 2874 in0 = _mm_unpacklo_epi16(u0, sum); |
| 2875 in1 = _mm_unpackhi_epi16(u0, sum); |
| 2876 in0 = _mm_srai_epi32(in0, 16); |
| 2877 in1 = _mm_srai_epi32(in1, 16); |
| 2878 |
| 2879 sum = _mm_add_epi32(in0, in1); |
| 2880 in0 = _mm_unpacklo_epi32(sum, u0); |
| 2881 in1 = _mm_unpackhi_epi32(sum, u0); |
| 2882 |
| 2883 sum = _mm_add_epi32(in0, in1); |
| 2884 in0 = _mm_srli_si128(sum, 8); |
| 2885 |
| 2886 in1 = _mm_add_epi32(sum, in0); |
| 2887 in1 = _mm_srai_epi32(in1, 3); |
| 2888 _mm_store_si128((__m128i *)(output), in1); |
| 2889 } |
| 2890 |
2683 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2 | 2891 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2 |
2684 #define FDCT32x32_HIGH_PRECISION 0 | 2892 #define FDCT32x32_HIGH_PRECISION 0 |
2685 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" | 2893 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" |
2686 #undef FDCT32x32_2D | 2894 #undef FDCT32x32_2D |
2687 #undef FDCT32x32_HIGH_PRECISION | 2895 #undef FDCT32x32_HIGH_PRECISION |
2688 | 2896 |
2689 #define FDCT32x32_2D vp9_fdct32x32_sse2 | 2897 #define FDCT32x32_2D vp9_fdct32x32_sse2 |
2690 #define FDCT32x32_HIGH_PRECISION 1 | 2898 #define FDCT32x32_HIGH_PRECISION 1 |
2691 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT | 2899 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT |
2692 #undef FDCT32x32_2D | 2900 #undef FDCT32x32_2D |
2693 #undef FDCT32x32_HIGH_PRECISION | 2901 #undef FDCT32x32_HIGH_PRECISION |
OLD | NEW |