OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
| 11 #include "./vpx_dsp_rtcd.h" |
11 #include "vpx_dsp/x86/inv_txfm_sse2.h" | 12 #include "vpx_dsp/x86/inv_txfm_sse2.h" |
12 #include "vpx_dsp/x86/txfm_common_sse2.h" | 13 #include "vpx_dsp/x86/txfm_common_sse2.h" |
13 | 14 |
14 #define RECON_AND_STORE4X4(dest, in_x) \ | 15 #define RECON_AND_STORE4X4(dest, in_x) \ |
15 { \ | 16 { \ |
16 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ | 17 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ |
17 d0 = _mm_unpacklo_epi8(d0, zero); \ | 18 d0 = _mm_unpacklo_epi8(d0, zero); \ |
18 d0 = _mm_add_epi16(in_x, d0); \ | 19 d0 = _mm_add_epi16(in_x, d0); \ |
19 d0 = _mm_packus_epi16(d0, d0); \ | 20 d0 = _mm_packus_epi16(d0, d0); \ |
20 *(int *)(dest) = _mm_cvtsi128_si32(d0); \ | 21 *(int *)(dest) = _mm_cvtsi128_si32(d0); \ |
21 } | 22 } |
22 | 23 |
23 void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 24 void vpx_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
24 const __m128i zero = _mm_setzero_si128(); | 25 const __m128i zero = _mm_setzero_si128(); |
25 const __m128i eight = _mm_set1_epi16(8); | 26 const __m128i eight = _mm_set1_epi16(8); |
26 const __m128i cst = _mm_setr_epi16( | 27 const __m128i cst = _mm_setr_epi16( |
27 (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64, | 28 (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64, |
28 (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64, | 29 (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64, |
29 (int16_t)cospi_8_64, (int16_t)cospi_24_64); | 30 (int16_t)cospi_8_64, (int16_t)cospi_24_64); |
30 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 31 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
31 __m128i input0, input1, input2, input3; | 32 __m128i input0, input1, input2, input3; |
32 | 33 |
33 // Rows | 34 // Rows |
(...skipping 109 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
143 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); | 144 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); |
144 // store input2 | 145 // store input2 |
145 d0 = _mm_srli_si128(d0, 4); | 146 d0 = _mm_srli_si128(d0, 4); |
146 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); | 147 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); |
147 // store input3 | 148 // store input3 |
148 d0 = _mm_srli_si128(d0, 4); | 149 d0 = _mm_srli_si128(d0, 4); |
149 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); | 150 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); |
150 } | 151 } |
151 } | 152 } |
152 | 153 |
153 void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 154 void vpx_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
154 __m128i dc_value; | 155 __m128i dc_value; |
155 const __m128i zero = _mm_setzero_si128(); | 156 const __m128i zero = _mm_setzero_si128(); |
156 int a; | 157 int a; |
157 | 158 |
158 a = dct_const_round_shift(input[0] * cospi_16_64); | 159 a = dct_const_round_shift(input[0] * cospi_16_64); |
159 a = dct_const_round_shift(a * cospi_16_64); | 160 a = dct_const_round_shift(a * cospi_16_64); |
160 a = ROUND_POWER_OF_TWO(a, 4); | 161 a = ROUND_POWER_OF_TWO(a, 4); |
161 | 162 |
162 dc_value = _mm_set1_epi16(a); | 163 dc_value = _mm_set1_epi16(a); |
163 | 164 |
(...skipping 277 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
441 out0 = _mm_adds_epi16(stp1_0, stp2_7); \ | 442 out0 = _mm_adds_epi16(stp1_0, stp2_7); \ |
442 out1 = _mm_adds_epi16(stp1_1, stp1_6); \ | 443 out1 = _mm_adds_epi16(stp1_1, stp1_6); \ |
443 out2 = _mm_adds_epi16(stp1_2, stp1_5); \ | 444 out2 = _mm_adds_epi16(stp1_2, stp1_5); \ |
444 out3 = _mm_adds_epi16(stp1_3, stp2_4); \ | 445 out3 = _mm_adds_epi16(stp1_3, stp2_4); \ |
445 out4 = _mm_subs_epi16(stp1_3, stp2_4); \ | 446 out4 = _mm_subs_epi16(stp1_3, stp2_4); \ |
446 out5 = _mm_subs_epi16(stp1_2, stp1_5); \ | 447 out5 = _mm_subs_epi16(stp1_2, stp1_5); \ |
447 out6 = _mm_subs_epi16(stp1_1, stp1_6); \ | 448 out6 = _mm_subs_epi16(stp1_1, stp1_6); \ |
448 out7 = _mm_subs_epi16(stp1_0, stp2_7); \ | 449 out7 = _mm_subs_epi16(stp1_0, stp2_7); \ |
449 } | 450 } |
450 | 451 |
451 void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 452 void vpx_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
452 const __m128i zero = _mm_setzero_si128(); | 453 const __m128i zero = _mm_setzero_si128(); |
453 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 454 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
454 const __m128i final_rounding = _mm_set1_epi16(1 << 4); | 455 const __m128i final_rounding = _mm_set1_epi16(1 << 4); |
455 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); | 456 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
456 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); | 457 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
457 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 458 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
458 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); | 459 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); |
459 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); | 460 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
460 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 461 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
461 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 462 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
(...skipping 10 matching lines...) Expand all Loading... |
472 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); | 473 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
473 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); | 474 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
474 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); | 475 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
475 in4 = _mm_load_si128((const __m128i *)(input + 8 * 4)); | 476 in4 = _mm_load_si128((const __m128i *)(input + 8 * 4)); |
476 in5 = _mm_load_si128((const __m128i *)(input + 8 * 5)); | 477 in5 = _mm_load_si128((const __m128i *)(input + 8 * 5)); |
477 in6 = _mm_load_si128((const __m128i *)(input + 8 * 6)); | 478 in6 = _mm_load_si128((const __m128i *)(input + 8 * 6)); |
478 in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); | 479 in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); |
479 | 480 |
480 // 2-D | 481 // 2-D |
481 for (i = 0; i < 2; i++) { | 482 for (i = 0; i < 2; i++) { |
482 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() | 483 // 8x8 Transpose is copied from vpx_fdct8x8_sse2() |
483 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, | 484 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, |
484 in0, in1, in2, in3, in4, in5, in6, in7); | 485 in0, in1, in2, in3, in4, in5, in6, in7); |
485 | 486 |
486 // 4-stage 1D idct8x8 | 487 // 4-stage 1D idct8x8 |
487 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, | 488 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, |
488 in0, in1, in2, in3, in4, in5, in6, in7); | 489 in0, in1, in2, in3, in4, in5, in6, in7); |
489 } | 490 } |
490 | 491 |
491 // Final rounding and shift | 492 // Final rounding and shift |
492 in0 = _mm_adds_epi16(in0, final_rounding); | 493 in0 = _mm_adds_epi16(in0, final_rounding); |
(...skipping 17 matching lines...) Expand all Loading... |
510 RECON_AND_STORE(dest + 0 * stride, in0); | 511 RECON_AND_STORE(dest + 0 * stride, in0); |
511 RECON_AND_STORE(dest + 1 * stride, in1); | 512 RECON_AND_STORE(dest + 1 * stride, in1); |
512 RECON_AND_STORE(dest + 2 * stride, in2); | 513 RECON_AND_STORE(dest + 2 * stride, in2); |
513 RECON_AND_STORE(dest + 3 * stride, in3); | 514 RECON_AND_STORE(dest + 3 * stride, in3); |
514 RECON_AND_STORE(dest + 4 * stride, in4); | 515 RECON_AND_STORE(dest + 4 * stride, in4); |
515 RECON_AND_STORE(dest + 5 * stride, in5); | 516 RECON_AND_STORE(dest + 5 * stride, in5); |
516 RECON_AND_STORE(dest + 6 * stride, in6); | 517 RECON_AND_STORE(dest + 6 * stride, in6); |
517 RECON_AND_STORE(dest + 7 * stride, in7); | 518 RECON_AND_STORE(dest + 7 * stride, in7); |
518 } | 519 } |
519 | 520 |
520 void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 521 void vpx_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
521 __m128i dc_value; | 522 __m128i dc_value; |
522 const __m128i zero = _mm_setzero_si128(); | 523 const __m128i zero = _mm_setzero_si128(); |
523 int a; | 524 int a; |
524 | 525 |
525 a = dct_const_round_shift(input[0] * cospi_16_64); | 526 a = dct_const_round_shift(input[0] * cospi_16_64); |
526 a = dct_const_round_shift(a * cospi_16_64); | 527 a = dct_const_round_shift(a * cospi_16_64); |
527 a = ROUND_POWER_OF_TWO(a, 5); | 528 a = ROUND_POWER_OF_TWO(a, 5); |
528 | 529 |
529 dc_value = _mm_set1_epi16(a); | 530 dc_value = _mm_set1_epi16(a); |
530 | 531 |
(...skipping 16 matching lines...) Expand all Loading... |
547 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); | 548 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
548 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 549 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
549 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 550 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
550 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); | 551 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
551 | 552 |
552 __m128i in0, in1, in2, in3, in4, in5, in6, in7; | 553 __m128i in0, in1, in2, in3, in4, in5, in6, in7; |
553 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; | 554 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; |
554 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; | 555 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; |
555 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | 556 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
556 | 557 |
557 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() | 558 // 8x8 Transpose is copied from vpx_fdct8x8_sse2() |
558 TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], | 559 TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], |
559 in0, in1, in2, in3, in4, in5, in6, in7); | 560 in0, in1, in2, in3, in4, in5, in6, in7); |
560 | 561 |
561 // 4-stage 1D idct8x8 | 562 // 4-stage 1D idct8x8 |
562 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, | 563 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, |
563 in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); | 564 in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); |
564 } | 565 } |
565 | 566 |
566 void iadst8_sse2(__m128i *in) { | 567 void iadst8_sse2(__m128i *in) { |
567 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); | 568 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); |
(...skipping 216 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
784 in[0] = s0; | 785 in[0] = s0; |
785 in[1] = _mm_sub_epi16(k__const_0, s4); | 786 in[1] = _mm_sub_epi16(k__const_0, s4); |
786 in[2] = s6; | 787 in[2] = s6; |
787 in[3] = _mm_sub_epi16(k__const_0, s2); | 788 in[3] = _mm_sub_epi16(k__const_0, s2); |
788 in[4] = s3; | 789 in[4] = s3; |
789 in[5] = _mm_sub_epi16(k__const_0, s7); | 790 in[5] = _mm_sub_epi16(k__const_0, s7); |
790 in[6] = s5; | 791 in[6] = s5; |
791 in[7] = _mm_sub_epi16(k__const_0, s1); | 792 in[7] = _mm_sub_epi16(k__const_0, s1); |
792 } | 793 } |
793 | 794 |
794 void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 795 void vpx_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
795 const __m128i zero = _mm_setzero_si128(); | 796 const __m128i zero = _mm_setzero_si128(); |
796 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 797 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
797 const __m128i final_rounding = _mm_set1_epi16(1 << 4); | 798 const __m128i final_rounding = _mm_set1_epi16(1 << 4); |
798 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); | 799 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
799 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); | 800 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
800 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 801 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
801 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); | 802 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); |
802 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); | 803 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
803 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 804 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
804 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 805 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
(...skipping 356 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1161 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ | 1162 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ |
1162 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ | 1163 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ |
1163 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ | 1164 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ |
1164 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ | 1165 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ |
1165 \ | 1166 \ |
1166 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ | 1167 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ |
1167 stg6_0, stg4_0, stg6_0, stg4_0, \ | 1168 stg6_0, stg4_0, stg6_0, stg4_0, \ |
1168 stp2_10, stp2_13, stp2_11, stp2_12) \ | 1169 stp2_10, stp2_13, stp2_11, stp2_12) \ |
1169 } | 1170 } |
1170 | 1171 |
1171 void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, | 1172 void vpx_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, |
1172 int stride) { | 1173 int stride) { |
1173 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 1174 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
1174 const __m128i final_rounding = _mm_set1_epi16(1 << 5); | 1175 const __m128i final_rounding = _mm_set1_epi16(1 << 5); |
1175 const __m128i zero = _mm_setzero_si128(); | 1176 const __m128i zero = _mm_setzero_si128(); |
1176 | 1177 |
1177 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); | 1178 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
1178 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); | 1179 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); |
1179 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); | 1180 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
1180 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); | 1181 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); |
1181 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); | 1182 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
(...skipping 104 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1286 // Final rounding and shift | 1287 // Final rounding and shift |
1287 in[j] = _mm_adds_epi16(in[j], final_rounding); | 1288 in[j] = _mm_adds_epi16(in[j], final_rounding); |
1288 in[j] = _mm_srai_epi16(in[j], 6); | 1289 in[j] = _mm_srai_epi16(in[j], 6); |
1289 RECON_AND_STORE(dest + j * stride, in[j]); | 1290 RECON_AND_STORE(dest + j * stride, in[j]); |
1290 } | 1291 } |
1291 | 1292 |
1292 dest += 8; | 1293 dest += 8; |
1293 } | 1294 } |
1294 } | 1295 } |
1295 | 1296 |
1296 void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 1297 void vpx_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
1297 __m128i dc_value; | 1298 __m128i dc_value; |
1298 const __m128i zero = _mm_setzero_si128(); | 1299 const __m128i zero = _mm_setzero_si128(); |
1299 int a, i; | 1300 int a, i; |
1300 | 1301 |
1301 a = dct_const_round_shift(input[0] * cospi_16_64); | 1302 a = dct_const_round_shift(input[0] * cospi_16_64); |
1302 a = dct_const_round_shift(a * cospi_16_64); | 1303 a = dct_const_round_shift(a * cospi_16_64); |
1303 a = ROUND_POWER_OF_TWO(a, 6); | 1304 a = ROUND_POWER_OF_TWO(a, 6); |
1304 | 1305 |
1305 dc_value = _mm_set1_epi16(a); | 1306 dc_value = _mm_set1_epi16(a); |
1306 | 1307 |
(...skipping 837 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2144 idct16_8col(in0); | 2145 idct16_8col(in0); |
2145 idct16_8col(in1); | 2146 idct16_8col(in1); |
2146 } | 2147 } |
2147 | 2148 |
2148 void iadst16_sse2(__m128i *in0, __m128i *in1) { | 2149 void iadst16_sse2(__m128i *in0, __m128i *in1) { |
2149 array_transpose_16x16(in0, in1); | 2150 array_transpose_16x16(in0, in1); |
2150 iadst16_8col(in0); | 2151 iadst16_8col(in0); |
2151 iadst16_8col(in1); | 2152 iadst16_8col(in1); |
2152 } | 2153 } |
2153 | 2154 |
2154 void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, | 2155 void vpx_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, |
2155 int stride) { | 2156 int stride) { |
2156 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 2157 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
2157 const __m128i final_rounding = _mm_set1_epi16(1 << 5); | 2158 const __m128i final_rounding = _mm_set1_epi16(1 << 5); |
2158 const __m128i zero = _mm_setzero_si128(); | 2159 const __m128i zero = _mm_setzero_si128(); |
2159 | 2160 |
2160 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); | 2161 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
2161 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); | 2162 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); |
2162 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); | 2163 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
2163 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); | 2164 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); |
2164 | 2165 |
(...skipping 856 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3021 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ | 3022 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ |
3022 stp1_23, stp1_24) \ | 3023 stp1_23, stp1_24) \ |
3023 \ | 3024 \ |
3024 stp1_28 = stp2_28; \ | 3025 stp1_28 = stp2_28; \ |
3025 stp1_29 = stp2_29; \ | 3026 stp1_29 = stp2_29; \ |
3026 stp1_30 = stp2_30; \ | 3027 stp1_30 = stp2_30; \ |
3027 stp1_31 = stp2_31; \ | 3028 stp1_31 = stp2_31; \ |
3028 } | 3029 } |
3029 | 3030 |
3030 // Only upper-left 8x8 has non-zero coeff | 3031 // Only upper-left 8x8 has non-zero coeff |
3031 void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, | 3032 void vpx_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, |
3032 int stride) { | 3033 int stride) { |
3033 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 3034 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
3034 const __m128i final_rounding = _mm_set1_epi16(1<<5); | 3035 const __m128i final_rounding = _mm_set1_epi16(1<<5); |
3035 | 3036 |
3036 // idct constants for each stage | 3037 // idct constants for each stage |
3037 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); | 3038 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
3038 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); | 3039 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); |
3039 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); | 3040 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); |
3040 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); | 3041 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); |
3041 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); | 3042 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); |
(...skipping 138 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3180 // Final rounding and shift | 3181 // Final rounding and shift |
3181 in[j] = _mm_adds_epi16(in[j], final_rounding); | 3182 in[j] = _mm_adds_epi16(in[j], final_rounding); |
3182 in[j] = _mm_srai_epi16(in[j], 6); | 3183 in[j] = _mm_srai_epi16(in[j], 6); |
3183 RECON_AND_STORE(dest + j * stride, in[j]); | 3184 RECON_AND_STORE(dest + j * stride, in[j]); |
3184 } | 3185 } |
3185 | 3186 |
3186 dest += 8; | 3187 dest += 8; |
3187 } | 3188 } |
3188 } | 3189 } |
3189 | 3190 |
3190 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, | 3191 void vpx_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, |
3191 int stride) { | 3192 int stride) { |
3192 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 3193 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
3193 const __m128i final_rounding = _mm_set1_epi16(1 << 5); | 3194 const __m128i final_rounding = _mm_set1_epi16(1 << 5); |
3194 const __m128i zero = _mm_setzero_si128(); | 3195 const __m128i zero = _mm_setzero_si128(); |
3195 | 3196 |
3196 // idct constants for each stage | 3197 // idct constants for each stage |
3197 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); | 3198 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
3198 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); | 3199 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); |
3199 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); | 3200 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); |
3200 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); | 3201 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); |
(...skipping 255 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3456 // Final rounding and shift | 3457 // Final rounding and shift |
3457 in[j] = _mm_adds_epi16(in[j], final_rounding); | 3458 in[j] = _mm_adds_epi16(in[j], final_rounding); |
3458 in[j] = _mm_srai_epi16(in[j], 6); | 3459 in[j] = _mm_srai_epi16(in[j], 6); |
3459 RECON_AND_STORE(dest + j * stride, in[j]); | 3460 RECON_AND_STORE(dest + j * stride, in[j]); |
3460 } | 3461 } |
3461 | 3462 |
3462 dest += 8; | 3463 dest += 8; |
3463 } | 3464 } |
3464 } | 3465 } |
3465 | 3466 |
3466 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 3467 void vpx_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
3467 __m128i dc_value; | 3468 __m128i dc_value; |
3468 const __m128i zero = _mm_setzero_si128(); | 3469 const __m128i zero = _mm_setzero_si128(); |
3469 int a, i; | 3470 int a, i; |
3470 | 3471 |
3471 a = dct_const_round_shift(input[0] * cospi_16_64); | 3472 a = dct_const_round_shift(input[0] * cospi_16_64); |
3472 a = dct_const_round_shift(a * cospi_16_64); | 3473 a = dct_const_round_shift(a * cospi_16_64); |
3473 a = ROUND_POWER_OF_TWO(a, 6); | 3474 a = ROUND_POWER_OF_TWO(a, 6); |
3474 | 3475 |
3475 dc_value = _mm_set1_epi16(a); | 3476 dc_value = _mm_set1_epi16(a); |
3476 | 3477 |
(...skipping 13 matching lines...) Expand all Loading... |
3490 const __m128i one = _mm_set1_epi16(1); | 3491 const __m128i one = _mm_set1_epi16(1); |
3491 const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); | 3492 const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); |
3492 ubounded = _mm_cmpgt_epi16(value, max); | 3493 ubounded = _mm_cmpgt_epi16(value, max); |
3493 retval = _mm_andnot_si128(ubounded, value); | 3494 retval = _mm_andnot_si128(ubounded, value); |
3494 ubounded = _mm_and_si128(ubounded, max); | 3495 ubounded = _mm_and_si128(ubounded, max); |
3495 retval = _mm_or_si128(retval, ubounded); | 3496 retval = _mm_or_si128(retval, ubounded); |
3496 retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); | 3497 retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); |
3497 return retval; | 3498 return retval; |
3498 } | 3499 } |
3499 | 3500 |
3500 void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, | 3501 void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, |
3501 int stride, int bd) { | 3502 int stride, int bd) { |
3502 tran_low_t out[4 * 4]; | 3503 tran_low_t out[4 * 4]; |
3503 tran_low_t *outptr = out; | 3504 tran_low_t *outptr = out; |
3504 int i, j; | 3505 int i, j; |
3505 __m128i inptr[4]; | 3506 __m128i inptr[4]; |
3506 __m128i sign_bits[2]; | 3507 __m128i sign_bits[2]; |
3507 __m128i temp_mm, min_input, max_input; | 3508 __m128i temp_mm, min_input, max_input; |
3508 int test; | 3509 int test; |
3509 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); | 3510 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
3510 int optimised_cols = 0; | 3511 int optimised_cols = 0; |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3553 _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); | 3554 _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); |
3554 _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); | 3555 _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); |
3555 _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); | 3556 _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); |
3556 } else { | 3557 } else { |
3557 // Set to use the optimised transform for the column | 3558 // Set to use the optimised transform for the column |
3558 optimised_cols = 1; | 3559 optimised_cols = 1; |
3559 } | 3560 } |
3560 } else { | 3561 } else { |
3561 // Run the un-optimised row transform | 3562 // Run the un-optimised row transform |
3562 for (i = 0; i < 4; ++i) { | 3563 for (i = 0; i < 4; ++i) { |
3563 vp9_highbd_idct4_c(input, outptr, bd); | 3564 vpx_highbd_idct4_c(input, outptr, bd); |
3564 input += 4; | 3565 input += 4; |
3565 outptr += 4; | 3566 outptr += 4; |
3566 } | 3567 } |
3567 } | 3568 } |
3568 | 3569 |
3569 if (optimised_cols) { | 3570 if (optimised_cols) { |
3570 idct4_sse2(inptr); | 3571 idct4_sse2(inptr); |
3571 | 3572 |
3572 // Final round and shift | 3573 // Final round and shift |
3573 inptr[0] = _mm_add_epi16(inptr[0], eight); | 3574 inptr[0] = _mm_add_epi16(inptr[0], eight); |
(...skipping 23 matching lines...) Expand all Loading... |
3597 d2 = _mm_srli_si128(d2, 8); | 3598 d2 = _mm_srli_si128(d2, 8); |
3598 _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); | 3599 _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); |
3599 } | 3600 } |
3600 } else { | 3601 } else { |
3601 // Run the un-optimised column transform | 3602 // Run the un-optimised column transform |
3602 tran_low_t temp_in[4], temp_out[4]; | 3603 tran_low_t temp_in[4], temp_out[4]; |
3603 // Columns | 3604 // Columns |
3604 for (i = 0; i < 4; ++i) { | 3605 for (i = 0; i < 4; ++i) { |
3605 for (j = 0; j < 4; ++j) | 3606 for (j = 0; j < 4; ++j) |
3606 temp_in[j] = out[j * 4 + i]; | 3607 temp_in[j] = out[j * 4 + i]; |
3607 vp9_highbd_idct4_c(temp_in, temp_out, bd); | 3608 vpx_highbd_idct4_c(temp_in, temp_out, bd); |
3608 for (j = 0; j < 4; ++j) { | 3609 for (j = 0; j < 4; ++j) { |
3609 dest[j * stride + i] = highbd_clip_pixel_add( | 3610 dest[j * stride + i] = highbd_clip_pixel_add( |
3610 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); | 3611 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); |
3611 } | 3612 } |
3612 } | 3613 } |
3613 } | 3614 } |
3614 } | 3615 } |
3615 | 3616 |
3616 void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, | 3617 void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, |
3617 int stride, int bd) { | 3618 int stride, int bd) { |
3618 tran_low_t out[8 * 8]; | 3619 tran_low_t out[8 * 8]; |
3619 tran_low_t *outptr = out; | 3620 tran_low_t *outptr = out; |
3620 int i, j, test; | 3621 int i, j, test; |
3621 __m128i inptr[8]; | 3622 __m128i inptr[8]; |
3622 __m128i min_input, max_input, temp1, temp2, sign_bits; | 3623 __m128i min_input, max_input, temp1, temp2, sign_bits; |
3623 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); | 3624 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
3624 const __m128i zero = _mm_set1_epi16(0); | 3625 const __m128i zero = _mm_set1_epi16(0); |
3625 const __m128i sixteen = _mm_set1_epi16(16); | 3626 const __m128i sixteen = _mm_set1_epi16(16); |
3626 const __m128i max = _mm_set1_epi16(6201); | 3627 const __m128i max = _mm_set1_epi16(6201); |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3671 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); | 3672 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); |
3672 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); | 3673 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); |
3673 } | 3674 } |
3674 } else { | 3675 } else { |
3675 // Set to use the optimised transform for the column | 3676 // Set to use the optimised transform for the column |
3676 optimised_cols = 1; | 3677 optimised_cols = 1; |
3677 } | 3678 } |
3678 } else { | 3679 } else { |
3679 // Run the un-optimised row transform | 3680 // Run the un-optimised row transform |
3680 for (i = 0; i < 8; ++i) { | 3681 for (i = 0; i < 8; ++i) { |
3681 vp9_highbd_idct8_c(input, outptr, bd); | 3682 vpx_highbd_idct8_c(input, outptr, bd); |
3682 input += 8; | 3683 input += 8; |
3683 outptr += 8; | 3684 outptr += 8; |
3684 } | 3685 } |
3685 } | 3686 } |
3686 | 3687 |
3687 if (optimised_cols) { | 3688 if (optimised_cols) { |
3688 idct8_sse2(inptr); | 3689 idct8_sse2(inptr); |
3689 | 3690 |
3690 // Final round & shift and Reconstruction and Store | 3691 // Final round & shift and Reconstruction and Store |
3691 { | 3692 { |
3692 __m128i d[8]; | 3693 __m128i d[8]; |
3693 for (i = 0; i < 8; i++) { | 3694 for (i = 0; i < 8; i++) { |
3694 inptr[i] = _mm_add_epi16(inptr[i], sixteen); | 3695 inptr[i] = _mm_add_epi16(inptr[i], sixteen); |
3695 d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); | 3696 d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); |
3696 inptr[i] = _mm_srai_epi16(inptr[i], 5); | 3697 inptr[i] = _mm_srai_epi16(inptr[i], 5); |
3697 d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); | 3698 d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); |
3698 // Store | 3699 // Store |
3699 _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); | 3700 _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); |
3700 } | 3701 } |
3701 } | 3702 } |
3702 } else { | 3703 } else { |
3703 // Run the un-optimised column transform | 3704 // Run the un-optimised column transform |
3704 tran_low_t temp_in[8], temp_out[8]; | 3705 tran_low_t temp_in[8], temp_out[8]; |
3705 for (i = 0; i < 8; ++i) { | 3706 for (i = 0; i < 8; ++i) { |
3706 for (j = 0; j < 8; ++j) | 3707 for (j = 0; j < 8; ++j) |
3707 temp_in[j] = out[j * 8 + i]; | 3708 temp_in[j] = out[j * 8 + i]; |
3708 vp9_highbd_idct8_c(temp_in, temp_out, bd); | 3709 vpx_highbd_idct8_c(temp_in, temp_out, bd); |
3709 for (j = 0; j < 8; ++j) { | 3710 for (j = 0; j < 8; ++j) { |
3710 dest[j * stride + i] = highbd_clip_pixel_add( | 3711 dest[j * stride + i] = highbd_clip_pixel_add( |
3711 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); | 3712 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); |
3712 } | 3713 } |
3713 } | 3714 } |
3714 } | 3715 } |
3715 } | 3716 } |
3716 | 3717 |
3717 void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, | 3718 void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, |
3718 int stride, int bd) { | 3719 int stride, int bd) { |
3719 tran_low_t out[8 * 8] = { 0 }; | 3720 tran_low_t out[8 * 8] = { 0 }; |
3720 tran_low_t *outptr = out; | 3721 tran_low_t *outptr = out; |
3721 int i, j, test; | 3722 int i, j, test; |
3722 __m128i inptr[8]; | 3723 __m128i inptr[8]; |
3723 __m128i min_input, max_input, temp1, temp2, sign_bits; | 3724 __m128i min_input, max_input, temp1, temp2, sign_bits; |
3724 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); | 3725 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
3725 const __m128i zero = _mm_set1_epi16(0); | 3726 const __m128i zero = _mm_set1_epi16(0); |
3726 const __m128i sixteen = _mm_set1_epi16(16); | 3727 const __m128i sixteen = _mm_set1_epi16(16); |
3727 const __m128i max = _mm_set1_epi16(6201); | 3728 const __m128i max = _mm_set1_epi16(6201); |
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3775 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); | 3776 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); |
3776 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); | 3777 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); |
3777 } | 3778 } |
3778 } else { | 3779 } else { |
3779 // Set to use the optimised transform for the column | 3780 // Set to use the optimised transform for the column |
3780 optimised_cols = 1; | 3781 optimised_cols = 1; |
3781 } | 3782 } |
3782 } else { | 3783 } else { |
3783 // Run the un-optimised row transform | 3784 // Run the un-optimised row transform |
3784 for (i = 0; i < 4; ++i) { | 3785 for (i = 0; i < 4; ++i) { |
3785 vp9_highbd_idct8_c(input, outptr, bd); | 3786 vpx_highbd_idct8_c(input, outptr, bd); |
3786 input += 8; | 3787 input += 8; |
3787 outptr += 8; | 3788 outptr += 8; |
3788 } | 3789 } |
3789 } | 3790 } |
3790 | 3791 |
3791 if (optimised_cols) { | 3792 if (optimised_cols) { |
3792 idct8_sse2(inptr); | 3793 idct8_sse2(inptr); |
3793 | 3794 |
3794 // Final round & shift and Reconstruction and Store | 3795 // Final round & shift and Reconstruction and Store |
3795 { | 3796 { |
3796 __m128i d[8]; | 3797 __m128i d[8]; |
3797 for (i = 0; i < 8; i++) { | 3798 for (i = 0; i < 8; i++) { |
3798 inptr[i] = _mm_add_epi16(inptr[i], sixteen); | 3799 inptr[i] = _mm_add_epi16(inptr[i], sixteen); |
3799 d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); | 3800 d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); |
3800 inptr[i] = _mm_srai_epi16(inptr[i], 5); | 3801 inptr[i] = _mm_srai_epi16(inptr[i], 5); |
3801 d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); | 3802 d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); |
3802 // Store | 3803 // Store |
3803 _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); | 3804 _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); |
3804 } | 3805 } |
3805 } | 3806 } |
3806 } else { | 3807 } else { |
3807 // Run the un-optimised column transform | 3808 // Run the un-optimised column transform |
3808 tran_low_t temp_in[8], temp_out[8]; | 3809 tran_low_t temp_in[8], temp_out[8]; |
3809 for (i = 0; i < 8; ++i) { | 3810 for (i = 0; i < 8; ++i) { |
3810 for (j = 0; j < 8; ++j) | 3811 for (j = 0; j < 8; ++j) |
3811 temp_in[j] = out[j * 8 + i]; | 3812 temp_in[j] = out[j * 8 + i]; |
3812 vp9_highbd_idct8_c(temp_in, temp_out, bd); | 3813 vpx_highbd_idct8_c(temp_in, temp_out, bd); |
3813 for (j = 0; j < 8; ++j) { | 3814 for (j = 0; j < 8; ++j) { |
3814 dest[j * stride + i] = highbd_clip_pixel_add( | 3815 dest[j * stride + i] = highbd_clip_pixel_add( |
3815 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); | 3816 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); |
3816 } | 3817 } |
3817 } | 3818 } |
3818 } | 3819 } |
3819 } | 3820 } |
3820 | 3821 |
3821 void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, | 3822 void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, |
3822 int stride, int bd) { | 3823 int stride, int bd) { |
3823 tran_low_t out[16 * 16]; | 3824 tran_low_t out[16 * 16]; |
3824 tran_low_t *outptr = out; | 3825 tran_low_t *outptr = out; |
3825 int i, j, test; | 3826 int i, j, test; |
3826 __m128i inptr[32]; | 3827 __m128i inptr[32]; |
3827 __m128i min_input, max_input, temp1, temp2, sign_bits; | 3828 __m128i min_input, max_input, temp1, temp2, sign_bits; |
3828 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); | 3829 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
3829 const __m128i zero = _mm_set1_epi16(0); | 3830 const __m128i zero = _mm_set1_epi16(0); |
3830 const __m128i rounding = _mm_set1_epi16(32); | 3831 const __m128i rounding = _mm_set1_epi16(32); |
3831 const __m128i max = _mm_set1_epi16(3155); | 3832 const __m128i max = _mm_set1_epi16(3155); |
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3884 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); | 3885 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); |
3885 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); | 3886 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); |
3886 } | 3887 } |
3887 } else { | 3888 } else { |
3888 // Set to use the optimised transform for the column | 3889 // Set to use the optimised transform for the column |
3889 optimised_cols = 1; | 3890 optimised_cols = 1; |
3890 } | 3891 } |
3891 } else { | 3892 } else { |
3892 // Run the un-optimised row transform | 3893 // Run the un-optimised row transform |
3893 for (i = 0; i < 16; ++i) { | 3894 for (i = 0; i < 16; ++i) { |
3894 vp9_highbd_idct16_c(input, outptr, bd); | 3895 vpx_highbd_idct16_c(input, outptr, bd); |
3895 input += 16; | 3896 input += 16; |
3896 outptr += 16; | 3897 outptr += 16; |
3897 } | 3898 } |
3898 } | 3899 } |
3899 | 3900 |
3900 if (optimised_cols) { | 3901 if (optimised_cols) { |
3901 idct16_sse2(inptr, inptr + 16); | 3902 idct16_sse2(inptr, inptr + 16); |
3902 | 3903 |
3903 // Final round & shift and Reconstruction and Store | 3904 // Final round & shift and Reconstruction and Store |
3904 { | 3905 { |
(...skipping 11 matching lines...) Expand all Loading... |
3916 _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); | 3917 _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); |
3917 _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); | 3918 _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); |
3918 } | 3919 } |
3919 } | 3920 } |
3920 } else { | 3921 } else { |
3921 // Run the un-optimised column transform | 3922 // Run the un-optimised column transform |
3922 tran_low_t temp_in[16], temp_out[16]; | 3923 tran_low_t temp_in[16], temp_out[16]; |
3923 for (i = 0; i < 16; ++i) { | 3924 for (i = 0; i < 16; ++i) { |
3924 for (j = 0; j < 16; ++j) | 3925 for (j = 0; j < 16; ++j) |
3925 temp_in[j] = out[j * 16 + i]; | 3926 temp_in[j] = out[j * 16 + i]; |
3926 vp9_highbd_idct16_c(temp_in, temp_out, bd); | 3927 vpx_highbd_idct16_c(temp_in, temp_out, bd); |
3927 for (j = 0; j < 16; ++j) { | 3928 for (j = 0; j < 16; ++j) { |
3928 dest[j * stride + i] = highbd_clip_pixel_add( | 3929 dest[j * stride + i] = highbd_clip_pixel_add( |
3929 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); | 3930 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); |
3930 } | 3931 } |
3931 } | 3932 } |
3932 } | 3933 } |
3933 } | 3934 } |
3934 | 3935 |
3935 void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, | 3936 void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, |
3936 int stride, int bd) { | 3937 int stride, int bd) { |
3937 tran_low_t out[16 * 16] = { 0 }; | 3938 tran_low_t out[16 * 16] = { 0 }; |
3938 tran_low_t *outptr = out; | 3939 tran_low_t *outptr = out; |
3939 int i, j, test; | 3940 int i, j, test; |
3940 __m128i inptr[32]; | 3941 __m128i inptr[32]; |
3941 __m128i min_input, max_input, temp1, temp2, sign_bits; | 3942 __m128i min_input, max_input, temp1, temp2, sign_bits; |
3942 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); | 3943 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
3943 const __m128i zero = _mm_set1_epi16(0); | 3944 const __m128i zero = _mm_set1_epi16(0); |
3944 const __m128i rounding = _mm_set1_epi16(32); | 3945 const __m128i rounding = _mm_set1_epi16(32); |
3945 const __m128i max = _mm_set1_epi16(3155); | 3946 const __m128i max = _mm_set1_epi16(3155); |
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4003 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); | 4004 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); |
4004 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); | 4005 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); |
4005 } | 4006 } |
4006 } else { | 4007 } else { |
4007 // Set to use the optimised transform for the column | 4008 // Set to use the optimised transform for the column |
4008 optimised_cols = 1; | 4009 optimised_cols = 1; |
4009 } | 4010 } |
4010 } else { | 4011 } else { |
4011 // Run the un-optimised row transform | 4012 // Run the un-optimised row transform |
4012 for (i = 0; i < 4; ++i) { | 4013 for (i = 0; i < 4; ++i) { |
4013 vp9_highbd_idct16_c(input, outptr, bd); | 4014 vpx_highbd_idct16_c(input, outptr, bd); |
4014 input += 16; | 4015 input += 16; |
4015 outptr += 16; | 4016 outptr += 16; |
4016 } | 4017 } |
4017 } | 4018 } |
4018 | 4019 |
4019 if (optimised_cols) { | 4020 if (optimised_cols) { |
4020 idct16_sse2(inptr, inptr + 16); | 4021 idct16_sse2(inptr, inptr + 16); |
4021 | 4022 |
4022 // Final round & shift and Reconstruction and Store | 4023 // Final round & shift and Reconstruction and Store |
4023 { | 4024 { |
(...skipping 11 matching lines...) Expand all Loading... |
4035 _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); | 4036 _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); |
4036 _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); | 4037 _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); |
4037 } | 4038 } |
4038 } | 4039 } |
4039 } else { | 4040 } else { |
4040 // Run the un-optimised column transform | 4041 // Run the un-optimised column transform |
4041 tran_low_t temp_in[16], temp_out[16]; | 4042 tran_low_t temp_in[16], temp_out[16]; |
4042 for (i = 0; i < 16; ++i) { | 4043 for (i = 0; i < 16; ++i) { |
4043 for (j = 0; j < 16; ++j) | 4044 for (j = 0; j < 16; ++j) |
4044 temp_in[j] = out[j * 16 + i]; | 4045 temp_in[j] = out[j * 16 + i]; |
4045 vp9_highbd_idct16_c(temp_in, temp_out, bd); | 4046 vpx_highbd_idct16_c(temp_in, temp_out, bd); |
4046 for (j = 0; j < 16; ++j) { | 4047 for (j = 0; j < 16; ++j) { |
4047 dest[j * stride + i] = highbd_clip_pixel_add( | 4048 dest[j * stride + i] = highbd_clip_pixel_add( |
4048 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); | 4049 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); |
4049 } | 4050 } |
4050 } | 4051 } |
4051 } | 4052 } |
4052 } | 4053 } |
4053 #endif // CONFIG_VP9_HIGHBITDEPTH | 4054 #endif // CONFIG_VP9_HIGHBITDEPTH |
OLD | NEW |