| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include "./vpx_dsp_rtcd.h" |
| 11 #include "vpx_dsp/x86/inv_txfm_sse2.h" | 12 #include "vpx_dsp/x86/inv_txfm_sse2.h" |
| 12 #include "vpx_dsp/x86/txfm_common_sse2.h" | 13 #include "vpx_dsp/x86/txfm_common_sse2.h" |
| 13 | 14 |
| 14 #define RECON_AND_STORE4X4(dest, in_x) \ | 15 #define RECON_AND_STORE4X4(dest, in_x) \ |
| 15 { \ | 16 { \ |
| 16 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ | 17 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ |
| 17 d0 = _mm_unpacklo_epi8(d0, zero); \ | 18 d0 = _mm_unpacklo_epi8(d0, zero); \ |
| 18 d0 = _mm_add_epi16(in_x, d0); \ | 19 d0 = _mm_add_epi16(in_x, d0); \ |
| 19 d0 = _mm_packus_epi16(d0, d0); \ | 20 d0 = _mm_packus_epi16(d0, d0); \ |
| 20 *(int *)(dest) = _mm_cvtsi128_si32(d0); \ | 21 *(int *)(dest) = _mm_cvtsi128_si32(d0); \ |
| 21 } | 22 } |
| 22 | 23 |
| 23 void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 24 void vpx_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
| 24 const __m128i zero = _mm_setzero_si128(); | 25 const __m128i zero = _mm_setzero_si128(); |
| 25 const __m128i eight = _mm_set1_epi16(8); | 26 const __m128i eight = _mm_set1_epi16(8); |
| 26 const __m128i cst = _mm_setr_epi16( | 27 const __m128i cst = _mm_setr_epi16( |
| 27 (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64, | 28 (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64, |
| 28 (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64, | 29 (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64, |
| 29 (int16_t)cospi_8_64, (int16_t)cospi_24_64); | 30 (int16_t)cospi_8_64, (int16_t)cospi_24_64); |
| 30 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 31 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
| 31 __m128i input0, input1, input2, input3; | 32 __m128i input0, input1, input2, input3; |
| 32 | 33 |
| 33 // Rows | 34 // Rows |
| (...skipping 109 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 143 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); | 144 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); |
| 144 // store input2 | 145 // store input2 |
| 145 d0 = _mm_srli_si128(d0, 4); | 146 d0 = _mm_srli_si128(d0, 4); |
| 146 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); | 147 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); |
| 147 // store input3 | 148 // store input3 |
| 148 d0 = _mm_srli_si128(d0, 4); | 149 d0 = _mm_srli_si128(d0, 4); |
| 149 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); | 150 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); |
| 150 } | 151 } |
| 151 } | 152 } |
| 152 | 153 |
| 153 void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 154 void vpx_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
| 154 __m128i dc_value; | 155 __m128i dc_value; |
| 155 const __m128i zero = _mm_setzero_si128(); | 156 const __m128i zero = _mm_setzero_si128(); |
| 156 int a; | 157 int a; |
| 157 | 158 |
| 158 a = dct_const_round_shift(input[0] * cospi_16_64); | 159 a = dct_const_round_shift(input[0] * cospi_16_64); |
| 159 a = dct_const_round_shift(a * cospi_16_64); | 160 a = dct_const_round_shift(a * cospi_16_64); |
| 160 a = ROUND_POWER_OF_TWO(a, 4); | 161 a = ROUND_POWER_OF_TWO(a, 4); |
| 161 | 162 |
| 162 dc_value = _mm_set1_epi16(a); | 163 dc_value = _mm_set1_epi16(a); |
| 163 | 164 |
| (...skipping 277 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 441 out0 = _mm_adds_epi16(stp1_0, stp2_7); \ | 442 out0 = _mm_adds_epi16(stp1_0, stp2_7); \ |
| 442 out1 = _mm_adds_epi16(stp1_1, stp1_6); \ | 443 out1 = _mm_adds_epi16(stp1_1, stp1_6); \ |
| 443 out2 = _mm_adds_epi16(stp1_2, stp1_5); \ | 444 out2 = _mm_adds_epi16(stp1_2, stp1_5); \ |
| 444 out3 = _mm_adds_epi16(stp1_3, stp2_4); \ | 445 out3 = _mm_adds_epi16(stp1_3, stp2_4); \ |
| 445 out4 = _mm_subs_epi16(stp1_3, stp2_4); \ | 446 out4 = _mm_subs_epi16(stp1_3, stp2_4); \ |
| 446 out5 = _mm_subs_epi16(stp1_2, stp1_5); \ | 447 out5 = _mm_subs_epi16(stp1_2, stp1_5); \ |
| 447 out6 = _mm_subs_epi16(stp1_1, stp1_6); \ | 448 out6 = _mm_subs_epi16(stp1_1, stp1_6); \ |
| 448 out7 = _mm_subs_epi16(stp1_0, stp2_7); \ | 449 out7 = _mm_subs_epi16(stp1_0, stp2_7); \ |
| 449 } | 450 } |
| 450 | 451 |
| 451 void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 452 void vpx_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
| 452 const __m128i zero = _mm_setzero_si128(); | 453 const __m128i zero = _mm_setzero_si128(); |
| 453 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 454 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
| 454 const __m128i final_rounding = _mm_set1_epi16(1 << 4); | 455 const __m128i final_rounding = _mm_set1_epi16(1 << 4); |
| 455 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); | 456 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
| 456 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); | 457 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
| 457 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 458 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
| 458 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); | 459 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); |
| 459 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); | 460 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
| 460 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 461 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
| 461 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 462 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
| (...skipping 10 matching lines...) Expand all Loading... |
| 472 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); | 473 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
| 473 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); | 474 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
| 474 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); | 475 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
| 475 in4 = _mm_load_si128((const __m128i *)(input + 8 * 4)); | 476 in4 = _mm_load_si128((const __m128i *)(input + 8 * 4)); |
| 476 in5 = _mm_load_si128((const __m128i *)(input + 8 * 5)); | 477 in5 = _mm_load_si128((const __m128i *)(input + 8 * 5)); |
| 477 in6 = _mm_load_si128((const __m128i *)(input + 8 * 6)); | 478 in6 = _mm_load_si128((const __m128i *)(input + 8 * 6)); |
| 478 in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); | 479 in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); |
| 479 | 480 |
| 480 // 2-D | 481 // 2-D |
| 481 for (i = 0; i < 2; i++) { | 482 for (i = 0; i < 2; i++) { |
| 482 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() | 483 // 8x8 Transpose is copied from vpx_fdct8x8_sse2() |
| 483 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, | 484 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, |
| 484 in0, in1, in2, in3, in4, in5, in6, in7); | 485 in0, in1, in2, in3, in4, in5, in6, in7); |
| 485 | 486 |
| 486 // 4-stage 1D idct8x8 | 487 // 4-stage 1D idct8x8 |
| 487 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, | 488 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, |
| 488 in0, in1, in2, in3, in4, in5, in6, in7); | 489 in0, in1, in2, in3, in4, in5, in6, in7); |
| 489 } | 490 } |
| 490 | 491 |
| 491 // Final rounding and shift | 492 // Final rounding and shift |
| 492 in0 = _mm_adds_epi16(in0, final_rounding); | 493 in0 = _mm_adds_epi16(in0, final_rounding); |
| (...skipping 17 matching lines...) Expand all Loading... |
| 510 RECON_AND_STORE(dest + 0 * stride, in0); | 511 RECON_AND_STORE(dest + 0 * stride, in0); |
| 511 RECON_AND_STORE(dest + 1 * stride, in1); | 512 RECON_AND_STORE(dest + 1 * stride, in1); |
| 512 RECON_AND_STORE(dest + 2 * stride, in2); | 513 RECON_AND_STORE(dest + 2 * stride, in2); |
| 513 RECON_AND_STORE(dest + 3 * stride, in3); | 514 RECON_AND_STORE(dest + 3 * stride, in3); |
| 514 RECON_AND_STORE(dest + 4 * stride, in4); | 515 RECON_AND_STORE(dest + 4 * stride, in4); |
| 515 RECON_AND_STORE(dest + 5 * stride, in5); | 516 RECON_AND_STORE(dest + 5 * stride, in5); |
| 516 RECON_AND_STORE(dest + 6 * stride, in6); | 517 RECON_AND_STORE(dest + 6 * stride, in6); |
| 517 RECON_AND_STORE(dest + 7 * stride, in7); | 518 RECON_AND_STORE(dest + 7 * stride, in7); |
| 518 } | 519 } |
| 519 | 520 |
| 520 void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 521 void vpx_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
| 521 __m128i dc_value; | 522 __m128i dc_value; |
| 522 const __m128i zero = _mm_setzero_si128(); | 523 const __m128i zero = _mm_setzero_si128(); |
| 523 int a; | 524 int a; |
| 524 | 525 |
| 525 a = dct_const_round_shift(input[0] * cospi_16_64); | 526 a = dct_const_round_shift(input[0] * cospi_16_64); |
| 526 a = dct_const_round_shift(a * cospi_16_64); | 527 a = dct_const_round_shift(a * cospi_16_64); |
| 527 a = ROUND_POWER_OF_TWO(a, 5); | 528 a = ROUND_POWER_OF_TWO(a, 5); |
| 528 | 529 |
| 529 dc_value = _mm_set1_epi16(a); | 530 dc_value = _mm_set1_epi16(a); |
| 530 | 531 |
| (...skipping 16 matching lines...) Expand all Loading... |
| 547 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); | 548 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
| 548 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 549 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
| 549 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 550 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
| 550 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); | 551 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
| 551 | 552 |
| 552 __m128i in0, in1, in2, in3, in4, in5, in6, in7; | 553 __m128i in0, in1, in2, in3, in4, in5, in6, in7; |
| 553 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; | 554 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; |
| 554 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; | 555 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; |
| 555 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | 556 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
| 556 | 557 |
| 557 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() | 558 // 8x8 Transpose is copied from vpx_fdct8x8_sse2() |
| 558 TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], | 559 TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], |
| 559 in0, in1, in2, in3, in4, in5, in6, in7); | 560 in0, in1, in2, in3, in4, in5, in6, in7); |
| 560 | 561 |
| 561 // 4-stage 1D idct8x8 | 562 // 4-stage 1D idct8x8 |
| 562 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, | 563 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, |
| 563 in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); | 564 in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); |
| 564 } | 565 } |
| 565 | 566 |
| 566 void iadst8_sse2(__m128i *in) { | 567 void iadst8_sse2(__m128i *in) { |
| 567 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); | 568 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); |
| (...skipping 216 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 784 in[0] = s0; | 785 in[0] = s0; |
| 785 in[1] = _mm_sub_epi16(k__const_0, s4); | 786 in[1] = _mm_sub_epi16(k__const_0, s4); |
| 786 in[2] = s6; | 787 in[2] = s6; |
| 787 in[3] = _mm_sub_epi16(k__const_0, s2); | 788 in[3] = _mm_sub_epi16(k__const_0, s2); |
| 788 in[4] = s3; | 789 in[4] = s3; |
| 789 in[5] = _mm_sub_epi16(k__const_0, s7); | 790 in[5] = _mm_sub_epi16(k__const_0, s7); |
| 790 in[6] = s5; | 791 in[6] = s5; |
| 791 in[7] = _mm_sub_epi16(k__const_0, s1); | 792 in[7] = _mm_sub_epi16(k__const_0, s1); |
| 792 } | 793 } |
| 793 | 794 |
| 794 void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 795 void vpx_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
| 795 const __m128i zero = _mm_setzero_si128(); | 796 const __m128i zero = _mm_setzero_si128(); |
| 796 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 797 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
| 797 const __m128i final_rounding = _mm_set1_epi16(1 << 4); | 798 const __m128i final_rounding = _mm_set1_epi16(1 << 4); |
| 798 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); | 799 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
| 799 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); | 800 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
| 800 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 801 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
| 801 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); | 802 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); |
| 802 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); | 803 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
| 803 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 804 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
| 804 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 805 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
| (...skipping 356 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1161 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ | 1162 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ |
| 1162 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ | 1163 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ |
| 1163 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ | 1164 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ |
| 1164 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ | 1165 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ |
| 1165 \ | 1166 \ |
| 1166 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ | 1167 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ |
| 1167 stg6_0, stg4_0, stg6_0, stg4_0, \ | 1168 stg6_0, stg4_0, stg6_0, stg4_0, \ |
| 1168 stp2_10, stp2_13, stp2_11, stp2_12) \ | 1169 stp2_10, stp2_13, stp2_11, stp2_12) \ |
| 1169 } | 1170 } |
| 1170 | 1171 |
| 1171 void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, | 1172 void vpx_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, |
| 1172 int stride) { | 1173 int stride) { |
| 1173 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 1174 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
| 1174 const __m128i final_rounding = _mm_set1_epi16(1 << 5); | 1175 const __m128i final_rounding = _mm_set1_epi16(1 << 5); |
| 1175 const __m128i zero = _mm_setzero_si128(); | 1176 const __m128i zero = _mm_setzero_si128(); |
| 1176 | 1177 |
| 1177 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); | 1178 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
| 1178 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); | 1179 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); |
| 1179 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); | 1180 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
| 1180 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); | 1181 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); |
| 1181 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); | 1182 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
| (...skipping 104 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1286 // Final rounding and shift | 1287 // Final rounding and shift |
| 1287 in[j] = _mm_adds_epi16(in[j], final_rounding); | 1288 in[j] = _mm_adds_epi16(in[j], final_rounding); |
| 1288 in[j] = _mm_srai_epi16(in[j], 6); | 1289 in[j] = _mm_srai_epi16(in[j], 6); |
| 1289 RECON_AND_STORE(dest + j * stride, in[j]); | 1290 RECON_AND_STORE(dest + j * stride, in[j]); |
| 1290 } | 1291 } |
| 1291 | 1292 |
| 1292 dest += 8; | 1293 dest += 8; |
| 1293 } | 1294 } |
| 1294 } | 1295 } |
| 1295 | 1296 |
| 1296 void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 1297 void vpx_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
| 1297 __m128i dc_value; | 1298 __m128i dc_value; |
| 1298 const __m128i zero = _mm_setzero_si128(); | 1299 const __m128i zero = _mm_setzero_si128(); |
| 1299 int a, i; | 1300 int a, i; |
| 1300 | 1301 |
| 1301 a = dct_const_round_shift(input[0] * cospi_16_64); | 1302 a = dct_const_round_shift(input[0] * cospi_16_64); |
| 1302 a = dct_const_round_shift(a * cospi_16_64); | 1303 a = dct_const_round_shift(a * cospi_16_64); |
| 1303 a = ROUND_POWER_OF_TWO(a, 6); | 1304 a = ROUND_POWER_OF_TWO(a, 6); |
| 1304 | 1305 |
| 1305 dc_value = _mm_set1_epi16(a); | 1306 dc_value = _mm_set1_epi16(a); |
| 1306 | 1307 |
| (...skipping 837 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2144 idct16_8col(in0); | 2145 idct16_8col(in0); |
| 2145 idct16_8col(in1); | 2146 idct16_8col(in1); |
| 2146 } | 2147 } |
| 2147 | 2148 |
| 2148 void iadst16_sse2(__m128i *in0, __m128i *in1) { | 2149 void iadst16_sse2(__m128i *in0, __m128i *in1) { |
| 2149 array_transpose_16x16(in0, in1); | 2150 array_transpose_16x16(in0, in1); |
| 2150 iadst16_8col(in0); | 2151 iadst16_8col(in0); |
| 2151 iadst16_8col(in1); | 2152 iadst16_8col(in1); |
| 2152 } | 2153 } |
| 2153 | 2154 |
| 2154 void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, | 2155 void vpx_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, |
| 2155 int stride) { | 2156 int stride) { |
| 2156 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 2157 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
| 2157 const __m128i final_rounding = _mm_set1_epi16(1 << 5); | 2158 const __m128i final_rounding = _mm_set1_epi16(1 << 5); |
| 2158 const __m128i zero = _mm_setzero_si128(); | 2159 const __m128i zero = _mm_setzero_si128(); |
| 2159 | 2160 |
| 2160 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); | 2161 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
| 2161 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); | 2162 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); |
| 2162 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); | 2163 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
| 2163 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); | 2164 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); |
| 2164 | 2165 |
| (...skipping 856 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3021 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ | 3022 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ |
| 3022 stp1_23, stp1_24) \ | 3023 stp1_23, stp1_24) \ |
| 3023 \ | 3024 \ |
| 3024 stp1_28 = stp2_28; \ | 3025 stp1_28 = stp2_28; \ |
| 3025 stp1_29 = stp2_29; \ | 3026 stp1_29 = stp2_29; \ |
| 3026 stp1_30 = stp2_30; \ | 3027 stp1_30 = stp2_30; \ |
| 3027 stp1_31 = stp2_31; \ | 3028 stp1_31 = stp2_31; \ |
| 3028 } | 3029 } |
| 3029 | 3030 |
| 3030 // Only upper-left 8x8 has non-zero coeff | 3031 // Only upper-left 8x8 has non-zero coeff |
| 3031 void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, | 3032 void vpx_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, |
| 3032 int stride) { | 3033 int stride) { |
| 3033 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 3034 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
| 3034 const __m128i final_rounding = _mm_set1_epi16(1<<5); | 3035 const __m128i final_rounding = _mm_set1_epi16(1<<5); |
| 3035 | 3036 |
| 3036 // idct constants for each stage | 3037 // idct constants for each stage |
| 3037 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); | 3038 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
| 3038 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); | 3039 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); |
| 3039 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); | 3040 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); |
| 3040 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); | 3041 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); |
| 3041 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); | 3042 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); |
| (...skipping 138 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3180 // Final rounding and shift | 3181 // Final rounding and shift |
| 3181 in[j] = _mm_adds_epi16(in[j], final_rounding); | 3182 in[j] = _mm_adds_epi16(in[j], final_rounding); |
| 3182 in[j] = _mm_srai_epi16(in[j], 6); | 3183 in[j] = _mm_srai_epi16(in[j], 6); |
| 3183 RECON_AND_STORE(dest + j * stride, in[j]); | 3184 RECON_AND_STORE(dest + j * stride, in[j]); |
| 3184 } | 3185 } |
| 3185 | 3186 |
| 3186 dest += 8; | 3187 dest += 8; |
| 3187 } | 3188 } |
| 3188 } | 3189 } |
| 3189 | 3190 |
| 3190 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, | 3191 void vpx_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, |
| 3191 int stride) { | 3192 int stride) { |
| 3192 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 3193 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
| 3193 const __m128i final_rounding = _mm_set1_epi16(1 << 5); | 3194 const __m128i final_rounding = _mm_set1_epi16(1 << 5); |
| 3194 const __m128i zero = _mm_setzero_si128(); | 3195 const __m128i zero = _mm_setzero_si128(); |
| 3195 | 3196 |
| 3196 // idct constants for each stage | 3197 // idct constants for each stage |
| 3197 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); | 3198 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
| 3198 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); | 3199 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); |
| 3199 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); | 3200 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); |
| 3200 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); | 3201 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); |
| (...skipping 255 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3456 // Final rounding and shift | 3457 // Final rounding and shift |
| 3457 in[j] = _mm_adds_epi16(in[j], final_rounding); | 3458 in[j] = _mm_adds_epi16(in[j], final_rounding); |
| 3458 in[j] = _mm_srai_epi16(in[j], 6); | 3459 in[j] = _mm_srai_epi16(in[j], 6); |
| 3459 RECON_AND_STORE(dest + j * stride, in[j]); | 3460 RECON_AND_STORE(dest + j * stride, in[j]); |
| 3460 } | 3461 } |
| 3461 | 3462 |
| 3462 dest += 8; | 3463 dest += 8; |
| 3463 } | 3464 } |
| 3464 } | 3465 } |
| 3465 | 3466 |
| 3466 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 3467 void vpx_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
| 3467 __m128i dc_value; | 3468 __m128i dc_value; |
| 3468 const __m128i zero = _mm_setzero_si128(); | 3469 const __m128i zero = _mm_setzero_si128(); |
| 3469 int a, i; | 3470 int a, i; |
| 3470 | 3471 |
| 3471 a = dct_const_round_shift(input[0] * cospi_16_64); | 3472 a = dct_const_round_shift(input[0] * cospi_16_64); |
| 3472 a = dct_const_round_shift(a * cospi_16_64); | 3473 a = dct_const_round_shift(a * cospi_16_64); |
| 3473 a = ROUND_POWER_OF_TWO(a, 6); | 3474 a = ROUND_POWER_OF_TWO(a, 6); |
| 3474 | 3475 |
| 3475 dc_value = _mm_set1_epi16(a); | 3476 dc_value = _mm_set1_epi16(a); |
| 3476 | 3477 |
| (...skipping 13 matching lines...) Expand all Loading... |
| 3490 const __m128i one = _mm_set1_epi16(1); | 3491 const __m128i one = _mm_set1_epi16(1); |
| 3491 const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); | 3492 const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); |
| 3492 ubounded = _mm_cmpgt_epi16(value, max); | 3493 ubounded = _mm_cmpgt_epi16(value, max); |
| 3493 retval = _mm_andnot_si128(ubounded, value); | 3494 retval = _mm_andnot_si128(ubounded, value); |
| 3494 ubounded = _mm_and_si128(ubounded, max); | 3495 ubounded = _mm_and_si128(ubounded, max); |
| 3495 retval = _mm_or_si128(retval, ubounded); | 3496 retval = _mm_or_si128(retval, ubounded); |
| 3496 retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); | 3497 retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); |
| 3497 return retval; | 3498 return retval; |
| 3498 } | 3499 } |
| 3499 | 3500 |
| 3500 void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, | 3501 void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, |
| 3501 int stride, int bd) { | 3502 int stride, int bd) { |
| 3502 tran_low_t out[4 * 4]; | 3503 tran_low_t out[4 * 4]; |
| 3503 tran_low_t *outptr = out; | 3504 tran_low_t *outptr = out; |
| 3504 int i, j; | 3505 int i, j; |
| 3505 __m128i inptr[4]; | 3506 __m128i inptr[4]; |
| 3506 __m128i sign_bits[2]; | 3507 __m128i sign_bits[2]; |
| 3507 __m128i temp_mm, min_input, max_input; | 3508 __m128i temp_mm, min_input, max_input; |
| 3508 int test; | 3509 int test; |
| 3509 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); | 3510 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
| 3510 int optimised_cols = 0; | 3511 int optimised_cols = 0; |
| (...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3553 _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); | 3554 _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); |
| 3554 _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); | 3555 _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); |
| 3555 _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); | 3556 _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); |
| 3556 } else { | 3557 } else { |
| 3557 // Set to use the optimised transform for the column | 3558 // Set to use the optimised transform for the column |
| 3558 optimised_cols = 1; | 3559 optimised_cols = 1; |
| 3559 } | 3560 } |
| 3560 } else { | 3561 } else { |
| 3561 // Run the un-optimised row transform | 3562 // Run the un-optimised row transform |
| 3562 for (i = 0; i < 4; ++i) { | 3563 for (i = 0; i < 4; ++i) { |
| 3563 vp9_highbd_idct4_c(input, outptr, bd); | 3564 vpx_highbd_idct4_c(input, outptr, bd); |
| 3564 input += 4; | 3565 input += 4; |
| 3565 outptr += 4; | 3566 outptr += 4; |
| 3566 } | 3567 } |
| 3567 } | 3568 } |
| 3568 | 3569 |
| 3569 if (optimised_cols) { | 3570 if (optimised_cols) { |
| 3570 idct4_sse2(inptr); | 3571 idct4_sse2(inptr); |
| 3571 | 3572 |
| 3572 // Final round and shift | 3573 // Final round and shift |
| 3573 inptr[0] = _mm_add_epi16(inptr[0], eight); | 3574 inptr[0] = _mm_add_epi16(inptr[0], eight); |
| (...skipping 23 matching lines...) Expand all Loading... |
| 3597 d2 = _mm_srli_si128(d2, 8); | 3598 d2 = _mm_srli_si128(d2, 8); |
| 3598 _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); | 3599 _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); |
| 3599 } | 3600 } |
| 3600 } else { | 3601 } else { |
| 3601 // Run the un-optimised column transform | 3602 // Run the un-optimised column transform |
| 3602 tran_low_t temp_in[4], temp_out[4]; | 3603 tran_low_t temp_in[4], temp_out[4]; |
| 3603 // Columns | 3604 // Columns |
| 3604 for (i = 0; i < 4; ++i) { | 3605 for (i = 0; i < 4; ++i) { |
| 3605 for (j = 0; j < 4; ++j) | 3606 for (j = 0; j < 4; ++j) |
| 3606 temp_in[j] = out[j * 4 + i]; | 3607 temp_in[j] = out[j * 4 + i]; |
| 3607 vp9_highbd_idct4_c(temp_in, temp_out, bd); | 3608 vpx_highbd_idct4_c(temp_in, temp_out, bd); |
| 3608 for (j = 0; j < 4; ++j) { | 3609 for (j = 0; j < 4; ++j) { |
| 3609 dest[j * stride + i] = highbd_clip_pixel_add( | 3610 dest[j * stride + i] = highbd_clip_pixel_add( |
| 3610 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); | 3611 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); |
| 3611 } | 3612 } |
| 3612 } | 3613 } |
| 3613 } | 3614 } |
| 3614 } | 3615 } |
| 3615 | 3616 |
| 3616 void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, | 3617 void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, |
| 3617 int stride, int bd) { | 3618 int stride, int bd) { |
| 3618 tran_low_t out[8 * 8]; | 3619 tran_low_t out[8 * 8]; |
| 3619 tran_low_t *outptr = out; | 3620 tran_low_t *outptr = out; |
| 3620 int i, j, test; | 3621 int i, j, test; |
| 3621 __m128i inptr[8]; | 3622 __m128i inptr[8]; |
| 3622 __m128i min_input, max_input, temp1, temp2, sign_bits; | 3623 __m128i min_input, max_input, temp1, temp2, sign_bits; |
| 3623 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); | 3624 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
| 3624 const __m128i zero = _mm_set1_epi16(0); | 3625 const __m128i zero = _mm_set1_epi16(0); |
| 3625 const __m128i sixteen = _mm_set1_epi16(16); | 3626 const __m128i sixteen = _mm_set1_epi16(16); |
| 3626 const __m128i max = _mm_set1_epi16(6201); | 3627 const __m128i max = _mm_set1_epi16(6201); |
| (...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3671 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); | 3672 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); |
| 3672 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); | 3673 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); |
| 3673 } | 3674 } |
| 3674 } else { | 3675 } else { |
| 3675 // Set to use the optimised transform for the column | 3676 // Set to use the optimised transform for the column |
| 3676 optimised_cols = 1; | 3677 optimised_cols = 1; |
| 3677 } | 3678 } |
| 3678 } else { | 3679 } else { |
| 3679 // Run the un-optimised row transform | 3680 // Run the un-optimised row transform |
| 3680 for (i = 0; i < 8; ++i) { | 3681 for (i = 0; i < 8; ++i) { |
| 3681 vp9_highbd_idct8_c(input, outptr, bd); | 3682 vpx_highbd_idct8_c(input, outptr, bd); |
| 3682 input += 8; | 3683 input += 8; |
| 3683 outptr += 8; | 3684 outptr += 8; |
| 3684 } | 3685 } |
| 3685 } | 3686 } |
| 3686 | 3687 |
| 3687 if (optimised_cols) { | 3688 if (optimised_cols) { |
| 3688 idct8_sse2(inptr); | 3689 idct8_sse2(inptr); |
| 3689 | 3690 |
| 3690 // Final round & shift and Reconstruction and Store | 3691 // Final round & shift and Reconstruction and Store |
| 3691 { | 3692 { |
| 3692 __m128i d[8]; | 3693 __m128i d[8]; |
| 3693 for (i = 0; i < 8; i++) { | 3694 for (i = 0; i < 8; i++) { |
| 3694 inptr[i] = _mm_add_epi16(inptr[i], sixteen); | 3695 inptr[i] = _mm_add_epi16(inptr[i], sixteen); |
| 3695 d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); | 3696 d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); |
| 3696 inptr[i] = _mm_srai_epi16(inptr[i], 5); | 3697 inptr[i] = _mm_srai_epi16(inptr[i], 5); |
| 3697 d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); | 3698 d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); |
| 3698 // Store | 3699 // Store |
| 3699 _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); | 3700 _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); |
| 3700 } | 3701 } |
| 3701 } | 3702 } |
| 3702 } else { | 3703 } else { |
| 3703 // Run the un-optimised column transform | 3704 // Run the un-optimised column transform |
| 3704 tran_low_t temp_in[8], temp_out[8]; | 3705 tran_low_t temp_in[8], temp_out[8]; |
| 3705 for (i = 0; i < 8; ++i) { | 3706 for (i = 0; i < 8; ++i) { |
| 3706 for (j = 0; j < 8; ++j) | 3707 for (j = 0; j < 8; ++j) |
| 3707 temp_in[j] = out[j * 8 + i]; | 3708 temp_in[j] = out[j * 8 + i]; |
| 3708 vp9_highbd_idct8_c(temp_in, temp_out, bd); | 3709 vpx_highbd_idct8_c(temp_in, temp_out, bd); |
| 3709 for (j = 0; j < 8; ++j) { | 3710 for (j = 0; j < 8; ++j) { |
| 3710 dest[j * stride + i] = highbd_clip_pixel_add( | 3711 dest[j * stride + i] = highbd_clip_pixel_add( |
| 3711 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); | 3712 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); |
| 3712 } | 3713 } |
| 3713 } | 3714 } |
| 3714 } | 3715 } |
| 3715 } | 3716 } |
| 3716 | 3717 |
| 3717 void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, | 3718 void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, |
| 3718 int stride, int bd) { | 3719 int stride, int bd) { |
| 3719 tran_low_t out[8 * 8] = { 0 }; | 3720 tran_low_t out[8 * 8] = { 0 }; |
| 3720 tran_low_t *outptr = out; | 3721 tran_low_t *outptr = out; |
| 3721 int i, j, test; | 3722 int i, j, test; |
| 3722 __m128i inptr[8]; | 3723 __m128i inptr[8]; |
| 3723 __m128i min_input, max_input, temp1, temp2, sign_bits; | 3724 __m128i min_input, max_input, temp1, temp2, sign_bits; |
| 3724 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); | 3725 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
| 3725 const __m128i zero = _mm_set1_epi16(0); | 3726 const __m128i zero = _mm_set1_epi16(0); |
| 3726 const __m128i sixteen = _mm_set1_epi16(16); | 3727 const __m128i sixteen = _mm_set1_epi16(16); |
| 3727 const __m128i max = _mm_set1_epi16(6201); | 3728 const __m128i max = _mm_set1_epi16(6201); |
| (...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3775 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); | 3776 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); |
| 3776 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); | 3777 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); |
| 3777 } | 3778 } |
| 3778 } else { | 3779 } else { |
| 3779 // Set to use the optimised transform for the column | 3780 // Set to use the optimised transform for the column |
| 3780 optimised_cols = 1; | 3781 optimised_cols = 1; |
| 3781 } | 3782 } |
| 3782 } else { | 3783 } else { |
| 3783 // Run the un-optimised row transform | 3784 // Run the un-optimised row transform |
| 3784 for (i = 0; i < 4; ++i) { | 3785 for (i = 0; i < 4; ++i) { |
| 3785 vp9_highbd_idct8_c(input, outptr, bd); | 3786 vpx_highbd_idct8_c(input, outptr, bd); |
| 3786 input += 8; | 3787 input += 8; |
| 3787 outptr += 8; | 3788 outptr += 8; |
| 3788 } | 3789 } |
| 3789 } | 3790 } |
| 3790 | 3791 |
| 3791 if (optimised_cols) { | 3792 if (optimised_cols) { |
| 3792 idct8_sse2(inptr); | 3793 idct8_sse2(inptr); |
| 3793 | 3794 |
| 3794 // Final round & shift and Reconstruction and Store | 3795 // Final round & shift and Reconstruction and Store |
| 3795 { | 3796 { |
| 3796 __m128i d[8]; | 3797 __m128i d[8]; |
| 3797 for (i = 0; i < 8; i++) { | 3798 for (i = 0; i < 8; i++) { |
| 3798 inptr[i] = _mm_add_epi16(inptr[i], sixteen); | 3799 inptr[i] = _mm_add_epi16(inptr[i], sixteen); |
| 3799 d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); | 3800 d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); |
| 3800 inptr[i] = _mm_srai_epi16(inptr[i], 5); | 3801 inptr[i] = _mm_srai_epi16(inptr[i], 5); |
| 3801 d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); | 3802 d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); |
| 3802 // Store | 3803 // Store |
| 3803 _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); | 3804 _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); |
| 3804 } | 3805 } |
| 3805 } | 3806 } |
| 3806 } else { | 3807 } else { |
| 3807 // Run the un-optimised column transform | 3808 // Run the un-optimised column transform |
| 3808 tran_low_t temp_in[8], temp_out[8]; | 3809 tran_low_t temp_in[8], temp_out[8]; |
| 3809 for (i = 0; i < 8; ++i) { | 3810 for (i = 0; i < 8; ++i) { |
| 3810 for (j = 0; j < 8; ++j) | 3811 for (j = 0; j < 8; ++j) |
| 3811 temp_in[j] = out[j * 8 + i]; | 3812 temp_in[j] = out[j * 8 + i]; |
| 3812 vp9_highbd_idct8_c(temp_in, temp_out, bd); | 3813 vpx_highbd_idct8_c(temp_in, temp_out, bd); |
| 3813 for (j = 0; j < 8; ++j) { | 3814 for (j = 0; j < 8; ++j) { |
| 3814 dest[j * stride + i] = highbd_clip_pixel_add( | 3815 dest[j * stride + i] = highbd_clip_pixel_add( |
| 3815 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); | 3816 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); |
| 3816 } | 3817 } |
| 3817 } | 3818 } |
| 3818 } | 3819 } |
| 3819 } | 3820 } |
| 3820 | 3821 |
| 3821 void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, | 3822 void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, |
| 3822 int stride, int bd) { | 3823 int stride, int bd) { |
| 3823 tran_low_t out[16 * 16]; | 3824 tran_low_t out[16 * 16]; |
| 3824 tran_low_t *outptr = out; | 3825 tran_low_t *outptr = out; |
| 3825 int i, j, test; | 3826 int i, j, test; |
| 3826 __m128i inptr[32]; | 3827 __m128i inptr[32]; |
| 3827 __m128i min_input, max_input, temp1, temp2, sign_bits; | 3828 __m128i min_input, max_input, temp1, temp2, sign_bits; |
| 3828 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); | 3829 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
| 3829 const __m128i zero = _mm_set1_epi16(0); | 3830 const __m128i zero = _mm_set1_epi16(0); |
| 3830 const __m128i rounding = _mm_set1_epi16(32); | 3831 const __m128i rounding = _mm_set1_epi16(32); |
| 3831 const __m128i max = _mm_set1_epi16(3155); | 3832 const __m128i max = _mm_set1_epi16(3155); |
| (...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3884 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); | 3885 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); |
| 3885 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); | 3886 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); |
| 3886 } | 3887 } |
| 3887 } else { | 3888 } else { |
| 3888 // Set to use the optimised transform for the column | 3889 // Set to use the optimised transform for the column |
| 3889 optimised_cols = 1; | 3890 optimised_cols = 1; |
| 3890 } | 3891 } |
| 3891 } else { | 3892 } else { |
| 3892 // Run the un-optimised row transform | 3893 // Run the un-optimised row transform |
| 3893 for (i = 0; i < 16; ++i) { | 3894 for (i = 0; i < 16; ++i) { |
| 3894 vp9_highbd_idct16_c(input, outptr, bd); | 3895 vpx_highbd_idct16_c(input, outptr, bd); |
| 3895 input += 16; | 3896 input += 16; |
| 3896 outptr += 16; | 3897 outptr += 16; |
| 3897 } | 3898 } |
| 3898 } | 3899 } |
| 3899 | 3900 |
| 3900 if (optimised_cols) { | 3901 if (optimised_cols) { |
| 3901 idct16_sse2(inptr, inptr + 16); | 3902 idct16_sse2(inptr, inptr + 16); |
| 3902 | 3903 |
| 3903 // Final round & shift and Reconstruction and Store | 3904 // Final round & shift and Reconstruction and Store |
| 3904 { | 3905 { |
| (...skipping 11 matching lines...) Expand all Loading... |
| 3916 _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); | 3917 _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); |
| 3917 _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); | 3918 _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); |
| 3918 } | 3919 } |
| 3919 } | 3920 } |
| 3920 } else { | 3921 } else { |
| 3921 // Run the un-optimised column transform | 3922 // Run the un-optimised column transform |
| 3922 tran_low_t temp_in[16], temp_out[16]; | 3923 tran_low_t temp_in[16], temp_out[16]; |
| 3923 for (i = 0; i < 16; ++i) { | 3924 for (i = 0; i < 16; ++i) { |
| 3924 for (j = 0; j < 16; ++j) | 3925 for (j = 0; j < 16; ++j) |
| 3925 temp_in[j] = out[j * 16 + i]; | 3926 temp_in[j] = out[j * 16 + i]; |
| 3926 vp9_highbd_idct16_c(temp_in, temp_out, bd); | 3927 vpx_highbd_idct16_c(temp_in, temp_out, bd); |
| 3927 for (j = 0; j < 16; ++j) { | 3928 for (j = 0; j < 16; ++j) { |
| 3928 dest[j * stride + i] = highbd_clip_pixel_add( | 3929 dest[j * stride + i] = highbd_clip_pixel_add( |
| 3929 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); | 3930 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); |
| 3930 } | 3931 } |
| 3931 } | 3932 } |
| 3932 } | 3933 } |
| 3933 } | 3934 } |
| 3934 | 3935 |
| 3935 void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, | 3936 void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, |
| 3936 int stride, int bd) { | 3937 int stride, int bd) { |
| 3937 tran_low_t out[16 * 16] = { 0 }; | 3938 tran_low_t out[16 * 16] = { 0 }; |
| 3938 tran_low_t *outptr = out; | 3939 tran_low_t *outptr = out; |
| 3939 int i, j, test; | 3940 int i, j, test; |
| 3940 __m128i inptr[32]; | 3941 __m128i inptr[32]; |
| 3941 __m128i min_input, max_input, temp1, temp2, sign_bits; | 3942 __m128i min_input, max_input, temp1, temp2, sign_bits; |
| 3942 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); | 3943 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
| 3943 const __m128i zero = _mm_set1_epi16(0); | 3944 const __m128i zero = _mm_set1_epi16(0); |
| 3944 const __m128i rounding = _mm_set1_epi16(32); | 3945 const __m128i rounding = _mm_set1_epi16(32); |
| 3945 const __m128i max = _mm_set1_epi16(3155); | 3946 const __m128i max = _mm_set1_epi16(3155); |
| (...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4003 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); | 4004 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); |
| 4004 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); | 4005 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); |
| 4005 } | 4006 } |
| 4006 } else { | 4007 } else { |
| 4007 // Set to use the optimised transform for the column | 4008 // Set to use the optimised transform for the column |
| 4008 optimised_cols = 1; | 4009 optimised_cols = 1; |
| 4009 } | 4010 } |
| 4010 } else { | 4011 } else { |
| 4011 // Run the un-optimised row transform | 4012 // Run the un-optimised row transform |
| 4012 for (i = 0; i < 4; ++i) { | 4013 for (i = 0; i < 4; ++i) { |
| 4013 vp9_highbd_idct16_c(input, outptr, bd); | 4014 vpx_highbd_idct16_c(input, outptr, bd); |
| 4014 input += 16; | 4015 input += 16; |
| 4015 outptr += 16; | 4016 outptr += 16; |
| 4016 } | 4017 } |
| 4017 } | 4018 } |
| 4018 | 4019 |
| 4019 if (optimised_cols) { | 4020 if (optimised_cols) { |
| 4020 idct16_sse2(inptr, inptr + 16); | 4021 idct16_sse2(inptr, inptr + 16); |
| 4021 | 4022 |
| 4022 // Final round & shift and Reconstruction and Store | 4023 // Final round & shift and Reconstruction and Store |
| 4023 { | 4024 { |
| (...skipping 11 matching lines...) Expand all Loading... |
| 4035 _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); | 4036 _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); |
| 4036 _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); | 4037 _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); |
| 4037 } | 4038 } |
| 4038 } | 4039 } |
| 4039 } else { | 4040 } else { |
| 4040 // Run the un-optimised column transform | 4041 // Run the un-optimised column transform |
| 4041 tran_low_t temp_in[16], temp_out[16]; | 4042 tran_low_t temp_in[16], temp_out[16]; |
| 4042 for (i = 0; i < 16; ++i) { | 4043 for (i = 0; i < 16; ++i) { |
| 4043 for (j = 0; j < 16; ++j) | 4044 for (j = 0; j < 16; ++j) |
| 4044 temp_in[j] = out[j * 16 + i]; | 4045 temp_in[j] = out[j * 16 + i]; |
| 4045 vp9_highbd_idct16_c(temp_in, temp_out, bd); | 4046 vpx_highbd_idct16_c(temp_in, temp_out, bd); |
| 4046 for (j = 0; j < 16; ++j) { | 4047 for (j = 0; j < 16; ++j) { |
| 4047 dest[j * stride + i] = highbd_clip_pixel_add( | 4048 dest[j * stride + i] = highbd_clip_pixel_add( |
| 4048 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); | 4049 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); |
| 4049 } | 4050 } |
| 4050 } | 4051 } |
| 4051 } | 4052 } |
| 4052 } | 4053 } |
| 4053 #endif // CONFIG_VP9_HIGHBITDEPTH | 4054 #endif // CONFIG_VP9_HIGHBITDEPTH |
| OLD | NEW |