| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 694 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 705 in[1] = _mm_slli_epi16(in[1], 2); | 705 in[1] = _mm_slli_epi16(in[1], 2); |
| 706 in[2] = _mm_slli_epi16(in[2], 2); | 706 in[2] = _mm_slli_epi16(in[2], 2); |
| 707 in[3] = _mm_slli_epi16(in[3], 2); | 707 in[3] = _mm_slli_epi16(in[3], 2); |
| 708 in[4] = _mm_slli_epi16(in[4], 2); | 708 in[4] = _mm_slli_epi16(in[4], 2); |
| 709 in[5] = _mm_slli_epi16(in[5], 2); | 709 in[5] = _mm_slli_epi16(in[5], 2); |
| 710 in[6] = _mm_slli_epi16(in[6], 2); | 710 in[6] = _mm_slli_epi16(in[6], 2); |
| 711 in[7] = _mm_slli_epi16(in[7], 2); | 711 in[7] = _mm_slli_epi16(in[7], 2); |
| 712 } | 712 } |
| 713 | 713 |
| 714 // right shift and rounding | 714 // right shift and rounding |
| 715 static INLINE void right_shift_8x8(__m128i *res, int const bit) { | 715 static INLINE void right_shift_8x8(__m128i *res, const int bit) { |
| 716 const __m128i kOne = _mm_set1_epi16(1); | |
| 717 const int bit_m02 = bit - 2; | |
| 718 __m128i sign0 = _mm_srai_epi16(res[0], 15); | 716 __m128i sign0 = _mm_srai_epi16(res[0], 15); |
| 719 __m128i sign1 = _mm_srai_epi16(res[1], 15); | 717 __m128i sign1 = _mm_srai_epi16(res[1], 15); |
| 720 __m128i sign2 = _mm_srai_epi16(res[2], 15); | 718 __m128i sign2 = _mm_srai_epi16(res[2], 15); |
| 721 __m128i sign3 = _mm_srai_epi16(res[3], 15); | 719 __m128i sign3 = _mm_srai_epi16(res[3], 15); |
| 722 __m128i sign4 = _mm_srai_epi16(res[4], 15); | 720 __m128i sign4 = _mm_srai_epi16(res[4], 15); |
| 723 __m128i sign5 = _mm_srai_epi16(res[5], 15); | 721 __m128i sign5 = _mm_srai_epi16(res[5], 15); |
| 724 __m128i sign6 = _mm_srai_epi16(res[6], 15); | 722 __m128i sign6 = _mm_srai_epi16(res[6], 15); |
| 725 __m128i sign7 = _mm_srai_epi16(res[7], 15); | 723 __m128i sign7 = _mm_srai_epi16(res[7], 15); |
| 726 | 724 |
| 727 if (bit_m02 >= 0) { | 725 if (bit == 2) { |
| 728 __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02); | 726 const __m128i const_rounding = _mm_set1_epi16(1); |
| 729 res[0] = _mm_add_epi16(res[0], k_const_rounding); | 727 res[0] = _mm_add_epi16(res[0], const_rounding); |
| 730 res[1] = _mm_add_epi16(res[1], k_const_rounding); | 728 res[1] = _mm_add_epi16(res[1], const_rounding); |
| 731 res[2] = _mm_add_epi16(res[2], k_const_rounding); | 729 res[2] = _mm_add_epi16(res[2], const_rounding); |
| 732 res[3] = _mm_add_epi16(res[3], k_const_rounding); | 730 res[3] = _mm_add_epi16(res[3], const_rounding); |
| 733 res[4] = _mm_add_epi16(res[4], k_const_rounding); | 731 res[4] = _mm_add_epi16(res[4], const_rounding); |
| 734 res[5] = _mm_add_epi16(res[5], k_const_rounding); | 732 res[5] = _mm_add_epi16(res[5], const_rounding); |
| 735 res[6] = _mm_add_epi16(res[6], k_const_rounding); | 733 res[6] = _mm_add_epi16(res[6], const_rounding); |
| 736 res[7] = _mm_add_epi16(res[7], k_const_rounding); | 734 res[7] = _mm_add_epi16(res[7], const_rounding); |
| 737 } | 735 } |
| 738 | 736 |
| 739 res[0] = _mm_sub_epi16(res[0], sign0); | 737 res[0] = _mm_sub_epi16(res[0], sign0); |
| 740 res[1] = _mm_sub_epi16(res[1], sign1); | 738 res[1] = _mm_sub_epi16(res[1], sign1); |
| 741 res[2] = _mm_sub_epi16(res[2], sign2); | 739 res[2] = _mm_sub_epi16(res[2], sign2); |
| 742 res[3] = _mm_sub_epi16(res[3], sign3); | 740 res[3] = _mm_sub_epi16(res[3], sign3); |
| 743 res[4] = _mm_sub_epi16(res[4], sign4); | 741 res[4] = _mm_sub_epi16(res[4], sign4); |
| 744 res[5] = _mm_sub_epi16(res[5], sign5); | 742 res[5] = _mm_sub_epi16(res[5], sign5); |
| 745 res[6] = _mm_sub_epi16(res[6], sign6); | 743 res[6] = _mm_sub_epi16(res[6], sign6); |
| 746 res[7] = _mm_sub_epi16(res[7], sign7); | 744 res[7] = _mm_sub_epi16(res[7], sign7); |
| 747 | 745 |
| 748 res[0] = _mm_srai_epi16(res[0], bit); | 746 if (bit == 1) { |
| 749 res[1] = _mm_srai_epi16(res[1], bit); | 747 res[0] = _mm_srai_epi16(res[0], 1); |
| 750 res[2] = _mm_srai_epi16(res[2], bit); | 748 res[1] = _mm_srai_epi16(res[1], 1); |
| 751 res[3] = _mm_srai_epi16(res[3], bit); | 749 res[2] = _mm_srai_epi16(res[2], 1); |
| 752 res[4] = _mm_srai_epi16(res[4], bit); | 750 res[3] = _mm_srai_epi16(res[3], 1); |
| 753 res[5] = _mm_srai_epi16(res[5], bit); | 751 res[4] = _mm_srai_epi16(res[4], 1); |
| 754 res[6] = _mm_srai_epi16(res[6], bit); | 752 res[5] = _mm_srai_epi16(res[5], 1); |
| 755 res[7] = _mm_srai_epi16(res[7], bit); | 753 res[6] = _mm_srai_epi16(res[6], 1); |
| 754 res[7] = _mm_srai_epi16(res[7], 1); |
| 755 } else { |
| 756 res[0] = _mm_srai_epi16(res[0], 2); |
| 757 res[1] = _mm_srai_epi16(res[1], 2); |
| 758 res[2] = _mm_srai_epi16(res[2], 2); |
| 759 res[3] = _mm_srai_epi16(res[3], 2); |
| 760 res[4] = _mm_srai_epi16(res[4], 2); |
| 761 res[5] = _mm_srai_epi16(res[5], 2); |
| 762 res[6] = _mm_srai_epi16(res[6], 2); |
| 763 res[7] = _mm_srai_epi16(res[7], 2); |
| 764 } |
| 756 } | 765 } |
| 757 | 766 |
| 758 // write 8x8 array | 767 // write 8x8 array |
| 759 static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res, | 768 static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res, |
| 760 int stride) { | 769 int stride) { |
| 761 store_output(&res[0], (output + 0 * stride)); | 770 store_output(&res[0], (output + 0 * stride)); |
| 762 store_output(&res[1], (output + 1 * stride)); | 771 store_output(&res[1], (output + 1 * stride)); |
| 763 store_output(&res[2], (output + 2 * stride)); | 772 store_output(&res[2], (output + 2 * stride)); |
| 764 store_output(&res[3], (output + 3 * stride)); | 773 store_output(&res[3], (output + 3 * stride)); |
| 765 store_output(&res[4], (output + 4 * stride)); | 774 store_output(&res[4], (output + 4 * stride)); |
| (...skipping 1641 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2407 | 2416 |
| 2408 #define FDCT32x32_2D vp9_highbd_fdct32x32_sse2 | 2417 #define FDCT32x32_2D vp9_highbd_fdct32x32_sse2 |
| 2409 #define FDCT32x32_HIGH_PRECISION 1 | 2418 #define FDCT32x32_HIGH_PRECISION 1 |
| 2410 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT | 2419 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT |
| 2411 #undef FDCT32x32_2D | 2420 #undef FDCT32x32_2D |
| 2412 #undef FDCT32x32_HIGH_PRECISION | 2421 #undef FDCT32x32_HIGH_PRECISION |
| 2413 | 2422 |
| 2414 #undef DCT_HIGH_BIT_DEPTH | 2423 #undef DCT_HIGH_BIT_DEPTH |
| 2415 | 2424 |
| 2416 #endif // CONFIG_VP9_HIGHBITDEPTH | 2425 #endif // CONFIG_VP9_HIGHBITDEPTH |
| OLD | NEW |