OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 694 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
705 in[1] = _mm_slli_epi16(in[1], 2); | 705 in[1] = _mm_slli_epi16(in[1], 2); |
706 in[2] = _mm_slli_epi16(in[2], 2); | 706 in[2] = _mm_slli_epi16(in[2], 2); |
707 in[3] = _mm_slli_epi16(in[3], 2); | 707 in[3] = _mm_slli_epi16(in[3], 2); |
708 in[4] = _mm_slli_epi16(in[4], 2); | 708 in[4] = _mm_slli_epi16(in[4], 2); |
709 in[5] = _mm_slli_epi16(in[5], 2); | 709 in[5] = _mm_slli_epi16(in[5], 2); |
710 in[6] = _mm_slli_epi16(in[6], 2); | 710 in[6] = _mm_slli_epi16(in[6], 2); |
711 in[7] = _mm_slli_epi16(in[7], 2); | 711 in[7] = _mm_slli_epi16(in[7], 2); |
712 } | 712 } |
713 | 713 |
714 // right shift and rounding | 714 // right shift and rounding |
715 static INLINE void right_shift_8x8(__m128i *res, int const bit) { | 715 static INLINE void right_shift_8x8(__m128i *res, const int bit) { |
716 const __m128i kOne = _mm_set1_epi16(1); | |
717 const int bit_m02 = bit - 2; | |
718 __m128i sign0 = _mm_srai_epi16(res[0], 15); | 716 __m128i sign0 = _mm_srai_epi16(res[0], 15); |
719 __m128i sign1 = _mm_srai_epi16(res[1], 15); | 717 __m128i sign1 = _mm_srai_epi16(res[1], 15); |
720 __m128i sign2 = _mm_srai_epi16(res[2], 15); | 718 __m128i sign2 = _mm_srai_epi16(res[2], 15); |
721 __m128i sign3 = _mm_srai_epi16(res[3], 15); | 719 __m128i sign3 = _mm_srai_epi16(res[3], 15); |
722 __m128i sign4 = _mm_srai_epi16(res[4], 15); | 720 __m128i sign4 = _mm_srai_epi16(res[4], 15); |
723 __m128i sign5 = _mm_srai_epi16(res[5], 15); | 721 __m128i sign5 = _mm_srai_epi16(res[5], 15); |
724 __m128i sign6 = _mm_srai_epi16(res[6], 15); | 722 __m128i sign6 = _mm_srai_epi16(res[6], 15); |
725 __m128i sign7 = _mm_srai_epi16(res[7], 15); | 723 __m128i sign7 = _mm_srai_epi16(res[7], 15); |
726 | 724 |
727 if (bit_m02 >= 0) { | 725 if (bit == 2) { |
728 __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02); | 726 const __m128i const_rounding = _mm_set1_epi16(1); |
729 res[0] = _mm_add_epi16(res[0], k_const_rounding); | 727 res[0] = _mm_add_epi16(res[0], const_rounding); |
730 res[1] = _mm_add_epi16(res[1], k_const_rounding); | 728 res[1] = _mm_add_epi16(res[1], const_rounding); |
731 res[2] = _mm_add_epi16(res[2], k_const_rounding); | 729 res[2] = _mm_add_epi16(res[2], const_rounding); |
732 res[3] = _mm_add_epi16(res[3], k_const_rounding); | 730 res[3] = _mm_add_epi16(res[3], const_rounding); |
733 res[4] = _mm_add_epi16(res[4], k_const_rounding); | 731 res[4] = _mm_add_epi16(res[4], const_rounding); |
734 res[5] = _mm_add_epi16(res[5], k_const_rounding); | 732 res[5] = _mm_add_epi16(res[5], const_rounding); |
735 res[6] = _mm_add_epi16(res[6], k_const_rounding); | 733 res[6] = _mm_add_epi16(res[6], const_rounding); |
736 res[7] = _mm_add_epi16(res[7], k_const_rounding); | 734 res[7] = _mm_add_epi16(res[7], const_rounding); |
737 } | 735 } |
738 | 736 |
739 res[0] = _mm_sub_epi16(res[0], sign0); | 737 res[0] = _mm_sub_epi16(res[0], sign0); |
740 res[1] = _mm_sub_epi16(res[1], sign1); | 738 res[1] = _mm_sub_epi16(res[1], sign1); |
741 res[2] = _mm_sub_epi16(res[2], sign2); | 739 res[2] = _mm_sub_epi16(res[2], sign2); |
742 res[3] = _mm_sub_epi16(res[3], sign3); | 740 res[3] = _mm_sub_epi16(res[3], sign3); |
743 res[4] = _mm_sub_epi16(res[4], sign4); | 741 res[4] = _mm_sub_epi16(res[4], sign4); |
744 res[5] = _mm_sub_epi16(res[5], sign5); | 742 res[5] = _mm_sub_epi16(res[5], sign5); |
745 res[6] = _mm_sub_epi16(res[6], sign6); | 743 res[6] = _mm_sub_epi16(res[6], sign6); |
746 res[7] = _mm_sub_epi16(res[7], sign7); | 744 res[7] = _mm_sub_epi16(res[7], sign7); |
747 | 745 |
748 res[0] = _mm_srai_epi16(res[0], bit); | 746 if (bit == 1) { |
749 res[1] = _mm_srai_epi16(res[1], bit); | 747 res[0] = _mm_srai_epi16(res[0], 1); |
750 res[2] = _mm_srai_epi16(res[2], bit); | 748 res[1] = _mm_srai_epi16(res[1], 1); |
751 res[3] = _mm_srai_epi16(res[3], bit); | 749 res[2] = _mm_srai_epi16(res[2], 1); |
752 res[4] = _mm_srai_epi16(res[4], bit); | 750 res[3] = _mm_srai_epi16(res[3], 1); |
753 res[5] = _mm_srai_epi16(res[5], bit); | 751 res[4] = _mm_srai_epi16(res[4], 1); |
754 res[6] = _mm_srai_epi16(res[6], bit); | 752 res[5] = _mm_srai_epi16(res[5], 1); |
755 res[7] = _mm_srai_epi16(res[7], bit); | 753 res[6] = _mm_srai_epi16(res[6], 1); |
| 754 res[7] = _mm_srai_epi16(res[7], 1); |
| 755 } else { |
| 756 res[0] = _mm_srai_epi16(res[0], 2); |
| 757 res[1] = _mm_srai_epi16(res[1], 2); |
| 758 res[2] = _mm_srai_epi16(res[2], 2); |
| 759 res[3] = _mm_srai_epi16(res[3], 2); |
| 760 res[4] = _mm_srai_epi16(res[4], 2); |
| 761 res[5] = _mm_srai_epi16(res[5], 2); |
| 762 res[6] = _mm_srai_epi16(res[6], 2); |
| 763 res[7] = _mm_srai_epi16(res[7], 2); |
| 764 } |
756 } | 765 } |
757 | 766 |
758 // write 8x8 array | 767 // write 8x8 array |
759 static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res, | 768 static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res, |
760 int stride) { | 769 int stride) { |
761 store_output(&res[0], (output + 0 * stride)); | 770 store_output(&res[0], (output + 0 * stride)); |
762 store_output(&res[1], (output + 1 * stride)); | 771 store_output(&res[1], (output + 1 * stride)); |
763 store_output(&res[2], (output + 2 * stride)); | 772 store_output(&res[2], (output + 2 * stride)); |
764 store_output(&res[3], (output + 3 * stride)); | 773 store_output(&res[3], (output + 3 * stride)); |
765 store_output(&res[4], (output + 4 * stride)); | 774 store_output(&res[4], (output + 4 * stride)); |
(...skipping 1641 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2407 | 2416 |
2408 #define FDCT32x32_2D vp9_highbd_fdct32x32_sse2 | 2417 #define FDCT32x32_2D vp9_highbd_fdct32x32_sse2 |
2409 #define FDCT32x32_HIGH_PRECISION 1 | 2418 #define FDCT32x32_HIGH_PRECISION 1 |
2410 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT | 2419 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT |
2411 #undef FDCT32x32_2D | 2420 #undef FDCT32x32_2D |
2412 #undef FDCT32x32_HIGH_PRECISION | 2421 #undef FDCT32x32_HIGH_PRECISION |
2413 | 2422 |
2414 #undef DCT_HIGH_BIT_DEPTH | 2423 #undef DCT_HIGH_BIT_DEPTH |
2415 | 2424 |
2416 #endif // CONFIG_VP9_HIGHBITDEPTH | 2425 #endif // CONFIG_VP9_HIGHBITDEPTH |
OLD | NEW |