| OLD | NEW |
| 1 // Copyright 2012 Google Inc. All Rights Reserved. | 1 // Copyright 2012 Google Inc. All Rights Reserved. |
| 2 // | 2 // |
| 3 // Use of this source code is governed by a BSD-style license | 3 // Use of this source code is governed by a BSD-style license |
| 4 // that can be found in the COPYING file in the root of the source | 4 // that can be found in the COPYING file in the root of the source |
| 5 // tree. An additional intellectual property rights grant can be found | 5 // tree. An additional intellectual property rights grant can be found |
| 6 // in the file PATENTS. All contributing project authors may | 6 // in the file PATENTS. All contributing project authors may |
| 7 // be found in the AUTHORS file in the root of the source tree. | 7 // be found in the AUTHORS file in the root of the source tree. |
| 8 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
| 9 // | 9 // |
| 10 // ARM NEON version of speed-critical encoding functions. | 10 // ARM NEON version of speed-critical encoding functions. |
| 11 // | 11 // |
| 12 // adapted from libvpx (http://www.webmproject.org/code/) | 12 // adapted from libvpx (http://www.webmproject.org/code/) |
| 13 | 13 |
| 14 #include "./dsp.h" | 14 #include "./dsp.h" |
| 15 | 15 |
| 16 #if defined(WEBP_USE_NEON) | 16 #if defined(WEBP_USE_NEON) |
| 17 | 17 |
| 18 #include <assert.h> | 18 #include <assert.h> |
| 19 | 19 |
| 20 #include "./neon.h" | 20 #include "./neon.h" |
| 21 #include "../enc/vp8enci.h" | 21 #include "../enc/vp8i_enc.h" |
| 22 | 22 |
| 23 //------------------------------------------------------------------------------ | 23 //------------------------------------------------------------------------------ |
| 24 // Transforms (Paragraph 14.4) | 24 // Transforms (Paragraph 14.4) |
| 25 | 25 |
| 26 // Inverse transform. | 26 // Inverse transform. |
| 27 // This code is pretty much the same as TransformOne in the dec_neon.c, except | 27 // This code is pretty much the same as TransformOne in the dec_neon.c, except |
| 28 // for subtraction to *ref. See the comments there for algorithmic explanations. | 28 // for subtraction to *ref. See the comments there for algorithmic explanations. |
| 29 | 29 |
| 30 static const int16_t kC1 = 20091; | 30 static const int16_t kC1 = 20091; |
| 31 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above. | 31 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above. |
| (...skipping 707 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 739 } | 739 } |
| 740 | 740 |
| 741 //------------------------------------------------------------------------------ | 741 //------------------------------------------------------------------------------ |
| 742 | 742 |
| 743 static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a, | 743 static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a, |
| 744 const uint8_t* const b, | 744 const uint8_t* const b, |
| 745 uint32x4_t* const sum) { | 745 uint32x4_t* const sum) { |
| 746 const uint8x16_t a0 = vld1q_u8(a); | 746 const uint8x16_t a0 = vld1q_u8(a); |
| 747 const uint8x16_t b0 = vld1q_u8(b); | 747 const uint8x16_t b0 = vld1q_u8(b); |
| 748 const uint8x16_t abs_diff = vabdq_u8(a0, b0); | 748 const uint8x16_t abs_diff = vabdq_u8(a0, b0); |
| 749 uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff)); | 749 const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff), |
| 750 prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff)); | 750 vget_low_u8(abs_diff)); |
| 751 *sum = vpadalq_u16(*sum, prod); // pair-wise add and accumulate | 751 const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff), |
| 752 vget_high_u8(abs_diff)); |
| 753 /* pair-wise adds and widen */ |
| 754 const uint32x4_t sum1 = vpaddlq_u16(prod1); |
| 755 const uint32x4_t sum2 = vpaddlq_u16(prod2); |
| 756 *sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2)); |
| 752 } | 757 } |
| 753 | 758 |
| 754 // Horizontal sum of all four uint32_t values in 'sum'. | 759 // Horizontal sum of all four uint32_t values in 'sum'. |
| 755 static int SumToInt(uint32x4_t sum) { | 760 static int SumToInt(uint32x4_t sum) { |
| 756 const uint64x2_t sum2 = vpaddlq_u32(sum); | 761 const uint64x2_t sum2 = vpaddlq_u32(sum); |
| 757 const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1); | 762 const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1); |
| 758 return (int)sum3; | 763 return (int)sum3; |
| 759 } | 764 } |
| 760 | 765 |
| 761 static int SSE16x16(const uint8_t* a, const uint8_t* b) { | 766 static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) { |
| 762 uint32x4_t sum = vdupq_n_u32(0); | 767 uint32x4_t sum = vdupq_n_u32(0); |
| 763 int y; | 768 int y; |
| 764 for (y = 0; y < 16; ++y) { | 769 for (y = 0; y < 16; ++y) { |
| 765 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); | 770 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); |
| 766 } | 771 } |
| 767 return SumToInt(sum); | 772 return SumToInt(sum); |
| 768 } | 773 } |
| 769 | 774 |
| 770 static int SSE16x8(const uint8_t* a, const uint8_t* b) { | 775 static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) { |
| 771 uint32x4_t sum = vdupq_n_u32(0); | 776 uint32x4_t sum = vdupq_n_u32(0); |
| 772 int y; | 777 int y; |
| 773 for (y = 0; y < 8; ++y) { | 778 for (y = 0; y < 8; ++y) { |
| 774 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); | 779 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); |
| 775 } | 780 } |
| 776 return SumToInt(sum); | 781 return SumToInt(sum); |
| 777 } | 782 } |
| 778 | 783 |
| 779 static int SSE8x8(const uint8_t* a, const uint8_t* b) { | 784 static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) { |
| 780 uint32x4_t sum = vdupq_n_u32(0); | 785 uint32x4_t sum = vdupq_n_u32(0); |
| 781 int y; | 786 int y; |
| 782 for (y = 0; y < 8; ++y) { | 787 for (y = 0; y < 8; ++y) { |
| 783 const uint8x8_t a0 = vld1_u8(a + y * BPS); | 788 const uint8x8_t a0 = vld1_u8(a + y * BPS); |
| 784 const uint8x8_t b0 = vld1_u8(b + y * BPS); | 789 const uint8x8_t b0 = vld1_u8(b + y * BPS); |
| 785 const uint8x8_t abs_diff = vabd_u8(a0, b0); | 790 const uint8x8_t abs_diff = vabd_u8(a0, b0); |
| 786 const uint16x8_t prod = vmull_u8(abs_diff, abs_diff); | 791 const uint16x8_t prod = vmull_u8(abs_diff, abs_diff); |
| 787 sum = vpadalq_u16(sum, prod); | 792 sum = vpadalq_u16(sum, prod); |
| 788 } | 793 } |
| 789 return SumToInt(sum); | 794 return SumToInt(sum); |
| 790 } | 795 } |
| 791 | 796 |
| 792 static int SSE4x4(const uint8_t* a, const uint8_t* b) { | 797 static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) { |
| 793 const uint8x16_t a0 = Load4x4(a); | 798 const uint8x16_t a0 = Load4x4(a); |
| 794 const uint8x16_t b0 = Load4x4(b); | 799 const uint8x16_t b0 = Load4x4(b); |
| 795 const uint8x16_t abs_diff = vabdq_u8(a0, b0); | 800 const uint8x16_t abs_diff = vabdq_u8(a0, b0); |
| 796 uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff)); | 801 const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff), |
| 797 prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff)); | 802 vget_low_u8(abs_diff)); |
| 798 return SumToInt(vpaddlq_u16(prod)); | 803 const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff), |
| 804 vget_high_u8(abs_diff)); |
| 805 /* pair-wise adds and widen */ |
| 806 const uint32x4_t sum1 = vpaddlq_u16(prod1); |
| 807 const uint32x4_t sum2 = vpaddlq_u16(prod2); |
| 808 return SumToInt(vaddq_u32(sum1, sum2)); |
| 799 } | 809 } |
| 800 | 810 |
| 801 //------------------------------------------------------------------------------ | 811 //------------------------------------------------------------------------------ |
| 802 | 812 |
| 803 // Compilation with gcc-4.6.x is problematic for now. | 813 // Compilation with gcc-4.6.x is problematic for now. |
| 804 #if !defined(WORK_AROUND_GCC) | 814 #if !defined(WORK_AROUND_GCC) |
| 805 | 815 |
| 806 static int16x8_t Quantize(int16_t* const in, | 816 static int16x8_t Quantize(int16_t* const in, |
| 807 const VP8Matrix* const mtx, int offset) { | 817 const VP8Matrix* const mtx, int offset) { |
| 808 const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]); | 818 const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]); |
| (...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 896 | 906 |
| 897 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) { | 907 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) { |
| 898 VP8ITransform = ITransform; | 908 VP8ITransform = ITransform; |
| 899 VP8FTransform = FTransform; | 909 VP8FTransform = FTransform; |
| 900 | 910 |
| 901 VP8FTransformWHT = FTransformWHT; | 911 VP8FTransformWHT = FTransformWHT; |
| 902 | 912 |
| 903 VP8TDisto4x4 = Disto4x4; | 913 VP8TDisto4x4 = Disto4x4; |
| 904 VP8TDisto16x16 = Disto16x16; | 914 VP8TDisto16x16 = Disto16x16; |
| 905 VP8CollectHistogram = CollectHistogram; | 915 VP8CollectHistogram = CollectHistogram; |
| 906 VP8SSE16x16 = SSE16x16; | 916 |
| 907 VP8SSE16x8 = SSE16x8; | 917 VP8SSE16x16 = SSE16x16_NEON; |
| 908 VP8SSE8x8 = SSE8x8; | 918 VP8SSE16x8 = SSE16x8_NEON; |
| 909 VP8SSE4x4 = SSE4x4; | 919 VP8SSE8x8 = SSE8x8_NEON; |
| 920 VP8SSE4x4 = SSE4x4_NEON; |
| 921 |
| 910 #if !defined(WORK_AROUND_GCC) | 922 #if !defined(WORK_AROUND_GCC) |
| 911 VP8EncQuantizeBlock = QuantizeBlock; | 923 VP8EncQuantizeBlock = QuantizeBlock; |
| 912 VP8EncQuantize2Blocks = Quantize2Blocks; | 924 VP8EncQuantize2Blocks = Quantize2Blocks; |
| 913 #endif | 925 #endif |
| 914 } | 926 } |
| 915 | 927 |
| 916 #else // !WEBP_USE_NEON | 928 #else // !WEBP_USE_NEON |
| 917 | 929 |
| 918 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON) | 930 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON) |
| 919 | 931 |
| 920 #endif // WEBP_USE_NEON | 932 #endif // WEBP_USE_NEON |
| OLD | NEW |