Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(556)

Side by Side Diff: third_party/libwebp/dsp/enc_neon.c

Issue 2651883004: libwebp-0.6.0-rc1 (Closed)
Patch Set: Created 3 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/libwebp/dsp/enc_msa.c ('k') | third_party/libwebp/dsp/enc_sse2.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 Google Inc. All Rights Reserved. 1 // Copyright 2012 Google Inc. All Rights Reserved.
2 // 2 //
3 // Use of this source code is governed by a BSD-style license 3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source 4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found 5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may 6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree. 7 // be found in the AUTHORS file in the root of the source tree.
8 // ----------------------------------------------------------------------------- 8 // -----------------------------------------------------------------------------
9 // 9 //
10 // ARM NEON version of speed-critical encoding functions. 10 // ARM NEON version of speed-critical encoding functions.
11 // 11 //
12 // adapted from libvpx (http://www.webmproject.org/code/) 12 // adapted from libvpx (http://www.webmproject.org/code/)
13 13
14 #include "./dsp.h" 14 #include "./dsp.h"
15 15
16 #if defined(WEBP_USE_NEON) 16 #if defined(WEBP_USE_NEON)
17 17
18 #include <assert.h> 18 #include <assert.h>
19 19
20 #include "./neon.h" 20 #include "./neon.h"
21 #include "../enc/vp8enci.h" 21 #include "../enc/vp8i_enc.h"
22 22
23 //------------------------------------------------------------------------------ 23 //------------------------------------------------------------------------------
24 // Transforms (Paragraph 14.4) 24 // Transforms (Paragraph 14.4)
25 25
26 // Inverse transform. 26 // Inverse transform.
27 // This code is pretty much the same as TransformOne in the dec_neon.c, except 27 // This code is pretty much the same as TransformOne in the dec_neon.c, except
28 // for subtraction to *ref. See the comments there for algorithmic explanations. 28 // for subtraction to *ref. See the comments there for algorithmic explanations.
29 29
30 static const int16_t kC1 = 20091; 30 static const int16_t kC1 = 20091;
31 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above. 31 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
(...skipping 707 matching lines...) Expand 10 before | Expand all | Expand 10 after
739 } 739 }
740 740
741 //------------------------------------------------------------------------------ 741 //------------------------------------------------------------------------------
742 742
743 static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a, 743 static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
744 const uint8_t* const b, 744 const uint8_t* const b,
745 uint32x4_t* const sum) { 745 uint32x4_t* const sum) {
746 const uint8x16_t a0 = vld1q_u8(a); 746 const uint8x16_t a0 = vld1q_u8(a);
747 const uint8x16_t b0 = vld1q_u8(b); 747 const uint8x16_t b0 = vld1q_u8(b);
748 const uint8x16_t abs_diff = vabdq_u8(a0, b0); 748 const uint8x16_t abs_diff = vabdq_u8(a0, b0);
749 uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff)); 749 const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
750 prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff)); 750 vget_low_u8(abs_diff));
751 *sum = vpadalq_u16(*sum, prod); // pair-wise add and accumulate 751 const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
752 vget_high_u8(abs_diff));
753 /* pair-wise adds and widen */
754 const uint32x4_t sum1 = vpaddlq_u16(prod1);
755 const uint32x4_t sum2 = vpaddlq_u16(prod2);
756 *sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2));
752 } 757 }
753 758
754 // Horizontal sum of all four uint32_t values in 'sum'. 759 // Horizontal sum of all four uint32_t values in 'sum'.
755 static int SumToInt(uint32x4_t sum) { 760 static int SumToInt(uint32x4_t sum) {
756 const uint64x2_t sum2 = vpaddlq_u32(sum); 761 const uint64x2_t sum2 = vpaddlq_u32(sum);
757 const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1); 762 const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
758 return (int)sum3; 763 return (int)sum3;
759 } 764 }
760 765
761 static int SSE16x16(const uint8_t* a, const uint8_t* b) { 766 static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
762 uint32x4_t sum = vdupq_n_u32(0); 767 uint32x4_t sum = vdupq_n_u32(0);
763 int y; 768 int y;
764 for (y = 0; y < 16; ++y) { 769 for (y = 0; y < 16; ++y) {
765 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); 770 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
766 } 771 }
767 return SumToInt(sum); 772 return SumToInt(sum);
768 } 773 }
769 774
770 static int SSE16x8(const uint8_t* a, const uint8_t* b) { 775 static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
771 uint32x4_t sum = vdupq_n_u32(0); 776 uint32x4_t sum = vdupq_n_u32(0);
772 int y; 777 int y;
773 for (y = 0; y < 8; ++y) { 778 for (y = 0; y < 8; ++y) {
774 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); 779 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
775 } 780 }
776 return SumToInt(sum); 781 return SumToInt(sum);
777 } 782 }
778 783
779 static int SSE8x8(const uint8_t* a, const uint8_t* b) { 784 static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
780 uint32x4_t sum = vdupq_n_u32(0); 785 uint32x4_t sum = vdupq_n_u32(0);
781 int y; 786 int y;
782 for (y = 0; y < 8; ++y) { 787 for (y = 0; y < 8; ++y) {
783 const uint8x8_t a0 = vld1_u8(a + y * BPS); 788 const uint8x8_t a0 = vld1_u8(a + y * BPS);
784 const uint8x8_t b0 = vld1_u8(b + y * BPS); 789 const uint8x8_t b0 = vld1_u8(b + y * BPS);
785 const uint8x8_t abs_diff = vabd_u8(a0, b0); 790 const uint8x8_t abs_diff = vabd_u8(a0, b0);
786 const uint16x8_t prod = vmull_u8(abs_diff, abs_diff); 791 const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
787 sum = vpadalq_u16(sum, prod); 792 sum = vpadalq_u16(sum, prod);
788 } 793 }
789 return SumToInt(sum); 794 return SumToInt(sum);
790 } 795 }
791 796
792 static int SSE4x4(const uint8_t* a, const uint8_t* b) { 797 static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
793 const uint8x16_t a0 = Load4x4(a); 798 const uint8x16_t a0 = Load4x4(a);
794 const uint8x16_t b0 = Load4x4(b); 799 const uint8x16_t b0 = Load4x4(b);
795 const uint8x16_t abs_diff = vabdq_u8(a0, b0); 800 const uint8x16_t abs_diff = vabdq_u8(a0, b0);
796 uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff)); 801 const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
797 prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff)); 802 vget_low_u8(abs_diff));
798 return SumToInt(vpaddlq_u16(prod)); 803 const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
804 vget_high_u8(abs_diff));
805 /* pair-wise adds and widen */
806 const uint32x4_t sum1 = vpaddlq_u16(prod1);
807 const uint32x4_t sum2 = vpaddlq_u16(prod2);
808 return SumToInt(vaddq_u32(sum1, sum2));
799 } 809 }
800 810
801 //------------------------------------------------------------------------------ 811 //------------------------------------------------------------------------------
802 812
803 // Compilation with gcc-4.6.x is problematic for now. 813 // Compilation with gcc-4.6.x is problematic for now.
804 #if !defined(WORK_AROUND_GCC) 814 #if !defined(WORK_AROUND_GCC)
805 815
806 static int16x8_t Quantize(int16_t* const in, 816 static int16x8_t Quantize(int16_t* const in,
807 const VP8Matrix* const mtx, int offset) { 817 const VP8Matrix* const mtx, int offset) {
808 const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]); 818 const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
(...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after
896 906
897 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) { 907 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
898 VP8ITransform = ITransform; 908 VP8ITransform = ITransform;
899 VP8FTransform = FTransform; 909 VP8FTransform = FTransform;
900 910
901 VP8FTransformWHT = FTransformWHT; 911 VP8FTransformWHT = FTransformWHT;
902 912
903 VP8TDisto4x4 = Disto4x4; 913 VP8TDisto4x4 = Disto4x4;
904 VP8TDisto16x16 = Disto16x16; 914 VP8TDisto16x16 = Disto16x16;
905 VP8CollectHistogram = CollectHistogram; 915 VP8CollectHistogram = CollectHistogram;
906 VP8SSE16x16 = SSE16x16; 916
907 VP8SSE16x8 = SSE16x8; 917 VP8SSE16x16 = SSE16x16_NEON;
908 VP8SSE8x8 = SSE8x8; 918 VP8SSE16x8 = SSE16x8_NEON;
909 VP8SSE4x4 = SSE4x4; 919 VP8SSE8x8 = SSE8x8_NEON;
920 VP8SSE4x4 = SSE4x4_NEON;
921
910 #if !defined(WORK_AROUND_GCC) 922 #if !defined(WORK_AROUND_GCC)
911 VP8EncQuantizeBlock = QuantizeBlock; 923 VP8EncQuantizeBlock = QuantizeBlock;
912 VP8EncQuantize2Blocks = Quantize2Blocks; 924 VP8EncQuantize2Blocks = Quantize2Blocks;
913 #endif 925 #endif
914 } 926 }
915 927
916 #else // !WEBP_USE_NEON 928 #else // !WEBP_USE_NEON
917 929
918 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON) 930 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
919 931
920 #endif // WEBP_USE_NEON 932 #endif // WEBP_USE_NEON
OLDNEW
« no previous file with comments | « third_party/libwebp/dsp/enc_msa.c ('k') | third_party/libwebp/dsp/enc_sse2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698