third_party/libwebp/dsp/enc_neon.c - Issue 2651883004: libwebp-0.6.0-rc1

Side by Side Diff: third_party/libwebp/dsp/enc_neon.c

Issue 2651883004: libwebp-0.6.0-rc1 (Closed)

Patch Set: Created 3 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2012 Google Inc. All Rights Reserved.	1 // Copyright 2012 Google Inc. All Rights Reserved.

2 //	2 //

3 // Use of this source code is governed by a BSD-style license	3 // Use of this source code is governed by a BSD-style license

4 // that can be found in the COPYING file in the root of the source	4 // that can be found in the COPYING file in the root of the source

5 // tree. An additional intellectual property rights grant can be found	5 // tree. An additional intellectual property rights grant can be found

6 // in the file PATENTS. All contributing project authors may	6 // in the file PATENTS. All contributing project authors may

7 // be found in the AUTHORS file in the root of the source tree.	7 // be found in the AUTHORS file in the root of the source tree.

8 // -----------------------------------------------------------------------------	8 // -----------------------------------------------------------------------------

9 //	9 //

10 // ARM NEON version of speed-critical encoding functions.	10 // ARM NEON version of speed-critical encoding functions.

11 //	11 //

12 // adapted from libvpx (http://www.webmproject.org/code/)	12 // adapted from libvpx (http://www.webmproject.org/code/)

13	13

14 #include "./dsp.h"	14 #include "./dsp.h"

15	15

16 #if defined(WEBP_USE_NEON)	16 #if defined(WEBP_USE_NEON)

17	17

18 #include <assert.h>	18 #include <assert.h>

19	19

20 #include "./neon.h"	20 #include "./neon.h"

21 #include "../enc/vp8enci.h"	21 #include "../enc/vp8i_enc.h"

22	22

23 //------------------------------------------------------------------------------	23 //------------------------------------------------------------------------------

24 // Transforms (Paragraph 14.4)	24 // Transforms (Paragraph 14.4)

25	25

26 // Inverse transform.	26 // Inverse transform.

27 // This code is pretty much the same as TransformOne in the dec_neon.c, except	27 // This code is pretty much the same as TransformOne in the dec_neon.c, except

28 // for subtraction to *ref. See the comments there for algorithmic explanations.	28 // for subtraction to *ref. See the comments there for algorithmic explanations.

29	29

30 static const int16_t kC1 = 20091;	30 static const int16_t kC1 = 20091;

31 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.	31 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.

(...skipping 707 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
739 }	739 }

740	740

741 //------------------------------------------------------------------------------	741 //------------------------------------------------------------------------------

742	742

743 static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,	743 static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,

744 const uint8_t* const b,	744 const uint8_t* const b,

745 uint32x4_t* const sum) {	745 uint32x4_t* const sum) {

746 const uint8x16_t a0 = vld1q_u8(a);	746 const uint8x16_t a0 = vld1q_u8(a);

747 const uint8x16_t b0 = vld1q_u8(b);	747 const uint8x16_t b0 = vld1q_u8(b);

748 const uint8x16_t abs_diff = vabdq_u8(a0, b0);	748 const uint8x16_t abs_diff = vabdq_u8(a0, b0);

749 uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));	749 const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),

750 prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));	750 vget_low_u8(abs_diff));

751 sum = vpadalq_u16(sum, prod); // pair-wise add and accumulate	751 const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),

	752 vget_high_u8(abs_diff));

	753 /* pair-wise adds and widen */

	754 const uint32x4_t sum1 = vpaddlq_u16(prod1);

	755 const uint32x4_t sum2 = vpaddlq_u16(prod2);

	756 sum = vaddq_u32(sum, vaddq_u32(sum1, sum2));

752 }	757 }

753	758

754 // Horizontal sum of all four uint32_t values in 'sum'.	759 // Horizontal sum of all four uint32_t values in 'sum'.

755 static int SumToInt(uint32x4_t sum) {	760 static int SumToInt(uint32x4_t sum) {

756 const uint64x2_t sum2 = vpaddlq_u32(sum);	761 const uint64x2_t sum2 = vpaddlq_u32(sum);

757 const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);	762 const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);

758 return (int)sum3;	763 return (int)sum3;

759 }	764 }

760	765

761 static int SSE16x16(const uint8_t* a, const uint8_t* b) {	766 static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {

762 uint32x4_t sum = vdupq_n_u32(0);	767 uint32x4_t sum = vdupq_n_u32(0);

763 int y;	768 int y;

764 for (y = 0; y < 16; ++y) {	769 for (y = 0; y < 16; ++y) {

765 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);	770 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);

766 }	771 }

767 return SumToInt(sum);	772 return SumToInt(sum);

768 }	773 }

769	774

770 static int SSE16x8(const uint8_t* a, const uint8_t* b) {	775 static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {

771 uint32x4_t sum = vdupq_n_u32(0);	776 uint32x4_t sum = vdupq_n_u32(0);

772 int y;	777 int y;

773 for (y = 0; y < 8; ++y) {	778 for (y = 0; y < 8; ++y) {

774 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);	779 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);

775 }	780 }

776 return SumToInt(sum);	781 return SumToInt(sum);

777 }	782 }

778	783

779 static int SSE8x8(const uint8_t* a, const uint8_t* b) {	784 static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {

780 uint32x4_t sum = vdupq_n_u32(0);	785 uint32x4_t sum = vdupq_n_u32(0);

781 int y;	786 int y;

782 for (y = 0; y < 8; ++y) {	787 for (y = 0; y < 8; ++y) {

783 const uint8x8_t a0 = vld1_u8(a + y * BPS);	788 const uint8x8_t a0 = vld1_u8(a + y * BPS);

784 const uint8x8_t b0 = vld1_u8(b + y * BPS);	789 const uint8x8_t b0 = vld1_u8(b + y * BPS);

785 const uint8x8_t abs_diff = vabd_u8(a0, b0);	790 const uint8x8_t abs_diff = vabd_u8(a0, b0);

786 const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);	791 const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);

787 sum = vpadalq_u16(sum, prod);	792 sum = vpadalq_u16(sum, prod);

788 }	793 }

789 return SumToInt(sum);	794 return SumToInt(sum);

790 }	795 }

791	796

792 static int SSE4x4(const uint8_t* a, const uint8_t* b) {	797 static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {

793 const uint8x16_t a0 = Load4x4(a);	798 const uint8x16_t a0 = Load4x4(a);

794 const uint8x16_t b0 = Load4x4(b);	799 const uint8x16_t b0 = Load4x4(b);

795 const uint8x16_t abs_diff = vabdq_u8(a0, b0);	800 const uint8x16_t abs_diff = vabdq_u8(a0, b0);

796 uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));	801 const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),

797 prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));	802 vget_low_u8(abs_diff));

798 return SumToInt(vpaddlq_u16(prod));	803 const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),

	804 vget_high_u8(abs_diff));

	805 /* pair-wise adds and widen */

	806 const uint32x4_t sum1 = vpaddlq_u16(prod1);

	807 const uint32x4_t sum2 = vpaddlq_u16(prod2);

	808 return SumToInt(vaddq_u32(sum1, sum2));

799 }	809 }

800	810

801 //------------------------------------------------------------------------------	811 //------------------------------------------------------------------------------

802	812

803 // Compilation with gcc-4.6.x is problematic for now.	813 // Compilation with gcc-4.6.x is problematic for now.

804 #if !defined(WORK_AROUND_GCC)	814 #if !defined(WORK_AROUND_GCC)

805	815

806 static int16x8_t Quantize(int16_t* const in,	816 static int16x8_t Quantize(int16_t* const in,

807 const VP8Matrix* const mtx, int offset) {	817 const VP8Matrix* const mtx, int offset) {

808 const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);	818 const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);

(...skipping 87 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
896	906

897 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {	907 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {

898 VP8ITransform = ITransform;	908 VP8ITransform = ITransform;

899 VP8FTransform = FTransform;	909 VP8FTransform = FTransform;

900	910

901 VP8FTransformWHT = FTransformWHT;	911 VP8FTransformWHT = FTransformWHT;

902	912

903 VP8TDisto4x4 = Disto4x4;	913 VP8TDisto4x4 = Disto4x4;

904 VP8TDisto16x16 = Disto16x16;	914 VP8TDisto16x16 = Disto16x16;

905 VP8CollectHistogram = CollectHistogram;	915 VP8CollectHistogram = CollectHistogram;

906 VP8SSE16x16 = SSE16x16;	916

907 VP8SSE16x8 = SSE16x8;	917 VP8SSE16x16 = SSE16x16_NEON;

908 VP8SSE8x8 = SSE8x8;	918 VP8SSE16x8 = SSE16x8_NEON;

909 VP8SSE4x4 = SSE4x4;	919 VP8SSE8x8 = SSE8x8_NEON;

	920 VP8SSE4x4 = SSE4x4_NEON;

	921

910 #if !defined(WORK_AROUND_GCC)	922 #if !defined(WORK_AROUND_GCC)

911 VP8EncQuantizeBlock = QuantizeBlock;	923 VP8EncQuantizeBlock = QuantizeBlock;

912 VP8EncQuantize2Blocks = Quantize2Blocks;	924 VP8EncQuantize2Blocks = Quantize2Blocks;

913 #endif	925 #endif

914 }	926 }

915	927

916 #else // !WEBP_USE_NEON	928 #else // !WEBP_USE_NEON

917	929

918 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)	930 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)

919	931

920 #endif // WEBP_USE_NEON	932 #endif // WEBP_USE_NEON

OLD	NEW

« no previous file with comments | « third_party/libwebp/dsp/enc_msa.c ('k') | third_party/libwebp/dsp/enc_sse2.c » ('j') | no next file with comments »