third_party/libwebp/dsp/enc_neon.c - Issue 653803003: libwebp: update to 0.4.2-rc2

Side by Side Diff: third_party/libwebp/dsp/enc_neon.c

Issue 653803003: libwebp: update to 0.4.2-rc2 (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 6 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2012 Google Inc. All Rights Reserved.	1 // Copyright 2012 Google Inc. All Rights Reserved.

2 //	2 //

3 // Use of this source code is governed by a BSD-style license	3 // Use of this source code is governed by a BSD-style license

4 // that can be found in the COPYING file in the root of the source	4 // that can be found in the COPYING file in the root of the source

5 // tree. An additional intellectual property rights grant can be found	5 // tree. An additional intellectual property rights grant can be found

6 // in the file PATENTS. All contributing project authors may	6 // in the file PATENTS. All contributing project authors may

7 // be found in the AUTHORS file in the root of the source tree.	7 // be found in the AUTHORS file in the root of the source tree.

8 // -----------------------------------------------------------------------------	8 // -----------------------------------------------------------------------------

9 //	9 //

10 // ARM NEON version of speed-critical encoding functions.	10 // ARM NEON version of speed-critical encoding functions.

(...skipping 235 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
246 static void ITransform(const uint8_t* ref,	246 static void ITransform(const uint8_t* ref,

247 const int16_t* in, uint8_t* dst, int do_two) {	247 const int16_t* in, uint8_t* dst, int do_two) {

248 ITransformOne(ref, in, dst);	248 ITransformOne(ref, in, dst);

249 if (do_two) {	249 if (do_two) {

250 ITransformOne(ref + 4, in + 16, dst + 4);	250 ITransformOne(ref + 4, in + 16, dst + 4);

251 }	251 }

252 }	252 }

253	253

254 // Load all 4x4 pixels into a single uint8x16_t variable.	254 // Load all 4x4 pixels into a single uint8x16_t variable.

255 static uint8x16_t Load4x4(const uint8_t* src) {	255 static uint8x16_t Load4x4(const uint8_t* src) {

256 uint32x4_t out = { 0, 0, 0, 0 };	256 uint32x4_t out = vdupq_n_u32(0);

257 out = vld1q_lane_u32((const uint32_t)(src + 0 BPS), out, 0);	257 out = vld1q_lane_u32((const uint32_t)(src + 0 BPS), out, 0);

258 out = vld1q_lane_u32((const uint32_t)(src + 1 BPS), out, 1);	258 out = vld1q_lane_u32((const uint32_t)(src + 1 BPS), out, 1);

259 out = vld1q_lane_u32((const uint32_t)(src + 2 BPS), out, 2);	259 out = vld1q_lane_u32((const uint32_t)(src + 2 BPS), out, 2);

260 out = vld1q_lane_u32((const uint32_t)(src + 3 BPS), out, 3);	260 out = vld1q_lane_u32((const uint32_t)(src + 3 BPS), out, 3);

261 return vreinterpretq_u8_u32(out);	261 return vreinterpretq_u8_u32(out);

262 }	262 }

263	263

264 // Forward transform.	264 // Forward transform.

265	265

266 #if defined(USE_INTRINSICS)	266 #if defined(USE_INTRINSICS)

(...skipping 655 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
922 }	922 }

923	923

924 // Horizontal sum of all four uint32_t values in 'sum'.	924 // Horizontal sum of all four uint32_t values in 'sum'.

925 static int SumToInt(uint32x4_t sum) {	925 static int SumToInt(uint32x4_t sum) {

926 const uint64x2_t sum2 = vpaddlq_u32(sum);	926 const uint64x2_t sum2 = vpaddlq_u32(sum);

927 const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);	927 const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);

928 return (int)sum3;	928 return (int)sum3;

929 }	929 }

930	930

931 static int SSE16x16(const uint8_t* a, const uint8_t* b) {	931 static int SSE16x16(const uint8_t* a, const uint8_t* b) {

932 uint32x4_t sum = { 0, 0, 0, 0 };	932 uint32x4_t sum = vdupq_n_u32(0);

933 int y;	933 int y;

934 for (y = 0; y < 16; ++y) {	934 for (y = 0; y < 16; ++y) {

935 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);	935 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);

936 }	936 }

937 return SumToInt(sum);	937 return SumToInt(sum);

938 }	938 }

939	939

940 static int SSE16x8(const uint8_t* a, const uint8_t* b) {	940 static int SSE16x8(const uint8_t* a, const uint8_t* b) {

941 uint32x4_t sum = { 0, 0, 0, 0 };	941 uint32x4_t sum = vdupq_n_u32(0);

942 int y;	942 int y;

943 for (y = 0; y < 8; ++y) {	943 for (y = 0; y < 8; ++y) {

944 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);	944 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);

945 }	945 }

946 return SumToInt(sum);	946 return SumToInt(sum);

947 }	947 }

948	948

949 static int SSE8x8(const uint8_t* a, const uint8_t* b) {	949 static int SSE8x8(const uint8_t* a, const uint8_t* b) {

950 uint32x4_t sum = { 0, 0, 0, 0 };	950 uint32x4_t sum = vdupq_n_u32(0);

951 int y;	951 int y;

952 for (y = 0; y < 8; ++y) {	952 for (y = 0; y < 8; ++y) {

953 const uint8x8_t a0 = vld1_u8(a + y * BPS);	953 const uint8x8_t a0 = vld1_u8(a + y * BPS);

954 const uint8x8_t b0 = vld1_u8(b + y * BPS);	954 const uint8x8_t b0 = vld1_u8(b + y * BPS);

955 const uint8x8_t abs_diff = vabd_u8(a0, b0);	955 const uint8x8_t abs_diff = vabd_u8(a0, b0);

956 const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);	956 const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);

957 sum = vpadalq_u16(sum, prod);	957 sum = vpadalq_u16(sum, prod);

958 }	958 }

959 return SumToInt(sum);	959 return SumToInt(sum);

960 }	960 }

961	961

962 static int SSE4x4(const uint8_t* a, const uint8_t* b) {	962 static int SSE4x4(const uint8_t* a, const uint8_t* b) {

963 const uint8x16_t a0 = Load4x4(a);	963 const uint8x16_t a0 = Load4x4(a);

964 const uint8x16_t b0 = Load4x4(b);	964 const uint8x16_t b0 = Load4x4(b);

965 const uint8x16_t abs_diff = vabdq_u8(a0, b0);	965 const uint8x16_t abs_diff = vabdq_u8(a0, b0);

966 uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));	966 uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));

967 prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));	967 prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));

968 return SumToInt(vpaddlq_u16(prod));	968 return SumToInt(vpaddlq_u16(prod));

969 }	969 }

970	970

971 //------------------------------------------------------------------------------	971 //------------------------------------------------------------------------------

972	972

973 // Compilation with gcc-4.6.x is problematic for now and vtbl? are unavailable	973 // Compilation with gcc-4.6.x is problematic for now.

974 // in iOS/arm64 builds. Disable this function in those cases.	974 #if !defined(WORK_AROUND_GCC)

975 #if !(defined(WORK_AROUND_GCC) \|\| defined(__aarch64__))

976	975

977 static int16x8_t Quantize(int16_t* const in,	976 static int16x8_t Quantize(int16_t* const in,

978 const VP8Matrix* const mtx, int offset) {	977 const VP8Matrix* const mtx, int offset) {

979 const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);	978 const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);

980 const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);	979 const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);

981 const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);	980 const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);

982 const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);	981 const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);

983 const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);	982 const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);

984	983

985 const int16x8_t a = vld1q_s16(in + offset); // in	984 const int16x8_t a = vld1q_s16(in + offset); // in

986 const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in)	985 const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in)

987 const int16x8_t sign = vshrq_n_s16(a, 15); // sign	986 const int16x8_t sign = vshrq_n_s16(a, 15); // sign

988 const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen	987 const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen

989 const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));	988 const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));

990 const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));	989 const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));

991 const uint32x4_t m2 = vhaddq_u32(m0, bias0);	990 const uint32x4_t m2 = vhaddq_u32(m0, bias0);

992 const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1	991 const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1

993 const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),	992 const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),

994 vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1	993 vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1

995 const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));	994 const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));

996 const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);	995 const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);

997 const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign	996 const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign

998 const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));	997 const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));

999 vst1q_s16(in + offset, c4);	998 vst1q_s16(in + offset, c4);

1000 assert(QFIX == 17); // this function can't work as is if QFIX != 16+1	999 assert(QFIX == 17); // this function can't work as is if QFIX != 16+1

1001 return c3;	1000 return c3;

1002 }	1001 }

1003	1002

1004 static const uint8_t kShuffles[4][8] = {	1003 static const uint8_t kShuffles[4][8] = {

1005 { 0, 1, 2, 3, 8, 9, 16, 17 },	1004 { 0, 1, 2, 3, 8, 9, 16, 17 },

1006 { 10, 11, 4, 5, 6, 7, 12, 13 },	1005 { 10, 11, 4, 5, 6, 7, 12, 13 },

1007 { 18, 19, 24, 25, 26, 27, 20, 21 },	1006 { 18, 19, 24, 25, 26, 27, 20, 21 },

1008 { 14, 15, 22, 23, 28, 29, 30, 31 }	1007 { 14, 15, 22, 23, 28, 29, 30, 31 }

1009 };	1008 };

1010	1009

1011 static int QuantizeBlock(int16_t in[16], int16_t out[16],	1010 static int QuantizeBlock(int16_t in[16], int16_t out[16],

1012 const VP8Matrix* const mtx) {	1011 const VP8Matrix* const mtx) {

1013 const int16x8_t out0 = Quantize(in, mtx, 0);	1012 const int16x8_t out0 = Quantize(in, mtx, 0);

1014 const int16x8_t out1 = Quantize(in, mtx, 8);	1013 const int16x8_t out1 = Quantize(in, mtx, 8);

	1014 uint8x8x4_t shuffles;

	1015 // vtbl4_u8 is marked unavailable for iOS arm64, use wider versions there.

	1016 #if defined(__APPLE__) && defined(__aarch64__)

	1017 uint8x16x2_t all_out;

	1018 INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));

	1019 INIT_VECTOR4(shuffles,

	1020 vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),

	1021 vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),

	1022 vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),

	1023 vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));

	1024 #else

1015 uint8x8x4_t all_out;	1025 uint8x8x4_t all_out;

1016 INIT_VECTOR4(all_out,	1026 INIT_VECTOR4(all_out,

1017 vreinterpret_u8_s16(vget_low_s16(out0)),	1027 vreinterpret_u8_s16(vget_low_s16(out0)),

1018 vreinterpret_u8_s16(vget_high_s16(out0)),	1028 vreinterpret_u8_s16(vget_high_s16(out0)),

1019 vreinterpret_u8_s16(vget_low_s16(out1)),	1029 vreinterpret_u8_s16(vget_low_s16(out1)),

1020 vreinterpret_u8_s16(vget_high_s16(out1)));	1030 vreinterpret_u8_s16(vget_high_s16(out1)));

	1031 INIT_VECTOR4(shuffles,

	1032 vtbl4_u8(all_out, vld1_u8(kShuffles[0])),

	1033 vtbl4_u8(all_out, vld1_u8(kShuffles[1])),

	1034 vtbl4_u8(all_out, vld1_u8(kShuffles[2])),

	1035 vtbl4_u8(all_out, vld1_u8(kShuffles[3])));

	1036 #endif

1021 // Zigzag reordering	1037 // Zigzag reordering

1022 vst1_u8((uint8_t*)(out + 0), vtbl4_u8(all_out, vld1_u8(kShuffles[0])));	1038 vst1_u8((uint8_t*)(out + 0), shuffles.val[0]);

1023 vst1_u8((uint8_t*)(out + 4), vtbl4_u8(all_out, vld1_u8(kShuffles[1])));	1039 vst1_u8((uint8_t*)(out + 4), shuffles.val[1]);

1024 vst1_u8((uint8_t*)(out + 8), vtbl4_u8(all_out, vld1_u8(kShuffles[2])));	1040 vst1_u8((uint8_t*)(out + 8), shuffles.val[2]);

1025 vst1_u8((uint8_t*)(out + 12), vtbl4_u8(all_out, vld1_u8(kShuffles[3])));	1041 vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);

1026 // test zeros	1042 // test zeros

1027 if ((uint64_t)(out + 0) != 0) return 1;	1043 if ((uint64_t)(out + 0) != 0) return 1;

1028 if ((uint64_t)(out + 4) != 0) return 1;	1044 if ((uint64_t)(out + 4) != 0) return 1;

1029 if ((uint64_t)(out + 8) != 0) return 1;	1045 if ((uint64_t)(out + 8) != 0) return 1;

1030 if ((uint64_t)(out + 12) != 0) return 1;	1046 if ((uint64_t)(out + 12) != 0) return 1;

1031 return 0;	1047 return 0;

1032 }	1048 }

1033	1049

1034 #endif // !WORK_AROUND_GCC && !__aarch64__	1050 #endif // !WORK_AROUND_GCC

1035	1051

1036 #endif // WEBP_USE_NEON	1052 #endif // WEBP_USE_NEON

1037	1053

1038 //------------------------------------------------------------------------------	1054 //------------------------------------------------------------------------------

1039 // Entry point	1055 // Entry point

1040	1056

1041 extern void VP8EncDspInitNEON(void);	1057 extern void VP8EncDspInitNEON(void);

1042	1058

1043 void VP8EncDspInitNEON(void) {	1059 void VP8EncDspInitNEON(void) {

1044 #if defined(WEBP_USE_NEON)	1060 #if defined(WEBP_USE_NEON)

1045 VP8ITransform = ITransform;	1061 VP8ITransform = ITransform;

1046 VP8FTransform = FTransform;	1062 VP8FTransform = FTransform;

1047	1063

1048 VP8FTransformWHT = FTransformWHT;	1064 VP8FTransformWHT = FTransformWHT;

1049	1065

1050 VP8TDisto4x4 = Disto4x4;	1066 VP8TDisto4x4 = Disto4x4;

1051 VP8TDisto16x16 = Disto16x16;	1067 VP8TDisto16x16 = Disto16x16;

1052 VP8CollectHistogram = CollectHistogram;	1068 VP8CollectHistogram = CollectHistogram;

1053 VP8SSE16x16 = SSE16x16;	1069 VP8SSE16x16 = SSE16x16;

1054 VP8SSE16x8 = SSE16x8;	1070 VP8SSE16x8 = SSE16x8;

1055 VP8SSE8x8 = SSE8x8;	1071 VP8SSE8x8 = SSE8x8;

1056 VP8SSE4x4 = SSE4x4;	1072 VP8SSE4x4 = SSE4x4;

1057 #if !(defined(WORK_AROUND_GCC) \|\| defined(__aarch64__))	1073 #if !defined(WORK_AROUND_GCC)

1058 VP8EncQuantizeBlock = QuantizeBlock;	1074 VP8EncQuantizeBlock = QuantizeBlock;

1059 #endif	1075 #endif

1060 #endif // WEBP_USE_NEON	1076 #endif // WEBP_USE_NEON

1061 }	1077 }

OLD	NEW

« no previous file with comments | « third_party/libwebp/dsp/dsp.h ('k') | third_party/libwebp/dsp/lossless.h » ('j') | no next file with comments »