Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(84)

Side by Side Diff: third_party/libwebp/dsp/enc_neon.c

Issue 653803003: libwebp: update to 0.4.2-rc2 (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 6 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/libwebp/dsp/dsp.h ('k') | third_party/libwebp/dsp/lossless.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 Google Inc. All Rights Reserved. 1 // Copyright 2012 Google Inc. All Rights Reserved.
2 // 2 //
3 // Use of this source code is governed by a BSD-style license 3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source 4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found 5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may 6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree. 7 // be found in the AUTHORS file in the root of the source tree.
8 // ----------------------------------------------------------------------------- 8 // -----------------------------------------------------------------------------
9 // 9 //
10 // ARM NEON version of speed-critical encoding functions. 10 // ARM NEON version of speed-critical encoding functions.
(...skipping 235 matching lines...) Expand 10 before | Expand all | Expand 10 after
246 static void ITransform(const uint8_t* ref, 246 static void ITransform(const uint8_t* ref,
247 const int16_t* in, uint8_t* dst, int do_two) { 247 const int16_t* in, uint8_t* dst, int do_two) {
248 ITransformOne(ref, in, dst); 248 ITransformOne(ref, in, dst);
249 if (do_two) { 249 if (do_two) {
250 ITransformOne(ref + 4, in + 16, dst + 4); 250 ITransformOne(ref + 4, in + 16, dst + 4);
251 } 251 }
252 } 252 }
253 253
254 // Load all 4x4 pixels into a single uint8x16_t variable. 254 // Load all 4x4 pixels into a single uint8x16_t variable.
255 static uint8x16_t Load4x4(const uint8_t* src) { 255 static uint8x16_t Load4x4(const uint8_t* src) {
256 uint32x4_t out = { 0, 0, 0, 0 }; 256 uint32x4_t out = vdupq_n_u32(0);
257 out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0); 257 out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
258 out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1); 258 out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
259 out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2); 259 out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
260 out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3); 260 out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
261 return vreinterpretq_u8_u32(out); 261 return vreinterpretq_u8_u32(out);
262 } 262 }
263 263
264 // Forward transform. 264 // Forward transform.
265 265
266 #if defined(USE_INTRINSICS) 266 #if defined(USE_INTRINSICS)
(...skipping 655 matching lines...) Expand 10 before | Expand all | Expand 10 after
922 } 922 }
923 923
924 // Horizontal sum of all four uint32_t values in 'sum'. 924 // Horizontal sum of all four uint32_t values in 'sum'.
925 static int SumToInt(uint32x4_t sum) { 925 static int SumToInt(uint32x4_t sum) {
926 const uint64x2_t sum2 = vpaddlq_u32(sum); 926 const uint64x2_t sum2 = vpaddlq_u32(sum);
927 const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1); 927 const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
928 return (int)sum3; 928 return (int)sum3;
929 } 929 }
930 930
931 static int SSE16x16(const uint8_t* a, const uint8_t* b) { 931 static int SSE16x16(const uint8_t* a, const uint8_t* b) {
932 uint32x4_t sum = { 0, 0, 0, 0 }; 932 uint32x4_t sum = vdupq_n_u32(0);
933 int y; 933 int y;
934 for (y = 0; y < 16; ++y) { 934 for (y = 0; y < 16; ++y) {
935 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); 935 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
936 } 936 }
937 return SumToInt(sum); 937 return SumToInt(sum);
938 } 938 }
939 939
940 static int SSE16x8(const uint8_t* a, const uint8_t* b) { 940 static int SSE16x8(const uint8_t* a, const uint8_t* b) {
941 uint32x4_t sum = { 0, 0, 0, 0 }; 941 uint32x4_t sum = vdupq_n_u32(0);
942 int y; 942 int y;
943 for (y = 0; y < 8; ++y) { 943 for (y = 0; y < 8; ++y) {
944 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); 944 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
945 } 945 }
946 return SumToInt(sum); 946 return SumToInt(sum);
947 } 947 }
948 948
949 static int SSE8x8(const uint8_t* a, const uint8_t* b) { 949 static int SSE8x8(const uint8_t* a, const uint8_t* b) {
950 uint32x4_t sum = { 0, 0, 0, 0 }; 950 uint32x4_t sum = vdupq_n_u32(0);
951 int y; 951 int y;
952 for (y = 0; y < 8; ++y) { 952 for (y = 0; y < 8; ++y) {
953 const uint8x8_t a0 = vld1_u8(a + y * BPS); 953 const uint8x8_t a0 = vld1_u8(a + y * BPS);
954 const uint8x8_t b0 = vld1_u8(b + y * BPS); 954 const uint8x8_t b0 = vld1_u8(b + y * BPS);
955 const uint8x8_t abs_diff = vabd_u8(a0, b0); 955 const uint8x8_t abs_diff = vabd_u8(a0, b0);
956 const uint16x8_t prod = vmull_u8(abs_diff, abs_diff); 956 const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
957 sum = vpadalq_u16(sum, prod); 957 sum = vpadalq_u16(sum, prod);
958 } 958 }
959 return SumToInt(sum); 959 return SumToInt(sum);
960 } 960 }
961 961
962 static int SSE4x4(const uint8_t* a, const uint8_t* b) { 962 static int SSE4x4(const uint8_t* a, const uint8_t* b) {
963 const uint8x16_t a0 = Load4x4(a); 963 const uint8x16_t a0 = Load4x4(a);
964 const uint8x16_t b0 = Load4x4(b); 964 const uint8x16_t b0 = Load4x4(b);
965 const uint8x16_t abs_diff = vabdq_u8(a0, b0); 965 const uint8x16_t abs_diff = vabdq_u8(a0, b0);
966 uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff)); 966 uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
967 prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff)); 967 prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));
968 return SumToInt(vpaddlq_u16(prod)); 968 return SumToInt(vpaddlq_u16(prod));
969 } 969 }
970 970
971 //------------------------------------------------------------------------------ 971 //------------------------------------------------------------------------------
972 972
973 // Compilation with gcc-4.6.x is problematic for now and vtbl? are unavailable 973 // Compilation with gcc-4.6.x is problematic for now.
974 // in iOS/arm64 builds. Disable this function in those cases. 974 #if !defined(WORK_AROUND_GCC)
975 #if !(defined(WORK_AROUND_GCC) || defined(__aarch64__))
976 975
977 static int16x8_t Quantize(int16_t* const in, 976 static int16x8_t Quantize(int16_t* const in,
978 const VP8Matrix* const mtx, int offset) { 977 const VP8Matrix* const mtx, int offset) {
979 const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]); 978 const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
980 const uint16x8_t q = vld1q_u16(&mtx->q_[offset]); 979 const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
981 const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]); 980 const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
982 const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]); 981 const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
983 const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]); 982 const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
984 983
985 const int16x8_t a = vld1q_s16(in + offset); // in 984 const int16x8_t a = vld1q_s16(in + offset); // in
986 const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in) 985 const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in)
987 const int16x8_t sign = vshrq_n_s16(a, 15); // sign 986 const int16x8_t sign = vshrq_n_s16(a, 15); // sign
988 const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen 987 const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen
989 const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq)); 988 const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
990 const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq)); 989 const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
991 const uint32x4_t m2 = vhaddq_u32(m0, bias0); 990 const uint32x4_t m2 = vhaddq_u32(m0, bias0);
992 const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1 991 const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1
993 const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16), 992 const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
994 vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1 993 vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1
995 const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL)); 994 const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
996 const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign); 995 const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
997 const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign 996 const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign
998 const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q)); 997 const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
999 vst1q_s16(in + offset, c4); 998 vst1q_s16(in + offset, c4);
1000 assert(QFIX == 17); // this function can't work as is if QFIX != 16+1 999 assert(QFIX == 17); // this function can't work as is if QFIX != 16+1
1001 return c3; 1000 return c3;
1002 } 1001 }
1003 1002
1004 static const uint8_t kShuffles[4][8] = { 1003 static const uint8_t kShuffles[4][8] = {
1005 { 0, 1, 2, 3, 8, 9, 16, 17 }, 1004 { 0, 1, 2, 3, 8, 9, 16, 17 },
1006 { 10, 11, 4, 5, 6, 7, 12, 13 }, 1005 { 10, 11, 4, 5, 6, 7, 12, 13 },
1007 { 18, 19, 24, 25, 26, 27, 20, 21 }, 1006 { 18, 19, 24, 25, 26, 27, 20, 21 },
1008 { 14, 15, 22, 23, 28, 29, 30, 31 } 1007 { 14, 15, 22, 23, 28, 29, 30, 31 }
1009 }; 1008 };
1010 1009
1011 static int QuantizeBlock(int16_t in[16], int16_t out[16], 1010 static int QuantizeBlock(int16_t in[16], int16_t out[16],
1012 const VP8Matrix* const mtx) { 1011 const VP8Matrix* const mtx) {
1013 const int16x8_t out0 = Quantize(in, mtx, 0); 1012 const int16x8_t out0 = Quantize(in, mtx, 0);
1014 const int16x8_t out1 = Quantize(in, mtx, 8); 1013 const int16x8_t out1 = Quantize(in, mtx, 8);
1014 uint8x8x4_t shuffles;
1015 // vtbl4_u8 is marked unavailable for iOS arm64, use wider versions there.
1016 #if defined(__APPLE__) && defined(__aarch64__)
1017 uint8x16x2_t all_out;
1018 INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
1019 INIT_VECTOR4(shuffles,
1020 vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
1021 vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
1022 vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
1023 vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
1024 #else
1015 uint8x8x4_t all_out; 1025 uint8x8x4_t all_out;
1016 INIT_VECTOR4(all_out, 1026 INIT_VECTOR4(all_out,
1017 vreinterpret_u8_s16(vget_low_s16(out0)), 1027 vreinterpret_u8_s16(vget_low_s16(out0)),
1018 vreinterpret_u8_s16(vget_high_s16(out0)), 1028 vreinterpret_u8_s16(vget_high_s16(out0)),
1019 vreinterpret_u8_s16(vget_low_s16(out1)), 1029 vreinterpret_u8_s16(vget_low_s16(out1)),
1020 vreinterpret_u8_s16(vget_high_s16(out1))); 1030 vreinterpret_u8_s16(vget_high_s16(out1)));
1031 INIT_VECTOR4(shuffles,
1032 vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
1033 vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
1034 vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
1035 vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
1036 #endif
1021 // Zigzag reordering 1037 // Zigzag reordering
1022 vst1_u8((uint8_t*)(out + 0), vtbl4_u8(all_out, vld1_u8(kShuffles[0]))); 1038 vst1_u8((uint8_t*)(out + 0), shuffles.val[0]);
1023 vst1_u8((uint8_t*)(out + 4), vtbl4_u8(all_out, vld1_u8(kShuffles[1]))); 1039 vst1_u8((uint8_t*)(out + 4), shuffles.val[1]);
1024 vst1_u8((uint8_t*)(out + 8), vtbl4_u8(all_out, vld1_u8(kShuffles[2]))); 1040 vst1_u8((uint8_t*)(out + 8), shuffles.val[2]);
1025 vst1_u8((uint8_t*)(out + 12), vtbl4_u8(all_out, vld1_u8(kShuffles[3]))); 1041 vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
1026 // test zeros 1042 // test zeros
1027 if (*(uint64_t*)(out + 0) != 0) return 1; 1043 if (*(uint64_t*)(out + 0) != 0) return 1;
1028 if (*(uint64_t*)(out + 4) != 0) return 1; 1044 if (*(uint64_t*)(out + 4) != 0) return 1;
1029 if (*(uint64_t*)(out + 8) != 0) return 1; 1045 if (*(uint64_t*)(out + 8) != 0) return 1;
1030 if (*(uint64_t*)(out + 12) != 0) return 1; 1046 if (*(uint64_t*)(out + 12) != 0) return 1;
1031 return 0; 1047 return 0;
1032 } 1048 }
1033 1049
1034 #endif // !WORK_AROUND_GCC && !__aarch64__ 1050 #endif // !WORK_AROUND_GCC
1035 1051
1036 #endif // WEBP_USE_NEON 1052 #endif // WEBP_USE_NEON
1037 1053
1038 //------------------------------------------------------------------------------ 1054 //------------------------------------------------------------------------------
1039 // Entry point 1055 // Entry point
1040 1056
1041 extern void VP8EncDspInitNEON(void); 1057 extern void VP8EncDspInitNEON(void);
1042 1058
1043 void VP8EncDspInitNEON(void) { 1059 void VP8EncDspInitNEON(void) {
1044 #if defined(WEBP_USE_NEON) 1060 #if defined(WEBP_USE_NEON)
1045 VP8ITransform = ITransform; 1061 VP8ITransform = ITransform;
1046 VP8FTransform = FTransform; 1062 VP8FTransform = FTransform;
1047 1063
1048 VP8FTransformWHT = FTransformWHT; 1064 VP8FTransformWHT = FTransformWHT;
1049 1065
1050 VP8TDisto4x4 = Disto4x4; 1066 VP8TDisto4x4 = Disto4x4;
1051 VP8TDisto16x16 = Disto16x16; 1067 VP8TDisto16x16 = Disto16x16;
1052 VP8CollectHistogram = CollectHistogram; 1068 VP8CollectHistogram = CollectHistogram;
1053 VP8SSE16x16 = SSE16x16; 1069 VP8SSE16x16 = SSE16x16;
1054 VP8SSE16x8 = SSE16x8; 1070 VP8SSE16x8 = SSE16x8;
1055 VP8SSE8x8 = SSE8x8; 1071 VP8SSE8x8 = SSE8x8;
1056 VP8SSE4x4 = SSE4x4; 1072 VP8SSE4x4 = SSE4x4;
1057 #if !(defined(WORK_AROUND_GCC) || defined(__aarch64__)) 1073 #if !defined(WORK_AROUND_GCC)
1058 VP8EncQuantizeBlock = QuantizeBlock; 1074 VP8EncQuantizeBlock = QuantizeBlock;
1059 #endif 1075 #endif
1060 #endif // WEBP_USE_NEON 1076 #endif // WEBP_USE_NEON
1061 } 1077 }
OLDNEW
« no previous file with comments | « third_party/libwebp/dsp/dsp.h ('k') | third_party/libwebp/dsp/lossless.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698