| OLD | NEW |
| 1 // Copyright 2012 Google Inc. All Rights Reserved. | 1 // Copyright 2012 Google Inc. All Rights Reserved. |
| 2 // | 2 // |
| 3 // Use of this source code is governed by a BSD-style license | 3 // Use of this source code is governed by a BSD-style license |
| 4 // that can be found in the COPYING file in the root of the source | 4 // that can be found in the COPYING file in the root of the source |
| 5 // tree. An additional intellectual property rights grant can be found | 5 // tree. An additional intellectual property rights grant can be found |
| 6 // in the file PATENTS. All contributing project authors may | 6 // in the file PATENTS. All contributing project authors may |
| 7 // be found in the AUTHORS file in the root of the source tree. | 7 // be found in the AUTHORS file in the root of the source tree. |
| 8 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
| 9 // | 9 // |
| 10 // ARM NEON version of speed-critical encoding functions. | 10 // ARM NEON version of speed-critical encoding functions. |
| (...skipping 235 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 246 static void ITransform(const uint8_t* ref, | 246 static void ITransform(const uint8_t* ref, |
| 247 const int16_t* in, uint8_t* dst, int do_two) { | 247 const int16_t* in, uint8_t* dst, int do_two) { |
| 248 ITransformOne(ref, in, dst); | 248 ITransformOne(ref, in, dst); |
| 249 if (do_two) { | 249 if (do_two) { |
| 250 ITransformOne(ref + 4, in + 16, dst + 4); | 250 ITransformOne(ref + 4, in + 16, dst + 4); |
| 251 } | 251 } |
| 252 } | 252 } |
| 253 | 253 |
| 254 // Load all 4x4 pixels into a single uint8x16_t variable. | 254 // Load all 4x4 pixels into a single uint8x16_t variable. |
| 255 static uint8x16_t Load4x4(const uint8_t* src) { | 255 static uint8x16_t Load4x4(const uint8_t* src) { |
| 256 uint32x4_t out = { 0, 0, 0, 0 }; | 256 uint32x4_t out = vdupq_n_u32(0); |
| 257 out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0); | 257 out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0); |
| 258 out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1); | 258 out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1); |
| 259 out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2); | 259 out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2); |
| 260 out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3); | 260 out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3); |
| 261 return vreinterpretq_u8_u32(out); | 261 return vreinterpretq_u8_u32(out); |
| 262 } | 262 } |
| 263 | 263 |
| 264 // Forward transform. | 264 // Forward transform. |
| 265 | 265 |
| 266 #if defined(USE_INTRINSICS) | 266 #if defined(USE_INTRINSICS) |
| (...skipping 655 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 922 } | 922 } |
| 923 | 923 |
| 924 // Horizontal sum of all four uint32_t values in 'sum'. | 924 // Horizontal sum of all four uint32_t values in 'sum'. |
| 925 static int SumToInt(uint32x4_t sum) { | 925 static int SumToInt(uint32x4_t sum) { |
| 926 const uint64x2_t sum2 = vpaddlq_u32(sum); | 926 const uint64x2_t sum2 = vpaddlq_u32(sum); |
| 927 const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1); | 927 const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1); |
| 928 return (int)sum3; | 928 return (int)sum3; |
| 929 } | 929 } |
| 930 | 930 |
| 931 static int SSE16x16(const uint8_t* a, const uint8_t* b) { | 931 static int SSE16x16(const uint8_t* a, const uint8_t* b) { |
| 932 uint32x4_t sum = { 0, 0, 0, 0 }; | 932 uint32x4_t sum = vdupq_n_u32(0); |
| 933 int y; | 933 int y; |
| 934 for (y = 0; y < 16; ++y) { | 934 for (y = 0; y < 16; ++y) { |
| 935 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); | 935 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); |
| 936 } | 936 } |
| 937 return SumToInt(sum); | 937 return SumToInt(sum); |
| 938 } | 938 } |
| 939 | 939 |
| 940 static int SSE16x8(const uint8_t* a, const uint8_t* b) { | 940 static int SSE16x8(const uint8_t* a, const uint8_t* b) { |
| 941 uint32x4_t sum = { 0, 0, 0, 0 }; | 941 uint32x4_t sum = vdupq_n_u32(0); |
| 942 int y; | 942 int y; |
| 943 for (y = 0; y < 8; ++y) { | 943 for (y = 0; y < 8; ++y) { |
| 944 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); | 944 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); |
| 945 } | 945 } |
| 946 return SumToInt(sum); | 946 return SumToInt(sum); |
| 947 } | 947 } |
| 948 | 948 |
| 949 static int SSE8x8(const uint8_t* a, const uint8_t* b) { | 949 static int SSE8x8(const uint8_t* a, const uint8_t* b) { |
| 950 uint32x4_t sum = { 0, 0, 0, 0 }; | 950 uint32x4_t sum = vdupq_n_u32(0); |
| 951 int y; | 951 int y; |
| 952 for (y = 0; y < 8; ++y) { | 952 for (y = 0; y < 8; ++y) { |
| 953 const uint8x8_t a0 = vld1_u8(a + y * BPS); | 953 const uint8x8_t a0 = vld1_u8(a + y * BPS); |
| 954 const uint8x8_t b0 = vld1_u8(b + y * BPS); | 954 const uint8x8_t b0 = vld1_u8(b + y * BPS); |
| 955 const uint8x8_t abs_diff = vabd_u8(a0, b0); | 955 const uint8x8_t abs_diff = vabd_u8(a0, b0); |
| 956 const uint16x8_t prod = vmull_u8(abs_diff, abs_diff); | 956 const uint16x8_t prod = vmull_u8(abs_diff, abs_diff); |
| 957 sum = vpadalq_u16(sum, prod); | 957 sum = vpadalq_u16(sum, prod); |
| 958 } | 958 } |
| 959 return SumToInt(sum); | 959 return SumToInt(sum); |
| 960 } | 960 } |
| 961 | 961 |
| 962 static int SSE4x4(const uint8_t* a, const uint8_t* b) { | 962 static int SSE4x4(const uint8_t* a, const uint8_t* b) { |
| 963 const uint8x16_t a0 = Load4x4(a); | 963 const uint8x16_t a0 = Load4x4(a); |
| 964 const uint8x16_t b0 = Load4x4(b); | 964 const uint8x16_t b0 = Load4x4(b); |
| 965 const uint8x16_t abs_diff = vabdq_u8(a0, b0); | 965 const uint8x16_t abs_diff = vabdq_u8(a0, b0); |
| 966 uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff)); | 966 uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff)); |
| 967 prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff)); | 967 prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff)); |
| 968 return SumToInt(vpaddlq_u16(prod)); | 968 return SumToInt(vpaddlq_u16(prod)); |
| 969 } | 969 } |
| 970 | 970 |
| 971 //------------------------------------------------------------------------------ | 971 //------------------------------------------------------------------------------ |
| 972 | 972 |
| 973 // Compilation with gcc-4.6.x is problematic for now and vtbl? are unavailable | 973 // Compilation with gcc-4.6.x is problematic for now. |
| 974 // in iOS/arm64 builds. Disable this function in those cases. | 974 #if !defined(WORK_AROUND_GCC) |
| 975 #if !(defined(WORK_AROUND_GCC) || defined(__aarch64__)) | |
| 976 | 975 |
| 977 static int16x8_t Quantize(int16_t* const in, | 976 static int16x8_t Quantize(int16_t* const in, |
| 978 const VP8Matrix* const mtx, int offset) { | 977 const VP8Matrix* const mtx, int offset) { |
| 979 const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]); | 978 const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]); |
| 980 const uint16x8_t q = vld1q_u16(&mtx->q_[offset]); | 979 const uint16x8_t q = vld1q_u16(&mtx->q_[offset]); |
| 981 const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]); | 980 const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]); |
| 982 const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]); | 981 const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]); |
| 983 const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]); | 982 const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]); |
| 984 | 983 |
| 985 const int16x8_t a = vld1q_s16(in + offset); // in | 984 const int16x8_t a = vld1q_s16(in + offset); // in |
| 986 const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in) | 985 const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in) |
| 987 const int16x8_t sign = vshrq_n_s16(a, 15); // sign | 986 const int16x8_t sign = vshrq_n_s16(a, 15); // sign |
| 988 const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen | 987 const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen |
| 989 const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq)); | 988 const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq)); |
| 990 const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq)); | 989 const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq)); |
| 991 const uint32x4_t m2 = vhaddq_u32(m0, bias0); | 990 const uint32x4_t m2 = vhaddq_u32(m0, bias0); |
| 992 const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1 | 991 const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1 |
| 993 const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16), | 992 const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16), |
| 994 vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1 | 993 vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1 |
| 995 const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL)); | 994 const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL)); |
| 996 const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign); | 995 const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign); |
| 997 const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign | 996 const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign |
| 998 const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q)); | 997 const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q)); |
| 999 vst1q_s16(in + offset, c4); | 998 vst1q_s16(in + offset, c4); |
| 1000 assert(QFIX == 17); // this function can't work as is if QFIX != 16+1 | 999 assert(QFIX == 17); // this function can't work as is if QFIX != 16+1 |
| 1001 return c3; | 1000 return c3; |
| 1002 } | 1001 } |
| 1003 | 1002 |
| 1004 static const uint8_t kShuffles[4][8] = { | 1003 static const uint8_t kShuffles[4][8] = { |
| 1005 { 0, 1, 2, 3, 8, 9, 16, 17 }, | 1004 { 0, 1, 2, 3, 8, 9, 16, 17 }, |
| 1006 { 10, 11, 4, 5, 6, 7, 12, 13 }, | 1005 { 10, 11, 4, 5, 6, 7, 12, 13 }, |
| 1007 { 18, 19, 24, 25, 26, 27, 20, 21 }, | 1006 { 18, 19, 24, 25, 26, 27, 20, 21 }, |
| 1008 { 14, 15, 22, 23, 28, 29, 30, 31 } | 1007 { 14, 15, 22, 23, 28, 29, 30, 31 } |
| 1009 }; | 1008 }; |
| 1010 | 1009 |
| 1011 static int QuantizeBlock(int16_t in[16], int16_t out[16], | 1010 static int QuantizeBlock(int16_t in[16], int16_t out[16], |
| 1012 const VP8Matrix* const mtx) { | 1011 const VP8Matrix* const mtx) { |
| 1013 const int16x8_t out0 = Quantize(in, mtx, 0); | 1012 const int16x8_t out0 = Quantize(in, mtx, 0); |
| 1014 const int16x8_t out1 = Quantize(in, mtx, 8); | 1013 const int16x8_t out1 = Quantize(in, mtx, 8); |
| 1014 uint8x8x4_t shuffles; |
| 1015 // vtbl4_u8 is marked unavailable for iOS arm64, use wider versions there. |
| 1016 #if defined(__APPLE__) && defined(__aarch64__) |
| 1017 uint8x16x2_t all_out; |
| 1018 INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1)); |
| 1019 INIT_VECTOR4(shuffles, |
| 1020 vtbl2q_u8(all_out, vld1_u8(kShuffles[0])), |
| 1021 vtbl2q_u8(all_out, vld1_u8(kShuffles[1])), |
| 1022 vtbl2q_u8(all_out, vld1_u8(kShuffles[2])), |
| 1023 vtbl2q_u8(all_out, vld1_u8(kShuffles[3]))); |
| 1024 #else |
| 1015 uint8x8x4_t all_out; | 1025 uint8x8x4_t all_out; |
| 1016 INIT_VECTOR4(all_out, | 1026 INIT_VECTOR4(all_out, |
| 1017 vreinterpret_u8_s16(vget_low_s16(out0)), | 1027 vreinterpret_u8_s16(vget_low_s16(out0)), |
| 1018 vreinterpret_u8_s16(vget_high_s16(out0)), | 1028 vreinterpret_u8_s16(vget_high_s16(out0)), |
| 1019 vreinterpret_u8_s16(vget_low_s16(out1)), | 1029 vreinterpret_u8_s16(vget_low_s16(out1)), |
| 1020 vreinterpret_u8_s16(vget_high_s16(out1))); | 1030 vreinterpret_u8_s16(vget_high_s16(out1))); |
| 1031 INIT_VECTOR4(shuffles, |
| 1032 vtbl4_u8(all_out, vld1_u8(kShuffles[0])), |
| 1033 vtbl4_u8(all_out, vld1_u8(kShuffles[1])), |
| 1034 vtbl4_u8(all_out, vld1_u8(kShuffles[2])), |
| 1035 vtbl4_u8(all_out, vld1_u8(kShuffles[3]))); |
| 1036 #endif |
| 1021 // Zigzag reordering | 1037 // Zigzag reordering |
| 1022 vst1_u8((uint8_t*)(out + 0), vtbl4_u8(all_out, vld1_u8(kShuffles[0]))); | 1038 vst1_u8((uint8_t*)(out + 0), shuffles.val[0]); |
| 1023 vst1_u8((uint8_t*)(out + 4), vtbl4_u8(all_out, vld1_u8(kShuffles[1]))); | 1039 vst1_u8((uint8_t*)(out + 4), shuffles.val[1]); |
| 1024 vst1_u8((uint8_t*)(out + 8), vtbl4_u8(all_out, vld1_u8(kShuffles[2]))); | 1040 vst1_u8((uint8_t*)(out + 8), shuffles.val[2]); |
| 1025 vst1_u8((uint8_t*)(out + 12), vtbl4_u8(all_out, vld1_u8(kShuffles[3]))); | 1041 vst1_u8((uint8_t*)(out + 12), shuffles.val[3]); |
| 1026 // test zeros | 1042 // test zeros |
| 1027 if (*(uint64_t*)(out + 0) != 0) return 1; | 1043 if (*(uint64_t*)(out + 0) != 0) return 1; |
| 1028 if (*(uint64_t*)(out + 4) != 0) return 1; | 1044 if (*(uint64_t*)(out + 4) != 0) return 1; |
| 1029 if (*(uint64_t*)(out + 8) != 0) return 1; | 1045 if (*(uint64_t*)(out + 8) != 0) return 1; |
| 1030 if (*(uint64_t*)(out + 12) != 0) return 1; | 1046 if (*(uint64_t*)(out + 12) != 0) return 1; |
| 1031 return 0; | 1047 return 0; |
| 1032 } | 1048 } |
| 1033 | 1049 |
| 1034 #endif // !WORK_AROUND_GCC && !__aarch64__ | 1050 #endif // !WORK_AROUND_GCC |
| 1035 | 1051 |
| 1036 #endif // WEBP_USE_NEON | 1052 #endif // WEBP_USE_NEON |
| 1037 | 1053 |
| 1038 //------------------------------------------------------------------------------ | 1054 //------------------------------------------------------------------------------ |
| 1039 // Entry point | 1055 // Entry point |
| 1040 | 1056 |
| 1041 extern void VP8EncDspInitNEON(void); | 1057 extern void VP8EncDspInitNEON(void); |
| 1042 | 1058 |
| 1043 void VP8EncDspInitNEON(void) { | 1059 void VP8EncDspInitNEON(void) { |
| 1044 #if defined(WEBP_USE_NEON) | 1060 #if defined(WEBP_USE_NEON) |
| 1045 VP8ITransform = ITransform; | 1061 VP8ITransform = ITransform; |
| 1046 VP8FTransform = FTransform; | 1062 VP8FTransform = FTransform; |
| 1047 | 1063 |
| 1048 VP8FTransformWHT = FTransformWHT; | 1064 VP8FTransformWHT = FTransformWHT; |
| 1049 | 1065 |
| 1050 VP8TDisto4x4 = Disto4x4; | 1066 VP8TDisto4x4 = Disto4x4; |
| 1051 VP8TDisto16x16 = Disto16x16; | 1067 VP8TDisto16x16 = Disto16x16; |
| 1052 VP8CollectHistogram = CollectHistogram; | 1068 VP8CollectHistogram = CollectHistogram; |
| 1053 VP8SSE16x16 = SSE16x16; | 1069 VP8SSE16x16 = SSE16x16; |
| 1054 VP8SSE16x8 = SSE16x8; | 1070 VP8SSE16x8 = SSE16x8; |
| 1055 VP8SSE8x8 = SSE8x8; | 1071 VP8SSE8x8 = SSE8x8; |
| 1056 VP8SSE4x4 = SSE4x4; | 1072 VP8SSE4x4 = SSE4x4; |
| 1057 #if !(defined(WORK_AROUND_GCC) || defined(__aarch64__)) | 1073 #if !defined(WORK_AROUND_GCC) |
| 1058 VP8EncQuantizeBlock = QuantizeBlock; | 1074 VP8EncQuantizeBlock = QuantizeBlock; |
| 1059 #endif | 1075 #endif |
| 1060 #endif // WEBP_USE_NEON | 1076 #endif // WEBP_USE_NEON |
| 1061 } | 1077 } |
| OLD | NEW |