OLD | NEW |
1 // Copyright 2012 Google Inc. All Rights Reserved. | 1 // Copyright 2012 Google Inc. All Rights Reserved. |
2 // | 2 // |
3 // Use of this source code is governed by a BSD-style license | 3 // Use of this source code is governed by a BSD-style license |
4 // that can be found in the COPYING file in the root of the source | 4 // that can be found in the COPYING file in the root of the source |
5 // tree. An additional intellectual property rights grant can be found | 5 // tree. An additional intellectual property rights grant can be found |
6 // in the file PATENTS. All contributing project authors may | 6 // in the file PATENTS. All contributing project authors may |
7 // be found in the AUTHORS file in the root of the source tree. | 7 // be found in the AUTHORS file in the root of the source tree. |
8 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
9 // | 9 // |
10 // ARM NEON version of speed-critical encoding functions. | 10 // ARM NEON version of speed-critical encoding functions. |
(...skipping 235 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
246 static void ITransform(const uint8_t* ref, | 246 static void ITransform(const uint8_t* ref, |
247 const int16_t* in, uint8_t* dst, int do_two) { | 247 const int16_t* in, uint8_t* dst, int do_two) { |
248 ITransformOne(ref, in, dst); | 248 ITransformOne(ref, in, dst); |
249 if (do_two) { | 249 if (do_two) { |
250 ITransformOne(ref + 4, in + 16, dst + 4); | 250 ITransformOne(ref + 4, in + 16, dst + 4); |
251 } | 251 } |
252 } | 252 } |
253 | 253 |
254 // Load all 4x4 pixels into a single uint8x16_t variable. | 254 // Load all 4x4 pixels into a single uint8x16_t variable. |
255 static uint8x16_t Load4x4(const uint8_t* src) { | 255 static uint8x16_t Load4x4(const uint8_t* src) { |
256 uint32x4_t out = { 0, 0, 0, 0 }; | 256 uint32x4_t out = vdupq_n_u32(0); |
257 out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0); | 257 out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0); |
258 out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1); | 258 out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1); |
259 out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2); | 259 out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2); |
260 out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3); | 260 out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3); |
261 return vreinterpretq_u8_u32(out); | 261 return vreinterpretq_u8_u32(out); |
262 } | 262 } |
263 | 263 |
264 // Forward transform. | 264 // Forward transform. |
265 | 265 |
266 #if defined(USE_INTRINSICS) | 266 #if defined(USE_INTRINSICS) |
(...skipping 655 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
922 } | 922 } |
923 | 923 |
924 // Horizontal sum of all four uint32_t values in 'sum'. | 924 // Horizontal sum of all four uint32_t values in 'sum'. |
925 static int SumToInt(uint32x4_t sum) { | 925 static int SumToInt(uint32x4_t sum) { |
926 const uint64x2_t sum2 = vpaddlq_u32(sum); | 926 const uint64x2_t sum2 = vpaddlq_u32(sum); |
927 const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1); | 927 const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1); |
928 return (int)sum3; | 928 return (int)sum3; |
929 } | 929 } |
930 | 930 |
931 static int SSE16x16(const uint8_t* a, const uint8_t* b) { | 931 static int SSE16x16(const uint8_t* a, const uint8_t* b) { |
932 uint32x4_t sum = { 0, 0, 0, 0 }; | 932 uint32x4_t sum = vdupq_n_u32(0); |
933 int y; | 933 int y; |
934 for (y = 0; y < 16; ++y) { | 934 for (y = 0; y < 16; ++y) { |
935 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); | 935 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); |
936 } | 936 } |
937 return SumToInt(sum); | 937 return SumToInt(sum); |
938 } | 938 } |
939 | 939 |
940 static int SSE16x8(const uint8_t* a, const uint8_t* b) { | 940 static int SSE16x8(const uint8_t* a, const uint8_t* b) { |
941 uint32x4_t sum = { 0, 0, 0, 0 }; | 941 uint32x4_t sum = vdupq_n_u32(0); |
942 int y; | 942 int y; |
943 for (y = 0; y < 8; ++y) { | 943 for (y = 0; y < 8; ++y) { |
944 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); | 944 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); |
945 } | 945 } |
946 return SumToInt(sum); | 946 return SumToInt(sum); |
947 } | 947 } |
948 | 948 |
949 static int SSE8x8(const uint8_t* a, const uint8_t* b) { | 949 static int SSE8x8(const uint8_t* a, const uint8_t* b) { |
950 uint32x4_t sum = { 0, 0, 0, 0 }; | 950 uint32x4_t sum = vdupq_n_u32(0); |
951 int y; | 951 int y; |
952 for (y = 0; y < 8; ++y) { | 952 for (y = 0; y < 8; ++y) { |
953 const uint8x8_t a0 = vld1_u8(a + y * BPS); | 953 const uint8x8_t a0 = vld1_u8(a + y * BPS); |
954 const uint8x8_t b0 = vld1_u8(b + y * BPS); | 954 const uint8x8_t b0 = vld1_u8(b + y * BPS); |
955 const uint8x8_t abs_diff = vabd_u8(a0, b0); | 955 const uint8x8_t abs_diff = vabd_u8(a0, b0); |
956 const uint16x8_t prod = vmull_u8(abs_diff, abs_diff); | 956 const uint16x8_t prod = vmull_u8(abs_diff, abs_diff); |
957 sum = vpadalq_u16(sum, prod); | 957 sum = vpadalq_u16(sum, prod); |
958 } | 958 } |
959 return SumToInt(sum); | 959 return SumToInt(sum); |
960 } | 960 } |
961 | 961 |
962 static int SSE4x4(const uint8_t* a, const uint8_t* b) { | 962 static int SSE4x4(const uint8_t* a, const uint8_t* b) { |
963 const uint8x16_t a0 = Load4x4(a); | 963 const uint8x16_t a0 = Load4x4(a); |
964 const uint8x16_t b0 = Load4x4(b); | 964 const uint8x16_t b0 = Load4x4(b); |
965 const uint8x16_t abs_diff = vabdq_u8(a0, b0); | 965 const uint8x16_t abs_diff = vabdq_u8(a0, b0); |
966 uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff)); | 966 uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff)); |
967 prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff)); | 967 prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff)); |
968 return SumToInt(vpaddlq_u16(prod)); | 968 return SumToInt(vpaddlq_u16(prod)); |
969 } | 969 } |
970 | 970 |
971 //------------------------------------------------------------------------------ | 971 //------------------------------------------------------------------------------ |
972 | 972 |
973 // Compilation with gcc-4.6.x is problematic for now and vtbl? are unavailable | 973 // Compilation with gcc-4.6.x is problematic for now. |
974 // in iOS/arm64 builds. Disable this function in those cases. | 974 #if !defined(WORK_AROUND_GCC) |
975 #if !(defined(WORK_AROUND_GCC) || defined(__aarch64__)) | |
976 | 975 |
977 static int16x8_t Quantize(int16_t* const in, | 976 static int16x8_t Quantize(int16_t* const in, |
978 const VP8Matrix* const mtx, int offset) { | 977 const VP8Matrix* const mtx, int offset) { |
979 const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]); | 978 const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]); |
980 const uint16x8_t q = vld1q_u16(&mtx->q_[offset]); | 979 const uint16x8_t q = vld1q_u16(&mtx->q_[offset]); |
981 const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]); | 980 const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]); |
982 const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]); | 981 const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]); |
983 const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]); | 982 const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]); |
984 | 983 |
985 const int16x8_t a = vld1q_s16(in + offset); // in | 984 const int16x8_t a = vld1q_s16(in + offset); // in |
986 const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in) | 985 const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in) |
987 const int16x8_t sign = vshrq_n_s16(a, 15); // sign | 986 const int16x8_t sign = vshrq_n_s16(a, 15); // sign |
988 const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen | 987 const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen |
989 const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq)); | 988 const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq)); |
990 const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq)); | 989 const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq)); |
991 const uint32x4_t m2 = vhaddq_u32(m0, bias0); | 990 const uint32x4_t m2 = vhaddq_u32(m0, bias0); |
992 const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1 | 991 const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1 |
993 const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16), | 992 const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16), |
994 vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1 | 993 vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1 |
995 const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL)); | 994 const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL)); |
996 const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign); | 995 const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign); |
997 const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign | 996 const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign |
998 const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q)); | 997 const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q)); |
999 vst1q_s16(in + offset, c4); | 998 vst1q_s16(in + offset, c4); |
1000 assert(QFIX == 17); // this function can't work as is if QFIX != 16+1 | 999 assert(QFIX == 17); // this function can't work as is if QFIX != 16+1 |
1001 return c3; | 1000 return c3; |
1002 } | 1001 } |
1003 | 1002 |
1004 static const uint8_t kShuffles[4][8] = { | 1003 static const uint8_t kShuffles[4][8] = { |
1005 { 0, 1, 2, 3, 8, 9, 16, 17 }, | 1004 { 0, 1, 2, 3, 8, 9, 16, 17 }, |
1006 { 10, 11, 4, 5, 6, 7, 12, 13 }, | 1005 { 10, 11, 4, 5, 6, 7, 12, 13 }, |
1007 { 18, 19, 24, 25, 26, 27, 20, 21 }, | 1006 { 18, 19, 24, 25, 26, 27, 20, 21 }, |
1008 { 14, 15, 22, 23, 28, 29, 30, 31 } | 1007 { 14, 15, 22, 23, 28, 29, 30, 31 } |
1009 }; | 1008 }; |
1010 | 1009 |
1011 static int QuantizeBlock(int16_t in[16], int16_t out[16], | 1010 static int QuantizeBlock(int16_t in[16], int16_t out[16], |
1012 const VP8Matrix* const mtx) { | 1011 const VP8Matrix* const mtx) { |
1013 const int16x8_t out0 = Quantize(in, mtx, 0); | 1012 const int16x8_t out0 = Quantize(in, mtx, 0); |
1014 const int16x8_t out1 = Quantize(in, mtx, 8); | 1013 const int16x8_t out1 = Quantize(in, mtx, 8); |
| 1014 uint8x8x4_t shuffles; |
| 1015 // vtbl4_u8 is marked unavailable for iOS arm64, use wider versions there. |
| 1016 #if defined(__APPLE__) && defined(__aarch64__) |
| 1017 uint8x16x2_t all_out; |
| 1018 INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1)); |
| 1019 INIT_VECTOR4(shuffles, |
| 1020 vtbl2q_u8(all_out, vld1_u8(kShuffles[0])), |
| 1021 vtbl2q_u8(all_out, vld1_u8(kShuffles[1])), |
| 1022 vtbl2q_u8(all_out, vld1_u8(kShuffles[2])), |
| 1023 vtbl2q_u8(all_out, vld1_u8(kShuffles[3]))); |
| 1024 #else |
1015 uint8x8x4_t all_out; | 1025 uint8x8x4_t all_out; |
1016 INIT_VECTOR4(all_out, | 1026 INIT_VECTOR4(all_out, |
1017 vreinterpret_u8_s16(vget_low_s16(out0)), | 1027 vreinterpret_u8_s16(vget_low_s16(out0)), |
1018 vreinterpret_u8_s16(vget_high_s16(out0)), | 1028 vreinterpret_u8_s16(vget_high_s16(out0)), |
1019 vreinterpret_u8_s16(vget_low_s16(out1)), | 1029 vreinterpret_u8_s16(vget_low_s16(out1)), |
1020 vreinterpret_u8_s16(vget_high_s16(out1))); | 1030 vreinterpret_u8_s16(vget_high_s16(out1))); |
| 1031 INIT_VECTOR4(shuffles, |
| 1032 vtbl4_u8(all_out, vld1_u8(kShuffles[0])), |
| 1033 vtbl4_u8(all_out, vld1_u8(kShuffles[1])), |
| 1034 vtbl4_u8(all_out, vld1_u8(kShuffles[2])), |
| 1035 vtbl4_u8(all_out, vld1_u8(kShuffles[3]))); |
| 1036 #endif |
1021 // Zigzag reordering | 1037 // Zigzag reordering |
1022 vst1_u8((uint8_t*)(out + 0), vtbl4_u8(all_out, vld1_u8(kShuffles[0]))); | 1038 vst1_u8((uint8_t*)(out + 0), shuffles.val[0]); |
1023 vst1_u8((uint8_t*)(out + 4), vtbl4_u8(all_out, vld1_u8(kShuffles[1]))); | 1039 vst1_u8((uint8_t*)(out + 4), shuffles.val[1]); |
1024 vst1_u8((uint8_t*)(out + 8), vtbl4_u8(all_out, vld1_u8(kShuffles[2]))); | 1040 vst1_u8((uint8_t*)(out + 8), shuffles.val[2]); |
1025 vst1_u8((uint8_t*)(out + 12), vtbl4_u8(all_out, vld1_u8(kShuffles[3]))); | 1041 vst1_u8((uint8_t*)(out + 12), shuffles.val[3]); |
1026 // test zeros | 1042 // test zeros |
1027 if (*(uint64_t*)(out + 0) != 0) return 1; | 1043 if (*(uint64_t*)(out + 0) != 0) return 1; |
1028 if (*(uint64_t*)(out + 4) != 0) return 1; | 1044 if (*(uint64_t*)(out + 4) != 0) return 1; |
1029 if (*(uint64_t*)(out + 8) != 0) return 1; | 1045 if (*(uint64_t*)(out + 8) != 0) return 1; |
1030 if (*(uint64_t*)(out + 12) != 0) return 1; | 1046 if (*(uint64_t*)(out + 12) != 0) return 1; |
1031 return 0; | 1047 return 0; |
1032 } | 1048 } |
1033 | 1049 |
1034 #endif // !WORK_AROUND_GCC && !__aarch64__ | 1050 #endif // !WORK_AROUND_GCC |
1035 | 1051 |
1036 #endif // WEBP_USE_NEON | 1052 #endif // WEBP_USE_NEON |
1037 | 1053 |
1038 //------------------------------------------------------------------------------ | 1054 //------------------------------------------------------------------------------ |
1039 // Entry point | 1055 // Entry point |
1040 | 1056 |
1041 extern void VP8EncDspInitNEON(void); | 1057 extern void VP8EncDspInitNEON(void); |
1042 | 1058 |
1043 void VP8EncDspInitNEON(void) { | 1059 void VP8EncDspInitNEON(void) { |
1044 #if defined(WEBP_USE_NEON) | 1060 #if defined(WEBP_USE_NEON) |
1045 VP8ITransform = ITransform; | 1061 VP8ITransform = ITransform; |
1046 VP8FTransform = FTransform; | 1062 VP8FTransform = FTransform; |
1047 | 1063 |
1048 VP8FTransformWHT = FTransformWHT; | 1064 VP8FTransformWHT = FTransformWHT; |
1049 | 1065 |
1050 VP8TDisto4x4 = Disto4x4; | 1066 VP8TDisto4x4 = Disto4x4; |
1051 VP8TDisto16x16 = Disto16x16; | 1067 VP8TDisto16x16 = Disto16x16; |
1052 VP8CollectHistogram = CollectHistogram; | 1068 VP8CollectHistogram = CollectHistogram; |
1053 VP8SSE16x16 = SSE16x16; | 1069 VP8SSE16x16 = SSE16x16; |
1054 VP8SSE16x8 = SSE16x8; | 1070 VP8SSE16x8 = SSE16x8; |
1055 VP8SSE8x8 = SSE8x8; | 1071 VP8SSE8x8 = SSE8x8; |
1056 VP8SSE4x4 = SSE4x4; | 1072 VP8SSE4x4 = SSE4x4; |
1057 #if !(defined(WORK_AROUND_GCC) || defined(__aarch64__)) | 1073 #if !defined(WORK_AROUND_GCC) |
1058 VP8EncQuantizeBlock = QuantizeBlock; | 1074 VP8EncQuantizeBlock = QuantizeBlock; |
1059 #endif | 1075 #endif |
1060 #endif // WEBP_USE_NEON | 1076 #endif // WEBP_USE_NEON |
1061 } | 1077 } |
OLD | NEW |