Index: third_party/libwebp/dsp/dec_neon.c |
diff --git a/third_party/libwebp/dsp/dec_neon.c b/third_party/libwebp/dsp/dec_neon.c |
index 4afae076c6d94e5ce8704b011a8791f4c7cf4e27..a63f43fe172dbba84f7e7ecc3bebc76c2a30b85b 100644 |
--- a/third_party/libwebp/dsp/dec_neon.c |
+++ b/third_party/libwebp/dsp/dec_neon.c |
@@ -389,9 +389,9 @@ static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0, |
#endif // !WORK_AROUND_GCC |
-// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t. |
-static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) { |
- return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v))); |
+// Zero extend 'v' to an int16x8_t. |
+static WEBP_INLINE int16x8_t ConvertU8ToS16(uint8x8_t v) { |
+ return vreinterpretq_s16_u16(vmovl_u8(v)); |
} |
// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result |
@@ -423,8 +423,8 @@ static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23, |
{ |
// Convert to 16b. |
- const int16x8_t dst01_s16 = ConvertU8ToS16(dst01); |
- const int16x8_t dst23_s16 = ConvertU8ToS16(dst23); |
+ const int16x8_t dst01_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst01)); |
+ const int16x8_t dst23_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst23)); |
// Descale with rounding. |
const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3); |
@@ -479,6 +479,21 @@ static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) { |
//------------------------------------------------------------------------------ |
+static void ApplyFilter2NoFlip(const int8x16_t p0s, const int8x16_t q0s, |
+ const int8x16_t delta, |
+ int8x16_t* const op0, int8x16_t* const oq0) { |
+ const int8x16_t kCst3 = vdupq_n_s8(0x03); |
+ const int8x16_t kCst4 = vdupq_n_s8(0x04); |
+ const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3); |
+ const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4); |
+ const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3); |
+ const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3); |
+ *op0 = vqaddq_s8(p0s, delta3); |
+ *oq0 = vqsubq_s8(q0s, delta4); |
+} |
+ |
+#if defined(WEBP_USE_INTRINSICS) |
+ |
static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s, |
const int8x16_t delta, |
uint8x16_t* const op0, uint8x16_t* const oq0) { |
@@ -494,8 +509,6 @@ static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s, |
*oq0 = FlipSignBack(sq0); |
} |
-#if defined(USE_INTRINSICS) |
- |
static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0, |
const uint8x16_t q0, const uint8x16_t q1, |
const uint8x16_t mask, |
@@ -626,7 +639,7 @@ static void SimpleHFilter16(uint8_t* p, int stride, int thresh) { |
); |
} |
-#endif // USE_INTRINSICS |
+#endif // WEBP_USE_INTRINSICS |
static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) { |
uint32_t k; |
@@ -721,11 +734,7 @@ static void DoFilter4( |
const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s); |
const int8x16_t simple_lf_delta = |
vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask)); |
- uint8x16_t tmp_p0, tmp_q0; |
- ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0); |
- // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here |
- p0s = FlipSign(tmp_p0); |
- q0s = FlipSign(tmp_q0); |
+ ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s); |
} |
// do_filter4 part (complex loopfilter on pixels without hev) |
@@ -797,11 +806,7 @@ static void DoFilter6( |
{ |
const int8x16_t simple_lf_delta = |
vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask)); |
- uint8x16_t tmp_p0, tmp_q0; |
- ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0); |
- // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here |
- p0s = FlipSign(tmp_p0); |
- q0s = FlipSign(tmp_q0); |
+ ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s); |
} |
// do_filter6 part (complex loopfilter on pixels without hev) |
@@ -986,7 +991,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride, |
static const int16_t kC1 = 20091; |
static const int16_t kC2 = 17734; // half of kC2, actually. See comment above. |
-#if defined(USE_INTRINSICS) |
+#if defined(WEBP_USE_INTRINSICS) |
static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1, |
int16x8x2_t* const out) { |
// a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1 |
@@ -1163,7 +1168,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) { |
); |
} |
-#endif // USE_INTRINSICS |
+#endif // WEBP_USE_INTRINSICS |
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { |
TransformOne(in, dst); |
@@ -1241,7 +1246,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) { |
static void TransformAC3(const int16_t* in, uint8_t* dst) { |
static const int kC1_full = 20091 + (1 << 16); |
static const int kC2_full = 35468; |
- const int16x4_t A = vdup_n_s16(in[0]); |
+ const int16x4_t A = vld1_dup_s16(in); |
const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full)); |
const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full)); |
const int c1 = MUL(in[1], kC2_full); |
@@ -1258,15 +1263,330 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) { |
} |
#undef MUL |
-#endif // WEBP_USE_NEON |
+//------------------------------------------------------------------------------ |
+// 4x4 |
+ |
+static void DC4(uint8_t* dst) { // DC |
+ const uint8x8_t A = vld1_u8(dst - BPS); // top row |
+ const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top |
+ const uint16x4_t p1 = vpadd_u16(p0, p0); |
+ const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1)); |
+ const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1)); |
+ const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1)); |
+ const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1)); |
+ const uint16x8_t s0 = vaddq_u16(L0, L1); |
+ const uint16x8_t s1 = vaddq_u16(L2, L3); |
+ const uint16x8_t s01 = vaddq_u16(s0, s1); |
+ const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1)); |
+ const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); // (sum + 4) >> 3 |
+ const uint8x8_t dc = vdup_lane_u8(dc0, 0); |
+ int i; |
+ for (i = 0; i < 4; ++i) { |
+ vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0); |
+ } |
+} |
+ |
+// TrueMotion (4x4 + 8x8) |
+static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) { |
+ const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]' |
+ const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]' |
+ const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL)); // A[c] - A[-1] |
+ int y; |
+ for (y = 0; y < size; y += 4) { |
+ // left edge |
+ const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1)); |
+ const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1)); |
+ const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1)); |
+ const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1)); |
+ const int16x8_t r0 = vaddq_s16(L0, d); // L[r] + A[c] - A[-1] |
+ const int16x8_t r1 = vaddq_s16(L1, d); |
+ const int16x8_t r2 = vaddq_s16(L2, d); |
+ const int16x8_t r3 = vaddq_s16(L3, d); |
+ // Saturate and store the result. |
+ const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0)); |
+ const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1)); |
+ const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2)); |
+ const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3)); |
+ if (size == 4) { |
+ vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0); |
+ vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0); |
+ vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0); |
+ vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0); |
+ } else { |
+ vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32); |
+ vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32); |
+ vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32); |
+ vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32); |
+ } |
+ dst += 4 * BPS; |
+ } |
+} |
+ |
+static void TM4(uint8_t* dst) { TrueMotion(dst, 4); } |
+ |
+static void VE4(uint8_t* dst) { // vertical |
+ // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS. |
+ const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1)); // top row |
+ const uint64x1_t A1 = vshr_n_u64(A0, 8); |
+ const uint64x1_t A2 = vshr_n_u64(A0, 16); |
+ const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0); |
+ const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1); |
+ const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2); |
+ const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00); |
+ const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0); |
+ int i; |
+ for (i = 0; i < 4; ++i) { |
+ vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0); |
+ } |
+} |
+ |
+static void RD4(uint8_t* dst) { // Down-right |
+ const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1); |
+ const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); |
+ const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); |
+ const uint32_t I = dst[-1 + 0 * BPS]; |
+ const uint32_t J = dst[-1 + 1 * BPS]; |
+ const uint32_t K = dst[-1 + 2 * BPS]; |
+ const uint32_t L = dst[-1 + 3 * BPS]; |
+ const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24)); |
+ const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC); |
+ const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8)); |
+ const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16)); |
+ const uint8_t D = vget_lane_u8(XABCD_u8, 4); |
+ const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6); |
+ const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC); |
+ const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8); |
+ const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_); |
+ const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); |
+ const uint32x2_t r3 = vreinterpret_u32_u8(avg2); |
+ const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); |
+ const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); |
+ const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); |
+ vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0); |
+ vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0); |
+ vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0); |
+ vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0); |
+} |
+ |
+static void LD4(uint8_t* dst) { // Down-left |
+ // Note using the same shift trick as VE4() is slower here. |
+ const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0); |
+ const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1); |
+ const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2); |
+ const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6); |
+ const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0); |
+ const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0); |
+ const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); |
+ const uint32x2_t r0 = vreinterpret_u32_u8(avg2); |
+ const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); |
+ const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); |
+ const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); |
+ vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0); |
+ vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0); |
+ vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0); |
+ vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0); |
+} |
+ |
+//------------------------------------------------------------------------------ |
+// Chroma |
+ |
+static void VE8uv(uint8_t* dst) { // vertical |
+ const uint8x8_t top = vld1_u8(dst - BPS); |
+ int j; |
+ for (j = 0; j < 8; ++j) { |
+ vst1_u8(dst + j * BPS, top); |
+ } |
+} |
+ |
+static void HE8uv(uint8_t* dst) { // horizontal |
+ int j; |
+ for (j = 0; j < 8; ++j) { |
+ const uint8x8_t left = vld1_dup_u8(dst - 1); |
+ vst1_u8(dst, left); |
+ dst += BPS; |
+ } |
+} |
+ |
+static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) { |
+ uint16x8_t sum_top; |
+ uint16x8_t sum_left; |
+ uint8x8_t dc0; |
+ |
+ if (do_top) { |
+ const uint8x8_t A = vld1_u8(dst - BPS); // top row |
+ const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top |
+ const uint16x4_t p1 = vpadd_u16(p0, p0); |
+ const uint16x4_t p2 = vpadd_u16(p1, p1); |
+ sum_top = vcombine_u16(p2, p2); |
+ } |
+ |
+ if (do_left) { |
+ const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1)); |
+ const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1)); |
+ const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1)); |
+ const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1)); |
+ const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1)); |
+ const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1)); |
+ const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1)); |
+ const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1)); |
+ const uint16x8_t s0 = vaddq_u16(L0, L1); |
+ const uint16x8_t s1 = vaddq_u16(L2, L3); |
+ const uint16x8_t s2 = vaddq_u16(L4, L5); |
+ const uint16x8_t s3 = vaddq_u16(L6, L7); |
+ const uint16x8_t s01 = vaddq_u16(s0, s1); |
+ const uint16x8_t s23 = vaddq_u16(s2, s3); |
+ sum_left = vaddq_u16(s01, s23); |
+ } |
+ |
+ if (do_top && do_left) { |
+ const uint16x8_t sum = vaddq_u16(sum_left, sum_top); |
+ dc0 = vrshrn_n_u16(sum, 4); |
+ } else if (do_top) { |
+ dc0 = vrshrn_n_u16(sum_top, 3); |
+ } else if (do_left) { |
+ dc0 = vrshrn_n_u16(sum_left, 3); |
+ } else { |
+ dc0 = vdup_n_u8(0x80); |
+ } |
+ |
+ { |
+ const uint8x8_t dc = vdup_lane_u8(dc0, 0); |
+ int i; |
+ for (i = 0; i < 8; ++i) { |
+ vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc)); |
+ } |
+ } |
+} |
+ |
+static void DC8uv(uint8_t* dst) { DC8(dst, 1, 1); } |
+static void DC8uvNoTop(uint8_t* dst) { DC8(dst, 0, 1); } |
+static void DC8uvNoLeft(uint8_t* dst) { DC8(dst, 1, 0); } |
+static void DC8uvNoTopLeft(uint8_t* dst) { DC8(dst, 0, 0); } |
+ |
+static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); } |
+ |
+//------------------------------------------------------------------------------ |
+// 16x16 |
+ |
+static void VE16(uint8_t* dst) { // vertical |
+ const uint8x16_t top = vld1q_u8(dst - BPS); |
+ int j; |
+ for (j = 0; j < 16; ++j) { |
+ vst1q_u8(dst + j * BPS, top); |
+ } |
+} |
+ |
+static void HE16(uint8_t* dst) { // horizontal |
+ int j; |
+ for (j = 0; j < 16; ++j) { |
+ const uint8x16_t left = vld1q_dup_u8(dst - 1); |
+ vst1q_u8(dst, left); |
+ dst += BPS; |
+ } |
+} |
+ |
+static WEBP_INLINE void DC16(uint8_t* dst, int do_top, int do_left) { |
+ uint16x8_t sum_top; |
+ uint16x8_t sum_left; |
+ uint8x8_t dc0; |
+ |
+ if (do_top) { |
+ const uint8x16_t A = vld1q_u8(dst - BPS); // top row |
+ const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top |
+ const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); |
+ const uint16x4_t p2 = vpadd_u16(p1, p1); |
+ const uint16x4_t p3 = vpadd_u16(p2, p2); |
+ sum_top = vcombine_u16(p3, p3); |
+ } |
+ |
+ if (do_left) { |
+ int i; |
+ sum_left = vdupq_n_u16(0); |
+ for (i = 0; i < 16; i += 8) { |
+ const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1)); |
+ const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1)); |
+ const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1)); |
+ const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1)); |
+ const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1)); |
+ const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1)); |
+ const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1)); |
+ const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1)); |
+ const uint16x8_t s0 = vaddq_u16(L0, L1); |
+ const uint16x8_t s1 = vaddq_u16(L2, L3); |
+ const uint16x8_t s2 = vaddq_u16(L4, L5); |
+ const uint16x8_t s3 = vaddq_u16(L6, L7); |
+ const uint16x8_t s01 = vaddq_u16(s0, s1); |
+ const uint16x8_t s23 = vaddq_u16(s2, s3); |
+ const uint16x8_t sum = vaddq_u16(s01, s23); |
+ sum_left = vaddq_u16(sum_left, sum); |
+ } |
+ } |
+ |
+ if (do_top && do_left) { |
+ const uint16x8_t sum = vaddq_u16(sum_left, sum_top); |
+ dc0 = vrshrn_n_u16(sum, 5); |
+ } else if (do_top) { |
+ dc0 = vrshrn_n_u16(sum_top, 4); |
+ } else if (do_left) { |
+ dc0 = vrshrn_n_u16(sum_left, 4); |
+ } else { |
+ dc0 = vdup_n_u8(0x80); |
+ } |
+ |
+ { |
+ const uint8x16_t dc = vdupq_lane_u8(dc0, 0); |
+ int i; |
+ for (i = 0; i < 16; ++i) { |
+ vst1q_u8(dst + i * BPS, dc); |
+ } |
+ } |
+} |
+ |
+static void DC16TopLeft(uint8_t* dst) { DC16(dst, 1, 1); } |
+static void DC16NoTop(uint8_t* dst) { DC16(dst, 0, 1); } |
+static void DC16NoLeft(uint8_t* dst) { DC16(dst, 1, 0); } |
+static void DC16NoTopLeft(uint8_t* dst) { DC16(dst, 0, 0); } |
+ |
+static void TM16(uint8_t* dst) { |
+ const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]' |
+ const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]' |
+ // A[c] - A[-1] |
+ const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL)); |
+ const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL)); |
+ int y; |
+ for (y = 0; y < 16; y += 4) { |
+ // left edge |
+ const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1)); |
+ const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1)); |
+ const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1)); |
+ const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1)); |
+ const int16x8_t r0_lo = vaddq_s16(L0, d_lo); // L[r] + A[c] - A[-1] |
+ const int16x8_t r1_lo = vaddq_s16(L1, d_lo); |
+ const int16x8_t r2_lo = vaddq_s16(L2, d_lo); |
+ const int16x8_t r3_lo = vaddq_s16(L3, d_lo); |
+ const int16x8_t r0_hi = vaddq_s16(L0, d_hi); |
+ const int16x8_t r1_hi = vaddq_s16(L1, d_hi); |
+ const int16x8_t r2_hi = vaddq_s16(L2, d_hi); |
+ const int16x8_t r3_hi = vaddq_s16(L3, d_hi); |
+ // Saturate and store the result. |
+ const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi)); |
+ const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi)); |
+ const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi)); |
+ const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi)); |
+ vst1q_u8(dst + 0 * BPS, row0); |
+ vst1q_u8(dst + 1 * BPS, row1); |
+ vst1q_u8(dst + 2 * BPS, row2); |
+ vst1q_u8(dst + 3 * BPS, row3); |
+ dst += 4 * BPS; |
+ } |
+} |
//------------------------------------------------------------------------------ |
// Entry point |
extern void VP8DspInitNEON(void); |
-void VP8DspInitNEON(void) { |
-#if defined(WEBP_USE_NEON) |
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) { |
VP8Transform = TransformTwo; |
VP8TransformAC3 = TransformAC3; |
VP8TransformDC = TransformDC; |
@@ -1288,5 +1608,32 @@ void VP8DspInitNEON(void) { |
VP8SimpleHFilter16 = SimpleHFilter16; |
VP8SimpleVFilter16i = SimpleVFilter16i; |
VP8SimpleHFilter16i = SimpleHFilter16i; |
-#endif // WEBP_USE_NEON |
+ |
+ VP8PredLuma4[0] = DC4; |
+ VP8PredLuma4[1] = TM4; |
+ VP8PredLuma4[2] = VE4; |
+ VP8PredLuma4[4] = RD4; |
+ VP8PredLuma4[6] = LD4; |
+ |
+ VP8PredLuma16[0] = DC16TopLeft; |
+ VP8PredLuma16[1] = TM16; |
+ VP8PredLuma16[2] = VE16; |
+ VP8PredLuma16[3] = HE16; |
+ VP8PredLuma16[4] = DC16NoTop; |
+ VP8PredLuma16[5] = DC16NoLeft; |
+ VP8PredLuma16[6] = DC16NoTopLeft; |
+ |
+ VP8PredChroma8[0] = DC8uv; |
+ VP8PredChroma8[1] = TM8uv; |
+ VP8PredChroma8[2] = VE8uv; |
+ VP8PredChroma8[3] = HE8uv; |
+ VP8PredChroma8[4] = DC8uvNoTop; |
+ VP8PredChroma8[5] = DC8uvNoLeft; |
+ VP8PredChroma8[6] = DC8uvNoTopLeft; |
} |
+ |
+#else // !WEBP_USE_NEON |
+ |
+WEBP_DSP_INIT_STUB(VP8DspInitNEON) |
+ |
+#endif // WEBP_USE_NEON |