| Index: third_party/libwebp/dsp/dec_neon.c
|
| diff --git a/third_party/libwebp/dsp/dec_neon.c b/third_party/libwebp/dsp/dec_neon.c
|
| index 4afae076c6d94e5ce8704b011a8791f4c7cf4e27..a63f43fe172dbba84f7e7ecc3bebc76c2a30b85b 100644
|
| --- a/third_party/libwebp/dsp/dec_neon.c
|
| +++ b/third_party/libwebp/dsp/dec_neon.c
|
| @@ -389,9 +389,9 @@ static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,
|
|
|
| #endif // !WORK_AROUND_GCC
|
|
|
| -// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
|
| -static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
|
| - return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
|
| +// Zero extend 'v' to an int16x8_t.
|
| +static WEBP_INLINE int16x8_t ConvertU8ToS16(uint8x8_t v) {
|
| + return vreinterpretq_s16_u16(vmovl_u8(v));
|
| }
|
|
|
| // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
|
| @@ -423,8 +423,8 @@ static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
|
|
|
| {
|
| // Convert to 16b.
|
| - const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
|
| - const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
|
| + const int16x8_t dst01_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst01));
|
| + const int16x8_t dst23_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst23));
|
|
|
| // Descale with rounding.
|
| const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
|
| @@ -479,6 +479,21 @@ static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {
|
|
|
| //------------------------------------------------------------------------------
|
|
|
| +static void ApplyFilter2NoFlip(const int8x16_t p0s, const int8x16_t q0s,
|
| + const int8x16_t delta,
|
| + int8x16_t* const op0, int8x16_t* const oq0) {
|
| + const int8x16_t kCst3 = vdupq_n_s8(0x03);
|
| + const int8x16_t kCst4 = vdupq_n_s8(0x04);
|
| + const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
|
| + const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
|
| + const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
|
| + const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
|
| + *op0 = vqaddq_s8(p0s, delta3);
|
| + *oq0 = vqsubq_s8(q0s, delta4);
|
| +}
|
| +
|
| +#if defined(WEBP_USE_INTRINSICS)
|
| +
|
| static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
|
| const int8x16_t delta,
|
| uint8x16_t* const op0, uint8x16_t* const oq0) {
|
| @@ -494,8 +509,6 @@ static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
|
| *oq0 = FlipSignBack(sq0);
|
| }
|
|
|
| -#if defined(USE_INTRINSICS)
|
| -
|
| static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
|
| const uint8x16_t q0, const uint8x16_t q1,
|
| const uint8x16_t mask,
|
| @@ -626,7 +639,7 @@ static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
|
| );
|
| }
|
|
|
| -#endif // USE_INTRINSICS
|
| +#endif // WEBP_USE_INTRINSICS
|
|
|
| static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
|
| uint32_t k;
|
| @@ -721,11 +734,7 @@ static void DoFilter4(
|
| const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s);
|
| const int8x16_t simple_lf_delta =
|
| vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
|
| - uint8x16_t tmp_p0, tmp_q0;
|
| - ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);
|
| - // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
|
| - p0s = FlipSign(tmp_p0);
|
| - q0s = FlipSign(tmp_q0);
|
| + ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
|
| }
|
|
|
| // do_filter4 part (complex loopfilter on pixels without hev)
|
| @@ -797,11 +806,7 @@ static void DoFilter6(
|
| {
|
| const int8x16_t simple_lf_delta =
|
| vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
|
| - uint8x16_t tmp_p0, tmp_q0;
|
| - ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);
|
| - // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
|
| - p0s = FlipSign(tmp_p0);
|
| - q0s = FlipSign(tmp_q0);
|
| + ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
|
| }
|
|
|
| // do_filter6 part (complex loopfilter on pixels without hev)
|
| @@ -986,7 +991,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
|
| static const int16_t kC1 = 20091;
|
| static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
|
|
|
| -#if defined(USE_INTRINSICS)
|
| +#if defined(WEBP_USE_INTRINSICS)
|
| static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
|
| int16x8x2_t* const out) {
|
| // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
|
| @@ -1163,7 +1168,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
|
| );
|
| }
|
|
|
| -#endif // USE_INTRINSICS
|
| +#endif // WEBP_USE_INTRINSICS
|
|
|
| static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
|
| TransformOne(in, dst);
|
| @@ -1241,7 +1246,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
|
| static void TransformAC3(const int16_t* in, uint8_t* dst) {
|
| static const int kC1_full = 20091 + (1 << 16);
|
| static const int kC2_full = 35468;
|
| - const int16x4_t A = vdup_n_s16(in[0]);
|
| + const int16x4_t A = vld1_dup_s16(in);
|
| const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
|
| const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
|
| const int c1 = MUL(in[1], kC2_full);
|
| @@ -1258,15 +1263,330 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
|
| }
|
| #undef MUL
|
|
|
| -#endif // WEBP_USE_NEON
|
| +//------------------------------------------------------------------------------
|
| +// 4x4
|
| +
|
| +static void DC4(uint8_t* dst) { // DC
|
| + const uint8x8_t A = vld1_u8(dst - BPS); // top row
|
| + const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
|
| + const uint16x4_t p1 = vpadd_u16(p0, p0);
|
| + const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
|
| + const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
|
| + const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
|
| + const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
|
| + const uint16x8_t s0 = vaddq_u16(L0, L1);
|
| + const uint16x8_t s1 = vaddq_u16(L2, L3);
|
| + const uint16x8_t s01 = vaddq_u16(s0, s1);
|
| + const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
|
| + const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); // (sum + 4) >> 3
|
| + const uint8x8_t dc = vdup_lane_u8(dc0, 0);
|
| + int i;
|
| + for (i = 0; i < 4; ++i) {
|
| + vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
|
| + }
|
| +}
|
| +
|
| +// TrueMotion (4x4 + 8x8)
|
| +static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
|
| + const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
|
| + const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]'
|
| + const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL)); // A[c] - A[-1]
|
| + int y;
|
| + for (y = 0; y < size; y += 4) {
|
| + // left edge
|
| + const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1));
|
| + const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1));
|
| + const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1));
|
| + const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1));
|
| + const int16x8_t r0 = vaddq_s16(L0, d); // L[r] + A[c] - A[-1]
|
| + const int16x8_t r1 = vaddq_s16(L1, d);
|
| + const int16x8_t r2 = vaddq_s16(L2, d);
|
| + const int16x8_t r3 = vaddq_s16(L3, d);
|
| + // Saturate and store the result.
|
| + const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
|
| + const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
|
| + const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
|
| + const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
|
| + if (size == 4) {
|
| + vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
|
| + vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
|
| + vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
|
| + vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
|
| + } else {
|
| + vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
|
| + vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
|
| + vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
|
| + vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
|
| + }
|
| + dst += 4 * BPS;
|
| + }
|
| +}
|
| +
|
| +static void TM4(uint8_t* dst) { TrueMotion(dst, 4); }
|
| +
|
| +static void VE4(uint8_t* dst) { // vertical
|
| + // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
|
| + const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1)); // top row
|
| + const uint64x1_t A1 = vshr_n_u64(A0, 8);
|
| + const uint64x1_t A2 = vshr_n_u64(A0, 16);
|
| + const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
|
| + const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
|
| + const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
|
| + const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
|
| + const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
|
| + int i;
|
| + for (i = 0; i < 4; ++i) {
|
| + vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
|
| + }
|
| +}
|
| +
|
| +static void RD4(uint8_t* dst) { // Down-right
|
| + const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
|
| + const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
|
| + const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
|
| + const uint32_t I = dst[-1 + 0 * BPS];
|
| + const uint32_t J = dst[-1 + 1 * BPS];
|
| + const uint32_t K = dst[-1 + 2 * BPS];
|
| + const uint32_t L = dst[-1 + 3 * BPS];
|
| + const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24));
|
| + const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
|
| + const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
|
| + const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
|
| + const uint8_t D = vget_lane_u8(XABCD_u8, 4);
|
| + const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
|
| + const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
|
| + const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
|
| + const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
|
| + const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
|
| + const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
|
| + const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
|
| + const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
|
| + const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
|
| + vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
|
| + vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
|
| + vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
|
| + vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
|
| +}
|
| +
|
| +static void LD4(uint8_t* dst) { // Down-left
|
| + // Note using the same shift trick as VE4() is slower here.
|
| + const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
|
| + const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
|
| + const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
|
| + const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
|
| + const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
|
| + const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
|
| + const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
|
| + const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
|
| + const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
|
| + const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
|
| + const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
|
| + vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
|
| + vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
|
| + vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
|
| + vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
|
| +}
|
| +
|
| +//------------------------------------------------------------------------------
|
| +// Chroma
|
| +
|
| +static void VE8uv(uint8_t* dst) { // vertical
|
| + const uint8x8_t top = vld1_u8(dst - BPS);
|
| + int j;
|
| + for (j = 0; j < 8; ++j) {
|
| + vst1_u8(dst + j * BPS, top);
|
| + }
|
| +}
|
| +
|
| +static void HE8uv(uint8_t* dst) { // horizontal
|
| + int j;
|
| + for (j = 0; j < 8; ++j) {
|
| + const uint8x8_t left = vld1_dup_u8(dst - 1);
|
| + vst1_u8(dst, left);
|
| + dst += BPS;
|
| + }
|
| +}
|
| +
|
| +static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) {
|
| + uint16x8_t sum_top;
|
| + uint16x8_t sum_left;
|
| + uint8x8_t dc0;
|
| +
|
| + if (do_top) {
|
| + const uint8x8_t A = vld1_u8(dst - BPS); // top row
|
| + const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
|
| + const uint16x4_t p1 = vpadd_u16(p0, p0);
|
| + const uint16x4_t p2 = vpadd_u16(p1, p1);
|
| + sum_top = vcombine_u16(p2, p2);
|
| + }
|
| +
|
| + if (do_left) {
|
| + const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
|
| + const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
|
| + const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
|
| + const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
|
| + const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
|
| + const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
|
| + const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
|
| + const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
|
| + const uint16x8_t s0 = vaddq_u16(L0, L1);
|
| + const uint16x8_t s1 = vaddq_u16(L2, L3);
|
| + const uint16x8_t s2 = vaddq_u16(L4, L5);
|
| + const uint16x8_t s3 = vaddq_u16(L6, L7);
|
| + const uint16x8_t s01 = vaddq_u16(s0, s1);
|
| + const uint16x8_t s23 = vaddq_u16(s2, s3);
|
| + sum_left = vaddq_u16(s01, s23);
|
| + }
|
| +
|
| + if (do_top && do_left) {
|
| + const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
|
| + dc0 = vrshrn_n_u16(sum, 4);
|
| + } else if (do_top) {
|
| + dc0 = vrshrn_n_u16(sum_top, 3);
|
| + } else if (do_left) {
|
| + dc0 = vrshrn_n_u16(sum_left, 3);
|
| + } else {
|
| + dc0 = vdup_n_u8(0x80);
|
| + }
|
| +
|
| + {
|
| + const uint8x8_t dc = vdup_lane_u8(dc0, 0);
|
| + int i;
|
| + for (i = 0; i < 8; ++i) {
|
| + vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
|
| + }
|
| + }
|
| +}
|
| +
|
| +static void DC8uv(uint8_t* dst) { DC8(dst, 1, 1); }
|
| +static void DC8uvNoTop(uint8_t* dst) { DC8(dst, 0, 1); }
|
| +static void DC8uvNoLeft(uint8_t* dst) { DC8(dst, 1, 0); }
|
| +static void DC8uvNoTopLeft(uint8_t* dst) { DC8(dst, 0, 0); }
|
| +
|
| +static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
|
| +
|
| +//------------------------------------------------------------------------------
|
| +// 16x16
|
| +
|
| +static void VE16(uint8_t* dst) { // vertical
|
| + const uint8x16_t top = vld1q_u8(dst - BPS);
|
| + int j;
|
| + for (j = 0; j < 16; ++j) {
|
| + vst1q_u8(dst + j * BPS, top);
|
| + }
|
| +}
|
| +
|
| +static void HE16(uint8_t* dst) { // horizontal
|
| + int j;
|
| + for (j = 0; j < 16; ++j) {
|
| + const uint8x16_t left = vld1q_dup_u8(dst - 1);
|
| + vst1q_u8(dst, left);
|
| + dst += BPS;
|
| + }
|
| +}
|
| +
|
| +static WEBP_INLINE void DC16(uint8_t* dst, int do_top, int do_left) {
|
| + uint16x8_t sum_top;
|
| + uint16x8_t sum_left;
|
| + uint8x8_t dc0;
|
| +
|
| + if (do_top) {
|
| + const uint8x16_t A = vld1q_u8(dst - BPS); // top row
|
| + const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
|
| + const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
|
| + const uint16x4_t p2 = vpadd_u16(p1, p1);
|
| + const uint16x4_t p3 = vpadd_u16(p2, p2);
|
| + sum_top = vcombine_u16(p3, p3);
|
| + }
|
| +
|
| + if (do_left) {
|
| + int i;
|
| + sum_left = vdupq_n_u16(0);
|
| + for (i = 0; i < 16; i += 8) {
|
| + const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));
|
| + const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));
|
| + const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));
|
| + const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));
|
| + const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));
|
| + const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));
|
| + const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));
|
| + const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));
|
| + const uint16x8_t s0 = vaddq_u16(L0, L1);
|
| + const uint16x8_t s1 = vaddq_u16(L2, L3);
|
| + const uint16x8_t s2 = vaddq_u16(L4, L5);
|
| + const uint16x8_t s3 = vaddq_u16(L6, L7);
|
| + const uint16x8_t s01 = vaddq_u16(s0, s1);
|
| + const uint16x8_t s23 = vaddq_u16(s2, s3);
|
| + const uint16x8_t sum = vaddq_u16(s01, s23);
|
| + sum_left = vaddq_u16(sum_left, sum);
|
| + }
|
| + }
|
| +
|
| + if (do_top && do_left) {
|
| + const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
|
| + dc0 = vrshrn_n_u16(sum, 5);
|
| + } else if (do_top) {
|
| + dc0 = vrshrn_n_u16(sum_top, 4);
|
| + } else if (do_left) {
|
| + dc0 = vrshrn_n_u16(sum_left, 4);
|
| + } else {
|
| + dc0 = vdup_n_u8(0x80);
|
| + }
|
| +
|
| + {
|
| + const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
|
| + int i;
|
| + for (i = 0; i < 16; ++i) {
|
| + vst1q_u8(dst + i * BPS, dc);
|
| + }
|
| + }
|
| +}
|
| +
|
| +static void DC16TopLeft(uint8_t* dst) { DC16(dst, 1, 1); }
|
| +static void DC16NoTop(uint8_t* dst) { DC16(dst, 0, 1); }
|
| +static void DC16NoLeft(uint8_t* dst) { DC16(dst, 1, 0); }
|
| +static void DC16NoTopLeft(uint8_t* dst) { DC16(dst, 0, 0); }
|
| +
|
| +static void TM16(uint8_t* dst) {
|
| + const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
|
| + const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]'
|
| + // A[c] - A[-1]
|
| + const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
|
| + const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
|
| + int y;
|
| + for (y = 0; y < 16; y += 4) {
|
| + // left edge
|
| + const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1));
|
| + const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1));
|
| + const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1));
|
| + const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1));
|
| + const int16x8_t r0_lo = vaddq_s16(L0, d_lo); // L[r] + A[c] - A[-1]
|
| + const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
|
| + const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
|
| + const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
|
| + const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
|
| + const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
|
| + const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
|
| + const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
|
| + // Saturate and store the result.
|
| + const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
|
| + const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
|
| + const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
|
| + const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
|
| + vst1q_u8(dst + 0 * BPS, row0);
|
| + vst1q_u8(dst + 1 * BPS, row1);
|
| + vst1q_u8(dst + 2 * BPS, row2);
|
| + vst1q_u8(dst + 3 * BPS, row3);
|
| + dst += 4 * BPS;
|
| + }
|
| +}
|
|
|
| //------------------------------------------------------------------------------
|
| // Entry point
|
|
|
| extern void VP8DspInitNEON(void);
|
|
|
| -void VP8DspInitNEON(void) {
|
| -#if defined(WEBP_USE_NEON)
|
| +WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
|
| VP8Transform = TransformTwo;
|
| VP8TransformAC3 = TransformAC3;
|
| VP8TransformDC = TransformDC;
|
| @@ -1288,5 +1608,32 @@ void VP8DspInitNEON(void) {
|
| VP8SimpleHFilter16 = SimpleHFilter16;
|
| VP8SimpleVFilter16i = SimpleVFilter16i;
|
| VP8SimpleHFilter16i = SimpleHFilter16i;
|
| -#endif // WEBP_USE_NEON
|
| +
|
| + VP8PredLuma4[0] = DC4;
|
| + VP8PredLuma4[1] = TM4;
|
| + VP8PredLuma4[2] = VE4;
|
| + VP8PredLuma4[4] = RD4;
|
| + VP8PredLuma4[6] = LD4;
|
| +
|
| + VP8PredLuma16[0] = DC16TopLeft;
|
| + VP8PredLuma16[1] = TM16;
|
| + VP8PredLuma16[2] = VE16;
|
| + VP8PredLuma16[3] = HE16;
|
| + VP8PredLuma16[4] = DC16NoTop;
|
| + VP8PredLuma16[5] = DC16NoLeft;
|
| + VP8PredLuma16[6] = DC16NoTopLeft;
|
| +
|
| + VP8PredChroma8[0] = DC8uv;
|
| + VP8PredChroma8[1] = TM8uv;
|
| + VP8PredChroma8[2] = VE8uv;
|
| + VP8PredChroma8[3] = HE8uv;
|
| + VP8PredChroma8[4] = DC8uvNoTop;
|
| + VP8PredChroma8[5] = DC8uvNoLeft;
|
| + VP8PredChroma8[6] = DC8uvNoTopLeft;
|
| }
|
| +
|
| +#else // !WEBP_USE_NEON
|
| +
|
| +WEBP_DSP_INIT_STUB(VP8DspInitNEON)
|
| +
|
| +#endif // WEBP_USE_NEON
|
|
|