| OLD | NEW |
| 1 // Copyright 2012 Google Inc. All Rights Reserved. | 1 // Copyright 2012 Google Inc. All Rights Reserved. |
| 2 // | 2 // |
| 3 // Use of this source code is governed by a BSD-style license | 3 // Use of this source code is governed by a BSD-style license |
| 4 // that can be found in the COPYING file in the root of the source | 4 // that can be found in the COPYING file in the root of the source |
| 5 // tree. An additional intellectual property rights grant can be found | 5 // tree. An additional intellectual property rights grant can be found |
| 6 // in the file PATENTS. All contributing project authors may | 6 // in the file PATENTS. All contributing project authors may |
| 7 // be found in the AUTHORS file in the root of the source tree. | 7 // be found in the AUTHORS file in the root of the source tree. |
| 8 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
| 9 // | 9 // |
| 10 // ARM NEON version of speed-critical encoding functions. | 10 // ARM NEON version of speed-critical encoding functions. |
| (...skipping 14 matching lines...) Expand all Loading... |
| 25 | 25 |
| 26 // Inverse transform. | 26 // Inverse transform. |
| 27 // This code is pretty much the same as TransformOne in the dec_neon.c, except | 27 // This code is pretty much the same as TransformOne in the dec_neon.c, except |
| 28 // for subtraction to *ref. See the comments there for algorithmic explanations. | 28 // for subtraction to *ref. See the comments there for algorithmic explanations. |
| 29 | 29 |
| 30 static const int16_t kC1 = 20091; | 30 static const int16_t kC1 = 20091; |
| 31 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above. | 31 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above. |
| 32 | 32 |
| 33 // This code works but is *slower* than the inlined-asm version below | 33 // This code works but is *slower* than the inlined-asm version below |
| 34 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to | 34 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to |
| 35 // USE_INTRINSICS define. | 35 // WEBP_USE_INTRINSICS define. |
| 36 // With gcc-4.8, it's a little faster speed than inlined-assembly. | 36 // With gcc-4.8, it's a little faster speed than inlined-assembly. |
| 37 #if defined(USE_INTRINSICS) | 37 #if defined(WEBP_USE_INTRINSICS) |
| 38 | 38 |
| 39 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t. | 39 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t. |
| 40 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) { | 40 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) { |
| 41 return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v))); | 41 return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v))); |
| 42 } | 42 } |
| 43 | 43 |
| 44 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result | 44 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result |
| 45 // to the corresponding rows of 'dst'. | 45 // to the corresponding rows of 'dst'. |
| 46 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst, | 46 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst, |
| 47 const int16x8_t dst01, | 47 const int16x8_t dst01, |
| (...skipping 186 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 234 "vst1.32 d0[1], [%[dst]], %[kBPS] \n" | 234 "vst1.32 d0[1], [%[dst]], %[kBPS] \n" |
| 235 "vst1.32 d1[0], [%[dst]], %[kBPS] \n" | 235 "vst1.32 d1[0], [%[dst]], %[kBPS] \n" |
| 236 "vst1.32 d1[1], [%[dst]] \n" | 236 "vst1.32 d1[1], [%[dst]] \n" |
| 237 | 237 |
| 238 : [in] "+r"(in), [dst] "+r"(dst) // modified registers | 238 : [in] "+r"(in), [dst] "+r"(dst) // modified registers |
| 239 : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants | 239 : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants |
| 240 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered | 240 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered |
| 241 ); | 241 ); |
| 242 } | 242 } |
| 243 | 243 |
| 244 #endif // USE_INTRINSICS | 244 #endif // WEBP_USE_INTRINSICS |
| 245 | 245 |
| 246 static void ITransform(const uint8_t* ref, | 246 static void ITransform(const uint8_t* ref, |
| 247 const int16_t* in, uint8_t* dst, int do_two) { | 247 const int16_t* in, uint8_t* dst, int do_two) { |
| 248 ITransformOne(ref, in, dst); | 248 ITransformOne(ref, in, dst); |
| 249 if (do_two) { | 249 if (do_two) { |
| 250 ITransformOne(ref + 4, in + 16, dst + 4); | 250 ITransformOne(ref + 4, in + 16, dst + 4); |
| 251 } | 251 } |
| 252 } | 252 } |
| 253 | 253 |
| 254 // Load all 4x4 pixels into a single uint8x16_t variable. | 254 // Load all 4x4 pixels into a single uint8x16_t variable. |
| 255 static uint8x16_t Load4x4(const uint8_t* src) { | 255 static uint8x16_t Load4x4(const uint8_t* src) { |
| 256 uint32x4_t out = vdupq_n_u32(0); | 256 uint32x4_t out = vdupq_n_u32(0); |
| 257 out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0); | 257 out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0); |
| 258 out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1); | 258 out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1); |
| 259 out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2); | 259 out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2); |
| 260 out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3); | 260 out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3); |
| 261 return vreinterpretq_u8_u32(out); | 261 return vreinterpretq_u8_u32(out); |
| 262 } | 262 } |
| 263 | 263 |
| 264 // Forward transform. | 264 // Forward transform. |
| 265 | 265 |
| 266 #if defined(USE_INTRINSICS) | 266 #if defined(WEBP_USE_INTRINSICS) |
| 267 | 267 |
| 268 static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B, | 268 static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B, |
| 269 const int16x4_t C, const int16x4_t D, | 269 const int16x4_t C, const int16x4_t D, |
| 270 int16x8_t* const out01, | 270 int16x8_t* const out01, |
| 271 int16x8_t* const out32) { | 271 int16x8_t* const out32) { |
| 272 const int16x4x2_t AB = vtrn_s16(A, B); | 272 const int16x4x2_t AB = vtrn_s16(A, B); |
| 273 const int16x4x2_t CD = vtrn_s16(C, D); | 273 const int16x4x2_t CD = vtrn_s16(C, D); |
| 274 const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]), | 274 const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]), |
| 275 vreinterpret_s32_s16(CD.val[0])); | 275 vreinterpret_s32_s16(CD.val[0])); |
| 276 const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]), | 276 const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]), |
| (...skipping 264 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 541 } | 541 } |
| 542 } | 542 } |
| 543 #undef LOAD_LANE_16b | 543 #undef LOAD_LANE_16b |
| 544 | 544 |
| 545 //------------------------------------------------------------------------------ | 545 //------------------------------------------------------------------------------ |
| 546 // Texture distortion | 546 // Texture distortion |
| 547 // | 547 // |
| 548 // We try to match the spectral content (weighted) between source and | 548 // We try to match the spectral content (weighted) between source and |
| 549 // reconstructed samples. | 549 // reconstructed samples. |
| 550 | 550 |
| 551 // This code works but is *slower* than the inlined-asm version below | 551 // a 0123, b 0123 |
| 552 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to | 552 // a 4567, b 4567 |
| 553 // USE_INTRINSICS define. | 553 // a 89ab, b 89ab |
| 554 // With gcc-4.8, it's only slightly slower than the inlined. | 554 // a cdef, b cdef |
| 555 #if defined(USE_INTRINSICS) | 555 // |
| 556 // transpose |
| 557 // |
| 558 // a 048c, b 048c |
| 559 // a 159d, b 159d |
| 560 // a 26ae, b 26ae |
| 561 // a 37bf, b 37bf |
| 562 // |
| 563 static WEBP_INLINE uint8x8x4_t DistoTranspose4x4U8(uint8x8x4_t d4_in) { |
| 564 const uint8x8x2_t d2_tmp0 = vtrn_u8(d4_in.val[0], d4_in.val[1]); |
| 565 const uint8x8x2_t d2_tmp1 = vtrn_u8(d4_in.val[2], d4_in.val[3]); |
| 566 const uint16x4x2_t d2_tmp2 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[0]), |
| 567 vreinterpret_u16_u8(d2_tmp1.val[0])); |
| 568 const uint16x4x2_t d2_tmp3 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[1]), |
| 569 vreinterpret_u16_u8(d2_tmp1.val[1])); |
| 556 | 570 |
| 557 // Zero extend an uint16x4_t 'v' to an int32x4_t. | 571 d4_in.val[0] = vreinterpret_u8_u16(d2_tmp2.val[0]); |
| 558 static WEBP_INLINE int32x4_t ConvertU16ToS32(uint16x4_t v) { | 572 d4_in.val[2] = vreinterpret_u8_u16(d2_tmp2.val[1]); |
| 559 return vreinterpretq_s32_u32(vmovl_u16(v)); | 573 d4_in.val[1] = vreinterpret_u8_u16(d2_tmp3.val[0]); |
| 574 d4_in.val[3] = vreinterpret_u8_u16(d2_tmp3.val[1]); |
| 575 return d4_in; |
| 560 } | 576 } |
| 561 | 577 |
| 562 // Does a regular 4x4 transpose followed by an adjustment of the upper columns | 578 static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) { |
| 563 // in the inner rows to restore the source order of differences, | 579 const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]); |
| 564 // i.e., a0 - a1 | a3 - a2. | 580 const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]); |
| 565 static WEBP_INLINE int32x4x4_t DistoTranspose4x4(const int32x4x4_t rows) { | 581 const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]), |
| 566 int32x4x4_t out = Transpose4x4(rows); | 582 vreinterpretq_s32_s16(q2_tmp1.val[0])); |
| 567 // restore source order in the columns containing differences. | 583 const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]), |
| 568 const int32x2_t r1h = vget_high_s32(out.val[1]); | 584 vreinterpretq_s32_s16(q2_tmp1.val[1])); |
| 569 const int32x2_t r2h = vget_high_s32(out.val[2]); | 585 q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]); |
| 570 out.val[1] = vcombine_s32(vget_low_s32(out.val[1]), r2h); | 586 q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]); |
| 571 out.val[2] = vcombine_s32(vget_low_s32(out.val[2]), r1h); | 587 q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]); |
| 572 return out; | 588 q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]); |
| 589 return q4_in; |
| 573 } | 590 } |
| 574 | 591 |
| 575 static WEBP_INLINE int32x4x4_t DistoHorizontalPass(const uint8x8_t r0r1, | 592 static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const uint8x8x4_t d4_in) { |
| 576 const uint8x8_t r2r3) { | 593 // {a0, a1} = {in[0] + in[2], in[1] + in[3]} |
| 577 // a0 = in[0] + in[2] | a1 = in[1] + in[3] | 594 // {a3, a2} = {in[0] - in[2], in[1] - in[3]} |
| 578 const uint16x8_t a0a1 = vaddl_u8(r0r1, r2r3); | 595 const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[0], |
| 579 // a3 = in[0] - in[2] | a2 = in[1] - in[3] | 596 d4_in.val[2])); |
| 580 const uint16x8_t a3a2 = vsubl_u8(r0r1, r2r3); | 597 const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[1], |
| 581 const int32x4_t tmp0 = vpaddlq_s16(vreinterpretq_s16_u16(a0a1)); // a0 + a1 | 598 d4_in.val[3])); |
| 582 const int32x4_t tmp1 = vpaddlq_s16(vreinterpretq_s16_u16(a3a2)); // a3 + a2 | 599 const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[0], |
| 583 // no pairwise subtraction; reorder to perform tmp[2]/tmp[3] calculations. | 600 d4_in.val[2])); |
| 584 // a0a0 a3a3 a0a0 a3a3 a0a0 a3a3 a0a0 a3a3 | 601 const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[1], |
| 585 // a1a1 a2a2 a1a1 a2a2 a1a1 a2a2 a1a1 a2a2 | 602 d4_in.val[3])); |
| 586 const int16x8x2_t transpose = | 603 int16x8x4_t q4_out; |
| 587 vtrnq_s16(vreinterpretq_s16_u16(a0a1), vreinterpretq_s16_u16(a3a2)); | 604 // tmp[0] = a0 + a1 |
| 588 // tmp[3] = a0 - a1 | tmp[2] = a3 - a2 | 605 // tmp[1] = a3 + a2 |
| 589 const int32x4_t tmp32_1 = vsubl_s16(vget_low_s16(transpose.val[0]), | 606 // tmp[2] = a3 - a2 |
| 590 vget_low_s16(transpose.val[1])); | 607 // tmp[3] = a0 - a1 |
| 591 const int32x4_t tmp32_2 = vsubl_s16(vget_high_s16(transpose.val[0]), | 608 INIT_VECTOR4(q4_out, |
| 592 vget_high_s16(transpose.val[1])); | 609 vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2), |
| 593 // [0]: tmp[3] [1]: tmp[2] | 610 vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1)); |
| 594 const int32x4x2_t split = vtrnq_s32(tmp32_1, tmp32_2); | 611 return q4_out; |
| 595 const int32x4x4_t res = { { tmp0, tmp1, split.val[1], split.val[0] } }; | |
| 596 return res; | |
| 597 } | 612 } |
| 598 | 613 |
| 599 static WEBP_INLINE int32x4x4_t DistoVerticalPass(const int32x4x4_t rows) { | 614 static WEBP_INLINE int16x8x4_t DistoVerticalPass(int16x8x4_t q4_in) { |
| 600 // a0 = tmp[0 + i] + tmp[8 + i]; | 615 const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]); |
| 601 const int32x4_t a0 = vaddq_s32(rows.val[0], rows.val[1]); | 616 const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]); |
| 602 // a1 = tmp[4 + i] + tmp[12+ i]; | 617 const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]); |
| 603 const int32x4_t a1 = vaddq_s32(rows.val[2], rows.val[3]); | 618 const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]); |
| 604 // a2 = tmp[4 + i] - tmp[12+ i]; | 619 |
| 605 const int32x4_t a2 = vsubq_s32(rows.val[2], rows.val[3]); | 620 q4_in.val[0] = vaddq_s16(q_a0, q_a1); |
| 606 // a3 = tmp[0 + i] - tmp[8 + i]; | 621 q4_in.val[1] = vaddq_s16(q_a3, q_a2); |
| 607 const int32x4_t a3 = vsubq_s32(rows.val[0], rows.val[1]); | 622 q4_in.val[2] = vabdq_s16(q_a3, q_a2); |
| 608 const int32x4_t b0 = vqabsq_s32(vaddq_s32(a0, a1)); // abs(a0 + a1) | 623 q4_in.val[3] = vabdq_s16(q_a0, q_a1); |
| 609 const int32x4_t b1 = vqabsq_s32(vaddq_s32(a3, a2)); // abs(a3 + a2) | 624 q4_in.val[0] = vabsq_s16(q4_in.val[0]); |
| 610 const int32x4_t b2 = vabdq_s32(a3, a2); // abs(a3 - a2) | 625 q4_in.val[1] = vabsq_s16(q4_in.val[1]); |
| 611 const int32x4_t b3 = vabdq_s32(a0, a1); // abs(a0 - a1) | 626 return q4_in; |
| 612 const int32x4x4_t res = { { b0, b1, b2, b3 } }; | |
| 613 return res; | |
| 614 } | 627 } |
| 615 | 628 |
| 616 // Calculate the weighted sum of the rows in 'b'. | 629 static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) { |
| 617 static WEBP_INLINE int64x1_t DistoSum(const int32x4x4_t b, | 630 const uint16x8_t q_w07 = vld1q_u16(&w[0]); |
| 618 const int32x4_t w0, const int32x4_t w1, | 631 const uint16x8_t q_w8f = vld1q_u16(&w[8]); |
| 619 const int32x4_t w2, const int32x4_t w3) { | 632 int16x4x4_t d4_w; |
| 620 const int32x4_t s0 = vmulq_s32(w0, b.val[0]); | 633 INIT_VECTOR4(d4_w, |
| 621 const int32x4_t s1 = vmlaq_s32(s0, w1, b.val[1]); | 634 vget_low_s16(vreinterpretq_s16_u16(q_w07)), |
| 622 const int32x4_t s2 = vmlaq_s32(s1, w2, b.val[2]); | 635 vget_high_s16(vreinterpretq_s16_u16(q_w07)), |
| 623 const int32x4_t s3 = vmlaq_s32(s2, w3, b.val[3]); | 636 vget_low_s16(vreinterpretq_s16_u16(q_w8f)), |
| 624 const int64x2_t sum1 = vpaddlq_s32(s3); | 637 vget_high_s16(vreinterpretq_s16_u16(q_w8f))); |
| 625 const int64x1_t sum2 = vadd_s64(vget_low_s64(sum1), vget_high_s64(sum1)); | 638 return d4_w; |
| 626 return sum2; | 639 } |
| 640 |
| 641 static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in, |
| 642 const int16x4x4_t d4_w) { |
| 643 int32x2_t d_sum; |
| 644 // sum += w[ 0] * abs(b0); |
| 645 // sum += w[ 4] * abs(b1); |
| 646 // sum += w[ 8] * abs(b2); |
| 647 // sum += w[12] * abs(b3); |
| 648 int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0])); |
| 649 int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1])); |
| 650 int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2])); |
| 651 int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3])); |
| 652 q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0])); |
| 653 q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1])); |
| 654 q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2])); |
| 655 q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3])); |
| 656 |
| 657 q_sum0 = vaddq_s32(q_sum0, q_sum1); |
| 658 q_sum2 = vaddq_s32(q_sum2, q_sum3); |
| 659 q_sum2 = vaddq_s32(q_sum0, q_sum2); |
| 660 d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2)); |
| 661 d_sum = vpadd_s32(d_sum, d_sum); |
| 662 return d_sum; |
| 627 } | 663 } |
| 628 | 664 |
| 629 #define LOAD_LANE_32b(src, VALUE, LANE) \ | 665 #define LOAD_LANE_32b(src, VALUE, LANE) \ |
| 630 (VALUE) = vld1q_lane_u32((const uint32_t*)(src), (VALUE), (LANE)) | 666 (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE)) |
| 631 | 667 |
| 632 // Hadamard transform | 668 // Hadamard transform |
| 633 // Returns the weighted sum of the absolute value of transformed coefficients. | 669 // Returns the weighted sum of the absolute value of transformed coefficients. |
| 634 static int Disto4x4(const uint8_t* const a, const uint8_t* const b, | 670 static int Disto4x4(const uint8_t* const a, const uint8_t* const b, |
| 635 const uint16_t* const w) { | 671 const uint16_t* const w) { |
| 636 uint32x4_t d0d1 = { 0, 0, 0, 0 }; | 672 uint32x2_t d_in_ab_0123 = vdup_n_u32(0); |
| 637 uint32x4_t d2d3 = { 0, 0, 0, 0 }; | 673 uint32x2_t d_in_ab_4567 = vdup_n_u32(0); |
| 638 LOAD_LANE_32b(a + 0 * BPS, d0d1, 0); // a00 a01 a02 a03 | 674 uint32x2_t d_in_ab_89ab = vdup_n_u32(0); |
| 639 LOAD_LANE_32b(a + 1 * BPS, d0d1, 1); // a10 a11 a12 a13 | 675 uint32x2_t d_in_ab_cdef = vdup_n_u32(0); |
| 640 LOAD_LANE_32b(b + 0 * BPS, d0d1, 2); // b00 b01 b02 b03 | 676 uint8x8x4_t d4_in; |
| 641 LOAD_LANE_32b(b + 1 * BPS, d0d1, 3); // b10 b11 b12 b13 | 677 |
| 642 LOAD_LANE_32b(a + 2 * BPS, d2d3, 0); // a20 a21 a22 a23 | 678 // load data a, b |
| 643 LOAD_LANE_32b(a + 3 * BPS, d2d3, 1); // a30 a31 a32 a33 | 679 LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0); |
| 644 LOAD_LANE_32b(b + 2 * BPS, d2d3, 2); // b20 b21 b22 b23 | 680 LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0); |
| 645 LOAD_LANE_32b(b + 3 * BPS, d2d3, 3); // b30 b31 b32 b33 | 681 LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0); |
| 682 LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0); |
| 683 LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1); |
| 684 LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1); |
| 685 LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1); |
| 686 LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1); |
| 687 INIT_VECTOR4(d4_in, |
| 688 vreinterpret_u8_u32(d_in_ab_0123), |
| 689 vreinterpret_u8_u32(d_in_ab_4567), |
| 690 vreinterpret_u8_u32(d_in_ab_89ab), |
| 691 vreinterpret_u8_u32(d_in_ab_cdef)); |
| 646 | 692 |
| 647 { | 693 { |
| 648 // a00 a01 a20 a21 a10 a11 a30 a31 b00 b01 b20 b21 b10 b11 b30 b31 | 694 // horizontal pass |
| 649 // a02 a03 a22 a23 a12 a13 a32 a33 b02 b03 b22 b23 b12 b13 b32 b33 | 695 const uint8x8x4_t d4_t = DistoTranspose4x4U8(d4_in); |
| 650 const uint16x8x2_t tmp = | 696 const int16x8x4_t q4_h = DistoHorizontalPass(d4_t); |
| 651 vtrnq_u16(vreinterpretq_u16_u32(d0d1), vreinterpretq_u16_u32(d2d3)); | 697 const int16x4x4_t d4_w = DistoLoadW(w); |
| 652 const uint8x16_t d0d1u8 = vreinterpretq_u8_u16(tmp.val[0]); | 698 // vertical pass |
| 653 const uint8x16_t d2d3u8 = vreinterpretq_u8_u16(tmp.val[1]); | 699 const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_h); |
| 654 const int32x4x4_t hpass_a = DistoHorizontalPass(vget_low_u8(d0d1u8), | 700 const int16x8x4_t q4_v = DistoVerticalPass(q4_t); |
| 655 vget_low_u8(d2d3u8)); | 701 int32x2_t d_sum = DistoSum(q4_v, d4_w); |
| 656 const int32x4x4_t hpass_b = DistoHorizontalPass(vget_high_u8(d0d1u8), | 702 |
| 657 vget_high_u8(d2d3u8)); | 703 // abs(sum2 - sum1) >> 5 |
| 658 const int32x4x4_t tmp_a = DistoTranspose4x4(hpass_a); | 704 d_sum = vabs_s32(d_sum); |
| 659 const int32x4x4_t tmp_b = DistoTranspose4x4(hpass_b); | 705 d_sum = vshr_n_s32(d_sum, 5); |
| 660 const int32x4x4_t vpass_a = DistoVerticalPass(tmp_a); | 706 return vget_lane_s32(d_sum, 0); |
| 661 const int32x4x4_t vpass_b = DistoVerticalPass(tmp_b); | |
| 662 const int32x4_t w0 = ConvertU16ToS32(vld1_u16(w + 0)); | |
| 663 const int32x4_t w1 = ConvertU16ToS32(vld1_u16(w + 4)); | |
| 664 const int32x4_t w2 = ConvertU16ToS32(vld1_u16(w + 8)); | |
| 665 const int32x4_t w3 = ConvertU16ToS32(vld1_u16(w + 12)); | |
| 666 const int64x1_t sum1 = DistoSum(vpass_a, w0, w1, w2, w3); | |
| 667 const int64x1_t sum2 = DistoSum(vpass_b, w0, w1, w2, w3); | |
| 668 const int32x2_t diff = vabd_s32(vreinterpret_s32_s64(sum1), | |
| 669 vreinterpret_s32_s64(sum2)); | |
| 670 const int32x2_t res = vshr_n_s32(diff, 5); | |
| 671 return vget_lane_s32(res, 0); | |
| 672 } | 707 } |
| 673 } | 708 } |
| 674 | |
| 675 #undef LOAD_LANE_32b | 709 #undef LOAD_LANE_32b |
| 676 | 710 |
| 677 #else | |
| 678 | |
| 679 // Hadamard transform | |
| 680 // Returns the weighted sum of the absolute value of transformed coefficients. | |
| 681 static int Disto4x4(const uint8_t* const a, const uint8_t* const b, | |
| 682 const uint16_t* const w) { | |
| 683 const int kBPS = BPS; | |
| 684 const uint8_t* A = a; | |
| 685 const uint8_t* B = b; | |
| 686 const uint16_t* W = w; | |
| 687 int sum; | |
| 688 __asm__ volatile ( | |
| 689 "vld1.32 d0[0], [%[a]], %[kBPS] \n" | |
| 690 "vld1.32 d0[1], [%[a]], %[kBPS] \n" | |
| 691 "vld1.32 d2[0], [%[a]], %[kBPS] \n" | |
| 692 "vld1.32 d2[1], [%[a]] \n" | |
| 693 | |
| 694 "vld1.32 d1[0], [%[b]], %[kBPS] \n" | |
| 695 "vld1.32 d1[1], [%[b]], %[kBPS] \n" | |
| 696 "vld1.32 d3[0], [%[b]], %[kBPS] \n" | |
| 697 "vld1.32 d3[1], [%[b]] \n" | |
| 698 | |
| 699 // a d0/d2, b d1/d3 | |
| 700 // d0/d1: 01 01 01 01 | |
| 701 // d2/d3: 23 23 23 23 | |
| 702 // But: it goes 01 45 23 67 | |
| 703 // Notice the middle values are transposed | |
| 704 "vtrn.16 q0, q1 \n" | |
| 705 | |
| 706 // {a0, a1} = {in[0] + in[2], in[1] + in[3]} | |
| 707 "vaddl.u8 q2, d0, d2 \n" | |
| 708 "vaddl.u8 q10, d1, d3 \n" | |
| 709 // {a3, a2} = {in[0] - in[2], in[1] - in[3]} | |
| 710 "vsubl.u8 q3, d0, d2 \n" | |
| 711 "vsubl.u8 q11, d1, d3 \n" | |
| 712 | |
| 713 // tmp[0] = a0 + a1 | |
| 714 "vpaddl.s16 q0, q2 \n" | |
| 715 "vpaddl.s16 q8, q10 \n" | |
| 716 | |
| 717 // tmp[1] = a3 + a2 | |
| 718 "vpaddl.s16 q1, q3 \n" | |
| 719 "vpaddl.s16 q9, q11 \n" | |
| 720 | |
| 721 // No pair subtract | |
| 722 // q2 = {a0, a3} | |
| 723 // q3 = {a1, a2} | |
| 724 "vtrn.16 q2, q3 \n" | |
| 725 "vtrn.16 q10, q11 \n" | |
| 726 | |
| 727 // {tmp[3], tmp[2]} = {a0 - a1, a3 - a2} | |
| 728 "vsubl.s16 q12, d4, d6 \n" | |
| 729 "vsubl.s16 q13, d5, d7 \n" | |
| 730 "vsubl.s16 q14, d20, d22 \n" | |
| 731 "vsubl.s16 q15, d21, d23 \n" | |
| 732 | |
| 733 // separate tmp[3] and tmp[2] | |
| 734 // q12 = tmp[3] | |
| 735 // q13 = tmp[2] | |
| 736 "vtrn.32 q12, q13 \n" | |
| 737 "vtrn.32 q14, q15 \n" | |
| 738 | |
| 739 // Transpose tmp for a | |
| 740 "vswp d1, d26 \n" // vtrn.64 | |
| 741 "vswp d3, d24 \n" // vtrn.64 | |
| 742 "vtrn.32 q0, q1 \n" | |
| 743 "vtrn.32 q13, q12 \n" | |
| 744 | |
| 745 // Transpose tmp for b | |
| 746 "vswp d17, d30 \n" // vtrn.64 | |
| 747 "vswp d19, d28 \n" // vtrn.64 | |
| 748 "vtrn.32 q8, q9 \n" | |
| 749 "vtrn.32 q15, q14 \n" | |
| 750 | |
| 751 // The first Q register is a, the second b. | |
| 752 // q0/8 tmp[0-3] | |
| 753 // q13/15 tmp[4-7] | |
| 754 // q1/9 tmp[8-11] | |
| 755 // q12/14 tmp[12-15] | |
| 756 | |
| 757 // These are still in 01 45 23 67 order. We fix it easily in the addition | |
| 758 // case but the subtraction propagates them. | |
| 759 "vswp d3, d27 \n" | |
| 760 "vswp d19, d31 \n" | |
| 761 | |
| 762 // a0 = tmp[0] + tmp[8] | |
| 763 "vadd.s32 q2, q0, q1 \n" | |
| 764 "vadd.s32 q3, q8, q9 \n" | |
| 765 | |
| 766 // a1 = tmp[4] + tmp[12] | |
| 767 "vadd.s32 q10, q13, q12 \n" | |
| 768 "vadd.s32 q11, q15, q14 \n" | |
| 769 | |
| 770 // a2 = tmp[4] - tmp[12] | |
| 771 "vsub.s32 q13, q13, q12 \n" | |
| 772 "vsub.s32 q15, q15, q14 \n" | |
| 773 | |
| 774 // a3 = tmp[0] - tmp[8] | |
| 775 "vsub.s32 q0, q0, q1 \n" | |
| 776 "vsub.s32 q8, q8, q9 \n" | |
| 777 | |
| 778 // b0 = a0 + a1 | |
| 779 "vadd.s32 q1, q2, q10 \n" | |
| 780 "vadd.s32 q9, q3, q11 \n" | |
| 781 | |
| 782 // b1 = a3 + a2 | |
| 783 "vadd.s32 q12, q0, q13 \n" | |
| 784 "vadd.s32 q14, q8, q15 \n" | |
| 785 | |
| 786 // b2 = a3 - a2 | |
| 787 "vsub.s32 q0, q0, q13 \n" | |
| 788 "vsub.s32 q8, q8, q15 \n" | |
| 789 | |
| 790 // b3 = a0 - a1 | |
| 791 "vsub.s32 q2, q2, q10 \n" | |
| 792 "vsub.s32 q3, q3, q11 \n" | |
| 793 | |
| 794 "vld1.64 {q10, q11}, [%[w]] \n" | |
| 795 | |
| 796 // abs(b0) | |
| 797 "vabs.s32 q1, q1 \n" | |
| 798 "vabs.s32 q9, q9 \n" | |
| 799 // abs(b1) | |
| 800 "vabs.s32 q12, q12 \n" | |
| 801 "vabs.s32 q14, q14 \n" | |
| 802 // abs(b2) | |
| 803 "vabs.s32 q0, q0 \n" | |
| 804 "vabs.s32 q8, q8 \n" | |
| 805 // abs(b3) | |
| 806 "vabs.s32 q2, q2 \n" | |
| 807 "vabs.s32 q3, q3 \n" | |
| 808 | |
| 809 // expand w before using. | |
| 810 "vmovl.u16 q13, d20 \n" | |
| 811 "vmovl.u16 q15, d21 \n" | |
| 812 | |
| 813 // w[0] * abs(b0) | |
| 814 "vmul.u32 q1, q1, q13 \n" | |
| 815 "vmul.u32 q9, q9, q13 \n" | |
| 816 | |
| 817 // w[4] * abs(b1) | |
| 818 "vmla.u32 q1, q12, q15 \n" | |
| 819 "vmla.u32 q9, q14, q15 \n" | |
| 820 | |
| 821 // expand w before using. | |
| 822 "vmovl.u16 q13, d22 \n" | |
| 823 "vmovl.u16 q15, d23 \n" | |
| 824 | |
| 825 // w[8] * abs(b1) | |
| 826 "vmla.u32 q1, q0, q13 \n" | |
| 827 "vmla.u32 q9, q8, q13 \n" | |
| 828 | |
| 829 // w[12] * abs(b1) | |
| 830 "vmla.u32 q1, q2, q15 \n" | |
| 831 "vmla.u32 q9, q3, q15 \n" | |
| 832 | |
| 833 // Sum the arrays | |
| 834 "vpaddl.u32 q1, q1 \n" | |
| 835 "vpaddl.u32 q9, q9 \n" | |
| 836 "vadd.u64 d2, d3 \n" | |
| 837 "vadd.u64 d18, d19 \n" | |
| 838 | |
| 839 // Hadamard transform needs 4 bits of extra precision (2 bits in each | |
| 840 // direction) for dynamic raw. Weights w[] are 16bits at max, so the maximum | |
| 841 // precision for coeff is 8bit of input + 4bits of Hadamard transform + | |
| 842 // 16bits for w[] + 2 bits of abs() summation. | |
| 843 // | |
| 844 // This uses a maximum of 31 bits (signed). Discarding the top 32 bits is | |
| 845 // A-OK. | |
| 846 | |
| 847 // sum2 - sum1 | |
| 848 "vsub.u32 d0, d2, d18 \n" | |
| 849 // abs(sum2 - sum1) | |
| 850 "vabs.s32 d0, d0 \n" | |
| 851 // abs(sum2 - sum1) >> 5 | |
| 852 "vshr.u32 d0, #5 \n" | |
| 853 | |
| 854 // It would be better to move the value straight into r0 but I'm not | |
| 855 // entirely sure how this works with inline assembly. | |
| 856 "vmov.32 %[sum], d0[0] \n" | |
| 857 | |
| 858 : [sum] "=r"(sum), [a] "+r"(A), [b] "+r"(B), [w] "+r"(W) | |
| 859 : [kBPS] "r"(kBPS) | |
| 860 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", | |
| 861 "q10", "q11", "q12", "q13", "q14", "q15" // clobbered | |
| 862 ) ; | |
| 863 | |
| 864 return sum; | |
| 865 } | |
| 866 | |
| 867 #endif // USE_INTRINSICS | |
| 868 | |
| 869 static int Disto16x16(const uint8_t* const a, const uint8_t* const b, | 711 static int Disto16x16(const uint8_t* const a, const uint8_t* const b, |
| 870 const uint16_t* const w) { | 712 const uint16_t* const w) { |
| 871 int D = 0; | 713 int D = 0; |
| 872 int x, y; | 714 int x, y; |
| 873 for (y = 0; y < 16 * BPS; y += 4 * BPS) { | 715 for (y = 0; y < 16 * BPS; y += 4 * BPS) { |
| 874 for (x = 0; x < 16; x += 4) { | 716 for (x = 0; x < 16; x += 4) { |
| 875 D += Disto4x4(a + x + y, b + x + y, w); | 717 D += Disto4x4(a + x + y, b + x + y, w); |
| 876 } | 718 } |
| 877 } | 719 } |
| 878 return D; | 720 return D; |
| 879 } | 721 } |
| 880 | 722 |
| 881 //------------------------------------------------------------------------------ | 723 //------------------------------------------------------------------------------ |
| 882 | 724 |
| 883 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, | 725 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, |
| 884 int start_block, int end_block, | 726 int start_block, int end_block, |
| 885 VP8Histogram* const histo) { | 727 VP8Histogram* const histo) { |
| 886 const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH); | 728 const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH); |
| 887 int j; | 729 int j; |
| 730 int distribution[MAX_COEFF_THRESH + 1] = { 0 }; |
| 888 for (j = start_block; j < end_block; ++j) { | 731 for (j = start_block; j < end_block; ++j) { |
| 889 int16_t out[16]; | 732 int16_t out[16]; |
| 890 FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); | 733 FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); |
| 891 { | 734 { |
| 892 int k; | 735 int k; |
| 893 const int16x8_t a0 = vld1q_s16(out + 0); | 736 const int16x8_t a0 = vld1q_s16(out + 0); |
| 894 const int16x8_t b0 = vld1q_s16(out + 8); | 737 const int16x8_t b0 = vld1q_s16(out + 8); |
| 895 const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0)); | 738 const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0)); |
| 896 const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0)); | 739 const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0)); |
| 897 const uint16x8_t a2 = vshrq_n_u16(a1, 3); | 740 const uint16x8_t a2 = vshrq_n_u16(a1, 3); |
| 898 const uint16x8_t b2 = vshrq_n_u16(b1, 3); | 741 const uint16x8_t b2 = vshrq_n_u16(b1, 3); |
| 899 const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh); | 742 const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh); |
| 900 const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh); | 743 const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh); |
| 901 vst1q_s16(out + 0, vreinterpretq_s16_u16(a3)); | 744 vst1q_s16(out + 0, vreinterpretq_s16_u16(a3)); |
| 902 vst1q_s16(out + 8, vreinterpretq_s16_u16(b3)); | 745 vst1q_s16(out + 8, vreinterpretq_s16_u16(b3)); |
| 903 // Convert coefficients to bin. | 746 // Convert coefficients to bin. |
| 904 for (k = 0; k < 16; ++k) { | 747 for (k = 0; k < 16; ++k) { |
| 905 histo->distribution[out[k]]++; | 748 ++distribution[out[k]]; |
| 906 } | 749 } |
| 907 } | 750 } |
| 908 } | 751 } |
| 752 VP8SetHistogramData(distribution, histo); |
| 909 } | 753 } |
| 910 | 754 |
| 911 //------------------------------------------------------------------------------ | 755 //------------------------------------------------------------------------------ |
| 912 | 756 |
| 913 static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a, | 757 static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a, |
| 914 const uint8_t* const b, | 758 const uint8_t* const b, |
| 915 uint32x4_t* const sum) { | 759 uint32x4_t* const sum) { |
| 916 const uint8x16_t a0 = vld1q_u8(a); | 760 const uint8x16_t a0 = vld1q_u8(a); |
| 917 const uint8x16_t b0 = vld1q_u8(b); | 761 const uint8x16_t b0 = vld1q_u8(b); |
| 918 const uint8x16_t abs_diff = vabdq_u8(a0, b0); | 762 const uint8x16_t abs_diff = vabdq_u8(a0, b0); |
| (...skipping 123 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1042 vst1_u8((uint8_t*)(out + 8), shuffles.val[2]); | 886 vst1_u8((uint8_t*)(out + 8), shuffles.val[2]); |
| 1043 vst1_u8((uint8_t*)(out + 12), shuffles.val[3]); | 887 vst1_u8((uint8_t*)(out + 12), shuffles.val[3]); |
| 1044 // test zeros | 888 // test zeros |
| 1045 if (*(uint64_t*)(out + 0) != 0) return 1; | 889 if (*(uint64_t*)(out + 0) != 0) return 1; |
| 1046 if (*(uint64_t*)(out + 4) != 0) return 1; | 890 if (*(uint64_t*)(out + 4) != 0) return 1; |
| 1047 if (*(uint64_t*)(out + 8) != 0) return 1; | 891 if (*(uint64_t*)(out + 8) != 0) return 1; |
| 1048 if (*(uint64_t*)(out + 12) != 0) return 1; | 892 if (*(uint64_t*)(out + 12) != 0) return 1; |
| 1049 return 0; | 893 return 0; |
| 1050 } | 894 } |
| 1051 | 895 |
| 896 static int Quantize2Blocks(int16_t in[32], int16_t out[32], |
| 897 const VP8Matrix* const mtx) { |
| 898 int nz; |
| 899 nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0; |
| 900 nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1; |
| 901 return nz; |
| 902 } |
| 903 |
| 1052 #endif // !WORK_AROUND_GCC | 904 #endif // !WORK_AROUND_GCC |
| 1053 | 905 |
| 1054 #endif // WEBP_USE_NEON | |
| 1055 | |
| 1056 //------------------------------------------------------------------------------ | 906 //------------------------------------------------------------------------------ |
| 1057 // Entry point | 907 // Entry point |
| 1058 | 908 |
| 1059 extern void VP8EncDspInitNEON(void); | 909 extern void VP8EncDspInitNEON(void); |
| 1060 | 910 |
| 1061 void VP8EncDspInitNEON(void) { | 911 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) { |
| 1062 #if defined(WEBP_USE_NEON) | |
| 1063 VP8ITransform = ITransform; | 912 VP8ITransform = ITransform; |
| 1064 VP8FTransform = FTransform; | 913 VP8FTransform = FTransform; |
| 1065 | 914 |
| 1066 VP8FTransformWHT = FTransformWHT; | 915 VP8FTransformWHT = FTransformWHT; |
| 1067 | 916 |
| 1068 VP8TDisto4x4 = Disto4x4; | 917 VP8TDisto4x4 = Disto4x4; |
| 1069 VP8TDisto16x16 = Disto16x16; | 918 VP8TDisto16x16 = Disto16x16; |
| 1070 VP8CollectHistogram = CollectHistogram; | 919 VP8CollectHistogram = CollectHistogram; |
| 1071 VP8SSE16x16 = SSE16x16; | 920 VP8SSE16x16 = SSE16x16; |
| 1072 VP8SSE16x8 = SSE16x8; | 921 VP8SSE16x8 = SSE16x8; |
| 1073 VP8SSE8x8 = SSE8x8; | 922 VP8SSE8x8 = SSE8x8; |
| 1074 VP8SSE4x4 = SSE4x4; | 923 VP8SSE4x4 = SSE4x4; |
| 1075 #if !defined(WORK_AROUND_GCC) | 924 #if !defined(WORK_AROUND_GCC) |
| 1076 VP8EncQuantizeBlock = QuantizeBlock; | 925 VP8EncQuantizeBlock = QuantizeBlock; |
| 926 VP8EncQuantize2Blocks = Quantize2Blocks; |
| 1077 #endif | 927 #endif |
| 1078 #endif // WEBP_USE_NEON | |
| 1079 } | 928 } |
| 929 |
| 930 #else // !WEBP_USE_NEON |
| 931 |
| 932 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON) |
| 933 |
| 934 #endif // WEBP_USE_NEON |
| OLD | NEW |