| OLD | NEW |
| 1 // Copyright 2012 Google Inc. All Rights Reserved. | 1 // Copyright 2012 Google Inc. All Rights Reserved. |
| 2 // | 2 // |
| 3 // Use of this source code is governed by a BSD-style license | 3 // Use of this source code is governed by a BSD-style license |
| 4 // that can be found in the COPYING file in the root of the source | 4 // that can be found in the COPYING file in the root of the source |
| 5 // tree. An additional intellectual property rights grant can be found | 5 // tree. An additional intellectual property rights grant can be found |
| 6 // in the file PATENTS. All contributing project authors may | 6 // in the file PATENTS. All contributing project authors may |
| 7 // be found in the AUTHORS file in the root of the source tree. | 7 // be found in the AUTHORS file in the root of the source tree. |
| 8 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
| 9 // | 9 // |
| 10 // ARM NEON version of dsp functions and loop filtering. | 10 // ARM NEON version of dsp functions and loop filtering. |
| 11 // | 11 // |
| 12 // Authors: Somnath Banerjee (somnath@google.com) | 12 // Authors: Somnath Banerjee (somnath@google.com) |
| 13 // Johann Koenig (johannkoenig@google.com) | 13 // Johann Koenig (johannkoenig@google.com) |
| 14 | 14 |
| 15 #include "./dsp.h" | 15 #include "./dsp.h" |
| 16 | 16 |
| 17 #if defined(WEBP_USE_NEON) | 17 #if defined(WEBP_USE_NEON) |
| 18 | 18 |
| 19 #include "./neon.h" | 19 #include "./neon.h" |
| 20 #include "../dec/vp8i.h" | 20 #include "../dec/vp8i_dec.h" |
| 21 | 21 |
| 22 //------------------------------------------------------------------------------ | 22 //------------------------------------------------------------------------------ |
| 23 // NxM Loading functions | 23 // NxM Loading functions |
| 24 | 24 |
| 25 // Load/Store vertical edge | 25 // Load/Store vertical edge |
| 26 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \ | 26 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \ |
| 27 "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \ | 27 "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \ |
| 28 "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \ | 28 "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \ |
| 29 "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \ | 29 "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \ |
| 30 "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \ | 30 "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \ |
| (...skipping 628 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 659 | 659 |
| 660 //------------------------------------------------------------------------------ | 660 //------------------------------------------------------------------------------ |
| 661 // Complex In-loop filtering (Paragraph 15.3) | 661 // Complex In-loop filtering (Paragraph 15.3) |
| 662 | 662 |
| 663 static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0, | 663 static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0, |
| 664 const uint8x16_t q0, const uint8x16_t q1, | 664 const uint8x16_t q0, const uint8x16_t q1, |
| 665 int hev_thresh) { | 665 int hev_thresh) { |
| 666 const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh); | 666 const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh); |
| 667 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0) | 667 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0) |
| 668 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0) | 668 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0) |
| 669 const uint8x16_t mask1 = vcgtq_u8(a_p1_p0, hev_thresh_v); | 669 const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0); |
| 670 const uint8x16_t mask2 = vcgtq_u8(a_q1_q0, hev_thresh_v); | 670 const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v); |
| 671 const uint8x16_t mask = vorrq_u8(mask1, mask2); | |
| 672 return mask; | 671 return mask; |
| 673 } | 672 } |
| 674 | 673 |
| 675 static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2, | 674 static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2, |
| 676 const uint8x16_t p1, const uint8x16_t p0, | 675 const uint8x16_t p1, const uint8x16_t p0, |
| 677 const uint8x16_t q0, const uint8x16_t q1, | 676 const uint8x16_t q0, const uint8x16_t q1, |
| 678 const uint8x16_t q2, const uint8x16_t q3, | 677 const uint8x16_t q2, const uint8x16_t q3, |
| 679 int ithresh, int thresh) { | 678 int ithresh, int thresh) { |
| 680 const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh); | 679 const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh); |
| 681 const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2) | 680 const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2) |
| (...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 749 } | 748 } |
| 750 | 749 |
| 751 // 6-points filter | 750 // 6-points filter |
| 752 | 751 |
| 753 static void ApplyFilter6( | 752 static void ApplyFilter6( |
| 754 const int8x16_t p2, const int8x16_t p1, const int8x16_t p0, | 753 const int8x16_t p2, const int8x16_t p1, const int8x16_t p0, |
| 755 const int8x16_t q0, const int8x16_t q1, const int8x16_t q2, | 754 const int8x16_t q0, const int8x16_t q1, const int8x16_t q2, |
| 756 const int8x16_t delta, | 755 const int8x16_t delta, |
| 757 uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0, | 756 uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0, |
| 758 uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) { | 757 uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) { |
| 759 const int16x8_t kCst63 = vdupq_n_s16(63); | 758 // We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7 |
| 760 const int8x8_t kCst27 = vdup_n_s8(27); | 759 // Turns out, there's a common sub-expression S=9 * a - 1 that can be used |
| 761 const int8x8_t kCst18 = vdup_n_s8(18); | 760 // with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction: |
| 762 const int8x8_t kCst9 = vdup_n_s8(9); | 761 // X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7 |
| 763 const int8x8_t delta_lo = vget_low_s8(delta); | 762 const int8x8_t delta_lo = vget_low_s8(delta); |
| 764 const int8x8_t delta_hi = vget_high_s8(delta); | 763 const int8x8_t delta_hi = vget_high_s8(delta); |
| 765 const int16x8_t s1_lo = vmlal_s8(kCst63, kCst27, delta_lo); // 63 + 27 * a | 764 const int8x8_t kCst9 = vdup_n_s8(9); |
| 766 const int16x8_t s1_hi = vmlal_s8(kCst63, kCst27, delta_hi); // 63 + 27 * a | 765 const int16x8_t kCstm1 = vdupq_n_s16(-1); |
| 767 const int16x8_t s2_lo = vmlal_s8(kCst63, kCst18, delta_lo); // 63 + 18 * a | 766 const int8x8_t kCst18 = vdup_n_s8(18); |
| 768 const int16x8_t s2_hi = vmlal_s8(kCst63, kCst18, delta_hi); // 63 + 18 * a | 767 const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo); // S = 9 * a - 1 |
| 769 const int16x8_t s3_lo = vmlal_s8(kCst63, kCst9, delta_lo); // 63 + 9 * a | 768 const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi); |
| 770 const int16x8_t s3_hi = vmlal_s8(kCst63, kCst9, delta_hi); // 63 + 9 * a | 769 const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo); // S + 18 * a |
| 771 const int8x8_t a1_lo = vqshrn_n_s16(s1_lo, 7); | 770 const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi); |
| 772 const int8x8_t a1_hi = vqshrn_n_s16(s1_hi, 7); | 771 const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7); // (9 * a + 63) >> 7 |
| 773 const int8x8_t a2_lo = vqshrn_n_s16(s2_lo, 7); | 772 const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7); |
| 774 const int8x8_t a2_hi = vqshrn_n_s16(s2_hi, 7); | 773 const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6); // (9 * a + 31) >> 6 |
| 775 const int8x8_t a3_lo = vqshrn_n_s16(s3_lo, 7); | 774 const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6); |
| 776 const int8x8_t a3_hi = vqshrn_n_s16(s3_hi, 7); | 775 const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7); // (27 * a + 63) >> 7 |
| 776 const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7); |
| 777 const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi); | 777 const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi); |
| 778 const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi); | 778 const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi); |
| 779 const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi); | 779 const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi); |
| 780 | 780 |
| 781 *op0 = FlipSignBack(vqaddq_s8(p0, a1)); // clip(p0 + a1) | 781 *op0 = FlipSignBack(vqaddq_s8(p0, a1)); // clip(p0 + a1) |
| 782 *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - q1) | 782 *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - q1) |
| 783 *oq1 = FlipSignBack(vqsubq_s8(q1, a2)); // clip(q1 - a2) | 783 *oq1 = FlipSignBack(vqsubq_s8(q1, a2)); // clip(q1 - a2) |
| 784 *op1 = FlipSignBack(vqaddq_s8(p1, a2)); // clip(p1 + a2) | 784 *op1 = FlipSignBack(vqaddq_s8(p1, a2)); // clip(p1 + a2) |
| 785 *oq2 = FlipSignBack(vqsubq_s8(q2, a3)); // clip(q2 - a3) | 785 *oq2 = FlipSignBack(vqsubq_s8(q2, a3)); // clip(q2 - a3) |
| 786 *op2 = FlipSignBack(vqaddq_s8(p2, a3)); // clip(p2 + a3) | 786 *op2 = FlipSignBack(vqaddq_s8(p2, a3)); // clip(p2 + a3) |
| (...skipping 843 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1630 VP8PredChroma8[4] = DC8uvNoTop; | 1630 VP8PredChroma8[4] = DC8uvNoTop; |
| 1631 VP8PredChroma8[5] = DC8uvNoLeft; | 1631 VP8PredChroma8[5] = DC8uvNoLeft; |
| 1632 VP8PredChroma8[6] = DC8uvNoTopLeft; | 1632 VP8PredChroma8[6] = DC8uvNoTopLeft; |
| 1633 } | 1633 } |
| 1634 | 1634 |
| 1635 #else // !WEBP_USE_NEON | 1635 #else // !WEBP_USE_NEON |
| 1636 | 1636 |
| 1637 WEBP_DSP_INIT_STUB(VP8DspInitNEON) | 1637 WEBP_DSP_INIT_STUB(VP8DspInitNEON) |
| 1638 | 1638 |
| 1639 #endif // WEBP_USE_NEON | 1639 #endif // WEBP_USE_NEON |
| OLD | NEW |