Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(644)

Side by Side Diff: third_party/libwebp/dsp/dec_neon.c

Issue 2651883004: libwebp-0.6.0-rc1 (Closed)
Patch Set: Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/libwebp/dsp/dec_msa.c ('k') | third_party/libwebp/dsp/dec_sse2.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 Google Inc. All Rights Reserved. 1 // Copyright 2012 Google Inc. All Rights Reserved.
2 // 2 //
3 // Use of this source code is governed by a BSD-style license 3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source 4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found 5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may 6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree. 7 // be found in the AUTHORS file in the root of the source tree.
8 // ----------------------------------------------------------------------------- 8 // -----------------------------------------------------------------------------
9 // 9 //
10 // ARM NEON version of dsp functions and loop filtering. 10 // ARM NEON version of dsp functions and loop filtering.
11 // 11 //
12 // Authors: Somnath Banerjee (somnath@google.com) 12 // Authors: Somnath Banerjee (somnath@google.com)
13 // Johann Koenig (johannkoenig@google.com) 13 // Johann Koenig (johannkoenig@google.com)
14 14
15 #include "./dsp.h" 15 #include "./dsp.h"
16 16
17 #if defined(WEBP_USE_NEON) 17 #if defined(WEBP_USE_NEON)
18 18
19 #include "./neon.h" 19 #include "./neon.h"
20 #include "../dec/vp8i.h" 20 #include "../dec/vp8i_dec.h"
21 21
22 //------------------------------------------------------------------------------ 22 //------------------------------------------------------------------------------
23 // NxM Loading functions 23 // NxM Loading functions
24 24
25 // Load/Store vertical edge 25 // Load/Store vertical edge
26 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \ 26 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \
27 "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \ 27 "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
28 "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \ 28 "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
29 "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \ 29 "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
30 "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \ 30 "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
(...skipping 628 matching lines...) Expand 10 before | Expand all | Expand 10 after
659 659
660 //------------------------------------------------------------------------------ 660 //------------------------------------------------------------------------------
661 // Complex In-loop filtering (Paragraph 15.3) 661 // Complex In-loop filtering (Paragraph 15.3)
662 662
663 static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0, 663 static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,
664 const uint8x16_t q0, const uint8x16_t q1, 664 const uint8x16_t q0, const uint8x16_t q1,
665 int hev_thresh) { 665 int hev_thresh) {
666 const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh); 666 const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
667 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0) 667 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
668 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0) 668 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
669 const uint8x16_t mask1 = vcgtq_u8(a_p1_p0, hev_thresh_v); 669 const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
670 const uint8x16_t mask2 = vcgtq_u8(a_q1_q0, hev_thresh_v); 670 const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);
671 const uint8x16_t mask = vorrq_u8(mask1, mask2);
672 return mask; 671 return mask;
673 } 672 }
674 673
675 static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2, 674 static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2,
676 const uint8x16_t p1, const uint8x16_t p0, 675 const uint8x16_t p1, const uint8x16_t p0,
677 const uint8x16_t q0, const uint8x16_t q1, 676 const uint8x16_t q0, const uint8x16_t q1,
678 const uint8x16_t q2, const uint8x16_t q3, 677 const uint8x16_t q2, const uint8x16_t q3,
679 int ithresh, int thresh) { 678 int ithresh, int thresh) {
680 const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh); 679 const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
681 const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2) 680 const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2)
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
749 } 748 }
750 749
751 // 6-points filter 750 // 6-points filter
752 751
753 static void ApplyFilter6( 752 static void ApplyFilter6(
754 const int8x16_t p2, const int8x16_t p1, const int8x16_t p0, 753 const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
755 const int8x16_t q0, const int8x16_t q1, const int8x16_t q2, 754 const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
756 const int8x16_t delta, 755 const int8x16_t delta,
757 uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0, 756 uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
758 uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) { 757 uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
759 const int16x8_t kCst63 = vdupq_n_s16(63); 758 // We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7
760 const int8x8_t kCst27 = vdup_n_s8(27); 759 // Turns out, there's a common sub-expression S=9 * a - 1 that can be used
761 const int8x8_t kCst18 = vdup_n_s8(18); 760 // with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:
762 const int8x8_t kCst9 = vdup_n_s8(9); 761 // X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7
763 const int8x8_t delta_lo = vget_low_s8(delta); 762 const int8x8_t delta_lo = vget_low_s8(delta);
764 const int8x8_t delta_hi = vget_high_s8(delta); 763 const int8x8_t delta_hi = vget_high_s8(delta);
765 const int16x8_t s1_lo = vmlal_s8(kCst63, kCst27, delta_lo); // 63 + 27 * a 764 const int8x8_t kCst9 = vdup_n_s8(9);
766 const int16x8_t s1_hi = vmlal_s8(kCst63, kCst27, delta_hi); // 63 + 27 * a 765 const int16x8_t kCstm1 = vdupq_n_s16(-1);
767 const int16x8_t s2_lo = vmlal_s8(kCst63, kCst18, delta_lo); // 63 + 18 * a 766 const int8x8_t kCst18 = vdup_n_s8(18);
768 const int16x8_t s2_hi = vmlal_s8(kCst63, kCst18, delta_hi); // 63 + 18 * a 767 const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo); // S = 9 * a - 1
769 const int16x8_t s3_lo = vmlal_s8(kCst63, kCst9, delta_lo); // 63 + 9 * a 768 const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);
770 const int16x8_t s3_hi = vmlal_s8(kCst63, kCst9, delta_hi); // 63 + 9 * a 769 const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo); // S + 18 * a
771 const int8x8_t a1_lo = vqshrn_n_s16(s1_lo, 7); 770 const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);
772 const int8x8_t a1_hi = vqshrn_n_s16(s1_hi, 7); 771 const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7); // (9 * a + 63) >> 7
773 const int8x8_t a2_lo = vqshrn_n_s16(s2_lo, 7); 772 const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);
774 const int8x8_t a2_hi = vqshrn_n_s16(s2_hi, 7); 773 const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6); // (9 * a + 31) >> 6
775 const int8x8_t a3_lo = vqshrn_n_s16(s3_lo, 7); 774 const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);
776 const int8x8_t a3_hi = vqshrn_n_s16(s3_hi, 7); 775 const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7); // (27 * a + 63) >> 7
776 const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);
777 const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi); 777 const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
778 const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi); 778 const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
779 const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi); 779 const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
780 780
781 *op0 = FlipSignBack(vqaddq_s8(p0, a1)); // clip(p0 + a1) 781 *op0 = FlipSignBack(vqaddq_s8(p0, a1)); // clip(p0 + a1)
782 *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - q1) 782 *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - q1)
783 *oq1 = FlipSignBack(vqsubq_s8(q1, a2)); // clip(q1 - a2) 783 *oq1 = FlipSignBack(vqsubq_s8(q1, a2)); // clip(q1 - a2)
784 *op1 = FlipSignBack(vqaddq_s8(p1, a2)); // clip(p1 + a2) 784 *op1 = FlipSignBack(vqaddq_s8(p1, a2)); // clip(p1 + a2)
785 *oq2 = FlipSignBack(vqsubq_s8(q2, a3)); // clip(q2 - a3) 785 *oq2 = FlipSignBack(vqsubq_s8(q2, a3)); // clip(q2 - a3)
786 *op2 = FlipSignBack(vqaddq_s8(p2, a3)); // clip(p2 + a3) 786 *op2 = FlipSignBack(vqaddq_s8(p2, a3)); // clip(p2 + a3)
(...skipping 843 matching lines...) Expand 10 before | Expand all | Expand 10 after
1630 VP8PredChroma8[4] = DC8uvNoTop; 1630 VP8PredChroma8[4] = DC8uvNoTop;
1631 VP8PredChroma8[5] = DC8uvNoLeft; 1631 VP8PredChroma8[5] = DC8uvNoLeft;
1632 VP8PredChroma8[6] = DC8uvNoTopLeft; 1632 VP8PredChroma8[6] = DC8uvNoTopLeft;
1633 } 1633 }
1634 1634
1635 #else // !WEBP_USE_NEON 1635 #else // !WEBP_USE_NEON
1636 1636
1637 WEBP_DSP_INIT_STUB(VP8DspInitNEON) 1637 WEBP_DSP_INIT_STUB(VP8DspInitNEON)
1638 1638
1639 #endif // WEBP_USE_NEON 1639 #endif // WEBP_USE_NEON
OLDNEW
« no previous file with comments | « third_party/libwebp/dsp/dec_msa.c ('k') | third_party/libwebp/dsp/dec_sse2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698