third_party/libwebp/dsp/dec_neon.c - Issue 2651883004: libwebp-0.6.0-rc1

Side by Side Diff: third_party/libwebp/dsp/dec_neon.c

Issue 2651883004: libwebp-0.6.0-rc1 (Closed)

Patch Set: Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2012 Google Inc. All Rights Reserved.	1 // Copyright 2012 Google Inc. All Rights Reserved.

2 //	2 //

3 // Use of this source code is governed by a BSD-style license	3 // Use of this source code is governed by a BSD-style license

4 // that can be found in the COPYING file in the root of the source	4 // that can be found in the COPYING file in the root of the source

5 // tree. An additional intellectual property rights grant can be found	5 // tree. An additional intellectual property rights grant can be found

6 // in the file PATENTS. All contributing project authors may	6 // in the file PATENTS. All contributing project authors may

7 // be found in the AUTHORS file in the root of the source tree.	7 // be found in the AUTHORS file in the root of the source tree.

8 // -----------------------------------------------------------------------------	8 // -----------------------------------------------------------------------------

9 //	9 //

10 // ARM NEON version of dsp functions and loop filtering.	10 // ARM NEON version of dsp functions and loop filtering.

11 //	11 //

12 // Authors: Somnath Banerjee (somnath@google.com)	12 // Authors: Somnath Banerjee (somnath@google.com)

13 // Johann Koenig (johannkoenig@google.com)	13 // Johann Koenig (johannkoenig@google.com)

14	14

15 #include "./dsp.h"	15 #include "./dsp.h"

16	16

17 #if defined(WEBP_USE_NEON)	17 #if defined(WEBP_USE_NEON)

18	18

19 #include "./neon.h"	19 #include "./neon.h"

20 #include "../dec/vp8i.h"	20 #include "../dec/vp8i_dec.h"

21	21

22 //------------------------------------------------------------------------------	22 //------------------------------------------------------------------------------

23 // NxM Loading functions	23 // NxM Loading functions

24	24

25 // Load/Store vertical edge	25 // Load/Store vertical edge

26 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \	26 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \

27 "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \	27 "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \

28 "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \	28 "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \

29 "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \	29 "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \

30 "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \	30 "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \

(...skipping 628 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
659	659

660 //------------------------------------------------------------------------------	660 //------------------------------------------------------------------------------

661 // Complex In-loop filtering (Paragraph 15.3)	661 // Complex In-loop filtering (Paragraph 15.3)

662	662

663 static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,	663 static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,

664 const uint8x16_t q0, const uint8x16_t q1,	664 const uint8x16_t q0, const uint8x16_t q1,

665 int hev_thresh) {	665 int hev_thresh) {

666 const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);	666 const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);

667 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)	667 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)

668 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)	668 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)

669 const uint8x16_t mask1 = vcgtq_u8(a_p1_p0, hev_thresh_v);	669 const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);

670 const uint8x16_t mask2 = vcgtq_u8(a_q1_q0, hev_thresh_v);	670 const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);

671 const uint8x16_t mask = vorrq_u8(mask1, mask2);

672 return mask;	671 return mask;

673 }	672 }

674	673

675 static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2,	674 static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2,

676 const uint8x16_t p1, const uint8x16_t p0,	675 const uint8x16_t p1, const uint8x16_t p0,

677 const uint8x16_t q0, const uint8x16_t q1,	676 const uint8x16_t q0, const uint8x16_t q1,

678 const uint8x16_t q2, const uint8x16_t q3,	677 const uint8x16_t q2, const uint8x16_t q3,

679 int ithresh, int thresh) {	678 int ithresh, int thresh) {

680 const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);	679 const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);

681 const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2)	680 const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2)

(...skipping 67 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
749 }	748 }

750	749

751 // 6-points filter	750 // 6-points filter

752	751

753 static void ApplyFilter6(	752 static void ApplyFilter6(

754 const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,	753 const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,

755 const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,	754 const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,

756 const int8x16_t delta,	755 const int8x16_t delta,

757 uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,	756 uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,

758 uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {	757 uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {

759 const int16x8_t kCst63 = vdupq_n_s16(63);	758 // We have to compute: X = (9a+63) >> 7, Y = (18a+63)>>7, Z = (27*a+63) >> 7

760 const int8x8_t kCst27 = vdup_n_s8(27);	759 // Turns out, there's a common sub-expression S=9 * a - 1 that can be used

761 const int8x8_t kCst18 = vdup_n_s8(18);	760 // with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:

762 const int8x8_t kCst9 = vdup_n_s8(9);	761 // X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7

763 const int8x8_t delta_lo = vget_low_s8(delta);	762 const int8x8_t delta_lo = vget_low_s8(delta);

764 const int8x8_t delta_hi = vget_high_s8(delta);	763 const int8x8_t delta_hi = vget_high_s8(delta);

765 const int16x8_t s1_lo = vmlal_s8(kCst63, kCst27, delta_lo); // 63 + 27 * a	764 const int8x8_t kCst9 = vdup_n_s8(9);

766 const int16x8_t s1_hi = vmlal_s8(kCst63, kCst27, delta_hi); // 63 + 27 * a	765 const int16x8_t kCstm1 = vdupq_n_s16(-1);

767 const int16x8_t s2_lo = vmlal_s8(kCst63, kCst18, delta_lo); // 63 + 18 * a	766 const int8x8_t kCst18 = vdup_n_s8(18);

768 const int16x8_t s2_hi = vmlal_s8(kCst63, kCst18, delta_hi); // 63 + 18 * a	767 const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo); // S = 9 * a - 1

769 const int16x8_t s3_lo = vmlal_s8(kCst63, kCst9, delta_lo); // 63 + 9 * a	768 const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);

770 const int16x8_t s3_hi = vmlal_s8(kCst63, kCst9, delta_hi); // 63 + 9 * a	769 const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo); // S + 18 * a

771 const int8x8_t a1_lo = vqshrn_n_s16(s1_lo, 7);	770 const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);

772 const int8x8_t a1_hi = vqshrn_n_s16(s1_hi, 7);	771 const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7); // (9 * a + 63) >> 7

773 const int8x8_t a2_lo = vqshrn_n_s16(s2_lo, 7);	772 const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);

774 const int8x8_t a2_hi = vqshrn_n_s16(s2_hi, 7);	773 const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6); // (9 * a + 31) >> 6

775 const int8x8_t a3_lo = vqshrn_n_s16(s3_lo, 7);	774 const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);

776 const int8x8_t a3_hi = vqshrn_n_s16(s3_hi, 7);	775 const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7); // (27 * a + 63) >> 7

	776 const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);

777 const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);	777 const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);

778 const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);	778 const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);

779 const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);	779 const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);

780	780

781 *op0 = FlipSignBack(vqaddq_s8(p0, a1)); // clip(p0 + a1)	781 *op0 = FlipSignBack(vqaddq_s8(p0, a1)); // clip(p0 + a1)

782 *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - q1)	782 *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - q1)

783 *oq1 = FlipSignBack(vqsubq_s8(q1, a2)); // clip(q1 - a2)	783 *oq1 = FlipSignBack(vqsubq_s8(q1, a2)); // clip(q1 - a2)

784 *op1 = FlipSignBack(vqaddq_s8(p1, a2)); // clip(p1 + a2)	784 *op1 = FlipSignBack(vqaddq_s8(p1, a2)); // clip(p1 + a2)

785 *oq2 = FlipSignBack(vqsubq_s8(q2, a3)); // clip(q2 - a3)	785 *oq2 = FlipSignBack(vqsubq_s8(q2, a3)); // clip(q2 - a3)

786 *op2 = FlipSignBack(vqaddq_s8(p2, a3)); // clip(p2 + a3)	786 *op2 = FlipSignBack(vqaddq_s8(p2, a3)); // clip(p2 + a3)

(...skipping 843 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1630 VP8PredChroma8[4] = DC8uvNoTop;	1630 VP8PredChroma8[4] = DC8uvNoTop;

1631 VP8PredChroma8[5] = DC8uvNoLeft;	1631 VP8PredChroma8[5] = DC8uvNoLeft;

1632 VP8PredChroma8[6] = DC8uvNoTopLeft;	1632 VP8PredChroma8[6] = DC8uvNoTopLeft;

1633 }	1633 }

1634	1634

1635 #else // !WEBP_USE_NEON	1635 #else // !WEBP_USE_NEON

1636	1636

1637 WEBP_DSP_INIT_STUB(VP8DspInitNEON)	1637 WEBP_DSP_INIT_STUB(VP8DspInitNEON)

1638	1638

1639 #endif // WEBP_USE_NEON	1639 #endif // WEBP_USE_NEON

OLD	NEW

« no previous file with comments | « third_party/libwebp/dsp/dec_msa.c ('k') | third_party/libwebp/dsp/dec_sse2.c » ('j') | no next file with comments »