third_party/libwebp/dsp/enc_neon.c - Issue 2149863002: libwebp: update to v0.5.1

Side by Side Diff: third_party/libwebp/dsp/enc_neon.c

Issue 2149863002: libwebp: update to v0.5.1 (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2012 Google Inc. All Rights Reserved.	1 // Copyright 2012 Google Inc. All Rights Reserved.

2 //	2 //

3 // Use of this source code is governed by a BSD-style license	3 // Use of this source code is governed by a BSD-style license

4 // that can be found in the COPYING file in the root of the source	4 // that can be found in the COPYING file in the root of the source

5 // tree. An additional intellectual property rights grant can be found	5 // tree. An additional intellectual property rights grant can be found

6 // in the file PATENTS. All contributing project authors may	6 // in the file PATENTS. All contributing project authors may

7 // be found in the AUTHORS file in the root of the source tree.	7 // be found in the AUTHORS file in the root of the source tree.

8 // -----------------------------------------------------------------------------	8 // -----------------------------------------------------------------------------

9 //	9 //

10 // ARM NEON version of speed-critical encoding functions.	10 // ARM NEON version of speed-critical encoding functions.

(...skipping 542 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
553 // a 89ab, b 89ab	553 // a 89ab, b 89ab

554 // a cdef, b cdef	554 // a cdef, b cdef

555 //	555 //

556 // transpose	556 // transpose

557 //	557 //

558 // a 048c, b 048c	558 // a 048c, b 048c

559 // a 159d, b 159d	559 // a 159d, b 159d

560 // a 26ae, b 26ae	560 // a 26ae, b 26ae

561 // a 37bf, b 37bf	561 // a 37bf, b 37bf

562 //	562 //

563 static WEBP_INLINE uint8x8x4_t DistoTranspose4x4U8(uint8x8x4_t d4_in) {

564 const uint8x8x2_t d2_tmp0 = vtrn_u8(d4_in.val[0], d4_in.val[1]);

565 const uint8x8x2_t d2_tmp1 = vtrn_u8(d4_in.val[2], d4_in.val[3]);

566 const uint16x4x2_t d2_tmp2 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[0]),

567 vreinterpret_u16_u8(d2_tmp1.val[0]));

568 const uint16x4x2_t d2_tmp3 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[1]),

569 vreinterpret_u16_u8(d2_tmp1.val[1]));

570

571 d4_in.val[0] = vreinterpret_u8_u16(d2_tmp2.val[0]);

572 d4_in.val[2] = vreinterpret_u8_u16(d2_tmp2.val[1]);

573 d4_in.val[1] = vreinterpret_u8_u16(d2_tmp3.val[0]);

574 d4_in.val[3] = vreinterpret_u8_u16(d2_tmp3.val[1]);

575 return d4_in;

576 }

577

578 static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {	563 static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {

579 const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);	564 const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);

580 const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);	565 const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);

581 const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),	566 const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),

582 vreinterpretq_s32_s16(q2_tmp1.val[0]));	567 vreinterpretq_s32_s16(q2_tmp1.val[0]));

583 const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),	568 const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),

584 vreinterpretq_s32_s16(q2_tmp1.val[1]));	569 vreinterpretq_s32_s16(q2_tmp1.val[1]));

585 q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);	570 q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);

586 q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);	571 q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);

587 q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);	572 q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);

588 q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);	573 q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);

589 return q4_in;	574 return q4_in;

590 }	575 }

591	576

592 static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const uint8x8x4_t d4_in) {	577 static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) {

593 // {a0, a1} = {in[0] + in[2], in[1] + in[3]}	578 // {a0, a1} = {in[0] + in[2], in[1] + in[3]}

594 // {a3, a2} = {in[0] - in[2], in[1] - in[3]}	579 // {a3, a2} = {in[0] - in[2], in[1] - in[3]}

595 const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[0],	580 const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);

596 d4_in.val[2]));	581 const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);

597 const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[1],	582 const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);

598 d4_in.val[3]));	583 const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);

599 const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[0],

600 d4_in.val[2]));

601 const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[1],

602 d4_in.val[3]));

603 int16x8x4_t q4_out;	584 int16x8x4_t q4_out;

604 // tmp[0] = a0 + a1	585 // tmp[0] = a0 + a1

605 // tmp[1] = a3 + a2	586 // tmp[1] = a3 + a2

606 // tmp[2] = a3 - a2	587 // tmp[2] = a3 - a2

607 // tmp[3] = a0 - a1	588 // tmp[3] = a0 - a1

608 INIT_VECTOR4(q4_out,	589 INIT_VECTOR4(q4_out,

	590 vabsq_s16(vaddq_s16(q_a0, q_a1)),

	591 vabsq_s16(vaddq_s16(q_a3, q_a2)),

	592 vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));

	593 return q4_out;

	594 }

	595

	596 static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) {

	597 const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],

	598 q4_in.val[2]));

	599 const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],

	600 q4_in.val[3]));

	601 const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1],

	602 q4_in.val[3]));

	603 const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0],

	604 q4_in.val[2]));

	605 int16x8x4_t q4_out;

	606

	607 INIT_VECTOR4(q4_out,

609 vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),	608 vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),

610 vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));	609 vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));

611 return q4_out;	610 return q4_out;

612 }	611 }

613	612

614 static WEBP_INLINE int16x8x4_t DistoVerticalPass(int16x8x4_t q4_in) {

615 const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);

616 const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);

617 const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);

618 const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);

619

620 q4_in.val[0] = vaddq_s16(q_a0, q_a1);

621 q4_in.val[1] = vaddq_s16(q_a3, q_a2);

622 q4_in.val[2] = vabdq_s16(q_a3, q_a2);

623 q4_in.val[3] = vabdq_s16(q_a0, q_a1);

624 q4_in.val[0] = vabsq_s16(q4_in.val[0]);

625 q4_in.val[1] = vabsq_s16(q4_in.val[1]);

626 return q4_in;

627 }

628

629 static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {	613 static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {

630 const uint16x8_t q_w07 = vld1q_u16(&w[0]);	614 const uint16x8_t q_w07 = vld1q_u16(&w[0]);

631 const uint16x8_t q_w8f = vld1q_u16(&w[8]);	615 const uint16x8_t q_w8f = vld1q_u16(&w[8]);

632 int16x4x4_t d4_w;	616 int16x4x4_t d4_w;

633 INIT_VECTOR4(d4_w,	617 INIT_VECTOR4(d4_w,

634 vget_low_s16(vreinterpretq_s16_u16(q_w07)),	618 vget_low_s16(vreinterpretq_s16_u16(q_w07)),

635 vget_high_s16(vreinterpretq_s16_u16(q_w07)),	619 vget_high_s16(vreinterpretq_s16_u16(q_w07)),

636 vget_low_s16(vreinterpretq_s16_u16(q_w8f)),	620 vget_low_s16(vreinterpretq_s16_u16(q_w8f)),

637 vget_high_s16(vreinterpretq_s16_u16(q_w8f)));	621 vget_high_s16(vreinterpretq_s16_u16(q_w8f)));

638 return d4_w;	622 return d4_w;

(...skipping 21 matching lines...) Expand all Loading...
660 d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));	644 d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));

661 d_sum = vpadd_s32(d_sum, d_sum);	645 d_sum = vpadd_s32(d_sum, d_sum);

662 return d_sum;	646 return d_sum;

663 }	647 }

664	648

665 #define LOAD_LANE_32b(src, VALUE, LANE) \	649 #define LOAD_LANE_32b(src, VALUE, LANE) \

666 (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))	650 (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))

667	651

668 // Hadamard transform	652 // Hadamard transform

669 // Returns the weighted sum of the absolute value of transformed coefficients.	653 // Returns the weighted sum of the absolute value of transformed coefficients.

	654 // w[] contains a row-major 4 by 4 symmetric matrix.

670 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,	655 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,

671 const uint16_t* const w) {	656 const uint16_t* const w) {

672 uint32x2_t d_in_ab_0123 = vdup_n_u32(0);	657 uint32x2_t d_in_ab_0123 = vdup_n_u32(0);

673 uint32x2_t d_in_ab_4567 = vdup_n_u32(0);	658 uint32x2_t d_in_ab_4567 = vdup_n_u32(0);

674 uint32x2_t d_in_ab_89ab = vdup_n_u32(0);	659 uint32x2_t d_in_ab_89ab = vdup_n_u32(0);

675 uint32x2_t d_in_ab_cdef = vdup_n_u32(0);	660 uint32x2_t d_in_ab_cdef = vdup_n_u32(0);

676 uint8x8x4_t d4_in;	661 uint8x8x4_t d4_in;

677	662

678 // load data a, b	663 // load data a, b

679 LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);	664 LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);

680 LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);	665 LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);

681 LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);	666 LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);

682 LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);	667 LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);

683 LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);	668 LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);

684 LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);	669 LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);

685 LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);	670 LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);

686 LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);	671 LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);

687 INIT_VECTOR4(d4_in,	672 INIT_VECTOR4(d4_in,

688 vreinterpret_u8_u32(d_in_ab_0123),	673 vreinterpret_u8_u32(d_in_ab_0123),

689 vreinterpret_u8_u32(d_in_ab_4567),	674 vreinterpret_u8_u32(d_in_ab_4567),

690 vreinterpret_u8_u32(d_in_ab_89ab),	675 vreinterpret_u8_u32(d_in_ab_89ab),

691 vreinterpret_u8_u32(d_in_ab_cdef));	676 vreinterpret_u8_u32(d_in_ab_cdef));

692	677

693 {	678 {

	679 // Vertical pass first to avoid a transpose (vertical and horizontal passes

	680 // are commutative because w/kWeightY is symmetric) and subsequent

	681 // transpose.

	682 const int16x8x4_t q4_v = DistoVerticalPass(d4_in);

	683 const int16x4x4_t d4_w = DistoLoadW(w);

694 // horizontal pass	684 // horizontal pass

695 const uint8x8x4_t d4_t = DistoTranspose4x4U8(d4_in);	685 const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_v);

696 const int16x8x4_t q4_h = DistoHorizontalPass(d4_t);	686 const int16x8x4_t q4_h = DistoHorizontalPass(q4_t);

697 const int16x4x4_t d4_w = DistoLoadW(w);	687 int32x2_t d_sum = DistoSum(q4_h, d4_w);

698 // vertical pass

699 const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_h);

700 const int16x8x4_t q4_v = DistoVerticalPass(q4_t);

701 int32x2_t d_sum = DistoSum(q4_v, d4_w);

702	688

703 // abs(sum2 - sum1) >> 5	689 // abs(sum2 - sum1) >> 5

704 d_sum = vabs_s32(d_sum);	690 d_sum = vabs_s32(d_sum);

705 d_sum = vshr_n_s32(d_sum, 5);	691 d_sum = vshr_n_s32(d_sum, 5);

706 return vget_lane_s32(d_sum, 0);	692 return vget_lane_s32(d_sum, 0);

707 }	693 }

708 }	694 }

709 #undef LOAD_LANE_32b	695 #undef LOAD_LANE_32b

710	696

711 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,	697 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,

712 const uint16_t* const w) {	698 const uint16_t* const w) {

713 int D = 0;	699 int D = 0;

714 int x, y;	700 int x, y;

715 for (y = 0; y < 16 * BPS; y += 4 * BPS) {	701 for (y = 0; y < 16 * BPS; y += 4 * BPS) {

(...skipping 209 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
925 VP8EncQuantizeBlock = QuantizeBlock;	911 VP8EncQuantizeBlock = QuantizeBlock;

926 VP8EncQuantize2Blocks = Quantize2Blocks;	912 VP8EncQuantize2Blocks = Quantize2Blocks;

927 #endif	913 #endif

928 }	914 }

929	915

930 #else // !WEBP_USE_NEON	916 #else // !WEBP_USE_NEON

931	917

932 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)	918 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)

933	919

934 #endif // WEBP_USE_NEON	920 #endif // WEBP_USE_NEON

OLD	NEW

« no previous file with comments | « third_party/libwebp/dsp/enc_mips_dsp_r2.c ('k') | third_party/libwebp/dsp/enc_sse2.c » ('j') | no next file with comments »