Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(285)

Side by Side Diff: third_party/libwebp/dsp/enc_neon.c

Issue 2149863002: libwebp: update to v0.5.1 (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/libwebp/dsp/enc_mips_dsp_r2.c ('k') | third_party/libwebp/dsp/enc_sse2.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 Google Inc. All Rights Reserved. 1 // Copyright 2012 Google Inc. All Rights Reserved.
2 // 2 //
3 // Use of this source code is governed by a BSD-style license 3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source 4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found 5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may 6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree. 7 // be found in the AUTHORS file in the root of the source tree.
8 // ----------------------------------------------------------------------------- 8 // -----------------------------------------------------------------------------
9 // 9 //
10 // ARM NEON version of speed-critical encoding functions. 10 // ARM NEON version of speed-critical encoding functions.
(...skipping 542 matching lines...) Expand 10 before | Expand all | Expand 10 after
553 // a 89ab, b 89ab 553 // a 89ab, b 89ab
554 // a cdef, b cdef 554 // a cdef, b cdef
555 // 555 //
556 // transpose 556 // transpose
557 // 557 //
558 // a 048c, b 048c 558 // a 048c, b 048c
559 // a 159d, b 159d 559 // a 159d, b 159d
560 // a 26ae, b 26ae 560 // a 26ae, b 26ae
561 // a 37bf, b 37bf 561 // a 37bf, b 37bf
562 // 562 //
563 static WEBP_INLINE uint8x8x4_t DistoTranspose4x4U8(uint8x8x4_t d4_in) {
564 const uint8x8x2_t d2_tmp0 = vtrn_u8(d4_in.val[0], d4_in.val[1]);
565 const uint8x8x2_t d2_tmp1 = vtrn_u8(d4_in.val[2], d4_in.val[3]);
566 const uint16x4x2_t d2_tmp2 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[0]),
567 vreinterpret_u16_u8(d2_tmp1.val[0]));
568 const uint16x4x2_t d2_tmp3 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[1]),
569 vreinterpret_u16_u8(d2_tmp1.val[1]));
570
571 d4_in.val[0] = vreinterpret_u8_u16(d2_tmp2.val[0]);
572 d4_in.val[2] = vreinterpret_u8_u16(d2_tmp2.val[1]);
573 d4_in.val[1] = vreinterpret_u8_u16(d2_tmp3.val[0]);
574 d4_in.val[3] = vreinterpret_u8_u16(d2_tmp3.val[1]);
575 return d4_in;
576 }
577
578 static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) { 563 static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
579 const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]); 564 const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
580 const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]); 565 const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
581 const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]), 566 const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
582 vreinterpretq_s32_s16(q2_tmp1.val[0])); 567 vreinterpretq_s32_s16(q2_tmp1.val[0]));
583 const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]), 568 const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
584 vreinterpretq_s32_s16(q2_tmp1.val[1])); 569 vreinterpretq_s32_s16(q2_tmp1.val[1]));
585 q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]); 570 q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
586 q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]); 571 q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
587 q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]); 572 q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
588 q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]); 573 q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
589 return q4_in; 574 return q4_in;
590 } 575 }
591 576
592 static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const uint8x8x4_t d4_in) { 577 static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) {
593 // {a0, a1} = {in[0] + in[2], in[1] + in[3]} 578 // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
594 // {a3, a2} = {in[0] - in[2], in[1] - in[3]} 579 // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
595 const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[0], 580 const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
596 d4_in.val[2])); 581 const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
597 const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[1], 582 const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
598 d4_in.val[3])); 583 const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
599 const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[0],
600 d4_in.val[2]));
601 const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[1],
602 d4_in.val[3]));
603 int16x8x4_t q4_out; 584 int16x8x4_t q4_out;
604 // tmp[0] = a0 + a1 585 // tmp[0] = a0 + a1
605 // tmp[1] = a3 + a2 586 // tmp[1] = a3 + a2
606 // tmp[2] = a3 - a2 587 // tmp[2] = a3 - a2
607 // tmp[3] = a0 - a1 588 // tmp[3] = a0 - a1
608 INIT_VECTOR4(q4_out, 589 INIT_VECTOR4(q4_out,
590 vabsq_s16(vaddq_s16(q_a0, q_a1)),
591 vabsq_s16(vaddq_s16(q_a3, q_a2)),
592 vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));
593 return q4_out;
594 }
595
596 static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) {
597 const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
598 q4_in.val[2]));
599 const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
600 q4_in.val[3]));
601 const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1],
602 q4_in.val[3]));
603 const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0],
604 q4_in.val[2]));
605 int16x8x4_t q4_out;
606
607 INIT_VECTOR4(q4_out,
609 vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2), 608 vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
610 vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1)); 609 vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
611 return q4_out; 610 return q4_out;
612 } 611 }
613 612
614 static WEBP_INLINE int16x8x4_t DistoVerticalPass(int16x8x4_t q4_in) {
615 const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
616 const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
617 const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
618 const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
619
620 q4_in.val[0] = vaddq_s16(q_a0, q_a1);
621 q4_in.val[1] = vaddq_s16(q_a3, q_a2);
622 q4_in.val[2] = vabdq_s16(q_a3, q_a2);
623 q4_in.val[3] = vabdq_s16(q_a0, q_a1);
624 q4_in.val[0] = vabsq_s16(q4_in.val[0]);
625 q4_in.val[1] = vabsq_s16(q4_in.val[1]);
626 return q4_in;
627 }
628
629 static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) { 613 static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
630 const uint16x8_t q_w07 = vld1q_u16(&w[0]); 614 const uint16x8_t q_w07 = vld1q_u16(&w[0]);
631 const uint16x8_t q_w8f = vld1q_u16(&w[8]); 615 const uint16x8_t q_w8f = vld1q_u16(&w[8]);
632 int16x4x4_t d4_w; 616 int16x4x4_t d4_w;
633 INIT_VECTOR4(d4_w, 617 INIT_VECTOR4(d4_w,
634 vget_low_s16(vreinterpretq_s16_u16(q_w07)), 618 vget_low_s16(vreinterpretq_s16_u16(q_w07)),
635 vget_high_s16(vreinterpretq_s16_u16(q_w07)), 619 vget_high_s16(vreinterpretq_s16_u16(q_w07)),
636 vget_low_s16(vreinterpretq_s16_u16(q_w8f)), 620 vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
637 vget_high_s16(vreinterpretq_s16_u16(q_w8f))); 621 vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
638 return d4_w; 622 return d4_w;
(...skipping 21 matching lines...) Expand all
660 d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2)); 644 d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
661 d_sum = vpadd_s32(d_sum, d_sum); 645 d_sum = vpadd_s32(d_sum, d_sum);
662 return d_sum; 646 return d_sum;
663 } 647 }
664 648
665 #define LOAD_LANE_32b(src, VALUE, LANE) \ 649 #define LOAD_LANE_32b(src, VALUE, LANE) \
666 (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE)) 650 (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
667 651
668 // Hadamard transform 652 // Hadamard transform
669 // Returns the weighted sum of the absolute value of transformed coefficients. 653 // Returns the weighted sum of the absolute value of transformed coefficients.
654 // w[] contains a row-major 4 by 4 symmetric matrix.
670 static int Disto4x4(const uint8_t* const a, const uint8_t* const b, 655 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
671 const uint16_t* const w) { 656 const uint16_t* const w) {
672 uint32x2_t d_in_ab_0123 = vdup_n_u32(0); 657 uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
673 uint32x2_t d_in_ab_4567 = vdup_n_u32(0); 658 uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
674 uint32x2_t d_in_ab_89ab = vdup_n_u32(0); 659 uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
675 uint32x2_t d_in_ab_cdef = vdup_n_u32(0); 660 uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
676 uint8x8x4_t d4_in; 661 uint8x8x4_t d4_in;
677 662
678 // load data a, b 663 // load data a, b
679 LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0); 664 LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
680 LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0); 665 LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
681 LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0); 666 LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
682 LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0); 667 LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
683 LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1); 668 LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
684 LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1); 669 LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
685 LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1); 670 LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
686 LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1); 671 LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
687 INIT_VECTOR4(d4_in, 672 INIT_VECTOR4(d4_in,
688 vreinterpret_u8_u32(d_in_ab_0123), 673 vreinterpret_u8_u32(d_in_ab_0123),
689 vreinterpret_u8_u32(d_in_ab_4567), 674 vreinterpret_u8_u32(d_in_ab_4567),
690 vreinterpret_u8_u32(d_in_ab_89ab), 675 vreinterpret_u8_u32(d_in_ab_89ab),
691 vreinterpret_u8_u32(d_in_ab_cdef)); 676 vreinterpret_u8_u32(d_in_ab_cdef));
692 677
693 { 678 {
679 // Vertical pass first to avoid a transpose (vertical and horizontal passes
680 // are commutative because w/kWeightY is symmetric) and subsequent
681 // transpose.
682 const int16x8x4_t q4_v = DistoVerticalPass(d4_in);
683 const int16x4x4_t d4_w = DistoLoadW(w);
694 // horizontal pass 684 // horizontal pass
695 const uint8x8x4_t d4_t = DistoTranspose4x4U8(d4_in); 685 const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_v);
696 const int16x8x4_t q4_h = DistoHorizontalPass(d4_t); 686 const int16x8x4_t q4_h = DistoHorizontalPass(q4_t);
697 const int16x4x4_t d4_w = DistoLoadW(w); 687 int32x2_t d_sum = DistoSum(q4_h, d4_w);
698 // vertical pass
699 const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_h);
700 const int16x8x4_t q4_v = DistoVerticalPass(q4_t);
701 int32x2_t d_sum = DistoSum(q4_v, d4_w);
702 688
703 // abs(sum2 - sum1) >> 5 689 // abs(sum2 - sum1) >> 5
704 d_sum = vabs_s32(d_sum); 690 d_sum = vabs_s32(d_sum);
705 d_sum = vshr_n_s32(d_sum, 5); 691 d_sum = vshr_n_s32(d_sum, 5);
706 return vget_lane_s32(d_sum, 0); 692 return vget_lane_s32(d_sum, 0);
707 } 693 }
708 } 694 }
709 #undef LOAD_LANE_32b 695 #undef LOAD_LANE_32b
710 696
711 static int Disto16x16(const uint8_t* const a, const uint8_t* const b, 697 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
712 const uint16_t* const w) { 698 const uint16_t* const w) {
713 int D = 0; 699 int D = 0;
714 int x, y; 700 int x, y;
715 for (y = 0; y < 16 * BPS; y += 4 * BPS) { 701 for (y = 0; y < 16 * BPS; y += 4 * BPS) {
(...skipping 209 matching lines...) Expand 10 before | Expand all | Expand 10 after
925 VP8EncQuantizeBlock = QuantizeBlock; 911 VP8EncQuantizeBlock = QuantizeBlock;
926 VP8EncQuantize2Blocks = Quantize2Blocks; 912 VP8EncQuantize2Blocks = Quantize2Blocks;
927 #endif 913 #endif
928 } 914 }
929 915
930 #else // !WEBP_USE_NEON 916 #else // !WEBP_USE_NEON
931 917
932 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON) 918 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
933 919
934 #endif // WEBP_USE_NEON 920 #endif // WEBP_USE_NEON
OLDNEW
« no previous file with comments | « third_party/libwebp/dsp/enc_mips_dsp_r2.c ('k') | third_party/libwebp/dsp/enc_sse2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698