OLD | NEW |
1 // Copyright 2012 Google Inc. All Rights Reserved. | 1 // Copyright 2012 Google Inc. All Rights Reserved. |
2 // | 2 // |
3 // Use of this source code is governed by a BSD-style license | 3 // Use of this source code is governed by a BSD-style license |
4 // that can be found in the COPYING file in the root of the source | 4 // that can be found in the COPYING file in the root of the source |
5 // tree. An additional intellectual property rights grant can be found | 5 // tree. An additional intellectual property rights grant can be found |
6 // in the file PATENTS. All contributing project authors may | 6 // in the file PATENTS. All contributing project authors may |
7 // be found in the AUTHORS file in the root of the source tree. | 7 // be found in the AUTHORS file in the root of the source tree. |
8 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
9 // | 9 // |
10 // ARM NEON version of speed-critical encoding functions. | 10 // ARM NEON version of speed-critical encoding functions. |
(...skipping 542 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
553 // a 89ab, b 89ab | 553 // a 89ab, b 89ab |
554 // a cdef, b cdef | 554 // a cdef, b cdef |
555 // | 555 // |
556 // transpose | 556 // transpose |
557 // | 557 // |
558 // a 048c, b 048c | 558 // a 048c, b 048c |
559 // a 159d, b 159d | 559 // a 159d, b 159d |
560 // a 26ae, b 26ae | 560 // a 26ae, b 26ae |
561 // a 37bf, b 37bf | 561 // a 37bf, b 37bf |
562 // | 562 // |
563 static WEBP_INLINE uint8x8x4_t DistoTranspose4x4U8(uint8x8x4_t d4_in) { | |
564 const uint8x8x2_t d2_tmp0 = vtrn_u8(d4_in.val[0], d4_in.val[1]); | |
565 const uint8x8x2_t d2_tmp1 = vtrn_u8(d4_in.val[2], d4_in.val[3]); | |
566 const uint16x4x2_t d2_tmp2 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[0]), | |
567 vreinterpret_u16_u8(d2_tmp1.val[0])); | |
568 const uint16x4x2_t d2_tmp3 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[1]), | |
569 vreinterpret_u16_u8(d2_tmp1.val[1])); | |
570 | |
571 d4_in.val[0] = vreinterpret_u8_u16(d2_tmp2.val[0]); | |
572 d4_in.val[2] = vreinterpret_u8_u16(d2_tmp2.val[1]); | |
573 d4_in.val[1] = vreinterpret_u8_u16(d2_tmp3.val[0]); | |
574 d4_in.val[3] = vreinterpret_u8_u16(d2_tmp3.val[1]); | |
575 return d4_in; | |
576 } | |
577 | |
578 static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) { | 563 static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) { |
579 const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]); | 564 const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]); |
580 const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]); | 565 const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]); |
581 const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]), | 566 const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]), |
582 vreinterpretq_s32_s16(q2_tmp1.val[0])); | 567 vreinterpretq_s32_s16(q2_tmp1.val[0])); |
583 const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]), | 568 const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]), |
584 vreinterpretq_s32_s16(q2_tmp1.val[1])); | 569 vreinterpretq_s32_s16(q2_tmp1.val[1])); |
585 q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]); | 570 q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]); |
586 q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]); | 571 q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]); |
587 q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]); | 572 q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]); |
588 q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]); | 573 q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]); |
589 return q4_in; | 574 return q4_in; |
590 } | 575 } |
591 | 576 |
592 static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const uint8x8x4_t d4_in) { | 577 static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) { |
593 // {a0, a1} = {in[0] + in[2], in[1] + in[3]} | 578 // {a0, a1} = {in[0] + in[2], in[1] + in[3]} |
594 // {a3, a2} = {in[0] - in[2], in[1] - in[3]} | 579 // {a3, a2} = {in[0] - in[2], in[1] - in[3]} |
595 const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[0], | 580 const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]); |
596 d4_in.val[2])); | 581 const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]); |
597 const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[1], | 582 const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]); |
598 d4_in.val[3])); | 583 const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]); |
599 const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[0], | |
600 d4_in.val[2])); | |
601 const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[1], | |
602 d4_in.val[3])); | |
603 int16x8x4_t q4_out; | 584 int16x8x4_t q4_out; |
604 // tmp[0] = a0 + a1 | 585 // tmp[0] = a0 + a1 |
605 // tmp[1] = a3 + a2 | 586 // tmp[1] = a3 + a2 |
606 // tmp[2] = a3 - a2 | 587 // tmp[2] = a3 - a2 |
607 // tmp[3] = a0 - a1 | 588 // tmp[3] = a0 - a1 |
608 INIT_VECTOR4(q4_out, | 589 INIT_VECTOR4(q4_out, |
| 590 vabsq_s16(vaddq_s16(q_a0, q_a1)), |
| 591 vabsq_s16(vaddq_s16(q_a3, q_a2)), |
| 592 vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1)); |
| 593 return q4_out; |
| 594 } |
| 595 |
| 596 static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) { |
| 597 const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0], |
| 598 q4_in.val[2])); |
| 599 const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1], |
| 600 q4_in.val[3])); |
| 601 const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1], |
| 602 q4_in.val[3])); |
| 603 const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0], |
| 604 q4_in.val[2])); |
| 605 int16x8x4_t q4_out; |
| 606 |
| 607 INIT_VECTOR4(q4_out, |
609 vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2), | 608 vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2), |
610 vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1)); | 609 vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1)); |
611 return q4_out; | 610 return q4_out; |
612 } | 611 } |
613 | 612 |
614 static WEBP_INLINE int16x8x4_t DistoVerticalPass(int16x8x4_t q4_in) { | |
615 const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]); | |
616 const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]); | |
617 const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]); | |
618 const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]); | |
619 | |
620 q4_in.val[0] = vaddq_s16(q_a0, q_a1); | |
621 q4_in.val[1] = vaddq_s16(q_a3, q_a2); | |
622 q4_in.val[2] = vabdq_s16(q_a3, q_a2); | |
623 q4_in.val[3] = vabdq_s16(q_a0, q_a1); | |
624 q4_in.val[0] = vabsq_s16(q4_in.val[0]); | |
625 q4_in.val[1] = vabsq_s16(q4_in.val[1]); | |
626 return q4_in; | |
627 } | |
628 | |
629 static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) { | 613 static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) { |
630 const uint16x8_t q_w07 = vld1q_u16(&w[0]); | 614 const uint16x8_t q_w07 = vld1q_u16(&w[0]); |
631 const uint16x8_t q_w8f = vld1q_u16(&w[8]); | 615 const uint16x8_t q_w8f = vld1q_u16(&w[8]); |
632 int16x4x4_t d4_w; | 616 int16x4x4_t d4_w; |
633 INIT_VECTOR4(d4_w, | 617 INIT_VECTOR4(d4_w, |
634 vget_low_s16(vreinterpretq_s16_u16(q_w07)), | 618 vget_low_s16(vreinterpretq_s16_u16(q_w07)), |
635 vget_high_s16(vreinterpretq_s16_u16(q_w07)), | 619 vget_high_s16(vreinterpretq_s16_u16(q_w07)), |
636 vget_low_s16(vreinterpretq_s16_u16(q_w8f)), | 620 vget_low_s16(vreinterpretq_s16_u16(q_w8f)), |
637 vget_high_s16(vreinterpretq_s16_u16(q_w8f))); | 621 vget_high_s16(vreinterpretq_s16_u16(q_w8f))); |
638 return d4_w; | 622 return d4_w; |
(...skipping 21 matching lines...) Expand all Loading... |
660 d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2)); | 644 d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2)); |
661 d_sum = vpadd_s32(d_sum, d_sum); | 645 d_sum = vpadd_s32(d_sum, d_sum); |
662 return d_sum; | 646 return d_sum; |
663 } | 647 } |
664 | 648 |
665 #define LOAD_LANE_32b(src, VALUE, LANE) \ | 649 #define LOAD_LANE_32b(src, VALUE, LANE) \ |
666 (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE)) | 650 (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE)) |
667 | 651 |
668 // Hadamard transform | 652 // Hadamard transform |
669 // Returns the weighted sum of the absolute value of transformed coefficients. | 653 // Returns the weighted sum of the absolute value of transformed coefficients. |
| 654 // w[] contains a row-major 4 by 4 symmetric matrix. |
670 static int Disto4x4(const uint8_t* const a, const uint8_t* const b, | 655 static int Disto4x4(const uint8_t* const a, const uint8_t* const b, |
671 const uint16_t* const w) { | 656 const uint16_t* const w) { |
672 uint32x2_t d_in_ab_0123 = vdup_n_u32(0); | 657 uint32x2_t d_in_ab_0123 = vdup_n_u32(0); |
673 uint32x2_t d_in_ab_4567 = vdup_n_u32(0); | 658 uint32x2_t d_in_ab_4567 = vdup_n_u32(0); |
674 uint32x2_t d_in_ab_89ab = vdup_n_u32(0); | 659 uint32x2_t d_in_ab_89ab = vdup_n_u32(0); |
675 uint32x2_t d_in_ab_cdef = vdup_n_u32(0); | 660 uint32x2_t d_in_ab_cdef = vdup_n_u32(0); |
676 uint8x8x4_t d4_in; | 661 uint8x8x4_t d4_in; |
677 | 662 |
678 // load data a, b | 663 // load data a, b |
679 LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0); | 664 LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0); |
680 LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0); | 665 LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0); |
681 LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0); | 666 LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0); |
682 LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0); | 667 LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0); |
683 LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1); | 668 LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1); |
684 LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1); | 669 LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1); |
685 LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1); | 670 LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1); |
686 LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1); | 671 LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1); |
687 INIT_VECTOR4(d4_in, | 672 INIT_VECTOR4(d4_in, |
688 vreinterpret_u8_u32(d_in_ab_0123), | 673 vreinterpret_u8_u32(d_in_ab_0123), |
689 vreinterpret_u8_u32(d_in_ab_4567), | 674 vreinterpret_u8_u32(d_in_ab_4567), |
690 vreinterpret_u8_u32(d_in_ab_89ab), | 675 vreinterpret_u8_u32(d_in_ab_89ab), |
691 vreinterpret_u8_u32(d_in_ab_cdef)); | 676 vreinterpret_u8_u32(d_in_ab_cdef)); |
692 | 677 |
693 { | 678 { |
| 679 // Vertical pass first to avoid a transpose (vertical and horizontal passes |
| 680 // are commutative because w/kWeightY is symmetric) and subsequent |
| 681 // transpose. |
| 682 const int16x8x4_t q4_v = DistoVerticalPass(d4_in); |
| 683 const int16x4x4_t d4_w = DistoLoadW(w); |
694 // horizontal pass | 684 // horizontal pass |
695 const uint8x8x4_t d4_t = DistoTranspose4x4U8(d4_in); | 685 const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_v); |
696 const int16x8x4_t q4_h = DistoHorizontalPass(d4_t); | 686 const int16x8x4_t q4_h = DistoHorizontalPass(q4_t); |
697 const int16x4x4_t d4_w = DistoLoadW(w); | 687 int32x2_t d_sum = DistoSum(q4_h, d4_w); |
698 // vertical pass | |
699 const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_h); | |
700 const int16x8x4_t q4_v = DistoVerticalPass(q4_t); | |
701 int32x2_t d_sum = DistoSum(q4_v, d4_w); | |
702 | 688 |
703 // abs(sum2 - sum1) >> 5 | 689 // abs(sum2 - sum1) >> 5 |
704 d_sum = vabs_s32(d_sum); | 690 d_sum = vabs_s32(d_sum); |
705 d_sum = vshr_n_s32(d_sum, 5); | 691 d_sum = vshr_n_s32(d_sum, 5); |
706 return vget_lane_s32(d_sum, 0); | 692 return vget_lane_s32(d_sum, 0); |
707 } | 693 } |
708 } | 694 } |
709 #undef LOAD_LANE_32b | 695 #undef LOAD_LANE_32b |
710 | 696 |
711 static int Disto16x16(const uint8_t* const a, const uint8_t* const b, | 697 static int Disto16x16(const uint8_t* const a, const uint8_t* const b, |
712 const uint16_t* const w) { | 698 const uint16_t* const w) { |
713 int D = 0; | 699 int D = 0; |
714 int x, y; | 700 int x, y; |
715 for (y = 0; y < 16 * BPS; y += 4 * BPS) { | 701 for (y = 0; y < 16 * BPS; y += 4 * BPS) { |
(...skipping 209 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
925 VP8EncQuantizeBlock = QuantizeBlock; | 911 VP8EncQuantizeBlock = QuantizeBlock; |
926 VP8EncQuantize2Blocks = Quantize2Blocks; | 912 VP8EncQuantize2Blocks = Quantize2Blocks; |
927 #endif | 913 #endif |
928 } | 914 } |
929 | 915 |
930 #else // !WEBP_USE_NEON | 916 #else // !WEBP_USE_NEON |
931 | 917 |
932 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON) | 918 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON) |
933 | 919 |
934 #endif // WEBP_USE_NEON | 920 #endif // WEBP_USE_NEON |
OLD | NEW |