Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(153)

Side by Side Diff: source/libvpx/third_party/libyuv/source/scale_neon.cc

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 25 matching lines...) Expand all
36 "vst1.8 {q1}, [%1]! \n" // store odd pixels 36 "vst1.8 {q1}, [%1]! \n" // store odd pixels
37 "bgt 1b \n" 37 "bgt 1b \n"
38 : "+r"(src_ptr), // %0 38 : "+r"(src_ptr), // %0
39 "+r"(dst), // %1 39 "+r"(dst), // %1
40 "+r"(dst_width) // %2 40 "+r"(dst_width) // %2
41 : 41 :
42 : "q0", "q1" // Clobber List 42 : "q0", "q1" // Clobber List
43 ); 43 );
44 } 44 }
45 45
46 // Read 32x1 average down and write 16x1.
47 void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
48 uint8* dst, int dst_width) {
49 asm volatile (
50 ".p2align 2 \n"
51 "1: \n"
52 MEMACCESS(0)
53 "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc
54 "subs %2, %2, #16 \n" // 16 processed per loop
55 "vpaddl.u8 q0, q0 \n" // add adjacent
56 "vpaddl.u8 q1, q1 \n"
57 "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
58 "vrshrn.u16 d1, q1, #1 \n"
59 MEMACCESS(1)
60 "vst1.8 {q0}, [%1]! \n"
61 "bgt 1b \n"
62 : "+r"(src_ptr), // %0
63 "+r"(dst), // %1
64 "+r"(dst_width) // %2
65 :
66 : "q0", "q1" // Clobber List
67 );
68 }
69
46 // Read 32x2 average down and write 16x1. 70 // Read 32x2 average down and write 16x1.
47 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 71 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
48 uint8* dst, int dst_width) { 72 uint8* dst, int dst_width) {
49 asm volatile ( 73 asm volatile (
50 // change the stride to row 2 pointer 74 // change the stride to row 2 pointer
51 "add %1, %0 \n" 75 "add %1, %0 \n"
52 ".p2align 2 \n" 76 ".p2align 2 \n"
53 "1: \n" 77 "1: \n"
54 MEMACCESS(0) 78 MEMACCESS(0)
55 "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc 79 "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
(...skipping 454 matching lines...) Expand 10 before | Expand all | Expand 10 after
510 : "+r"(src_ptr), // %0 534 : "+r"(src_ptr), // %0
511 "+r"(dst_ptr), // %1 535 "+r"(dst_ptr), // %1
512 "+r"(dst_width), // %2 536 "+r"(dst_width), // %2
513 "+r"(src_stride) // %3 537 "+r"(src_stride) // %3
514 : "r"(&kMult38_Div6), // %4 538 : "r"(&kMult38_Div6), // %4
515 "r"(&kShuf38_2) // %5 539 "r"(&kShuf38_2) // %5
516 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" 540 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
517 ); 541 );
518 } 542 }
519 543
544 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
545 uint16* dst_ptr, int src_width, int src_height) {
546 const uint8* src_tmp = NULL;
547 asm volatile (
548 ".p2align 2 \n"
549 "1: \n"
550 "mov %0, %1 \n"
551 "mov r12, %5 \n"
552 "veor q2, q2, q2 \n"
553 "veor q3, q3, q3 \n"
554 "2: \n"
555 // load 16 pixels into q0
556 MEMACCESS(0)
557 "vld1.8 {q0}, [%0], %3 \n"
558 "vaddw.u8 q3, q3, d1 \n"
559 "vaddw.u8 q2, q2, d0 \n"
560 "subs r12, r12, #1 \n"
561 "bgt 2b \n"
562 MEMACCESS(2)
563 "vst1.16 {q2, q3}, [%2]! \n" // store pixels
564 "add %1, %1, #16 \n"
565 "subs %4, %4, #16 \n" // 16 processed per loop
566 "bgt 1b \n"
567 : "+r"(src_tmp), // %0
568 "+r"(src_ptr), // %1
569 "+r"(dst_ptr), // %2
570 "+r"(src_stride), // %3
571 "+r"(src_width), // %4
572 "+r"(src_height) // %5
573 :
574 : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List
575 );
576 }
577
578 // TODO(Yang Zhang): Investigate less load instructions for
579 // the x/dx stepping
580 #define LOAD2_DATA8_LANE(n) \
581 "lsr %5, %3, #16 \n" \
582 "add %6, %1, %5 \n" \
583 "add %3, %3, %4 \n" \
584 MEMACCESS(6) \
585 "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
586
587 void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
588 int dst_width, int x, int dx) {
589 int dx_offset[4] = {0, 1, 2, 3};
590 int* tmp = dx_offset;
591 const uint8* src_tmp = src_ptr;
592 asm volatile (
593 ".p2align 2 \n"
594 "vdup.32 q0, %3 \n" // x
595 "vdup.32 q1, %4 \n" // dx
596 "vld1.32 {q2}, [%5] \n" // 0 1 2 3
597 "vshl.i32 q3, q1, #2 \n" // 4 * dx
598 "vmul.s32 q1, q1, q2 \n"
599 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
600 "vadd.s32 q1, q1, q0 \n"
601 // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
602 "vadd.s32 q2, q1, q3 \n"
603 "vshl.i32 q0, q3, #1 \n" // 8 * dx
604 "1: \n"
605 LOAD2_DATA8_LANE(0)
606 LOAD2_DATA8_LANE(1)
607 LOAD2_DATA8_LANE(2)
608 LOAD2_DATA8_LANE(3)
609 LOAD2_DATA8_LANE(4)
610 LOAD2_DATA8_LANE(5)
611 LOAD2_DATA8_LANE(6)
612 LOAD2_DATA8_LANE(7)
613 "vmov q10, q1 \n"
614 "vmov q11, q2 \n"
615 "vuzp.16 q10, q11 \n"
616 "vmovl.u8 q8, d6 \n"
617 "vmovl.u8 q9, d7 \n"
618 "vsubl.s16 q11, d18, d16 \n"
619 "vsubl.s16 q12, d19, d17 \n"
620 "vmovl.u16 q13, d20 \n"
621 "vmovl.u16 q10, d21 \n"
622 "vmul.s32 q11, q11, q13 \n"
623 "vmul.s32 q12, q12, q10 \n"
624 "vshrn.s32 d18, q11, #16 \n"
625 "vshrn.s32 d19, q12, #16 \n"
626 "vadd.s16 q8, q8, q9 \n"
627 "vmovn.s16 d6, q8 \n"
628
629 MEMACCESS(0)
630 "vst1.8 {d6}, [%0]! \n" // store pixels
631 "vadd.s32 q1, q1, q0 \n"
632 "vadd.s32 q2, q2, q0 \n"
633 "subs %2, %2, #8 \n" // 8 processed per loop
634 "bgt 1b \n"
635 : "+r"(dst_ptr), // %0
636 "+r"(src_ptr), // %1
637 "+r"(dst_width), // %2
638 "+r"(x), // %3
639 "+r"(dx), // %4
640 "+r"(tmp), // %5
641 "+r"(src_tmp) // %6
642 :
643 : "memory", "cc", "q0", "q1", "q2", "q3",
644 "q8", "q9", "q10", "q11", "q12", "q13"
645 );
646 }
647
648 #undef LOAD2_DATA8_LANE
649
520 // 16x2 -> 16x1 650 // 16x2 -> 16x1
521 void ScaleFilterRows_NEON(uint8* dst_ptr, 651 void ScaleFilterRows_NEON(uint8* dst_ptr,
522 const uint8* src_ptr, ptrdiff_t src_stride, 652 const uint8* src_ptr, ptrdiff_t src_stride,
523 int dst_width, int source_y_fraction) { 653 int dst_width, int source_y_fraction) {
524 asm volatile ( 654 asm volatile (
525 "cmp %4, #0 \n" 655 "cmp %4, #0 \n"
526 "beq 100f \n" 656 "beq 100f \n"
527 "add %2, %1 \n" 657 "add %2, %1 \n"
528 "cmp %4, #64 \n" 658 "cmp %4, #64 \n"
529 "beq 75f \n" 659 "beq 75f \n"
(...skipping 103 matching lines...) Expand 10 before | Expand all | Expand 10 after
633 "vst1.8 {q3}, [%1]! \n" 763 "vst1.8 {q3}, [%1]! \n"
634 "bgt 1b \n" 764 "bgt 1b \n"
635 : "+r"(src_ptr), // %0 765 : "+r"(src_ptr), // %0
636 "+r"(dst), // %1 766 "+r"(dst), // %1
637 "+r"(dst_width) // %2 767 "+r"(dst_width) // %2
638 : 768 :
639 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List 769 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
640 ); 770 );
641 } 771 }
642 772
773 void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
774 uint8* dst_argb, int dst_width) {
775 asm volatile (
776 ".p2align 2 \n"
777 "1: \n"
778 MEMACCESS(0)
779 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
780 MEMACCESS(0)
781 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
782 "subs %2, %2, #8 \n" // 8 processed per loop
783 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
784 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
785 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
786 "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
787 "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
788 "vrshrn.u16 d1, q1, #1 \n"
789 "vrshrn.u16 d2, q2, #1 \n"
790 "vrshrn.u16 d3, q3, #1 \n"
791 MEMACCESS(1)
792 "vst4.8 {d0, d1, d2, d3}, [%1]! \n"
793 "bgt 1b \n"
794 : "+r"(src_argb), // %0
795 "+r"(dst_argb), // %1
796 "+r"(dst_width) // %2
797 :
798 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
799 );
800 }
801
643 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 802 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
644 uint8* dst, int dst_width) { 803 uint8* dst, int dst_width) {
645 asm volatile ( 804 asm volatile (
646 // change the stride to row 2 pointer 805 // change the stride to row 2 pointer
647 "add %1, %1, %0 \n" 806 "add %1, %1, %0 \n"
648 ".p2align 2 \n" 807 ".p2align 2 \n"
649 "1: \n" 808 "1: \n"
650 MEMACCESS(0) 809 MEMACCESS(0)
651 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 810 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
652 MEMACCESS(0) 811 MEMACCESS(0)
(...skipping 97 matching lines...) Expand 10 before | Expand all | Expand 10 after
750 "bgt 1b \n" 909 "bgt 1b \n"
751 : "+r"(src_argb), // %0 910 : "+r"(src_argb), // %0
752 "+r"(src_stride), // %1 911 "+r"(src_stride), // %1
753 "+r"(dst_argb), // %2 912 "+r"(dst_argb), // %2
754 "+r"(dst_width) // %3 913 "+r"(dst_width) // %3
755 : "r"(src_stepx) // %4 914 : "r"(src_stepx) // %4
756 : "memory", "cc", "r12", "q0", "q1", "q2", "q3" 915 : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
757 ); 916 );
758 } 917 }
759 918
919 // TODO(Yang Zhang): Investigate less load instructions for
920 // the x/dx stepping
921 #define LOAD1_DATA32_LANE(dn, n) \
922 "lsr %5, %3, #16 \n" \
923 "add %6, %1, %5, lsl #2 \n" \
924 "add %3, %3, %4 \n" \
925 MEMACCESS(6) \
926 "vld1.32 {"#dn"["#n"]}, [%6] \n"
927
928 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
929 int dst_width, int x, int dx) {
930 int tmp = 0;
931 const uint8* src_tmp = src_argb;
932 asm volatile (
933 ".p2align 2 \n"
934 "1: \n"
935 LOAD1_DATA32_LANE(d0, 0)
936 LOAD1_DATA32_LANE(d0, 1)
937 LOAD1_DATA32_LANE(d1, 0)
938 LOAD1_DATA32_LANE(d1, 1)
939 LOAD1_DATA32_LANE(d2, 0)
940 LOAD1_DATA32_LANE(d2, 1)
941 LOAD1_DATA32_LANE(d3, 0)
942 LOAD1_DATA32_LANE(d3, 1)
943
944 MEMACCESS(0)
945 "vst1.32 {q0, q1}, [%0]! \n" // store pixels
946 "subs %2, %2, #8 \n" // 8 processed per loop
947 "bgt 1b \n"
948 : "+r"(dst_argb), // %0
949 "+r"(src_argb), // %1
950 "+r"(dst_width), // %2
951 "+r"(x), // %3
952 "+r"(dx), // %4
953 "+r"(tmp), // %5
954 "+r"(src_tmp) // %6
955 :
956 : "memory", "cc", "q0", "q1"
957 );
958 }
959
960 #undef LOAD1_DATA32_LANE
961
962 // TODO(Yang Zhang): Investigate less load instructions for
963 // the x/dx stepping
964 #define LOAD2_DATA32_LANE(dn1, dn2, n) \
965 "lsr %5, %3, #16 \n" \
966 "add %6, %1, %5, lsl #2 \n" \
967 "add %3, %3, %4 \n" \
968 MEMACCESS(6) \
969 "vld2.32 {"#dn1"["#n"], "#dn2"["#n"]}, [%6] \n"
970
971 void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
972 int dst_width, int x, int dx) {
973 int dx_offset[4] = {0, 1, 2, 3};
974 int* tmp = dx_offset;
975 const uint8* src_tmp = src_argb;
976 asm volatile (
977 ".p2align 2 \n"
978 "vdup.32 q0, %3 \n" // x
979 "vdup.32 q1, %4 \n" // dx
980 "vld1.32 {q2}, [%5] \n" // 0 1 2 3
981 "vshl.i32 q9, q1, #2 \n" // 4 * dx
982 "vmul.s32 q1, q1, q2 \n"
983 "vmov.i8 q3, #0x7f \n" // 0x7F
984 "vmov.i16 q15, #0x7f \n" // 0x7F
985 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
986 "vadd.s32 q8, q1, q0 \n"
987 "1: \n"
988 // d0, d1: a
989 // d2, d3: b
990 LOAD2_DATA32_LANE(d0, d2, 0)
991 LOAD2_DATA32_LANE(d0, d2, 1)
992 LOAD2_DATA32_LANE(d1, d3, 0)
993 LOAD2_DATA32_LANE(d1, d3, 1)
994 "vshrn.i32 d22, q8, #9 \n"
995 "vand.16 d22, d22, d30 \n"
996 "vdup.8 d24, d22[0] \n"
997 "vdup.8 d25, d22[2] \n"
998 "vdup.8 d26, d22[4] \n"
999 "vdup.8 d27, d22[6] \n"
1000 "vext.8 d4, d24, d25, #4 \n"
1001 "vext.8 d5, d26, d27, #4 \n" // f
1002 "veor.8 q10, q2, q3 \n" // 0x7f ^ f
1003 "vmull.u8 q11, d0, d20 \n"
1004 "vmull.u8 q12, d1, d21 \n"
1005 "vmull.u8 q13, d2, d4 \n"
1006 "vmull.u8 q14, d3, d5 \n"
1007 "vadd.i16 q11, q11, q13 \n"
1008 "vadd.i16 q12, q12, q14 \n"
1009 "vshrn.i16 d0, q11, #7 \n"
1010 "vshrn.i16 d1, q12, #7 \n"
1011
1012 MEMACCESS(0)
1013 "vst1.32 {d0, d1}, [%0]! \n" // store pixels
1014 "vadd.s32 q8, q8, q9 \n"
1015 "subs %2, %2, #4 \n" // 4 processed per loop
1016 "bgt 1b \n"
1017 : "+r"(dst_argb), // %0
1018 "+r"(src_argb), // %1
1019 "+r"(dst_width), // %2
1020 "+r"(x), // %3
1021 "+r"(dx), // %4
1022 "+r"(tmp), // %5
1023 "+r"(src_tmp) // %6
1024 :
1025 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
1026 "q10", "q11", "q12", "q13", "q14", "q15"
1027 );
1028 }
1029
1030 #undef LOAD2_DATA32_LANE
1031
760 #endif // defined(__ARM_NEON__) && !defined(__aarch64__) 1032 #endif // defined(__ARM_NEON__) && !defined(__aarch64__)
761 1033
762 #ifdef __cplusplus 1034 #ifdef __cplusplus
763 } // extern "C" 1035 } // extern "C"
764 } // namespace libyuv 1036 } // namespace libyuv
765 #endif 1037 #endif
OLDNEW
« no previous file with comments | « source/libvpx/third_party/libyuv/source/scale_gcc.cc ('k') | source/libvpx/third_party/libyuv/source/scale_neon64.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698