Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(145)

Side by Side Diff: source/libvpx/third_party/libyuv/source/row_neon64.cc

Issue 554673004: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 /* 1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 806 matching lines...) Expand 10 before | Expand all | Expand 10 after
817 #endif // HAS_UYVYTOARGBROW_NEON 817 #endif // HAS_UYVYTOARGBROW_NEON
818 818
819 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. 819 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
820 #ifdef HAS_SPLITUVROW_NEON 820 #ifdef HAS_SPLITUVROW_NEON
821 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 821 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
822 int width) { 822 int width) {
823 asm volatile ( 823 asm volatile (
824 ".p2align 2 \n" 824 ".p2align 2 \n"
825 "1: \n" 825 "1: \n"
826 MEMACCESS(0) 826 MEMACCESS(0)
827 "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV 827 "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pairs of UV
828 "subs %3, %3, #16 \n" // 16 processed per loop 828 "subs %3, %3, #16 \n" // 16 processed per loop
829 MEMACCESS(1) 829 MEMACCESS(1)
830 "vst1.8 {q0}, [%1]! \n" // store U 830 "st1 {v0.16b}, [%1], #16 \n" // store U
831 MEMACCESS(2) 831 MEMACCESS(2)
832 "vst1.8 {q1}, [%2]! \n" // store V 832 "st1 {v1.16b}, [%2], #16 \n" // store V
833 "bgt 1b \n" 833 "bgt 1b \n"
834 : "+r"(src_uv), // %0 834 : "+r"(src_uv), // %0
835 "+r"(dst_u), // %1 835 "+r"(dst_u), // %1
836 "+r"(dst_v), // %2 836 "+r"(dst_v), // %2
837 "+r"(width) // %3 // Output registers 837 "+r"(width) // %3 // Output registers
838 : // Input registers 838 : // Input registers
839 : "cc", "memory", "q0", "q1" // Clobber List 839 : "cc", "memory", "v0", "v1" // Clobber List
840 ); 840 );
841 } 841 }
842 #endif // HAS_SPLITUVROW_NEON 842 #endif // HAS_SPLITUVROW_NEON
843 843
844 // Reads 16 U's and V's and writes out 16 pairs of UV. 844 // Reads 16 U's and V's and writes out 16 pairs of UV.
845 #ifdef HAS_MERGEUVROW_NEON 845 #ifdef HAS_MERGEUVROW_NEON
846 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 846 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
847 int width) { 847 int width) {
848 asm volatile ( 848 asm volatile (
849 ".p2align 2 \n" 849 ".p2align 2 \n"
850 "1: \n" 850 "1: \n"
851 MEMACCESS(0) 851 MEMACCESS(0)
852 "vld1.8 {q0}, [%0]! \n" // load U 852 "ld1 {v0.16b}, [%0], #16 \n" // load U
853 MEMACCESS(1) 853 MEMACCESS(1)
854 "vld1.8 {q1}, [%1]! \n" // load V 854 "ld1 {v1.16b}, [%1], #16 \n" // load V
855 "subs %3, %3, #16 \n" // 16 processed per loop 855 "subs %3, %3, #16 \n" // 16 processed per loop
856 MEMACCESS(2) 856 MEMACCESS(2)
857 "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV 857 "st2 {v0.16b, v1.16b}, [%2], #32 \n" // store 16 pairs of UV
858 "bgt 1b \n" 858 "bgt 1b \n"
859 : 859 :
860 "+r"(src_u), // %0 860 "+r"(src_u), // %0
861 "+r"(src_v), // %1 861 "+r"(src_v), // %1
862 "+r"(dst_uv), // %2 862 "+r"(dst_uv), // %2
863 "+r"(width) // %3 // Output registers 863 "+r"(width) // %3 // Output registers
864 : // Input registers 864 : // Input registers
865 : "cc", "memory", "q0", "q1" // Clobber List 865 : "cc", "memory", "v0", "v1" // Clobber List
866 ); 866 );
867 } 867 }
868 #endif // HAS_MERGEUVROW_NEON 868 #endif // HAS_MERGEUVROW_NEON
869 869
870 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. 870 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
871 #ifdef HAS_COPYROW_NEON 871 #ifdef HAS_COPYROW_NEON
872 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { 872 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
873 asm volatile ( 873 asm volatile (
874 ".p2align 2 \n" 874 ".p2align 2 \n"
875 "1: \n" 875 "1: \n"
876 MEMACCESS(0) 876 MEMACCESS(0)
877 "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 877 "ld1 {v0.8b-v3.8b}, [%0], #32 \n" // load 32
878 "subs %2, %2, #32 \n" // 32 processed per loop 878 "subs %2, %2, #32 \n" // 32 processed per loop
879 MEMACCESS(1) 879 MEMACCESS(1)
880 "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 880 "st1 {v0.8b-v3.8b}, [%1], #32 \n" // store 32
881 "bgt 1b \n" 881 "bgt 1b \n"
882 : "+r"(src), // %0 882 : "+r"(src), // %0
883 "+r"(dst), // %1 883 "+r"(dst), // %1
884 "+r"(count) // %2 // Output registers 884 "+r"(count) // %2 // Output registers
885 : // Input registers 885 : // Input registers
886 : "cc", "memory", "q0", "q1" // Clobber List 886 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
887 ); 887 );
888 } 888 }
889 #endif // HAS_COPYROW_NEON 889 #endif // HAS_COPYROW_NEON
890 890
891 // SetRow8 writes 'count' bytes using a 32 bit value repeated. 891 // SetRow8 writes 'count' bytes using a 32 bit value repeated.
892 #ifdef HAS_SETROW_NEON 892 #ifdef HAS_SETROW_NEON
893 void SetRow_NEON(uint8* dst, uint32 v32, int count) { 893 void SetRow_NEON(uint8* dst, uint32 v32, int count) {
894 asm volatile ( 894 asm volatile (
895 "vdup.u32 q0, %2 \n" // duplicate 4 ints 895 "dup v0.4s, %w2 \n" // duplicate 4 ints
896 "1: \n" 896 "1: \n"
897 "subs %1, %1, #16 \n" // 16 bytes per loop 897 "subs %1, %1, #16 \n" // 16 bytes per loop
898 MEMACCESS(0) 898 MEMACCESS(0)
899 "vst1.8 {q0}, [%0]! \n" // store 899 "st1 {v0.16b}, [%0], #16 \n" // store
900 "bgt 1b \n" 900 "bgt 1b \n"
901 : "+r"(dst), // %0 901 : "+r"(dst), // %0
902 "+r"(count) // %1 902 "+r"(count) // %1
903 : "r"(v32) // %2 903 : "r"(v32) // %2
904 : "cc", "memory", "q0" 904 : "cc", "memory", "v0"
905 ); 905 );
906 } 906 }
907 #endif // HAS_SETROW_NEON 907 #endif // HAS_SETROW_NEON
908 908
909 // TODO(fbarchard): Make fully assembler 909 // TODO(fbarchard): Make fully assembler
910 // SetRow32 writes 'count' words using a 32 bit value repeated. 910 // SetRow32 writes 'count' words using a 32 bit value repeated.
911 #ifdef HAS_ARGBSETROWS_NEON 911 #ifdef HAS_ARGBSETROWS_NEON
912 void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, 912 void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
913 int dst_stride, int height) { 913 int dst_stride, int height) {
914 for (int y = 0; y < height; ++y) { 914 for (int y = 0; y < height; ++y) {
915 SetRow_NEON(dst, v32, width << 2); 915 SetRow_NEON(dst, v32, width << 2);
916 dst += dst_stride; 916 dst += dst_stride;
917 } 917 }
918 } 918 }
919 #endif // HAS_ARGBSETROWS_NEON 919 #endif // HAS_ARGBSETROWS_NEON
920 920
921 #ifdef HAS_MIRRORROW_NEON 921 #ifdef HAS_MIRRORROW_NEON
922 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { 922 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
923 asm volatile ( 923 asm volatile (
924 // Start at end of source row. 924 // Start at end of source row.
925 "mov r3, #-16 \n"
926 "add %0, %0, %2 \n" 925 "add %0, %0, %2 \n"
927 "sub %0, #16 \n" 926 "sub %0, %0, #16 \n"
928 927
929 ".p2align 2 \n" 928 ".p2align 2 \n"
930 "1: \n" 929 "1: \n"
931 MEMACCESS(0) 930 MEMACCESS(0)
932 "vld1.8 {q0}, [%0], r3 \n" // src -= 16 931 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
933 "subs %2, #16 \n" // 16 pixels per loop. 932 "subs %2, %2, #16 \n" // 16 pixels per loop.
934 "vrev64.8 q0, q0 \n" 933 "rev64 v0.16b, v0.16b \n"
935 MEMACCESS(1) 934 MEMACCESS(1)
936 "vst1.8 {d1}, [%1]! \n" // dst += 16 935 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
937 MEMACCESS(1) 936 MEMACCESS(1)
938 "vst1.8 {d0}, [%1]! \n" 937 "st1 {v0.D}[0], [%1], #8 \n"
939 "bgt 1b \n" 938 "bgt 1b \n"
940 : "+r"(src), // %0 939 : "+r"(src), // %0
941 "+r"(dst), // %1 940 "+r"(dst), // %1
942 "+r"(width) // %2 941 "+r"(width) // %2
943 : 942 : "r"((ptrdiff_t)-16) // %3
944 : "cc", "memory", "r3", "q0" 943 : "cc", "memory", "v0"
945 ); 944 );
946 } 945 }
947 #endif // HAS_MIRRORROW_NEON 946 #endif // HAS_MIRRORROW_NEON
948 947
949 #ifdef HAS_MIRRORUVROW_NEON 948 #ifdef HAS_MIRRORUVROW_NEON
950 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 949 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
951 int width) { 950 int width) {
952 asm volatile ( 951 asm volatile (
953 // Start at end of source row. 952 // Start at end of source row.
954 "mov r12, #-16 \n"
955 "add %0, %0, %3, lsl #1 \n" 953 "add %0, %0, %3, lsl #1 \n"
956 "sub %0, #16 \n" 954 "sub %0, %0, #16 \n"
957 955
958 ".p2align 2 \n" 956 ".p2align 2 \n"
959 "1: \n" 957 "1: \n"
960 MEMACCESS(0) 958 MEMACCESS(0)
961 "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 959 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
962 "subs %3, #8 \n" // 8 pixels per loop. 960 "subs %3, %3, #8 \n" // 8 pixels per loop.
963 "vrev64.8 q0, q0 \n" 961 "rev64 v0.8b, v0.8b \n"
962 "rev64 v1.8b, v1.8b \n"
964 MEMACCESS(1) 963 MEMACCESS(1)
965 "vst1.8 {d0}, [%1]! \n" // dst += 8 964 "st1 {v0.8b}, [%1], #8 \n" // dst += 8
966 MEMACCESS(2) 965 MEMACCESS(2)
967 "vst1.8 {d1}, [%2]! \n" 966 "st1 {v1.8b}, [%2], #8 \n"
968 "bgt 1b \n" 967 "bgt 1b \n"
969 : "+r"(src_uv), // %0 968 : "+r"(src_uv), // %0
970 "+r"(dst_u), // %1 969 "+r"(dst_u), // %1
971 "+r"(dst_v), // %2 970 "+r"(dst_v), // %2
972 "+r"(width) // %3 971 "+r"(width) // %3
973 : 972 : "r"((ptrdiff_t)-16) // %4
974 : "cc", "memory", "r12", "q0" 973 : "cc", "memory", "v0", "v1"
975 ); 974 );
976 } 975 }
977 #endif // HAS_MIRRORUVROW_NEON 976 #endif // HAS_MIRRORUVROW_NEON
978 977
979 #ifdef HAS_ARGBMIRRORROW_NEON 978 #ifdef HAS_ARGBMIRRORROW_NEON
980 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { 979 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
981 asm volatile ( 980 asm volatile (
982 // Start at end of source row. 981 // Start at end of source row.
983 "mov r3, #-16 \n"
984 "add %0, %0, %2, lsl #2 \n" 982 "add %0, %0, %2, lsl #2 \n"
985 "sub %0, #16 \n" 983 "sub %0, %0, #16 \n"
986 984
987 ".p2align 2 \n" 985 ".p2align 2 \n"
988 "1: \n" 986 "1: \n"
989 MEMACCESS(0) 987 MEMACCESS(0)
990 "vld1.8 {q0}, [%0], r3 \n" // src -= 16 988 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
991 "subs %2, #4 \n" // 4 pixels per loop. 989 "subs %2, %2, #4 \n" // 4 pixels per loop.
992 "vrev64.32 q0, q0 \n" 990 "rev64 v0.4s, v0.4s \n"
993 MEMACCESS(1) 991 MEMACCESS(1)
994 "vst1.8 {d1}, [%1]! \n" // dst += 16 992 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
995 MEMACCESS(1) 993 MEMACCESS(1)
996 "vst1.8 {d0}, [%1]! \n" 994 "st1 {v0.D}[0], [%1], #8 \n"
997 "bgt 1b \n" 995 "bgt 1b \n"
998 : "+r"(src), // %0 996 : "+r"(src), // %0
999 "+r"(dst), // %1 997 "+r"(dst), // %1
1000 "+r"(width) // %2 998 "+r"(width) // %2
1001 : 999 : "r"((ptrdiff_t)-16) // %3
1002 : "cc", "memory", "r3", "q0" 1000 : "cc", "memory", "v0"
1003 ); 1001 );
1004 } 1002 }
1005 #endif // HAS_ARGBMIRRORROW_NEON 1003 #endif // HAS_ARGBMIRRORROW_NEON
1006 1004
1007 #ifdef HAS_RGB24TOARGBROW_NEON 1005 #ifdef HAS_RGB24TOARGBROW_NEON
1008 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { 1006 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
1009 asm volatile ( 1007 asm volatile (
1010 "vmov.u8 d4, #255 \n" // Alpha 1008 "movi v4.8b, #255 \n" // Alpha
1011 ".p2align 2 \n" 1009 ".p2align 2 \n"
1012 "1: \n" 1010 "1: \n"
1013 MEMACCESS(0) 1011 MEMACCESS(0)
1014 "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. 1012 "ld3 {v1.8b-v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
1015 "subs %2, %2, #8 \n" // 8 processed per loop. 1013 "subs %2, %2, #8 \n" // 8 processed per loop.
1016 MEMACCESS(1) 1014 MEMACCESS(1)
1017 "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. 1015 "st4 {v1.8b-v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
1018 "bgt 1b \n" 1016 "bgt 1b \n"
1019 : "+r"(src_rgb24), // %0 1017 : "+r"(src_rgb24), // %0
1020 "+r"(dst_argb), // %1 1018 "+r"(dst_argb), // %1
1021 "+r"(pix) // %2 1019 "+r"(pix) // %2
1022 : 1020 :
1023 : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List 1021 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
1024 ); 1022 );
1025 } 1023 }
1026 #endif // HAS_RGB24TOARGBROW_NEON 1024 #endif // HAS_RGB24TOARGBROW_NEON
1027 1025
1028 #ifdef HAS_RAWTOARGBROW_NEON 1026 #ifdef HAS_RAWTOARGBROW_NEON
1029 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { 1027 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
1030 asm volatile ( 1028 asm volatile (
1031 "vmov.u8 d4, #255 \n" // Alpha 1029 "movi v5.8b, #255 \n" // Alpha
1032 ".p2align 2 \n" 1030 ".p2align 2 \n"
1033 "1: \n" 1031 "1: \n"
1034 MEMACCESS(0) 1032 MEMACCESS(0)
1035 "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. 1033 "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // read r g b
1036 "subs %2, %2, #8 \n" // 8 processed per loop. 1034 "subs %2, %2, #8 \n" // 8 processed per loop.
1037 "vswp.u8 d1, d3 \n" // swap R, B 1035 "mov v3.8b, v1.8b \n" // move g
1036 "mov v4.8b, v0.8b \n" // move r
1038 MEMACCESS(1) 1037 MEMACCESS(1)
1039 "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. 1038 "st4 {v2.8b-v5.8b}, [%1], #32 \n" // store b g r a
1040 "bgt 1b \n" 1039 "bgt 1b \n"
1041 : "+r"(src_raw), // %0 1040 : "+r"(src_raw), // %0
1042 "+r"(dst_argb), // %1 1041 "+r"(dst_argb), // %1
1043 "+r"(pix) // %2 1042 "+r"(pix) // %2
1044 : 1043 :
1045 : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List 1044 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
1046 ); 1045 );
1047 } 1046 }
1048 #endif // HAS_RAWTOARGBROW_NEON 1047 #endif // HAS_RAWTOARGBROW_NEON
1049 1048
1050 #define RGB565TOARGB \ 1049 #define RGB565TOARGB \
1051 "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ 1050 "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \
1052 "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ 1051 "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
1053 "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ 1052 "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \
1054 "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ 1053 "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \
1055 "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ 1054 "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
(...skipping 107 matching lines...) Expand 10 before | Expand all | Expand 10 after
1163 ); 1162 );
1164 } 1163 }
1165 #endif // HAS_ARGB4444TOARGBROW_NEON 1164 #endif // HAS_ARGB4444TOARGBROW_NEON
1166 1165
1167 #ifdef HAS_ARGBTORGB24ROW_NEON 1166 #ifdef HAS_ARGBTORGB24ROW_NEON
1168 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { 1167 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
1169 asm volatile ( 1168 asm volatile (
1170 ".p2align 2 \n" 1169 ".p2align 2 \n"
1171 "1: \n" 1170 "1: \n"
1172 MEMACCESS(0) 1171 MEMACCESS(0)
1173 "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. 1172 "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
1174 "subs %2, %2, #8 \n" // 8 processed per loop. 1173 "subs %2, %2, #8 \n" // 8 processed per loop.
1175 MEMACCESS(1) 1174 MEMACCESS(1)
1176 "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. 1175 "st3 {v1.8b-v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
1177 "bgt 1b \n" 1176 "bgt 1b \n"
1178 : "+r"(src_argb), // %0 1177 : "+r"(src_argb), // %0
1179 "+r"(dst_rgb24), // %1 1178 "+r"(dst_rgb24), // %1
1180 "+r"(pix) // %2 1179 "+r"(pix) // %2
1181 : 1180 :
1182 : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List 1181 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
1183 ); 1182 );
1184 } 1183 }
1185 #endif // HAS_ARGBTORGB24ROW_NEON 1184 #endif // HAS_ARGBTORGB24ROW_NEON
1186 1185
1187 #ifdef HAS_ARGBTORAWROW_NEON 1186 #ifdef HAS_ARGBTORAWROW_NEON
1188 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { 1187 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
1189 asm volatile ( 1188 asm volatile (
1190 ".p2align 2 \n" 1189 ".p2align 2 \n"
1191 "1: \n" 1190 "1: \n"
1192 MEMACCESS(0) 1191 MEMACCESS(0)
1193 "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. 1192 "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load b g r a
1194 "subs %2, %2, #8 \n" // 8 processed per loop. 1193 "subs %2, %2, #8 \n" // 8 processed per loop.
1195 "vswp.u8 d1, d3 \n" // swap R, B 1194 "mov v4.8b, v2.8b \n" // mov g
1195 "mov v5.8b, v1.8b \n" // mov b
1196 MEMACCESS(1) 1196 MEMACCESS(1)
1197 "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. 1197 "st3 {v3.8b-v5.8b}, [%1], #24 \n" // store r g b
1198 "bgt 1b \n" 1198 "bgt 1b \n"
1199 : "+r"(src_argb), // %0 1199 : "+r"(src_argb), // %0
1200 "+r"(dst_raw), // %1 1200 "+r"(dst_raw), // %1
1201 "+r"(pix) // %2 1201 "+r"(pix) // %2
1202 : 1202 :
1203 : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List 1203 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
1204 ); 1204 );
1205 } 1205 }
1206 #endif // HAS_ARGBTORAWROW_NEON 1206 #endif // HAS_ARGBTORAWROW_NEON
1207 1207
1208 #ifdef HAS_YUY2TOYROW_NEON 1208 #ifdef HAS_YUY2TOYROW_NEON
1209 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { 1209 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
1210 asm volatile ( 1210 asm volatile (
1211 ".p2align 2 \n" 1211 ".p2align 2 \n"
1212 "1: \n" 1212 "1: \n"
1213 MEMACCESS(0) 1213 MEMACCESS(0)
1214 "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. 1214 "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
1215 "subs %2, %2, #16 \n" // 16 processed per loop. 1215 "subs %2, %2, #16 \n" // 16 processed per loop.
1216 MEMACCESS(1) 1216 MEMACCESS(1)
1217 "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. 1217 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
1218 "bgt 1b \n" 1218 "bgt 1b \n"
1219 : "+r"(src_yuy2), // %0 1219 : "+r"(src_yuy2), // %0
1220 "+r"(dst_y), // %1 1220 "+r"(dst_y), // %1
1221 "+r"(pix) // %2 1221 "+r"(pix) // %2
1222 : 1222 :
1223 : "cc", "memory", "q0", "q1" // Clobber List 1223 : "cc", "memory", "v0", "v1" // Clobber List
1224 ); 1224 );
1225 } 1225 }
1226 #endif // HAS_YUY2TOYROW_NEON 1226 #endif // HAS_YUY2TOYROW_NEON
1227 1227
1228 #ifdef HAS_UYVYTOYROW_NEON 1228 #ifdef HAS_UYVYTOYROW_NEON
1229 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { 1229 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
1230 asm volatile ( 1230 asm volatile (
1231 ".p2align 2 \n" 1231 ".p2align 2 \n"
1232 "1: \n" 1232 "1: \n"
1233 MEMACCESS(0) 1233 MEMACCESS(0)
1234 "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. 1234 "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
1235 "subs %2, %2, #16 \n" // 16 processed per loop. 1235 "subs %2, %2, #16 \n" // 16 processed per loop.
1236 MEMACCESS(1) 1236 MEMACCESS(1)
1237 "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. 1237 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
1238 "bgt 1b \n" 1238 "bgt 1b \n"
1239 : "+r"(src_uyvy), // %0 1239 : "+r"(src_uyvy), // %0
1240 "+r"(dst_y), // %1 1240 "+r"(dst_y), // %1
1241 "+r"(pix) // %2 1241 "+r"(pix) // %2
1242 : 1242 :
1243 : "cc", "memory", "q0", "q1" // Clobber List 1243 : "cc", "memory", "v0", "v1" // Clobber List
1244 ); 1244 );
1245 } 1245 }
1246 #endif // HAS_UYVYTOYROW_NEON 1246 #endif // HAS_UYVYTOYROW_NEON
1247 1247
1248 #ifdef HAS_YUY2TOUV422ROW_NEON 1248 #ifdef HAS_YUY2TOUV422ROW_NEON
1249 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, 1249 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
1250 int pix) { 1250 int pix) {
1251 asm volatile ( 1251 asm volatile (
1252 ".p2align 2 \n" 1252 ".p2align 2 \n"
1253 "1: \n" 1253 "1: \n"
1254 MEMACCESS(0) 1254 MEMACCESS(0)
1255 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. 1255 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
1256 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. 1256 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
1257 MEMACCESS(1) 1257 MEMACCESS(1)
1258 "vst1.8 {d1}, [%1]! \n" // store 8 U. 1258 "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
1259 MEMACCESS(2) 1259 MEMACCESS(2)
1260 "vst1.8 {d3}, [%2]! \n" // store 8 V. 1260 "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
1261 "bgt 1b \n" 1261 "bgt 1b \n"
1262 : "+r"(src_yuy2), // %0 1262 : "+r"(src_yuy2), // %0
1263 "+r"(dst_u), // %1 1263 "+r"(dst_u), // %1
1264 "+r"(dst_v), // %2 1264 "+r"(dst_v), // %2
1265 "+r"(pix) // %3 1265 "+r"(pix) // %3
1266 : 1266 :
1267 : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List 1267 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1268 ); 1268 );
1269 } 1269 }
1270 #endif // HAS_YUY2TOUV422ROW_NEON 1270 #endif // HAS_YUY2TOUV422ROW_NEON
1271 1271
1272 #ifdef HAS_UYVYTOUV422ROW_NEON 1272 #ifdef HAS_UYVYTOUV422ROW_NEON
1273 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, 1273 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
1274 int pix) { 1274 int pix) {
1275 asm volatile ( 1275 asm volatile (
1276 ".p2align 2 \n" 1276 ".p2align 2 \n"
1277 "1: \n" 1277 "1: \n"
1278 MEMACCESS(0) 1278 MEMACCESS(0)
1279 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. 1279 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
1280 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. 1280 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
1281 MEMACCESS(1) 1281 MEMACCESS(1)
1282 "vst1.8 {d0}, [%1]! \n" // store 8 U. 1282 "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
1283 MEMACCESS(2) 1283 MEMACCESS(2)
1284 "vst1.8 {d2}, [%2]! \n" // store 8 V. 1284 "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
1285 "bgt 1b \n" 1285 "bgt 1b \n"
1286 : "+r"(src_uyvy), // %0 1286 : "+r"(src_uyvy), // %0
1287 "+r"(dst_u), // %1 1287 "+r"(dst_u), // %1
1288 "+r"(dst_v), // %2 1288 "+r"(dst_v), // %2
1289 "+r"(pix) // %3 1289 "+r"(pix) // %3
1290 : 1290 :
1291 : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List 1291 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1292 ); 1292 );
1293 } 1293 }
1294 #endif // HAS_UYVYTOUV422ROW_NEON 1294 #endif // HAS_UYVYTOUV422ROW_NEON
1295 1295
1296 #ifdef HAS_YUY2TOUVROW_NEON 1296 #ifdef HAS_YUY2TOUVROW_NEON
1297 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, 1297 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
1298 uint8* dst_u, uint8* dst_v, int pix) { 1298 uint8* dst_u, uint8* dst_v, int pix) {
1299 asm volatile ( 1299 asm volatile (
1300 "add %1, %0, %1 \n" // stride + src_yuy2 1300 "add %x1, %x0, %w1, sxtw \n" // stride + src_yuy2
1301 ".p2align 2 \n" 1301 ".p2align 2 \n"
1302 "1: \n" 1302 "1: \n"
1303 MEMACCESS(0) 1303 MEMACCESS(0)
1304 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. 1304 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
1305 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. 1305 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
1306 MEMACCESS(1) 1306 MEMACCESS(1)
1307 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. 1307 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row YUY2.
1308 "vrhadd.u8 d1, d1, d5 \n" // average rows of U 1308 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
1309 "vrhadd.u8 d3, d3, d7 \n" // average rows of V 1309 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
1310 MEMACCESS(2) 1310 MEMACCESS(2)
1311 "vst1.8 {d1}, [%2]! \n" // store 8 U. 1311 "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
1312 MEMACCESS(3) 1312 MEMACCESS(3)
1313 "vst1.8 {d3}, [%3]! \n" // store 8 V. 1313 "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
1314 "bgt 1b \n" 1314 "bgt 1b \n"
1315 : "+r"(src_yuy2), // %0 1315 : "+r"(src_yuy2), // %0
1316 "+r"(stride_yuy2), // %1 1316 "+r"(stride_yuy2), // %1
1317 "+r"(dst_u), // %2 1317 "+r"(dst_u), // %2
1318 "+r"(dst_v), // %3 1318 "+r"(dst_v), // %3
1319 "+r"(pix) // %4 1319 "+r"(pix) // %4
1320 : 1320 :
1321 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber L ist 1321 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber L ist
1322 ); 1322 );
1323 } 1323 }
1324 #endif // HAS_YUY2TOUVROW_NEON 1324 #endif // HAS_YUY2TOUVROW_NEON
1325 1325
1326 #ifdef HAS_UYVYTOUVROW_NEON 1326 #ifdef HAS_UYVYTOUVROW_NEON
1327 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, 1327 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
1328 uint8* dst_u, uint8* dst_v, int pix) { 1328 uint8* dst_u, uint8* dst_v, int pix) {
1329 asm volatile ( 1329 asm volatile (
1330 "add %1, %0, %1 \n" // stride + src_uyvy 1330 "add %x1, %x0, %w1, sxtw \n" // stride + src_uyvy
1331 ".p2align 2 \n" 1331 ".p2align 2 \n"
1332 "1: \n" 1332 "1: \n"
1333 MEMACCESS(0) 1333 MEMACCESS(0)
1334 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. 1334 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
1335 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. 1335 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
1336 MEMACCESS(1) 1336 MEMACCESS(1)
1337 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. 1337 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row UYVY.
1338 "vrhadd.u8 d0, d0, d4 \n" // average rows of U 1338 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
1339 "vrhadd.u8 d2, d2, d6 \n" // average rows of V 1339 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
1340 MEMACCESS(2) 1340 MEMACCESS(2)
1341 "vst1.8 {d0}, [%2]! \n" // store 8 U. 1341 "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
1342 MEMACCESS(3) 1342 MEMACCESS(3)
1343 "vst1.8 {d2}, [%3]! \n" // store 8 V. 1343 "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
1344 "bgt 1b \n" 1344 "bgt 1b \n"
1345 : "+r"(src_uyvy), // %0 1345 : "+r"(src_uyvy), // %0
1346 "+r"(stride_uyvy), // %1 1346 "+r"(stride_uyvy), // %1
1347 "+r"(dst_u), // %2 1347 "+r"(dst_u), // %2
1348 "+r"(dst_v), // %3 1348 "+r"(dst_v), // %3
1349 "+r"(pix) // %4 1349 "+r"(pix) // %4
1350 : 1350 :
1351 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber L ist 1351 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber L ist
1352 ); 1352 );
1353 } 1353 }
1354 #endif // HAS_UYVYTOUVROW_NEON 1354 #endif // HAS_UYVYTOUVROW_NEON
1355 1355
1356 #ifdef HAS_HALFROW_NEON 1356 #ifdef HAS_HALFROW_NEON
1357 void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, 1357 void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
1358 uint8* dst_uv, int pix) { 1358 uint8* dst_uv, int pix) {
1359 asm volatile ( 1359 asm volatile (
1360 // change the stride to row 2 pointer 1360 // change the stride to row 2 pointer
1361 "add %1, %0 \n" 1361 "add %x1, %x0, %w1, sxtw \n"
1362 "1: \n" 1362 "1: \n"
1363 MEMACCESS(0) 1363 MEMACCESS(0)
1364 "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels. 1364 "ld1 {v0.16b}, [%0], #16 \n" // load row 1 16 pixels.
1365 "subs %3, %3, #16 \n" // 16 processed per loop 1365 "subs %3, %3, #16 \n" // 16 processed per loop
1366 MEMACCESS(1) 1366 MEMACCESS(1)
1367 "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels. 1367 "ld1 {v1.16b}, [%1], #16 \n" // load row 2 16 pixels.
1368 "vrhadd.u8 q0, q1 \n" // average row 1 and 2 1368 "urhadd v0.16b, v0.16b, v1.16b \n" // average row 1 and 2
1369 MEMACCESS(2) 1369 MEMACCESS(2)
1370 "vst1.8 {q0}, [%2]! \n" 1370 "st1 {v0.16b}, [%2], #16 \n"
1371 "bgt 1b \n" 1371 "bgt 1b \n"
1372 : "+r"(src_uv), // %0 1372 : "+r"(src_uv), // %0
1373 "+r"(src_uv_stride), // %1 1373 "+r"(src_uv_stride), // %1
1374 "+r"(dst_uv), // %2 1374 "+r"(dst_uv), // %2
1375 "+r"(pix) // %3 1375 "+r"(pix) // %3
1376 : 1376 :
1377 : "cc", "memory", "q0", "q1" // Clobber List 1377 : "cc", "memory", "v0", "v1" // Clobber List
1378 ); 1378 );
1379 } 1379 }
1380 #endif // HAS_HALFROW_NEON 1380 #endif // HAS_HALFROW_NEON
1381 1381
1382 // Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG 1382 // Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG
1383 #ifdef HAS_ARGBTOBAYERROW_NEON 1383 #ifdef HAS_ARGBTOBAYERROW_NEON
1384 void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, 1384 void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
1385 uint32 selector, int pix) { 1385 uint32 selector, int pix) {
1386 asm volatile ( 1386 asm volatile (
1387 "vmov.u32 d6[0], %3 \n" // selector 1387 "mov v2.s[0], %w3 \n" // selector
1388 "1: \n" 1388 "1: \n"
1389 MEMACCESS(0) 1389 MEMACCESS(0)
1390 "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels. 1390 "ld1 {v0.16b, v1.16b}, [%0], 32 \n" // load row 8 pixels.
1391 "subs %2, %2, #8 \n" // 8 processed per loop 1391 "subs %2, %2, #8 \n" // 8 processed per loop
1392 "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels 1392 "tbl v4.8b, {v0.16b}, v2.8b \n" // look up 4 pixels
1393 "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels 1393 "tbl v5.8b, {v1.16b}, v2.8b \n" // look up 4 pixels
1394 "vtrn.u32 d4, d5 \n" // combine 8 pixels 1394 "trn1 v4.4s, v4.4s, v5.4s \n" // combine 8 pixels
1395 MEMACCESS(1) 1395 MEMACCESS(1)
1396 "vst1.8 {d4}, [%1]! \n" // store 8. 1396 "st1 {v4.8b}, [%1], #8 \n" // store 8.
1397 "bgt 1b \n" 1397 "bgt 1b \n"
1398 : "+r"(src_argb), // %0 1398 : "+r"(src_argb), // %0
1399 "+r"(dst_bayer), // %1 1399 "+r"(dst_bayer), // %1
1400 "+r"(pix) // %2 1400 "+r"(pix) // %2
1401 : "r"(selector) // %3 1401 : "r"(selector) // %3
1402 : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List 1402 : "cc", "memory", "v0", "v1", "v2", "v4", "v5" // Clobber List
1403 ); 1403 );
1404 } 1404 }
1405 #endif // HAS_ARGBTOBAYERROW_NEON 1405 #endif // HAS_ARGBTOBAYERROW_NEON
1406 1406
1407 // Select G channels from ARGB. e.g. GGGGGGGG 1407 // Select G channels from ARGB. e.g. GGGGGGGG
1408 #ifdef HAS_ARGBTOBAYERGGROW_NEON 1408 #ifdef HAS_ARGBTOBAYERGGROW_NEON
1409 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, 1409 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
1410 uint32 /*selector*/, int pix) { 1410 uint32 /*selector*/, int pix) {
1411 asm volatile ( 1411 asm volatile (
1412 "1: \n" 1412 "1: \n"
1413 MEMACCESS(0) 1413 MEMACCESS(0)
1414 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels. 1414 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load row 8 pixels.
1415 "subs %2, %2, #8 \n" // 8 processed per loop 1415 "subs %2, %2, #8 \n" // 8 processed per loop
1416 MEMACCESS(1) 1416 MEMACCESS(1)
1417 "vst1.8 {d1}, [%1]! \n" // store 8 G's. 1417 "st1 {v1.8b}, [%1], #8 \n" // store 8 G's.
1418 "bgt 1b \n" 1418 "bgt 1b \n"
1419 : "+r"(src_argb), // %0 1419 : "+r"(src_argb), // %0
1420 "+r"(dst_bayer), // %1 1420 "+r"(dst_bayer), // %1
1421 "+r"(pix) // %2 1421 "+r"(pix) // %2
1422 : 1422 :
1423 : "cc", "memory", "q0", "q1" // Clobber List 1423 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1424 ); 1424 );
1425 } 1425 }
1426 #endif // HAS_ARGBTOBAYERGGROW_NEON 1426 #endif // HAS_ARGBTOBAYERGGROW_NEON
1427 1427
1428 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 1428 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
1429 #ifdef HAS_ARGBSHUFFLEROW_NEON 1429 #ifdef HAS_ARGBSHUFFLEROW_NEON
1430 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, 1430 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
1431 const uint8* shuffler, int pix) { 1431 const uint8* shuffler, int pix) {
1432 asm volatile ( 1432 asm volatile (
1433 MEMACCESS(3) 1433 MEMACCESS(3)
1434 "vld1.8 {q2}, [%3] \n" // shuffler 1434 "ld1 {v2.16b}, [%3] \n" // shuffler
1435 "1: \n" 1435 "1: \n"
1436 MEMACCESS(0) 1436 MEMACCESS(0)
1437 "vld1.8 {q0}, [%0]! \n" // load 4 pixels. 1437 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
1438 "subs %2, %2, #4 \n" // 4 processed per loop 1438 "subs %2, %2, #4 \n" // 4 processed per loop
1439 "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels 1439 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
1440 "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
1441 MEMACCESS(1) 1440 MEMACCESS(1)
1442 "vst1.8 {q1}, [%1]! \n" // store 4. 1441 "st1 {v1.16b}, [%1], #16 \n" // store 4.
1443 "bgt 1b \n" 1442 "bgt 1b \n"
1444 : "+r"(src_argb), // %0 1443 : "+r"(src_argb), // %0
1445 "+r"(dst_argb), // %1 1444 "+r"(dst_argb), // %1
1446 "+r"(pix) // %2 1445 "+r"(pix) // %2
1447 : "r"(shuffler) // %3 1446 : "r"(shuffler) // %3
1448 : "cc", "memory", "q0", "q1", "q2" // Clobber List 1447 : "cc", "memory", "v0", "v1", "v2" // Clobber List
1449 ); 1448 );
1450 } 1449 }
1451 #endif // HAS_ARGBSHUFFLEROW_NEON 1450 #endif // HAS_ARGBSHUFFLEROW_NEON
1452 1451
1453 #ifdef HAS_I422TOYUY2ROW_NEON 1452 #ifdef HAS_I422TOYUY2ROW_NEON
1454 void I422ToYUY2Row_NEON(const uint8* src_y, 1453 void I422ToYUY2Row_NEON(const uint8* src_y,
1455 const uint8* src_u, 1454 const uint8* src_u,
1456 const uint8* src_v, 1455 const uint8* src_v,
1457 uint8* dst_yuy2, int width) { 1456 uint8* dst_yuy2, int width) {
1458 asm volatile ( 1457 asm volatile (
1459 ".p2align 2 \n" 1458 ".p2align 2 \n"
1460 "1: \n" 1459 "1: \n"
1461 MEMACCESS(0) 1460 MEMACCESS(0)
1462 "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys 1461 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
1462 "mov v2.8b, v1.8b \n"
1463 MEMACCESS(1) 1463 MEMACCESS(1)
1464 "vld1.8 {d1}, [%1]! \n" // load 8 Us 1464 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
1465 MEMACCESS(2) 1465 MEMACCESS(2)
1466 "vld1.8 {d3}, [%2]! \n" // load 8 Vs 1466 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
1467 "subs %4, %4, #16 \n" // 16 pixels 1467 "subs %4, %4, #16 \n" // 16 pixels
1468 MEMACCESS(3) 1468 MEMACCESS(3)
1469 "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. 1469 "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels.
1470 "bgt 1b \n" 1470 "bgt 1b \n"
1471 : "+r"(src_y), // %0 1471 : "+r"(src_y), // %0
1472 "+r"(src_u), // %1 1472 "+r"(src_u), // %1
1473 "+r"(src_v), // %2 1473 "+r"(src_v), // %2
1474 "+r"(dst_yuy2), // %3 1474 "+r"(dst_yuy2), // %3
1475 "+r"(width) // %4 1475 "+r"(width) // %4
1476 : 1476 :
1477 : "cc", "memory", "d0", "d1", "d2", "d3" 1477 : "cc", "memory", "v0", "v1", "v2", "v3"
1478 ); 1478 );
1479 } 1479 }
1480 #endif // HAS_I422TOYUY2ROW_NEON 1480 #endif // HAS_I422TOYUY2ROW_NEON
1481 1481
1482 #ifdef HAS_I422TOUYVYROW_NEON 1482 #ifdef HAS_I422TOUYVYROW_NEON
1483 void I422ToUYVYRow_NEON(const uint8* src_y, 1483 void I422ToUYVYRow_NEON(const uint8* src_y,
1484 const uint8* src_u, 1484 const uint8* src_u,
1485 const uint8* src_v, 1485 const uint8* src_v,
1486 uint8* dst_uyvy, int width) { 1486 uint8* dst_uyvy, int width) {
1487 asm volatile ( 1487 asm volatile (
1488 ".p2align 2 \n" 1488 ".p2align 2 \n"
1489 "1: \n" 1489 "1: \n"
1490 MEMACCESS(0) 1490 MEMACCESS(0)
1491 "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys 1491 "ld2 {v1.8b, v2.8b}, [%0], #16 \n" // load 16 Ys
1492 "mov v3.8b, v2.8b \n"
1492 MEMACCESS(1) 1493 MEMACCESS(1)
1493 "vld1.8 {d0}, [%1]! \n" // load 8 Us 1494 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
1494 MEMACCESS(2) 1495 MEMACCESS(2)
1495 "vld1.8 {d2}, [%2]! \n" // load 8 Vs 1496 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
1496 "subs %4, %4, #16 \n" // 16 pixels 1497 "subs %4, %4, #16 \n" // 16 pixels
1497 MEMACCESS(3) 1498 MEMACCESS(3)
1498 "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. 1499 "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels.
1499 "bgt 1b \n" 1500 "bgt 1b \n"
1500 : "+r"(src_y), // %0 1501 : "+r"(src_y), // %0
1501 "+r"(src_u), // %1 1502 "+r"(src_u), // %1
1502 "+r"(src_v), // %2 1503 "+r"(src_v), // %2
1503 "+r"(dst_uyvy), // %3 1504 "+r"(dst_uyvy), // %3
1504 "+r"(width) // %4 1505 "+r"(width) // %4
1505 : 1506 :
1506 : "cc", "memory", "d0", "d1", "d2", "d3" 1507 : "cc", "memory", "v0", "v1", "v2", "v3"
1507 ); 1508 );
1508 } 1509 }
1509 #endif // HAS_I422TOUYVYROW_NEON 1510 #endif // HAS_I422TOUYVYROW_NEON
1510 1511
1511 #ifdef HAS_ARGBTORGB565ROW_NEON 1512 #ifdef HAS_ARGBTORGB565ROW_NEON
1512 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { 1513 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
1513 asm volatile ( 1514 asm volatile (
1514 ".p2align 2 \n" 1515 ".p2align 2 \n"
1515 "1: \n" 1516 "1: \n"
1516 MEMACCESS(0) 1517 MEMACCESS(0)
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after
1570 "+r"(pix) // %2 1571 "+r"(pix) // %2
1571 : 1572 :
1572 : "cc", "memory", "q0", "q8", "q9", "q10", "q11" 1573 : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
1573 ); 1574 );
1574 } 1575 }
1575 #endif // HAS_ARGBTOARGB4444ROW_NEON 1576 #endif // HAS_ARGBTOARGB4444ROW_NEON
1576 1577
1577 #ifdef HAS_ARGBTOYROW_NEON 1578 #ifdef HAS_ARGBTOYROW_NEON
1578 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { 1579 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
1579 asm volatile ( 1580 asm volatile (
1580 "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient 1581 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
1581 "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient 1582 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
1582 "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient 1583 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
1583 "vmov.u8 d27, #16 \n" // Add 16 constant 1584 "movi v7.8b, #16 \n" // Add 16 constant
1584 ".p2align 2 \n" 1585 ".p2align 2 \n"
1585 "1: \n" 1586 "1: \n"
1586 MEMACCESS(0) 1587 MEMACCESS(0)
1587 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. 1588 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
1588 "subs %2, %2, #8 \n" // 8 processed per loop. 1589 "subs %2, %2, #8 \n" // 8 processed per loop.
1589 "vmull.u8 q2, d0, d24 \n" // B 1590 "umull v3.8h, v0.8b, v4.8b \n" // B
1590 "vmlal.u8 q2, d1, d25 \n" // G 1591 "umlal v3.8h, v1.8b, v5.8b \n" // G
1591 "vmlal.u8 q2, d2, d26 \n" // R 1592 "umlal v3.8h, v2.8b, v6.8b \n" // R
1592 "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y 1593 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
1593 "vqadd.u8 d0, d27 \n" 1594 "uqadd v0.8b, v0.8b, v7.8b \n"
1594 MEMACCESS(1) 1595 MEMACCESS(1)
1595 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. 1596 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1596 "bgt 1b \n" 1597 "bgt 1b \n"
1597 : "+r"(src_argb), // %0 1598 : "+r"(src_argb), // %0
1598 "+r"(dst_y), // %1 1599 "+r"(dst_y), // %1
1599 "+r"(pix) // %2 1600 "+r"(pix) // %2
1600 : 1601 :
1601 : "cc", "memory", "q0", "q1", "q2", "q12", "q13" 1602 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1602 ); 1603 );
1603 } 1604 }
1604 #endif // HAS_ARGBTOYROW_NEON 1605 #endif // HAS_ARGBTOYROW_NEON
1605 1606
1606 #ifdef HAS_ARGBTOYJROW_NEON 1607 #ifdef HAS_ARGBTOYJROW_NEON
1607 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { 1608 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
1608 asm volatile ( 1609 asm volatile (
1609 "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient 1610 "movi v4.8b, #15 \n" // B * 0.11400 coefficient
1610 "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient 1611 "movi v5.8b, #75 \n" // G * 0.58700 coefficient
1611 "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient 1612 "movi v6.8b, #38 \n" // R * 0.29900 coefficient
1612 ".p2align 2 \n" 1613 ".p2align 2 \n"
1613 "1: \n" 1614 "1: \n"
1614 MEMACCESS(0) 1615 MEMACCESS(0)
1615 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. 1616 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
1616 "subs %2, %2, #8 \n" // 8 processed per loop. 1617 "subs %2, %2, #8 \n" // 8 processed per loop.
1617 "vmull.u8 q2, d0, d24 \n" // B 1618 "umull v3.8h, v0.8b, v4.8b \n" // B
1618 "vmlal.u8 q2, d1, d25 \n" // G 1619 "umlal v3.8h, v1.8b, v5.8b \n" // G
1619 "vmlal.u8 q2, d2, d26 \n" // R 1620 "umlal v3.8h, v2.8b, v6.8b \n" // R
1620 "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y 1621 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
1621 MEMACCESS(1) 1622 MEMACCESS(1)
1622 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. 1623 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1623 "bgt 1b \n" 1624 "bgt 1b \n"
1624 : "+r"(src_argb), // %0 1625 : "+r"(src_argb), // %0
1625 "+r"(dst_y), // %1 1626 "+r"(dst_y), // %1
1626 "+r"(pix) // %2 1627 "+r"(pix) // %2
1627 : 1628 :
1628 : "cc", "memory", "q0", "q1", "q2", "q12", "q13" 1629 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
1629 ); 1630 );
1630 } 1631 }
1631 #endif // HAS_ARGBTOYJROW_NEON 1632 #endif // HAS_ARGBTOYJROW_NEON
1632 1633
1633 // 8x1 pixels. 1634 // 8x1 pixels.
1634 #ifdef HAS_ARGBTOUV444ROW_NEON 1635 #ifdef HAS_ARGBTOUV444ROW_NEON
1635 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1636 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1636 int pix) { 1637 int pix) {
1637 asm volatile ( 1638 asm volatile (
1638 "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient 1639 "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient
(...skipping 1402 matching lines...) Expand 10 before | Expand all | Expand 10 after
3041 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. 3042 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
3042 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. 3043 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
3043 #ifdef HAS_ARGBMULTIPLYROW_NEON 3044 #ifdef HAS_ARGBMULTIPLYROW_NEON
3044 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 3045 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
3045 uint8* dst_argb, int width) { 3046 uint8* dst_argb, int width) {
3046 asm volatile ( 3047 asm volatile (
3047 // 8 pixel loop. 3048 // 8 pixel loop.
3048 ".p2align 2 \n" 3049 ".p2align 2 \n"
3049 "1: \n" 3050 "1: \n"
3050 MEMACCESS(0) 3051 MEMACCESS(0)
3051 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 3052 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
3052 MEMACCESS(1) 3053 MEMACCESS(1)
3053 "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels. 3054 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
3054 "subs %3, %3, #8 \n" // 8 processed per loop. 3055 "subs %3, %3, #8 \n" // 8 processed per loop.
3055 "vmull.u8 q0, d0, d1 \n" // multiply B 3056 "umull v0.8h, v0.8b, v4.8b \n" // multiply B
3056 "vmull.u8 q1, d2, d3 \n" // multiply G 3057 "umull v1.8h, v1.8b, v5.8b \n" // multiply G
3057 "vmull.u8 q2, d4, d5 \n" // multiply R 3058 "umull v2.8h, v2.8b, v6.8b \n" // multiply R
3058 "vmull.u8 q3, d6, d7 \n" // multiply A 3059 "umull v3.8h, v3.8b, v7.8b \n" // multiply A
3059 "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B 3060 "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
3060 "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G 3061 "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
3061 "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R 3062 "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
3062 "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A 3063 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
3063 MEMACCESS(2) 3064 MEMACCESS(2)
3064 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. 3065 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
3065 "bgt 1b \n" 3066 "bgt 1b \n"
3066 3067
3067 : "+r"(src_argb0), // %0 3068 : "+r"(src_argb0), // %0
3068 "+r"(src_argb1), // %1 3069 "+r"(src_argb1), // %1
3069 "+r"(dst_argb), // %2 3070 "+r"(dst_argb), // %2
3070 "+r"(width) // %3 3071 "+r"(width) // %3
3071 : 3072 :
3072 : "cc", "memory", "q0", "q1", "q2", "q3" 3073 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
3073 ); 3074 );
3074 } 3075 }
3075 #endif // HAS_ARGBMULTIPLYROW_NEON 3076 #endif // HAS_ARGBMULTIPLYROW_NEON
3076 3077
3077 // Add 2 rows of ARGB pixels together, 8 pixels at a time. 3078 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
3078 #ifdef HAS_ARGBADDROW_NEON 3079 #ifdef HAS_ARGBADDROW_NEON
3079 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 3080 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
3080 uint8* dst_argb, int width) { 3081 uint8* dst_argb, int width) {
3081 asm volatile ( 3082 asm volatile (
3082 // 8 pixel loop. 3083 // 8 pixel loop.
3083 ".p2align 2 \n" 3084 ".p2align 2 \n"
3084 "1: \n" 3085 "1: \n"
3085 MEMACCESS(0) 3086 MEMACCESS(0)
3086 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. 3087 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
3087 MEMACCESS(1) 3088 MEMACCESS(1)
3088 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. 3089 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
3089 "subs %3, %3, #8 \n" // 8 processed per loop. 3090 "subs %3, %3, #8 \n" // 8 processed per loop.
3090 "vqadd.u8 q0, q0, q2 \n" // add B, G 3091 "uqadd v0.8b, v0.8b, v4.8b \n"
3091 "vqadd.u8 q1, q1, q3 \n" // add R, A 3092 "uqadd v1.8b, v1.8b, v5.8b \n"
3093 "uqadd v2.8b, v2.8b, v6.8b \n"
3094 "uqadd v3.8b, v3.8b, v7.8b \n"
3092 MEMACCESS(2) 3095 MEMACCESS(2)
3093 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. 3096 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
3094 "bgt 1b \n" 3097 "bgt 1b \n"
3095 3098
3096 : "+r"(src_argb0), // %0 3099 : "+r"(src_argb0), // %0
3097 "+r"(src_argb1), // %1 3100 "+r"(src_argb1), // %1
3098 "+r"(dst_argb), // %2 3101 "+r"(dst_argb), // %2
3099 "+r"(width) // %3 3102 "+r"(width) // %3
3100 : 3103 :
3101 : "cc", "memory", "q0", "q1", "q2", "q3" 3104 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
3102 ); 3105 );
3103 } 3106 }
3104 #endif // HAS_ARGBADDROW_NEON 3107 #endif // HAS_ARGBADDROW_NEON
3105 3108
3106 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. 3109 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
3107 #ifdef HAS_ARGBSUBTRACTROW_NEON 3110 #ifdef HAS_ARGBSUBTRACTROW_NEON
3108 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 3111 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
3109 uint8* dst_argb, int width) { 3112 uint8* dst_argb, int width) {
3110 asm volatile ( 3113 asm volatile (
3111 // 8 pixel loop. 3114 // 8 pixel loop.
3112 ".p2align 2 \n" 3115 ".p2align 2 \n"
3113 "1: \n" 3116 "1: \n"
3114 MEMACCESS(0) 3117 MEMACCESS(0)
3115 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. 3118 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
3116 MEMACCESS(1) 3119 MEMACCESS(1)
3117 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. 3120 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
3118 "subs %3, %3, #8 \n" // 8 processed per loop. 3121 "subs %3, %3, #8 \n" // 8 processed per loop.
3119 "vqsub.u8 q0, q0, q2 \n" // subtract B, G 3122 "uqsub v0.8b, v0.8b, v4.8b \n"
3120 "vqsub.u8 q1, q1, q3 \n" // subtract R, A 3123 "uqsub v1.8b, v1.8b, v5.8b \n"
3124 "uqsub v2.8b, v2.8b, v6.8b \n"
3125 "uqsub v3.8b, v3.8b, v7.8b \n"
3121 MEMACCESS(2) 3126 MEMACCESS(2)
3122 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. 3127 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
3123 "bgt 1b \n" 3128 "bgt 1b \n"
3124 3129
3125 : "+r"(src_argb0), // %0 3130 : "+r"(src_argb0), // %0
3126 "+r"(src_argb1), // %1 3131 "+r"(src_argb1), // %1
3127 "+r"(dst_argb), // %2 3132 "+r"(dst_argb), // %2
3128 "+r"(width) // %3 3133 "+r"(width) // %3
3129 : 3134 :
3130 : "cc", "memory", "q0", "q1", "q2", "q3" 3135 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
3131 ); 3136 );
3132 } 3137 }
3133 #endif // HAS_ARGBSUBTRACTROW_NEON 3138 #endif // HAS_ARGBSUBTRACTROW_NEON
3134 3139
3135 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. 3140 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
3136 // A = 255 3141 // A = 255
3137 // R = Sobel 3142 // R = Sobel
3138 // G = Sobel 3143 // G = Sobel
3139 // B = Sobel 3144 // B = Sobel
3140 #ifdef HAS_SOBELROW_NEON 3145 #ifdef HAS_SOBELROW_NEON
3141 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 3146 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
3142 uint8* dst_argb, int width) { 3147 uint8* dst_argb, int width) {
3143 asm volatile ( 3148 asm volatile (
3144 "vmov.u8 d3, #255 \n" // alpha 3149 "movi v3.8b, #255 \n" // alpha
3145 // 8 pixel loop. 3150 // 8 pixel loop.
3146 ".p2align 2 \n" 3151 ".p2align 2 \n"
3147 "1: \n" 3152 "1: \n"
3148 MEMACCESS(0) 3153 MEMACCESS(0)
3149 "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. 3154 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
3150 MEMACCESS(1) 3155 MEMACCESS(1)
3151 "vld1.8 {d1}, [%1]! \n" // load 8 sobely. 3156 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
3152 "subs %3, %3, #8 \n" // 8 processed per loop. 3157 "subs %3, %3, #8 \n" // 8 processed per loop.
3153 "vqadd.u8 d0, d0, d1 \n" // add 3158 "uqadd v0.8b, v0.8b, v1.8b \n" // add
3154 "vmov.u8 d1, d0 \n" 3159 "mov v1.8b, v0.8b \n"
3155 "vmov.u8 d2, d0 \n" 3160 "mov v2.8b, v0.8b \n"
3156 MEMACCESS(2) 3161 MEMACCESS(2)
3157 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. 3162 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
3158 "bgt 1b \n" 3163 "bgt 1b \n"
3159 : "+r"(src_sobelx), // %0 3164 : "+r"(src_sobelx), // %0
3160 "+r"(src_sobely), // %1 3165 "+r"(src_sobely), // %1
3161 "+r"(dst_argb), // %2 3166 "+r"(dst_argb), // %2
3162 "+r"(width) // %3 3167 "+r"(width) // %3
3163 : 3168 :
3164 : "cc", "memory", "q0", "q1" 3169 : "cc", "memory", "v0", "v1", "v2", "v3"
3165 ); 3170 );
3166 } 3171 }
3167 #endif // HAS_SOBELROW_NEON 3172 #endif // HAS_SOBELROW_NEON
3168 3173
3169 // Adds Sobel X and Sobel Y and stores Sobel into plane. 3174 // Adds Sobel X and Sobel Y and stores Sobel into plane.
3170 #ifdef HAS_SOBELTOPLANEROW_NEON 3175 #ifdef HAS_SOBELTOPLANEROW_NEON
3171 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 3176 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
3172 uint8* dst_y, int width) { 3177 uint8* dst_y, int width) {
3173 asm volatile ( 3178 asm volatile (
3174 // 16 pixel loop. 3179 // 16 pixel loop.
3175 ".p2align 2 \n" 3180 ".p2align 2 \n"
3176 "1: \n" 3181 "1: \n"
3177 MEMACCESS(0) 3182 MEMACCESS(0)
3178 "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. 3183 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
3179 MEMACCESS(1) 3184 MEMACCESS(1)
3180 "vld1.8 {q1}, [%1]! \n" // load 16 sobely. 3185 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
3181 "subs %3, %3, #16 \n" // 16 processed per loop. 3186 "subs %3, %3, #16 \n" // 16 processed per loop.
3182 "vqadd.u8 q0, q0, q1 \n" // add 3187 "uqadd v0.16b, v0.16b, v1.16b \n" // add
3183 MEMACCESS(2) 3188 MEMACCESS(2)
3184 "vst1.8 {q0}, [%2]! \n" // store 16 pixels. 3189 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
3185 "bgt 1b \n" 3190 "bgt 1b \n"
3186 : "+r"(src_sobelx), // %0 3191 : "+r"(src_sobelx), // %0
3187 "+r"(src_sobely), // %1 3192 "+r"(src_sobely), // %1
3188 "+r"(dst_y), // %2 3193 "+r"(dst_y), // %2
3189 "+r"(width) // %3 3194 "+r"(width) // %3
3190 : 3195 :
3191 : "cc", "memory", "q0", "q1" 3196 : "cc", "memory", "v0", "v1"
3192 ); 3197 );
3193 } 3198 }
3194 #endif // HAS_SOBELTOPLANEROW_NEON 3199 #endif // HAS_SOBELTOPLANEROW_NEON
3195 3200
3196 // Mixes Sobel X, Sobel Y and Sobel into ARGB. 3201 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
3197 // A = 255 3202 // A = 255
3198 // R = Sobel X 3203 // R = Sobel X
3199 // G = Sobel 3204 // G = Sobel
3200 // B = Sobel Y 3205 // B = Sobel Y
3201 #ifdef HAS_SOBELXYROW_NEON 3206 #ifdef HAS_SOBELXYROW_NEON
3202 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 3207 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
3203 uint8* dst_argb, int width) { 3208 uint8* dst_argb, int width) {
3204 asm volatile ( 3209 asm volatile (
3205 "vmov.u8 d3, #255 \n" // alpha 3210 "movi v3.8b, #255 \n" // alpha
3206 // 8 pixel loop. 3211 // 8 pixel loop.
3207 ".p2align 2 \n" 3212 ".p2align 2 \n"
3208 "1: \n" 3213 "1: \n"
3209 MEMACCESS(0) 3214 MEMACCESS(0)
3210 "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. 3215 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
3211 MEMACCESS(1) 3216 MEMACCESS(1)
3212 "vld1.8 {d0}, [%1]! \n" // load 8 sobely. 3217 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
3213 "subs %3, %3, #8 \n" // 8 processed per loop. 3218 "subs %3, %3, #8 \n" // 8 processed per loop.
3214 "vqadd.u8 d1, d0, d2 \n" // add 3219 "uqadd v1.8b, v0.8b, v2.8b \n" // add
3215 MEMACCESS(2) 3220 MEMACCESS(2)
3216 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. 3221 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
3217 "bgt 1b \n" 3222 "bgt 1b \n"
3218 : "+r"(src_sobelx), // %0 3223 : "+r"(src_sobelx), // %0
3219 "+r"(src_sobely), // %1 3224 "+r"(src_sobely), // %1
3220 "+r"(dst_argb), // %2 3225 "+r"(dst_argb), // %2
3221 "+r"(width) // %3 3226 "+r"(width) // %3
3222 : 3227 :
3223 : "cc", "memory", "q0", "q1" 3228 : "cc", "memory", "v0", "v1", "v2", "v3"
3224 ); 3229 );
3225 } 3230 }
3226 #endif // HAS_SOBELXYROW_NEON 3231 #endif // HAS_SOBELXYROW_NEON
3227 3232
3228 // SobelX as a matrix is 3233 // SobelX as a matrix is
3229 // -1 0 1 3234 // -1 0 1
3230 // -2 0 2 3235 // -2 0 2
3231 // -1 0 1 3236 // -1 0 1
3232 #ifdef HAS_SOBELXROW_NEON 3237 #ifdef HAS_SOBELXROW_NEON
3233 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, 3238 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
3234 const uint8* src_y2, uint8* dst_sobelx, int width) { 3239 const uint8* src_y2, uint8* dst_sobelx, int width) {
3235 asm volatile ( 3240 asm volatile (
3236 ".p2align 2 \n" 3241 ".p2align 2 \n"
3237 "1: \n" 3242 "1: \n"
3238 MEMACCESS(0) 3243 MEMACCESS(0)
3239 "vld1.8 {d0}, [%0],%5 \n" // top 3244 "ld1 {v0.8b}, [%0],%5 \n" // top
3240 MEMACCESS(0) 3245 MEMACCESS(0)
3241 "vld1.8 {d1}, [%0],%6 \n" 3246 "ld1 {v1.8b}, [%0],%6 \n"
3242 "vsubl.u8 q0, d0, d1 \n" 3247 "usubl v0.8h, v0.8b, v1.8b \n"
3243 MEMACCESS(1) 3248 MEMACCESS(1)
3244 "vld1.8 {d2}, [%1],%5 \n" // center * 2 3249 "ld1 {v2.8b}, [%1],%5 \n" // center * 2
3245 MEMACCESS(1) 3250 MEMACCESS(1)
3246 "vld1.8 {d3}, [%1],%6 \n" 3251 "ld1 {v3.8b}, [%1],%6 \n"
3247 "vsubl.u8 q1, d2, d3 \n" 3252 "usubl v1.8h, v2.8b, v3.8b \n"
3248 "vadd.s16 q0, q0, q1 \n" 3253 "add v0.8h, v0.8h, v1.8h \n"
3249 "vadd.s16 q0, q0, q1 \n" 3254 "add v0.8h, v0.8h, v1.8h \n"
3250 MEMACCESS(2) 3255 MEMACCESS(2)
3251 "vld1.8 {d2}, [%2],%5 \n" // bottom 3256 "ld1 {v2.8b}, [%2],%5 \n" // bottom
3252 MEMACCESS(2) 3257 MEMACCESS(2)
3253 "vld1.8 {d3}, [%2],%6 \n" 3258 "ld1 {v3.8b}, [%2],%6 \n"
3254 "subs %4, %4, #8 \n" // 8 pixels 3259 "subs %4, %4, #8 \n" // 8 pixels
3255 "vsubl.u8 q1, d2, d3 \n" 3260 "usubl v1.8h, v2.8b, v3.8b \n"
3256 "vadd.s16 q0, q0, q1 \n" 3261 "add v0.8h, v0.8h, v1.8h \n"
3257 "vabs.s16 q0, q0 \n" 3262 "abs v0.8h, v0.8h \n"
3258 "vqmovn.u16 d0, q0 \n" 3263 "uqxtn v0.8b, v0.8h \n"
3259 MEMACCESS(3) 3264 MEMACCESS(3)
3260 "vst1.8 {d0}, [%3]! \n" // store 8 sobelx 3265 "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
3261 "bgt 1b \n" 3266 "bgt 1b \n"
3262 : "+r"(src_y0), // %0 3267 : "+r"(src_y0), // %0
3263 "+r"(src_y1), // %1 3268 "+r"(src_y1), // %1
3264 "+r"(src_y2), // %2 3269 "+r"(src_y2), // %2
3265 "+r"(dst_sobelx), // %3 3270 "+r"(dst_sobelx), // %3
3266 "+r"(width) // %4 3271 "+r"(width) // %4
3267 : "r"(2), // %5 3272 : "r"(2), // %5
3268 "r"(6) // %6 3273 "r"(6) // %6
3269 : "cc", "memory", "q0", "q1" // Clobber List 3274 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
3270 ); 3275 );
3271 } 3276 }
3272 #endif // HAS_SOBELXROW_NEON 3277 #endif // HAS_SOBELXROW_NEON
3273 3278
3274 // SobelY as a matrix is 3279 // SobelY as a matrix is
3275 // -1 -2 -1 3280 // -1 -2 -1
3276 // 0 0 0 3281 // 0 0 0
3277 // 1 2 1 3282 // 1 2 1
3278 #ifdef HAS_SOBELYROW_NEON 3283 #ifdef HAS_SOBELYROW_NEON
3279 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, 3284 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
3280 uint8* dst_sobely, int width) { 3285 uint8* dst_sobely, int width) {
3281 asm volatile ( 3286 asm volatile (
3282 ".p2align 2 \n" 3287 ".p2align 2 \n"
3283 "1: \n" 3288 "1: \n"
3284 MEMACCESS(0) 3289 MEMACCESS(0)
3285 "vld1.8 {d0}, [%0],%4 \n" // left 3290 "ld1 {v0.8b}, [%0],%4 \n" // left
3286 MEMACCESS(1) 3291 MEMACCESS(1)
3287 "vld1.8 {d1}, [%1],%4 \n" 3292 "ld1 {v1.8b}, [%1],%4 \n"
3288 "vsubl.u8 q0, d0, d1 \n" 3293 "usubl v0.8h, v0.8b, v1.8b \n"
3289 MEMACCESS(0) 3294 MEMACCESS(0)
3290 "vld1.8 {d2}, [%0],%4 \n" // center * 2 3295 "ld1 {v2.8b}, [%0],%4 \n" // center * 2
3291 MEMACCESS(1) 3296 MEMACCESS(1)
3292 "vld1.8 {d3}, [%1],%4 \n" 3297 "ld1 {v3.8b}, [%1],%4 \n"
3293 "vsubl.u8 q1, d2, d3 \n" 3298 "usubl v1.8h, v2.8b, v3.8b \n"
3294 "vadd.s16 q0, q0, q1 \n" 3299 "add v0.8h, v0.8h, v1.8h \n"
3295 "vadd.s16 q0, q0, q1 \n" 3300 "add v0.8h, v0.8h, v1.8h \n"
3296 MEMACCESS(0) 3301 MEMACCESS(0)
3297 "vld1.8 {d2}, [%0],%5 \n" // right 3302 "ld1 {v2.8b}, [%0],%5 \n" // right
3298 MEMACCESS(1) 3303 MEMACCESS(1)
3299 "vld1.8 {d3}, [%1],%5 \n" 3304 "ld1 {v3.8b}, [%1],%5 \n"
3300 "subs %3, %3, #8 \n" // 8 pixels 3305 "subs %3, %3, #8 \n" // 8 pixels
3301 "vsubl.u8 q1, d2, d3 \n" 3306 "usubl v1.8h, v2.8b, v3.8b \n"
3302 "vadd.s16 q0, q0, q1 \n" 3307 "add v0.8h, v0.8h, v1.8h \n"
3303 "vabs.s16 q0, q0 \n" 3308 "abs v0.8h, v0.8h \n"
3304 "vqmovn.u16 d0, q0 \n" 3309 "uqxtn v0.8b, v0.8h \n"
3305 MEMACCESS(2) 3310 MEMACCESS(2)
3306 "vst1.8 {d0}, [%2]! \n" // store 8 sobely 3311 "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
3307 "bgt 1b \n" 3312 "bgt 1b \n"
3308 : "+r"(src_y0), // %0 3313 : "+r"(src_y0), // %0
3309 "+r"(src_y1), // %1 3314 "+r"(src_y1), // %1
3310 "+r"(dst_sobely), // %2 3315 "+r"(dst_sobely), // %2
3311 "+r"(width) // %3 3316 "+r"(width) // %3
3312 : "r"(1), // %4 3317 : "r"(1), // %4
3313 "r"(6) // %5 3318 "r"(6) // %5
3314 : "cc", "memory", "q0", "q1" // Clobber List 3319 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
3315 ); 3320 );
3316 } 3321 }
3317 #endif // HAS_SOBELYROW_NEON 3322 #endif // HAS_SOBELYROW_NEON
3318 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 3323 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
3319 3324
3320 #ifdef __cplusplus 3325 #ifdef __cplusplus
3321 } // extern "C" 3326 } // extern "C"
3322 } // namespace libyuv 3327 } // namespace libyuv
3323 #endif 3328 #endif
OLDNEW
« no previous file with comments | « source/libvpx/third_party/libyuv/source/row_any.cc ('k') | source/libvpx/third_party/libyuv/source/row_win.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698