OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 806 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
817 #endif // HAS_UYVYTOARGBROW_NEON | 817 #endif // HAS_UYVYTOARGBROW_NEON |
818 | 818 |
819 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. | 819 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. |
820 #ifdef HAS_SPLITUVROW_NEON | 820 #ifdef HAS_SPLITUVROW_NEON |
821 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, | 821 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, |
822 int width) { | 822 int width) { |
823 asm volatile ( | 823 asm volatile ( |
824 ".p2align 2 \n" | 824 ".p2align 2 \n" |
825 "1: \n" | 825 "1: \n" |
826 MEMACCESS(0) | 826 MEMACCESS(0) |
827 "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV | 827 "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pairs of UV |
828 "subs %3, %3, #16 \n" // 16 processed per loop | 828 "subs %3, %3, #16 \n" // 16 processed per loop |
829 MEMACCESS(1) | 829 MEMACCESS(1) |
830 "vst1.8 {q0}, [%1]! \n" // store U | 830 "st1 {v0.16b}, [%1], #16 \n" // store U |
831 MEMACCESS(2) | 831 MEMACCESS(2) |
832 "vst1.8 {q1}, [%2]! \n" // store V | 832 "st1 {v1.16b}, [%2], #16 \n" // store V |
833 "bgt 1b \n" | 833 "bgt 1b \n" |
834 : "+r"(src_uv), // %0 | 834 : "+r"(src_uv), // %0 |
835 "+r"(dst_u), // %1 | 835 "+r"(dst_u), // %1 |
836 "+r"(dst_v), // %2 | 836 "+r"(dst_v), // %2 |
837 "+r"(width) // %3 // Output registers | 837 "+r"(width) // %3 // Output registers |
838 : // Input registers | 838 : // Input registers |
839 : "cc", "memory", "q0", "q1" // Clobber List | 839 : "cc", "memory", "v0", "v1" // Clobber List |
840 ); | 840 ); |
841 } | 841 } |
842 #endif // HAS_SPLITUVROW_NEON | 842 #endif // HAS_SPLITUVROW_NEON |
843 | 843 |
844 // Reads 16 U's and V's and writes out 16 pairs of UV. | 844 // Reads 16 U's and V's and writes out 16 pairs of UV. |
845 #ifdef HAS_MERGEUVROW_NEON | 845 #ifdef HAS_MERGEUVROW_NEON |
846 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, | 846 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |
847 int width) { | 847 int width) { |
848 asm volatile ( | 848 asm volatile ( |
849 ".p2align 2 \n" | 849 ".p2align 2 \n" |
850 "1: \n" | 850 "1: \n" |
851 MEMACCESS(0) | 851 MEMACCESS(0) |
852 "vld1.8 {q0}, [%0]! \n" // load U | 852 "ld1 {v0.16b}, [%0], #16 \n" // load U |
853 MEMACCESS(1) | 853 MEMACCESS(1) |
854 "vld1.8 {q1}, [%1]! \n" // load V | 854 "ld1 {v1.16b}, [%1], #16 \n" // load V |
855 "subs %3, %3, #16 \n" // 16 processed per loop | 855 "subs %3, %3, #16 \n" // 16 processed per loop |
856 MEMACCESS(2) | 856 MEMACCESS(2) |
857 "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV | 857 "st2 {v0.16b, v1.16b}, [%2], #32 \n" // store 16 pairs of UV |
858 "bgt 1b \n" | 858 "bgt 1b \n" |
859 : | 859 : |
860 "+r"(src_u), // %0 | 860 "+r"(src_u), // %0 |
861 "+r"(src_v), // %1 | 861 "+r"(src_v), // %1 |
862 "+r"(dst_uv), // %2 | 862 "+r"(dst_uv), // %2 |
863 "+r"(width) // %3 // Output registers | 863 "+r"(width) // %3 // Output registers |
864 : // Input registers | 864 : // Input registers |
865 : "cc", "memory", "q0", "q1" // Clobber List | 865 : "cc", "memory", "v0", "v1" // Clobber List |
866 ); | 866 ); |
867 } | 867 } |
868 #endif // HAS_MERGEUVROW_NEON | 868 #endif // HAS_MERGEUVROW_NEON |
869 | 869 |
870 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. | 870 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. |
871 #ifdef HAS_COPYROW_NEON | 871 #ifdef HAS_COPYROW_NEON |
872 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { | 872 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { |
873 asm volatile ( | 873 asm volatile ( |
874 ".p2align 2 \n" | 874 ".p2align 2 \n" |
875 "1: \n" | 875 "1: \n" |
876 MEMACCESS(0) | 876 MEMACCESS(0) |
877 "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 | 877 "ld1 {v0.8b-v3.8b}, [%0], #32 \n" // load 32 |
878 "subs %2, %2, #32 \n" // 32 processed per loop | 878 "subs %2, %2, #32 \n" // 32 processed per loop |
879 MEMACCESS(1) | 879 MEMACCESS(1) |
880 "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 | 880 "st1 {v0.8b-v3.8b}, [%1], #32 \n" // store 32 |
881 "bgt 1b \n" | 881 "bgt 1b \n" |
882 : "+r"(src), // %0 | 882 : "+r"(src), // %0 |
883 "+r"(dst), // %1 | 883 "+r"(dst), // %1 |
884 "+r"(count) // %2 // Output registers | 884 "+r"(count) // %2 // Output registers |
885 : // Input registers | 885 : // Input registers |
886 : "cc", "memory", "q0", "q1" // Clobber List | 886 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
887 ); | 887 ); |
888 } | 888 } |
889 #endif // HAS_COPYROW_NEON | 889 #endif // HAS_COPYROW_NEON |
890 | 890 |
891 // SetRow8 writes 'count' bytes using a 32 bit value repeated. | 891 // SetRow8 writes 'count' bytes using a 32 bit value repeated. |
892 #ifdef HAS_SETROW_NEON | 892 #ifdef HAS_SETROW_NEON |
893 void SetRow_NEON(uint8* dst, uint32 v32, int count) { | 893 void SetRow_NEON(uint8* dst, uint32 v32, int count) { |
894 asm volatile ( | 894 asm volatile ( |
895 "vdup.u32 q0, %2 \n" // duplicate 4 ints | 895 "dup v0.4s, %w2 \n" // duplicate 4 ints |
896 "1: \n" | 896 "1: \n" |
897 "subs %1, %1, #16 \n" // 16 bytes per loop | 897 "subs %1, %1, #16 \n" // 16 bytes per loop |
898 MEMACCESS(0) | 898 MEMACCESS(0) |
899 "vst1.8 {q0}, [%0]! \n" // store | 899 "st1 {v0.16b}, [%0], #16 \n" // store |
900 "bgt 1b \n" | 900 "bgt 1b \n" |
901 : "+r"(dst), // %0 | 901 : "+r"(dst), // %0 |
902 "+r"(count) // %1 | 902 "+r"(count) // %1 |
903 : "r"(v32) // %2 | 903 : "r"(v32) // %2 |
904 : "cc", "memory", "q0" | 904 : "cc", "memory", "v0" |
905 ); | 905 ); |
906 } | 906 } |
907 #endif // HAS_SETROW_NEON | 907 #endif // HAS_SETROW_NEON |
908 | 908 |
909 // TODO(fbarchard): Make fully assembler | 909 // TODO(fbarchard): Make fully assembler |
910 // SetRow32 writes 'count' words using a 32 bit value repeated. | 910 // SetRow32 writes 'count' words using a 32 bit value repeated. |
911 #ifdef HAS_ARGBSETROWS_NEON | 911 #ifdef HAS_ARGBSETROWS_NEON |
912 void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, | 912 void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, |
913 int dst_stride, int height) { | 913 int dst_stride, int height) { |
914 for (int y = 0; y < height; ++y) { | 914 for (int y = 0; y < height; ++y) { |
915 SetRow_NEON(dst, v32, width << 2); | 915 SetRow_NEON(dst, v32, width << 2); |
916 dst += dst_stride; | 916 dst += dst_stride; |
917 } | 917 } |
918 } | 918 } |
919 #endif // HAS_ARGBSETROWS_NEON | 919 #endif // HAS_ARGBSETROWS_NEON |
920 | 920 |
921 #ifdef HAS_MIRRORROW_NEON | 921 #ifdef HAS_MIRRORROW_NEON |
922 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { | 922 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { |
923 asm volatile ( | 923 asm volatile ( |
924 // Start at end of source row. | 924 // Start at end of source row. |
925 "mov r3, #-16 \n" | |
926 "add %0, %0, %2 \n" | 925 "add %0, %0, %2 \n" |
927 "sub %0, #16 \n" | 926 "sub %0, %0, #16 \n" |
928 | 927 |
929 ".p2align 2 \n" | 928 ".p2align 2 \n" |
930 "1: \n" | 929 "1: \n" |
931 MEMACCESS(0) | 930 MEMACCESS(0) |
932 "vld1.8 {q0}, [%0], r3 \n" // src -= 16 | 931 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 |
933 "subs %2, #16 \n" // 16 pixels per loop. | 932 "subs %2, %2, #16 \n" // 16 pixels per loop. |
934 "vrev64.8 q0, q0 \n" | 933 "rev64 v0.16b, v0.16b \n" |
935 MEMACCESS(1) | 934 MEMACCESS(1) |
936 "vst1.8 {d1}, [%1]! \n" // dst += 16 | 935 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 |
937 MEMACCESS(1) | 936 MEMACCESS(1) |
938 "vst1.8 {d0}, [%1]! \n" | 937 "st1 {v0.D}[0], [%1], #8 \n" |
939 "bgt 1b \n" | 938 "bgt 1b \n" |
940 : "+r"(src), // %0 | 939 : "+r"(src), // %0 |
941 "+r"(dst), // %1 | 940 "+r"(dst), // %1 |
942 "+r"(width) // %2 | 941 "+r"(width) // %2 |
943 : | 942 : "r"((ptrdiff_t)-16) // %3 |
944 : "cc", "memory", "r3", "q0" | 943 : "cc", "memory", "v0" |
945 ); | 944 ); |
946 } | 945 } |
947 #endif // HAS_MIRRORROW_NEON | 946 #endif // HAS_MIRRORROW_NEON |
948 | 947 |
949 #ifdef HAS_MIRRORUVROW_NEON | 948 #ifdef HAS_MIRRORUVROW_NEON |
950 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, | 949 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, |
951 int width) { | 950 int width) { |
952 asm volatile ( | 951 asm volatile ( |
953 // Start at end of source row. | 952 // Start at end of source row. |
954 "mov r12, #-16 \n" | |
955 "add %0, %0, %3, lsl #1 \n" | 953 "add %0, %0, %3, lsl #1 \n" |
956 "sub %0, #16 \n" | 954 "sub %0, %0, #16 \n" |
957 | 955 |
958 ".p2align 2 \n" | 956 ".p2align 2 \n" |
959 "1: \n" | 957 "1: \n" |
960 MEMACCESS(0) | 958 MEMACCESS(0) |
961 "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 | 959 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 |
962 "subs %3, #8 \n" // 8 pixels per loop. | 960 "subs %3, %3, #8 \n" // 8 pixels per loop. |
963 "vrev64.8 q0, q0 \n" | 961 "rev64 v0.8b, v0.8b \n" |
| 962 "rev64 v1.8b, v1.8b \n" |
964 MEMACCESS(1) | 963 MEMACCESS(1) |
965 "vst1.8 {d0}, [%1]! \n" // dst += 8 | 964 "st1 {v0.8b}, [%1], #8 \n" // dst += 8 |
966 MEMACCESS(2) | 965 MEMACCESS(2) |
967 "vst1.8 {d1}, [%2]! \n" | 966 "st1 {v1.8b}, [%2], #8 \n" |
968 "bgt 1b \n" | 967 "bgt 1b \n" |
969 : "+r"(src_uv), // %0 | 968 : "+r"(src_uv), // %0 |
970 "+r"(dst_u), // %1 | 969 "+r"(dst_u), // %1 |
971 "+r"(dst_v), // %2 | 970 "+r"(dst_v), // %2 |
972 "+r"(width) // %3 | 971 "+r"(width) // %3 |
973 : | 972 : "r"((ptrdiff_t)-16) // %4 |
974 : "cc", "memory", "r12", "q0" | 973 : "cc", "memory", "v0", "v1" |
975 ); | 974 ); |
976 } | 975 } |
977 #endif // HAS_MIRRORUVROW_NEON | 976 #endif // HAS_MIRRORUVROW_NEON |
978 | 977 |
979 #ifdef HAS_ARGBMIRRORROW_NEON | 978 #ifdef HAS_ARGBMIRRORROW_NEON |
980 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { | 979 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { |
981 asm volatile ( | 980 asm volatile ( |
982 // Start at end of source row. | 981 // Start at end of source row. |
983 "mov r3, #-16 \n" | |
984 "add %0, %0, %2, lsl #2 \n" | 982 "add %0, %0, %2, lsl #2 \n" |
985 "sub %0, #16 \n" | 983 "sub %0, %0, #16 \n" |
986 | 984 |
987 ".p2align 2 \n" | 985 ".p2align 2 \n" |
988 "1: \n" | 986 "1: \n" |
989 MEMACCESS(0) | 987 MEMACCESS(0) |
990 "vld1.8 {q0}, [%0], r3 \n" // src -= 16 | 988 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 |
991 "subs %2, #4 \n" // 4 pixels per loop. | 989 "subs %2, %2, #4 \n" // 4 pixels per loop. |
992 "vrev64.32 q0, q0 \n" | 990 "rev64 v0.4s, v0.4s \n" |
993 MEMACCESS(1) | 991 MEMACCESS(1) |
994 "vst1.8 {d1}, [%1]! \n" // dst += 16 | 992 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 |
995 MEMACCESS(1) | 993 MEMACCESS(1) |
996 "vst1.8 {d0}, [%1]! \n" | 994 "st1 {v0.D}[0], [%1], #8 \n" |
997 "bgt 1b \n" | 995 "bgt 1b \n" |
998 : "+r"(src), // %0 | 996 : "+r"(src), // %0 |
999 "+r"(dst), // %1 | 997 "+r"(dst), // %1 |
1000 "+r"(width) // %2 | 998 "+r"(width) // %2 |
1001 : | 999 : "r"((ptrdiff_t)-16) // %3 |
1002 : "cc", "memory", "r3", "q0" | 1000 : "cc", "memory", "v0" |
1003 ); | 1001 ); |
1004 } | 1002 } |
1005 #endif // HAS_ARGBMIRRORROW_NEON | 1003 #endif // HAS_ARGBMIRRORROW_NEON |
1006 | 1004 |
1007 #ifdef HAS_RGB24TOARGBROW_NEON | 1005 #ifdef HAS_RGB24TOARGBROW_NEON |
1008 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { | 1006 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { |
1009 asm volatile ( | 1007 asm volatile ( |
1010 "vmov.u8 d4, #255 \n" // Alpha | 1008 "movi v4.8b, #255 \n" // Alpha |
1011 ".p2align 2 \n" | 1009 ".p2align 2 \n" |
1012 "1: \n" | 1010 "1: \n" |
1013 MEMACCESS(0) | 1011 MEMACCESS(0) |
1014 "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. | 1012 "ld3 {v1.8b-v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. |
1015 "subs %2, %2, #8 \n" // 8 processed per loop. | 1013 "subs %2, %2, #8 \n" // 8 processed per loop. |
1016 MEMACCESS(1) | 1014 MEMACCESS(1) |
1017 "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. | 1015 "st4 {v1.8b-v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB. |
1018 "bgt 1b \n" | 1016 "bgt 1b \n" |
1019 : "+r"(src_rgb24), // %0 | 1017 : "+r"(src_rgb24), // %0 |
1020 "+r"(dst_argb), // %1 | 1018 "+r"(dst_argb), // %1 |
1021 "+r"(pix) // %2 | 1019 "+r"(pix) // %2 |
1022 : | 1020 : |
1023 : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List | 1021 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List |
1024 ); | 1022 ); |
1025 } | 1023 } |
1026 #endif // HAS_RGB24TOARGBROW_NEON | 1024 #endif // HAS_RGB24TOARGBROW_NEON |
1027 | 1025 |
1028 #ifdef HAS_RAWTOARGBROW_NEON | 1026 #ifdef HAS_RAWTOARGBROW_NEON |
1029 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { | 1027 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { |
1030 asm volatile ( | 1028 asm volatile ( |
1031 "vmov.u8 d4, #255 \n" // Alpha | 1029 "movi v5.8b, #255 \n" // Alpha |
1032 ".p2align 2 \n" | 1030 ".p2align 2 \n" |
1033 "1: \n" | 1031 "1: \n" |
1034 MEMACCESS(0) | 1032 MEMACCESS(0) |
1035 "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. | 1033 "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // read r g b |
1036 "subs %2, %2, #8 \n" // 8 processed per loop. | 1034 "subs %2, %2, #8 \n" // 8 processed per loop. |
1037 "vswp.u8 d1, d3 \n" // swap R, B | 1035 "mov v3.8b, v1.8b \n" // move g |
| 1036 "mov v4.8b, v0.8b \n" // move r |
1038 MEMACCESS(1) | 1037 MEMACCESS(1) |
1039 "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. | 1038 "st4 {v2.8b-v5.8b}, [%1], #32 \n" // store b g r a |
1040 "bgt 1b \n" | 1039 "bgt 1b \n" |
1041 : "+r"(src_raw), // %0 | 1040 : "+r"(src_raw), // %0 |
1042 "+r"(dst_argb), // %1 | 1041 "+r"(dst_argb), // %1 |
1043 "+r"(pix) // %2 | 1042 "+r"(pix) // %2 |
1044 : | 1043 : |
1045 : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List | 1044 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List |
1046 ); | 1045 ); |
1047 } | 1046 } |
1048 #endif // HAS_RAWTOARGBROW_NEON | 1047 #endif // HAS_RAWTOARGBROW_NEON |
1049 | 1048 |
1050 #define RGB565TOARGB \ | 1049 #define RGB565TOARGB \ |
1051 "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ | 1050 "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ |
1052 "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ | 1051 "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ |
1053 "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ | 1052 "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ |
1054 "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ | 1053 "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ |
1055 "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ | 1054 "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ |
(...skipping 107 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1163 ); | 1162 ); |
1164 } | 1163 } |
1165 #endif // HAS_ARGB4444TOARGBROW_NEON | 1164 #endif // HAS_ARGB4444TOARGBROW_NEON |
1166 | 1165 |
1167 #ifdef HAS_ARGBTORGB24ROW_NEON | 1166 #ifdef HAS_ARGBTORGB24ROW_NEON |
1168 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { | 1167 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { |
1169 asm volatile ( | 1168 asm volatile ( |
1170 ".p2align 2 \n" | 1169 ".p2align 2 \n" |
1171 "1: \n" | 1170 "1: \n" |
1172 MEMACCESS(0) | 1171 MEMACCESS(0) |
1173 "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. | 1172 "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB. |
1174 "subs %2, %2, #8 \n" // 8 processed per loop. | 1173 "subs %2, %2, #8 \n" // 8 processed per loop. |
1175 MEMACCESS(1) | 1174 MEMACCESS(1) |
1176 "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. | 1175 "st3 {v1.8b-v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. |
1177 "bgt 1b \n" | 1176 "bgt 1b \n" |
1178 : "+r"(src_argb), // %0 | 1177 : "+r"(src_argb), // %0 |
1179 "+r"(dst_rgb24), // %1 | 1178 "+r"(dst_rgb24), // %1 |
1180 "+r"(pix) // %2 | 1179 "+r"(pix) // %2 |
1181 : | 1180 : |
1182 : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List | 1181 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List |
1183 ); | 1182 ); |
1184 } | 1183 } |
1185 #endif // HAS_ARGBTORGB24ROW_NEON | 1184 #endif // HAS_ARGBTORGB24ROW_NEON |
1186 | 1185 |
1187 #ifdef HAS_ARGBTORAWROW_NEON | 1186 #ifdef HAS_ARGBTORAWROW_NEON |
1188 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { | 1187 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { |
1189 asm volatile ( | 1188 asm volatile ( |
1190 ".p2align 2 \n" | 1189 ".p2align 2 \n" |
1191 "1: \n" | 1190 "1: \n" |
1192 MEMACCESS(0) | 1191 MEMACCESS(0) |
1193 "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. | 1192 "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load b g r a |
1194 "subs %2, %2, #8 \n" // 8 processed per loop. | 1193 "subs %2, %2, #8 \n" // 8 processed per loop. |
1195 "vswp.u8 d1, d3 \n" // swap R, B | 1194 "mov v4.8b, v2.8b \n" // mov g |
| 1195 "mov v5.8b, v1.8b \n" // mov b |
1196 MEMACCESS(1) | 1196 MEMACCESS(1) |
1197 "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. | 1197 "st3 {v3.8b-v5.8b}, [%1], #24 \n" // store r g b |
1198 "bgt 1b \n" | 1198 "bgt 1b \n" |
1199 : "+r"(src_argb), // %0 | 1199 : "+r"(src_argb), // %0 |
1200 "+r"(dst_raw), // %1 | 1200 "+r"(dst_raw), // %1 |
1201 "+r"(pix) // %2 | 1201 "+r"(pix) // %2 |
1202 : | 1202 : |
1203 : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List | 1203 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List |
1204 ); | 1204 ); |
1205 } | 1205 } |
1206 #endif // HAS_ARGBTORAWROW_NEON | 1206 #endif // HAS_ARGBTORAWROW_NEON |
1207 | 1207 |
1208 #ifdef HAS_YUY2TOYROW_NEON | 1208 #ifdef HAS_YUY2TOYROW_NEON |
1209 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { | 1209 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { |
1210 asm volatile ( | 1210 asm volatile ( |
1211 ".p2align 2 \n" | 1211 ".p2align 2 \n" |
1212 "1: \n" | 1212 "1: \n" |
1213 MEMACCESS(0) | 1213 MEMACCESS(0) |
1214 "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. | 1214 "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. |
1215 "subs %2, %2, #16 \n" // 16 processed per loop. | 1215 "subs %2, %2, #16 \n" // 16 processed per loop. |
1216 MEMACCESS(1) | 1216 MEMACCESS(1) |
1217 "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. | 1217 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. |
1218 "bgt 1b \n" | 1218 "bgt 1b \n" |
1219 : "+r"(src_yuy2), // %0 | 1219 : "+r"(src_yuy2), // %0 |
1220 "+r"(dst_y), // %1 | 1220 "+r"(dst_y), // %1 |
1221 "+r"(pix) // %2 | 1221 "+r"(pix) // %2 |
1222 : | 1222 : |
1223 : "cc", "memory", "q0", "q1" // Clobber List | 1223 : "cc", "memory", "v0", "v1" // Clobber List |
1224 ); | 1224 ); |
1225 } | 1225 } |
1226 #endif // HAS_YUY2TOYROW_NEON | 1226 #endif // HAS_YUY2TOYROW_NEON |
1227 | 1227 |
1228 #ifdef HAS_UYVYTOYROW_NEON | 1228 #ifdef HAS_UYVYTOYROW_NEON |
1229 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { | 1229 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { |
1230 asm volatile ( | 1230 asm volatile ( |
1231 ".p2align 2 \n" | 1231 ".p2align 2 \n" |
1232 "1: \n" | 1232 "1: \n" |
1233 MEMACCESS(0) | 1233 MEMACCESS(0) |
1234 "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. | 1234 "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. |
1235 "subs %2, %2, #16 \n" // 16 processed per loop. | 1235 "subs %2, %2, #16 \n" // 16 processed per loop. |
1236 MEMACCESS(1) | 1236 MEMACCESS(1) |
1237 "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. | 1237 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. |
1238 "bgt 1b \n" | 1238 "bgt 1b \n" |
1239 : "+r"(src_uyvy), // %0 | 1239 : "+r"(src_uyvy), // %0 |
1240 "+r"(dst_y), // %1 | 1240 "+r"(dst_y), // %1 |
1241 "+r"(pix) // %2 | 1241 "+r"(pix) // %2 |
1242 : | 1242 : |
1243 : "cc", "memory", "q0", "q1" // Clobber List | 1243 : "cc", "memory", "v0", "v1" // Clobber List |
1244 ); | 1244 ); |
1245 } | 1245 } |
1246 #endif // HAS_UYVYTOYROW_NEON | 1246 #endif // HAS_UYVYTOYROW_NEON |
1247 | 1247 |
1248 #ifdef HAS_YUY2TOUV422ROW_NEON | 1248 #ifdef HAS_YUY2TOUV422ROW_NEON |
1249 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, | 1249 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, |
1250 int pix) { | 1250 int pix) { |
1251 asm volatile ( | 1251 asm volatile ( |
1252 ".p2align 2 \n" | 1252 ".p2align 2 \n" |
1253 "1: \n" | 1253 "1: \n" |
1254 MEMACCESS(0) | 1254 MEMACCESS(0) |
1255 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. | 1255 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. |
1256 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. | 1256 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. |
1257 MEMACCESS(1) | 1257 MEMACCESS(1) |
1258 "vst1.8 {d1}, [%1]! \n" // store 8 U. | 1258 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. |
1259 MEMACCESS(2) | 1259 MEMACCESS(2) |
1260 "vst1.8 {d3}, [%2]! \n" // store 8 V. | 1260 "st1 {v3.8b}, [%2], #8 \n" // store 8 V. |
1261 "bgt 1b \n" | 1261 "bgt 1b \n" |
1262 : "+r"(src_yuy2), // %0 | 1262 : "+r"(src_yuy2), // %0 |
1263 "+r"(dst_u), // %1 | 1263 "+r"(dst_u), // %1 |
1264 "+r"(dst_v), // %2 | 1264 "+r"(dst_v), // %2 |
1265 "+r"(pix) // %3 | 1265 "+r"(pix) // %3 |
1266 : | 1266 : |
1267 : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List | 1267 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
1268 ); | 1268 ); |
1269 } | 1269 } |
1270 #endif // HAS_YUY2TOUV422ROW_NEON | 1270 #endif // HAS_YUY2TOUV422ROW_NEON |
1271 | 1271 |
1272 #ifdef HAS_UYVYTOUV422ROW_NEON | 1272 #ifdef HAS_UYVYTOUV422ROW_NEON |
1273 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, | 1273 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, |
1274 int pix) { | 1274 int pix) { |
1275 asm volatile ( | 1275 asm volatile ( |
1276 ".p2align 2 \n" | 1276 ".p2align 2 \n" |
1277 "1: \n" | 1277 "1: \n" |
1278 MEMACCESS(0) | 1278 MEMACCESS(0) |
1279 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. | 1279 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. |
1280 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. | 1280 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. |
1281 MEMACCESS(1) | 1281 MEMACCESS(1) |
1282 "vst1.8 {d0}, [%1]! \n" // store 8 U. | 1282 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. |
1283 MEMACCESS(2) | 1283 MEMACCESS(2) |
1284 "vst1.8 {d2}, [%2]! \n" // store 8 V. | 1284 "st1 {v2.8b}, [%2], #8 \n" // store 8 V. |
1285 "bgt 1b \n" | 1285 "bgt 1b \n" |
1286 : "+r"(src_uyvy), // %0 | 1286 : "+r"(src_uyvy), // %0 |
1287 "+r"(dst_u), // %1 | 1287 "+r"(dst_u), // %1 |
1288 "+r"(dst_v), // %2 | 1288 "+r"(dst_v), // %2 |
1289 "+r"(pix) // %3 | 1289 "+r"(pix) // %3 |
1290 : | 1290 : |
1291 : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List | 1291 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
1292 ); | 1292 ); |
1293 } | 1293 } |
1294 #endif // HAS_UYVYTOUV422ROW_NEON | 1294 #endif // HAS_UYVYTOUV422ROW_NEON |
1295 | 1295 |
1296 #ifdef HAS_YUY2TOUVROW_NEON | 1296 #ifdef HAS_YUY2TOUVROW_NEON |
1297 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, | 1297 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, |
1298 uint8* dst_u, uint8* dst_v, int pix) { | 1298 uint8* dst_u, uint8* dst_v, int pix) { |
1299 asm volatile ( | 1299 asm volatile ( |
1300 "add %1, %0, %1 \n" // stride + src_yuy2 | 1300 "add %x1, %x0, %w1, sxtw \n" // stride + src_yuy2 |
1301 ".p2align 2 \n" | 1301 ".p2align 2 \n" |
1302 "1: \n" | 1302 "1: \n" |
1303 MEMACCESS(0) | 1303 MEMACCESS(0) |
1304 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. | 1304 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. |
1305 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. | 1305 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. |
1306 MEMACCESS(1) | 1306 MEMACCESS(1) |
1307 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. | 1307 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row YUY2. |
1308 "vrhadd.u8 d1, d1, d5 \n" // average rows of U | 1308 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U |
1309 "vrhadd.u8 d3, d3, d7 \n" // average rows of V | 1309 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V |
1310 MEMACCESS(2) | 1310 MEMACCESS(2) |
1311 "vst1.8 {d1}, [%2]! \n" // store 8 U. | 1311 "st1 {v1.8b}, [%2], #8 \n" // store 8 U. |
1312 MEMACCESS(3) | 1312 MEMACCESS(3) |
1313 "vst1.8 {d3}, [%3]! \n" // store 8 V. | 1313 "st1 {v3.8b}, [%3], #8 \n" // store 8 V. |
1314 "bgt 1b \n" | 1314 "bgt 1b \n" |
1315 : "+r"(src_yuy2), // %0 | 1315 : "+r"(src_yuy2), // %0 |
1316 "+r"(stride_yuy2), // %1 | 1316 "+r"(stride_yuy2), // %1 |
1317 "+r"(dst_u), // %2 | 1317 "+r"(dst_u), // %2 |
1318 "+r"(dst_v), // %3 | 1318 "+r"(dst_v), // %3 |
1319 "+r"(pix) // %4 | 1319 "+r"(pix) // %4 |
1320 : | 1320 : |
1321 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber L
ist | 1321 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber L
ist |
1322 ); | 1322 ); |
1323 } | 1323 } |
1324 #endif // HAS_YUY2TOUVROW_NEON | 1324 #endif // HAS_YUY2TOUVROW_NEON |
1325 | 1325 |
1326 #ifdef HAS_UYVYTOUVROW_NEON | 1326 #ifdef HAS_UYVYTOUVROW_NEON |
1327 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, | 1327 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, |
1328 uint8* dst_u, uint8* dst_v, int pix) { | 1328 uint8* dst_u, uint8* dst_v, int pix) { |
1329 asm volatile ( | 1329 asm volatile ( |
1330 "add %1, %0, %1 \n" // stride + src_uyvy | 1330 "add %x1, %x0, %w1, sxtw \n" // stride + src_uyvy |
1331 ".p2align 2 \n" | 1331 ".p2align 2 \n" |
1332 "1: \n" | 1332 "1: \n" |
1333 MEMACCESS(0) | 1333 MEMACCESS(0) |
1334 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. | 1334 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. |
1335 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. | 1335 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. |
1336 MEMACCESS(1) | 1336 MEMACCESS(1) |
1337 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. | 1337 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row UYVY. |
1338 "vrhadd.u8 d0, d0, d4 \n" // average rows of U | 1338 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U |
1339 "vrhadd.u8 d2, d2, d6 \n" // average rows of V | 1339 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V |
1340 MEMACCESS(2) | 1340 MEMACCESS(2) |
1341 "vst1.8 {d0}, [%2]! \n" // store 8 U. | 1341 "st1 {v0.8b}, [%2], #8 \n" // store 8 U. |
1342 MEMACCESS(3) | 1342 MEMACCESS(3) |
1343 "vst1.8 {d2}, [%3]! \n" // store 8 V. | 1343 "st1 {v2.8b}, [%3], #8 \n" // store 8 V. |
1344 "bgt 1b \n" | 1344 "bgt 1b \n" |
1345 : "+r"(src_uyvy), // %0 | 1345 : "+r"(src_uyvy), // %0 |
1346 "+r"(stride_uyvy), // %1 | 1346 "+r"(stride_uyvy), // %1 |
1347 "+r"(dst_u), // %2 | 1347 "+r"(dst_u), // %2 |
1348 "+r"(dst_v), // %3 | 1348 "+r"(dst_v), // %3 |
1349 "+r"(pix) // %4 | 1349 "+r"(pix) // %4 |
1350 : | 1350 : |
1351 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber L
ist | 1351 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber L
ist |
1352 ); | 1352 ); |
1353 } | 1353 } |
1354 #endif // HAS_UYVYTOUVROW_NEON | 1354 #endif // HAS_UYVYTOUVROW_NEON |
1355 | 1355 |
1356 #ifdef HAS_HALFROW_NEON | 1356 #ifdef HAS_HALFROW_NEON |
1357 void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, | 1357 void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, |
1358 uint8* dst_uv, int pix) { | 1358 uint8* dst_uv, int pix) { |
1359 asm volatile ( | 1359 asm volatile ( |
1360 // change the stride to row 2 pointer | 1360 // change the stride to row 2 pointer |
1361 "add %1, %0 \n" | 1361 "add %x1, %x0, %w1, sxtw \n" |
1362 "1: \n" | 1362 "1: \n" |
1363 MEMACCESS(0) | 1363 MEMACCESS(0) |
1364 "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels. | 1364 "ld1 {v0.16b}, [%0], #16 \n" // load row 1 16 pixels. |
1365 "subs %3, %3, #16 \n" // 16 processed per loop | 1365 "subs %3, %3, #16 \n" // 16 processed per loop |
1366 MEMACCESS(1) | 1366 MEMACCESS(1) |
1367 "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels. | 1367 "ld1 {v1.16b}, [%1], #16 \n" // load row 2 16 pixels. |
1368 "vrhadd.u8 q0, q1 \n" // average row 1 and 2 | 1368 "urhadd v0.16b, v0.16b, v1.16b \n" // average row 1 and 2 |
1369 MEMACCESS(2) | 1369 MEMACCESS(2) |
1370 "vst1.8 {q0}, [%2]! \n" | 1370 "st1 {v0.16b}, [%2], #16 \n" |
1371 "bgt 1b \n" | 1371 "bgt 1b \n" |
1372 : "+r"(src_uv), // %0 | 1372 : "+r"(src_uv), // %0 |
1373 "+r"(src_uv_stride), // %1 | 1373 "+r"(src_uv_stride), // %1 |
1374 "+r"(dst_uv), // %2 | 1374 "+r"(dst_uv), // %2 |
1375 "+r"(pix) // %3 | 1375 "+r"(pix) // %3 |
1376 : | 1376 : |
1377 : "cc", "memory", "q0", "q1" // Clobber List | 1377 : "cc", "memory", "v0", "v1" // Clobber List |
1378 ); | 1378 ); |
1379 } | 1379 } |
1380 #endif // HAS_HALFROW_NEON | 1380 #endif // HAS_HALFROW_NEON |
1381 | 1381 |
1382 // Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG | 1382 // Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG |
1383 #ifdef HAS_ARGBTOBAYERROW_NEON | 1383 #ifdef HAS_ARGBTOBAYERROW_NEON |
1384 void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, | 1384 void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, |
1385 uint32 selector, int pix) { | 1385 uint32 selector, int pix) { |
1386 asm volatile ( | 1386 asm volatile ( |
1387 "vmov.u32 d6[0], %3 \n" // selector | 1387 "mov v2.s[0], %w3 \n" // selector |
1388 "1: \n" | 1388 "1: \n" |
1389 MEMACCESS(0) | 1389 MEMACCESS(0) |
1390 "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels. | 1390 "ld1 {v0.16b, v1.16b}, [%0], 32 \n" // load row 8 pixels. |
1391 "subs %2, %2, #8 \n" // 8 processed per loop | 1391 "subs %2, %2, #8 \n" // 8 processed per loop |
1392 "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels | 1392 "tbl v4.8b, {v0.16b}, v2.8b \n" // look up 4 pixels |
1393 "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels | 1393 "tbl v5.8b, {v1.16b}, v2.8b \n" // look up 4 pixels |
1394 "vtrn.u32 d4, d5 \n" // combine 8 pixels | 1394 "trn1 v4.4s, v4.4s, v5.4s \n" // combine 8 pixels |
1395 MEMACCESS(1) | 1395 MEMACCESS(1) |
1396 "vst1.8 {d4}, [%1]! \n" // store 8. | 1396 "st1 {v4.8b}, [%1], #8 \n" // store 8. |
1397 "bgt 1b \n" | 1397 "bgt 1b \n" |
1398 : "+r"(src_argb), // %0 | 1398 : "+r"(src_argb), // %0 |
1399 "+r"(dst_bayer), // %1 | 1399 "+r"(dst_bayer), // %1 |
1400 "+r"(pix) // %2 | 1400 "+r"(pix) // %2 |
1401 : "r"(selector) // %3 | 1401 : "r"(selector) // %3 |
1402 : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List | 1402 : "cc", "memory", "v0", "v1", "v2", "v4", "v5" // Clobber List |
1403 ); | 1403 ); |
1404 } | 1404 } |
1405 #endif // HAS_ARGBTOBAYERROW_NEON | 1405 #endif // HAS_ARGBTOBAYERROW_NEON |
1406 | 1406 |
1407 // Select G channels from ARGB. e.g. GGGGGGGG | 1407 // Select G channels from ARGB. e.g. GGGGGGGG |
1408 #ifdef HAS_ARGBTOBAYERGGROW_NEON | 1408 #ifdef HAS_ARGBTOBAYERGGROW_NEON |
1409 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, | 1409 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, |
1410 uint32 /*selector*/, int pix) { | 1410 uint32 /*selector*/, int pix) { |
1411 asm volatile ( | 1411 asm volatile ( |
1412 "1: \n" | 1412 "1: \n" |
1413 MEMACCESS(0) | 1413 MEMACCESS(0) |
1414 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels. | 1414 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load row 8 pixels. |
1415 "subs %2, %2, #8 \n" // 8 processed per loop | 1415 "subs %2, %2, #8 \n" // 8 processed per loop |
1416 MEMACCESS(1) | 1416 MEMACCESS(1) |
1417 "vst1.8 {d1}, [%1]! \n" // store 8 G's. | 1417 "st1 {v1.8b}, [%1], #8 \n" // store 8 G's. |
1418 "bgt 1b \n" | 1418 "bgt 1b \n" |
1419 : "+r"(src_argb), // %0 | 1419 : "+r"(src_argb), // %0 |
1420 "+r"(dst_bayer), // %1 | 1420 "+r"(dst_bayer), // %1 |
1421 "+r"(pix) // %2 | 1421 "+r"(pix) // %2 |
1422 : | 1422 : |
1423 : "cc", "memory", "q0", "q1" // Clobber List | 1423 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
1424 ); | 1424 ); |
1425 } | 1425 } |
1426 #endif // HAS_ARGBTOBAYERGGROW_NEON | 1426 #endif // HAS_ARGBTOBAYERGGROW_NEON |
1427 | 1427 |
1428 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 1428 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
1429 #ifdef HAS_ARGBSHUFFLEROW_NEON | 1429 #ifdef HAS_ARGBSHUFFLEROW_NEON |
1430 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, | 1430 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, |
1431 const uint8* shuffler, int pix) { | 1431 const uint8* shuffler, int pix) { |
1432 asm volatile ( | 1432 asm volatile ( |
1433 MEMACCESS(3) | 1433 MEMACCESS(3) |
1434 "vld1.8 {q2}, [%3] \n" // shuffler | 1434 "ld1 {v2.16b}, [%3] \n" // shuffler |
1435 "1: \n" | 1435 "1: \n" |
1436 MEMACCESS(0) | 1436 MEMACCESS(0) |
1437 "vld1.8 {q0}, [%0]! \n" // load 4 pixels. | 1437 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. |
1438 "subs %2, %2, #4 \n" // 4 processed per loop | 1438 "subs %2, %2, #4 \n" // 4 processed per loop |
1439 "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels | 1439 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels |
1440 "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels | |
1441 MEMACCESS(1) | 1440 MEMACCESS(1) |
1442 "vst1.8 {q1}, [%1]! \n" // store 4. | 1441 "st1 {v1.16b}, [%1], #16 \n" // store 4. |
1443 "bgt 1b \n" | 1442 "bgt 1b \n" |
1444 : "+r"(src_argb), // %0 | 1443 : "+r"(src_argb), // %0 |
1445 "+r"(dst_argb), // %1 | 1444 "+r"(dst_argb), // %1 |
1446 "+r"(pix) // %2 | 1445 "+r"(pix) // %2 |
1447 : "r"(shuffler) // %3 | 1446 : "r"(shuffler) // %3 |
1448 : "cc", "memory", "q0", "q1", "q2" // Clobber List | 1447 : "cc", "memory", "v0", "v1", "v2" // Clobber List |
1449 ); | 1448 ); |
1450 } | 1449 } |
1451 #endif // HAS_ARGBSHUFFLEROW_NEON | 1450 #endif // HAS_ARGBSHUFFLEROW_NEON |
1452 | 1451 |
1453 #ifdef HAS_I422TOYUY2ROW_NEON | 1452 #ifdef HAS_I422TOYUY2ROW_NEON |
1454 void I422ToYUY2Row_NEON(const uint8* src_y, | 1453 void I422ToYUY2Row_NEON(const uint8* src_y, |
1455 const uint8* src_u, | 1454 const uint8* src_u, |
1456 const uint8* src_v, | 1455 const uint8* src_v, |
1457 uint8* dst_yuy2, int width) { | 1456 uint8* dst_yuy2, int width) { |
1458 asm volatile ( | 1457 asm volatile ( |
1459 ".p2align 2 \n" | 1458 ".p2align 2 \n" |
1460 "1: \n" | 1459 "1: \n" |
1461 MEMACCESS(0) | 1460 MEMACCESS(0) |
1462 "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys | 1461 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys |
| 1462 "mov v2.8b, v1.8b \n" |
1463 MEMACCESS(1) | 1463 MEMACCESS(1) |
1464 "vld1.8 {d1}, [%1]! \n" // load 8 Us | 1464 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us |
1465 MEMACCESS(2) | 1465 MEMACCESS(2) |
1466 "vld1.8 {d3}, [%2]! \n" // load 8 Vs | 1466 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs |
1467 "subs %4, %4, #16 \n" // 16 pixels | 1467 "subs %4, %4, #16 \n" // 16 pixels |
1468 MEMACCESS(3) | 1468 MEMACCESS(3) |
1469 "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. | 1469 "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels. |
1470 "bgt 1b \n" | 1470 "bgt 1b \n" |
1471 : "+r"(src_y), // %0 | 1471 : "+r"(src_y), // %0 |
1472 "+r"(src_u), // %1 | 1472 "+r"(src_u), // %1 |
1473 "+r"(src_v), // %2 | 1473 "+r"(src_v), // %2 |
1474 "+r"(dst_yuy2), // %3 | 1474 "+r"(dst_yuy2), // %3 |
1475 "+r"(width) // %4 | 1475 "+r"(width) // %4 |
1476 : | 1476 : |
1477 : "cc", "memory", "d0", "d1", "d2", "d3" | 1477 : "cc", "memory", "v0", "v1", "v2", "v3" |
1478 ); | 1478 ); |
1479 } | 1479 } |
1480 #endif // HAS_I422TOYUY2ROW_NEON | 1480 #endif // HAS_I422TOYUY2ROW_NEON |
1481 | 1481 |
1482 #ifdef HAS_I422TOUYVYROW_NEON | 1482 #ifdef HAS_I422TOUYVYROW_NEON |
1483 void I422ToUYVYRow_NEON(const uint8* src_y, | 1483 void I422ToUYVYRow_NEON(const uint8* src_y, |
1484 const uint8* src_u, | 1484 const uint8* src_u, |
1485 const uint8* src_v, | 1485 const uint8* src_v, |
1486 uint8* dst_uyvy, int width) { | 1486 uint8* dst_uyvy, int width) { |
1487 asm volatile ( | 1487 asm volatile ( |
1488 ".p2align 2 \n" | 1488 ".p2align 2 \n" |
1489 "1: \n" | 1489 "1: \n" |
1490 MEMACCESS(0) | 1490 MEMACCESS(0) |
1491 "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys | 1491 "ld2 {v1.8b, v2.8b}, [%0], #16 \n" // load 16 Ys |
| 1492 "mov v3.8b, v2.8b \n" |
1492 MEMACCESS(1) | 1493 MEMACCESS(1) |
1493 "vld1.8 {d0}, [%1]! \n" // load 8 Us | 1494 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us |
1494 MEMACCESS(2) | 1495 MEMACCESS(2) |
1495 "vld1.8 {d2}, [%2]! \n" // load 8 Vs | 1496 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs |
1496 "subs %4, %4, #16 \n" // 16 pixels | 1497 "subs %4, %4, #16 \n" // 16 pixels |
1497 MEMACCESS(3) | 1498 MEMACCESS(3) |
1498 "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. | 1499 "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels. |
1499 "bgt 1b \n" | 1500 "bgt 1b \n" |
1500 : "+r"(src_y), // %0 | 1501 : "+r"(src_y), // %0 |
1501 "+r"(src_u), // %1 | 1502 "+r"(src_u), // %1 |
1502 "+r"(src_v), // %2 | 1503 "+r"(src_v), // %2 |
1503 "+r"(dst_uyvy), // %3 | 1504 "+r"(dst_uyvy), // %3 |
1504 "+r"(width) // %4 | 1505 "+r"(width) // %4 |
1505 : | 1506 : |
1506 : "cc", "memory", "d0", "d1", "d2", "d3" | 1507 : "cc", "memory", "v0", "v1", "v2", "v3" |
1507 ); | 1508 ); |
1508 } | 1509 } |
1509 #endif // HAS_I422TOUYVYROW_NEON | 1510 #endif // HAS_I422TOUYVYROW_NEON |
1510 | 1511 |
1511 #ifdef HAS_ARGBTORGB565ROW_NEON | 1512 #ifdef HAS_ARGBTORGB565ROW_NEON |
1512 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { | 1513 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { |
1513 asm volatile ( | 1514 asm volatile ( |
1514 ".p2align 2 \n" | 1515 ".p2align 2 \n" |
1515 "1: \n" | 1516 "1: \n" |
1516 MEMACCESS(0) | 1517 MEMACCESS(0) |
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1570 "+r"(pix) // %2 | 1571 "+r"(pix) // %2 |
1571 : | 1572 : |
1572 : "cc", "memory", "q0", "q8", "q9", "q10", "q11" | 1573 : "cc", "memory", "q0", "q8", "q9", "q10", "q11" |
1573 ); | 1574 ); |
1574 } | 1575 } |
1575 #endif // HAS_ARGBTOARGB4444ROW_NEON | 1576 #endif // HAS_ARGBTOARGB4444ROW_NEON |
1576 | 1577 |
1577 #ifdef HAS_ARGBTOYROW_NEON | 1578 #ifdef HAS_ARGBTOYROW_NEON |
1578 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { | 1579 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { |
1579 asm volatile ( | 1580 asm volatile ( |
1580 "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient | 1581 "movi v4.8b, #13 \n" // B * 0.1016 coefficient |
1581 "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient | 1582 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
1582 "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient | 1583 "movi v6.8b, #33 \n" // R * 0.2578 coefficient |
1583 "vmov.u8 d27, #16 \n" // Add 16 constant | 1584 "movi v7.8b, #16 \n" // Add 16 constant |
1584 ".p2align 2 \n" | 1585 ".p2align 2 \n" |
1585 "1: \n" | 1586 "1: \n" |
1586 MEMACCESS(0) | 1587 MEMACCESS(0) |
1587 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. | 1588 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
1588 "subs %2, %2, #8 \n" // 8 processed per loop. | 1589 "subs %2, %2, #8 \n" // 8 processed per loop. |
1589 "vmull.u8 q2, d0, d24 \n" // B | 1590 "umull v3.8h, v0.8b, v4.8b \n" // B |
1590 "vmlal.u8 q2, d1, d25 \n" // G | 1591 "umlal v3.8h, v1.8b, v5.8b \n" // G |
1591 "vmlal.u8 q2, d2, d26 \n" // R | 1592 "umlal v3.8h, v2.8b, v6.8b \n" // R |
1592 "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y | 1593 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y |
1593 "vqadd.u8 d0, d27 \n" | 1594 "uqadd v0.8b, v0.8b, v7.8b \n" |
1594 MEMACCESS(1) | 1595 MEMACCESS(1) |
1595 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. | 1596 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
1596 "bgt 1b \n" | 1597 "bgt 1b \n" |
1597 : "+r"(src_argb), // %0 | 1598 : "+r"(src_argb), // %0 |
1598 "+r"(dst_y), // %1 | 1599 "+r"(dst_y), // %1 |
1599 "+r"(pix) // %2 | 1600 "+r"(pix) // %2 |
1600 : | 1601 : |
1601 : "cc", "memory", "q0", "q1", "q2", "q12", "q13" | 1602 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
1602 ); | 1603 ); |
1603 } | 1604 } |
1604 #endif // HAS_ARGBTOYROW_NEON | 1605 #endif // HAS_ARGBTOYROW_NEON |
1605 | 1606 |
1606 #ifdef HAS_ARGBTOYJROW_NEON | 1607 #ifdef HAS_ARGBTOYJROW_NEON |
1607 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { | 1608 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { |
1608 asm volatile ( | 1609 asm volatile ( |
1609 "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient | 1610 "movi v4.8b, #15 \n" // B * 0.11400 coefficient |
1610 "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient | 1611 "movi v5.8b, #75 \n" // G * 0.58700 coefficient |
1611 "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient | 1612 "movi v6.8b, #38 \n" // R * 0.29900 coefficient |
1612 ".p2align 2 \n" | 1613 ".p2align 2 \n" |
1613 "1: \n" | 1614 "1: \n" |
1614 MEMACCESS(0) | 1615 MEMACCESS(0) |
1615 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. | 1616 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
1616 "subs %2, %2, #8 \n" // 8 processed per loop. | 1617 "subs %2, %2, #8 \n" // 8 processed per loop. |
1617 "vmull.u8 q2, d0, d24 \n" // B | 1618 "umull v3.8h, v0.8b, v4.8b \n" // B |
1618 "vmlal.u8 q2, d1, d25 \n" // G | 1619 "umlal v3.8h, v1.8b, v5.8b \n" // G |
1619 "vmlal.u8 q2, d2, d26 \n" // R | 1620 "umlal v3.8h, v2.8b, v6.8b \n" // R |
1620 "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y | 1621 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y |
1621 MEMACCESS(1) | 1622 MEMACCESS(1) |
1622 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. | 1623 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
1623 "bgt 1b \n" | 1624 "bgt 1b \n" |
1624 : "+r"(src_argb), // %0 | 1625 : "+r"(src_argb), // %0 |
1625 "+r"(dst_y), // %1 | 1626 "+r"(dst_y), // %1 |
1626 "+r"(pix) // %2 | 1627 "+r"(pix) // %2 |
1627 : | 1628 : |
1628 : "cc", "memory", "q0", "q1", "q2", "q12", "q13" | 1629 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" |
1629 ); | 1630 ); |
1630 } | 1631 } |
1631 #endif // HAS_ARGBTOYJROW_NEON | 1632 #endif // HAS_ARGBTOYJROW_NEON |
1632 | 1633 |
1633 // 8x1 pixels. | 1634 // 8x1 pixels. |
1634 #ifdef HAS_ARGBTOUV444ROW_NEON | 1635 #ifdef HAS_ARGBTOUV444ROW_NEON |
1635 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, | 1636 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, |
1636 int pix) { | 1637 int pix) { |
1637 asm volatile ( | 1638 asm volatile ( |
1638 "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient | 1639 "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient |
(...skipping 1402 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3041 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. | 3042 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. |
3042 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. | 3043 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. |
3043 #ifdef HAS_ARGBMULTIPLYROW_NEON | 3044 #ifdef HAS_ARGBMULTIPLYROW_NEON |
3044 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, | 3045 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |
3045 uint8* dst_argb, int width) { | 3046 uint8* dst_argb, int width) { |
3046 asm volatile ( | 3047 asm volatile ( |
3047 // 8 pixel loop. | 3048 // 8 pixel loop. |
3048 ".p2align 2 \n" | 3049 ".p2align 2 \n" |
3049 "1: \n" | 3050 "1: \n" |
3050 MEMACCESS(0) | 3051 MEMACCESS(0) |
3051 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. | 3052 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
3052 MEMACCESS(1) | 3053 MEMACCESS(1) |
3053 "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels. | 3054 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. |
3054 "subs %3, %3, #8 \n" // 8 processed per loop. | 3055 "subs %3, %3, #8 \n" // 8 processed per loop. |
3055 "vmull.u8 q0, d0, d1 \n" // multiply B | 3056 "umull v0.8h, v0.8b, v4.8b \n" // multiply B |
3056 "vmull.u8 q1, d2, d3 \n" // multiply G | 3057 "umull v1.8h, v1.8b, v5.8b \n" // multiply G |
3057 "vmull.u8 q2, d4, d5 \n" // multiply R | 3058 "umull v2.8h, v2.8b, v6.8b \n" // multiply R |
3058 "vmull.u8 q3, d6, d7 \n" // multiply A | 3059 "umull v3.8h, v3.8b, v7.8b \n" // multiply A |
3059 "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B | 3060 "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B |
3060 "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G | 3061 "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G |
3061 "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R | 3062 "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R |
3062 "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A | 3063 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A |
3063 MEMACCESS(2) | 3064 MEMACCESS(2) |
3064 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. | 3065 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. |
3065 "bgt 1b \n" | 3066 "bgt 1b \n" |
3066 | 3067 |
3067 : "+r"(src_argb0), // %0 | 3068 : "+r"(src_argb0), // %0 |
3068 "+r"(src_argb1), // %1 | 3069 "+r"(src_argb1), // %1 |
3069 "+r"(dst_argb), // %2 | 3070 "+r"(dst_argb), // %2 |
3070 "+r"(width) // %3 | 3071 "+r"(width) // %3 |
3071 : | 3072 : |
3072 : "cc", "memory", "q0", "q1", "q2", "q3" | 3073 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
3073 ); | 3074 ); |
3074 } | 3075 } |
3075 #endif // HAS_ARGBMULTIPLYROW_NEON | 3076 #endif // HAS_ARGBMULTIPLYROW_NEON |
3076 | 3077 |
3077 // Add 2 rows of ARGB pixels together, 8 pixels at a time. | 3078 // Add 2 rows of ARGB pixels together, 8 pixels at a time. |
3078 #ifdef HAS_ARGBADDROW_NEON | 3079 #ifdef HAS_ARGBADDROW_NEON |
3079 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, | 3080 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |
3080 uint8* dst_argb, int width) { | 3081 uint8* dst_argb, int width) { |
3081 asm volatile ( | 3082 asm volatile ( |
3082 // 8 pixel loop. | 3083 // 8 pixel loop. |
3083 ".p2align 2 \n" | 3084 ".p2align 2 \n" |
3084 "1: \n" | 3085 "1: \n" |
3085 MEMACCESS(0) | 3086 MEMACCESS(0) |
3086 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. | 3087 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
3087 MEMACCESS(1) | 3088 MEMACCESS(1) |
3088 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. | 3089 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. |
3089 "subs %3, %3, #8 \n" // 8 processed per loop. | 3090 "subs %3, %3, #8 \n" // 8 processed per loop. |
3090 "vqadd.u8 q0, q0, q2 \n" // add B, G | 3091 "uqadd v0.8b, v0.8b, v4.8b \n" |
3091 "vqadd.u8 q1, q1, q3 \n" // add R, A | 3092 "uqadd v1.8b, v1.8b, v5.8b \n" |
| 3093 "uqadd v2.8b, v2.8b, v6.8b \n" |
| 3094 "uqadd v3.8b, v3.8b, v7.8b \n" |
3092 MEMACCESS(2) | 3095 MEMACCESS(2) |
3093 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. | 3096 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. |
3094 "bgt 1b \n" | 3097 "bgt 1b \n" |
3095 | 3098 |
3096 : "+r"(src_argb0), // %0 | 3099 : "+r"(src_argb0), // %0 |
3097 "+r"(src_argb1), // %1 | 3100 "+r"(src_argb1), // %1 |
3098 "+r"(dst_argb), // %2 | 3101 "+r"(dst_argb), // %2 |
3099 "+r"(width) // %3 | 3102 "+r"(width) // %3 |
3100 : | 3103 : |
3101 : "cc", "memory", "q0", "q1", "q2", "q3" | 3104 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
3102 ); | 3105 ); |
3103 } | 3106 } |
3104 #endif // HAS_ARGBADDROW_NEON | 3107 #endif // HAS_ARGBADDROW_NEON |
3105 | 3108 |
3106 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. | 3109 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. |
3107 #ifdef HAS_ARGBSUBTRACTROW_NEON | 3110 #ifdef HAS_ARGBSUBTRACTROW_NEON |
3108 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, | 3111 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |
3109 uint8* dst_argb, int width) { | 3112 uint8* dst_argb, int width) { |
3110 asm volatile ( | 3113 asm volatile ( |
3111 // 8 pixel loop. | 3114 // 8 pixel loop. |
3112 ".p2align 2 \n" | 3115 ".p2align 2 \n" |
3113 "1: \n" | 3116 "1: \n" |
3114 MEMACCESS(0) | 3117 MEMACCESS(0) |
3115 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. | 3118 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
3116 MEMACCESS(1) | 3119 MEMACCESS(1) |
3117 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. | 3120 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. |
3118 "subs %3, %3, #8 \n" // 8 processed per loop. | 3121 "subs %3, %3, #8 \n" // 8 processed per loop. |
3119 "vqsub.u8 q0, q0, q2 \n" // subtract B, G | 3122 "uqsub v0.8b, v0.8b, v4.8b \n" |
3120 "vqsub.u8 q1, q1, q3 \n" // subtract R, A | 3123 "uqsub v1.8b, v1.8b, v5.8b \n" |
| 3124 "uqsub v2.8b, v2.8b, v6.8b \n" |
| 3125 "uqsub v3.8b, v3.8b, v7.8b \n" |
3121 MEMACCESS(2) | 3126 MEMACCESS(2) |
3122 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. | 3127 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. |
3123 "bgt 1b \n" | 3128 "bgt 1b \n" |
3124 | 3129 |
3125 : "+r"(src_argb0), // %0 | 3130 : "+r"(src_argb0), // %0 |
3126 "+r"(src_argb1), // %1 | 3131 "+r"(src_argb1), // %1 |
3127 "+r"(dst_argb), // %2 | 3132 "+r"(dst_argb), // %2 |
3128 "+r"(width) // %3 | 3133 "+r"(width) // %3 |
3129 : | 3134 : |
3130 : "cc", "memory", "q0", "q1", "q2", "q3" | 3135 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
3131 ); | 3136 ); |
3132 } | 3137 } |
3133 #endif // HAS_ARGBSUBTRACTROW_NEON | 3138 #endif // HAS_ARGBSUBTRACTROW_NEON |
3134 | 3139 |
3135 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. | 3140 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. |
3136 // A = 255 | 3141 // A = 255 |
3137 // R = Sobel | 3142 // R = Sobel |
3138 // G = Sobel | 3143 // G = Sobel |
3139 // B = Sobel | 3144 // B = Sobel |
3140 #ifdef HAS_SOBELROW_NEON | 3145 #ifdef HAS_SOBELROW_NEON |
3141 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, | 3146 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, |
3142 uint8* dst_argb, int width) { | 3147 uint8* dst_argb, int width) { |
3143 asm volatile ( | 3148 asm volatile ( |
3144 "vmov.u8 d3, #255 \n" // alpha | 3149 "movi v3.8b, #255 \n" // alpha |
3145 // 8 pixel loop. | 3150 // 8 pixel loop. |
3146 ".p2align 2 \n" | 3151 ".p2align 2 \n" |
3147 "1: \n" | 3152 "1: \n" |
3148 MEMACCESS(0) | 3153 MEMACCESS(0) |
3149 "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. | 3154 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. |
3150 MEMACCESS(1) | 3155 MEMACCESS(1) |
3151 "vld1.8 {d1}, [%1]! \n" // load 8 sobely. | 3156 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. |
3152 "subs %3, %3, #8 \n" // 8 processed per loop. | 3157 "subs %3, %3, #8 \n" // 8 processed per loop. |
3153 "vqadd.u8 d0, d0, d1 \n" // add | 3158 "uqadd v0.8b, v0.8b, v1.8b \n" // add |
3154 "vmov.u8 d1, d0 \n" | 3159 "mov v1.8b, v0.8b \n" |
3155 "vmov.u8 d2, d0 \n" | 3160 "mov v2.8b, v0.8b \n" |
3156 MEMACCESS(2) | 3161 MEMACCESS(2) |
3157 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. | 3162 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. |
3158 "bgt 1b \n" | 3163 "bgt 1b \n" |
3159 : "+r"(src_sobelx), // %0 | 3164 : "+r"(src_sobelx), // %0 |
3160 "+r"(src_sobely), // %1 | 3165 "+r"(src_sobely), // %1 |
3161 "+r"(dst_argb), // %2 | 3166 "+r"(dst_argb), // %2 |
3162 "+r"(width) // %3 | 3167 "+r"(width) // %3 |
3163 : | 3168 : |
3164 : "cc", "memory", "q0", "q1" | 3169 : "cc", "memory", "v0", "v1", "v2", "v3" |
3165 ); | 3170 ); |
3166 } | 3171 } |
3167 #endif // HAS_SOBELROW_NEON | 3172 #endif // HAS_SOBELROW_NEON |
3168 | 3173 |
3169 // Adds Sobel X and Sobel Y and stores Sobel into plane. | 3174 // Adds Sobel X and Sobel Y and stores Sobel into plane. |
3170 #ifdef HAS_SOBELTOPLANEROW_NEON | 3175 #ifdef HAS_SOBELTOPLANEROW_NEON |
3171 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, | 3176 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, |
3172 uint8* dst_y, int width) { | 3177 uint8* dst_y, int width) { |
3173 asm volatile ( | 3178 asm volatile ( |
3174 // 16 pixel loop. | 3179 // 16 pixel loop. |
3175 ".p2align 2 \n" | 3180 ".p2align 2 \n" |
3176 "1: \n" | 3181 "1: \n" |
3177 MEMACCESS(0) | 3182 MEMACCESS(0) |
3178 "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. | 3183 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. |
3179 MEMACCESS(1) | 3184 MEMACCESS(1) |
3180 "vld1.8 {q1}, [%1]! \n" // load 16 sobely. | 3185 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. |
3181 "subs %3, %3, #16 \n" // 16 processed per loop. | 3186 "subs %3, %3, #16 \n" // 16 processed per loop. |
3182 "vqadd.u8 q0, q0, q1 \n" // add | 3187 "uqadd v0.16b, v0.16b, v1.16b \n" // add |
3183 MEMACCESS(2) | 3188 MEMACCESS(2) |
3184 "vst1.8 {q0}, [%2]! \n" // store 16 pixels. | 3189 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. |
3185 "bgt 1b \n" | 3190 "bgt 1b \n" |
3186 : "+r"(src_sobelx), // %0 | 3191 : "+r"(src_sobelx), // %0 |
3187 "+r"(src_sobely), // %1 | 3192 "+r"(src_sobely), // %1 |
3188 "+r"(dst_y), // %2 | 3193 "+r"(dst_y), // %2 |
3189 "+r"(width) // %3 | 3194 "+r"(width) // %3 |
3190 : | 3195 : |
3191 : "cc", "memory", "q0", "q1" | 3196 : "cc", "memory", "v0", "v1" |
3192 ); | 3197 ); |
3193 } | 3198 } |
3194 #endif // HAS_SOBELTOPLANEROW_NEON | 3199 #endif // HAS_SOBELTOPLANEROW_NEON |
3195 | 3200 |
3196 // Mixes Sobel X, Sobel Y and Sobel into ARGB. | 3201 // Mixes Sobel X, Sobel Y and Sobel into ARGB. |
3197 // A = 255 | 3202 // A = 255 |
3198 // R = Sobel X | 3203 // R = Sobel X |
3199 // G = Sobel | 3204 // G = Sobel |
3200 // B = Sobel Y | 3205 // B = Sobel Y |
3201 #ifdef HAS_SOBELXYROW_NEON | 3206 #ifdef HAS_SOBELXYROW_NEON |
3202 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, | 3207 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, |
3203 uint8* dst_argb, int width) { | 3208 uint8* dst_argb, int width) { |
3204 asm volatile ( | 3209 asm volatile ( |
3205 "vmov.u8 d3, #255 \n" // alpha | 3210 "movi v3.8b, #255 \n" // alpha |
3206 // 8 pixel loop. | 3211 // 8 pixel loop. |
3207 ".p2align 2 \n" | 3212 ".p2align 2 \n" |
3208 "1: \n" | 3213 "1: \n" |
3209 MEMACCESS(0) | 3214 MEMACCESS(0) |
3210 "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. | 3215 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. |
3211 MEMACCESS(1) | 3216 MEMACCESS(1) |
3212 "vld1.8 {d0}, [%1]! \n" // load 8 sobely. | 3217 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. |
3213 "subs %3, %3, #8 \n" // 8 processed per loop. | 3218 "subs %3, %3, #8 \n" // 8 processed per loop. |
3214 "vqadd.u8 d1, d0, d2 \n" // add | 3219 "uqadd v1.8b, v0.8b, v2.8b \n" // add |
3215 MEMACCESS(2) | 3220 MEMACCESS(2) |
3216 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. | 3221 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. |
3217 "bgt 1b \n" | 3222 "bgt 1b \n" |
3218 : "+r"(src_sobelx), // %0 | 3223 : "+r"(src_sobelx), // %0 |
3219 "+r"(src_sobely), // %1 | 3224 "+r"(src_sobely), // %1 |
3220 "+r"(dst_argb), // %2 | 3225 "+r"(dst_argb), // %2 |
3221 "+r"(width) // %3 | 3226 "+r"(width) // %3 |
3222 : | 3227 : |
3223 : "cc", "memory", "q0", "q1" | 3228 : "cc", "memory", "v0", "v1", "v2", "v3" |
3224 ); | 3229 ); |
3225 } | 3230 } |
3226 #endif // HAS_SOBELXYROW_NEON | 3231 #endif // HAS_SOBELXYROW_NEON |
3227 | 3232 |
3228 // SobelX as a matrix is | 3233 // SobelX as a matrix is |
3229 // -1 0 1 | 3234 // -1 0 1 |
3230 // -2 0 2 | 3235 // -2 0 2 |
3231 // -1 0 1 | 3236 // -1 0 1 |
3232 #ifdef HAS_SOBELXROW_NEON | 3237 #ifdef HAS_SOBELXROW_NEON |
3233 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, | 3238 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, |
3234 const uint8* src_y2, uint8* dst_sobelx, int width) { | 3239 const uint8* src_y2, uint8* dst_sobelx, int width) { |
3235 asm volatile ( | 3240 asm volatile ( |
3236 ".p2align 2 \n" | 3241 ".p2align 2 \n" |
3237 "1: \n" | 3242 "1: \n" |
3238 MEMACCESS(0) | 3243 MEMACCESS(0) |
3239 "vld1.8 {d0}, [%0],%5 \n" // top | 3244 "ld1 {v0.8b}, [%0],%5 \n" // top |
3240 MEMACCESS(0) | 3245 MEMACCESS(0) |
3241 "vld1.8 {d1}, [%0],%6 \n" | 3246 "ld1 {v1.8b}, [%0],%6 \n" |
3242 "vsubl.u8 q0, d0, d1 \n" | 3247 "usubl v0.8h, v0.8b, v1.8b \n" |
3243 MEMACCESS(1) | 3248 MEMACCESS(1) |
3244 "vld1.8 {d2}, [%1],%5 \n" // center * 2 | 3249 "ld1 {v2.8b}, [%1],%5 \n" // center * 2 |
3245 MEMACCESS(1) | 3250 MEMACCESS(1) |
3246 "vld1.8 {d3}, [%1],%6 \n" | 3251 "ld1 {v3.8b}, [%1],%6 \n" |
3247 "vsubl.u8 q1, d2, d3 \n" | 3252 "usubl v1.8h, v2.8b, v3.8b \n" |
3248 "vadd.s16 q0, q0, q1 \n" | 3253 "add v0.8h, v0.8h, v1.8h \n" |
3249 "vadd.s16 q0, q0, q1 \n" | 3254 "add v0.8h, v0.8h, v1.8h \n" |
3250 MEMACCESS(2) | 3255 MEMACCESS(2) |
3251 "vld1.8 {d2}, [%2],%5 \n" // bottom | 3256 "ld1 {v2.8b}, [%2],%5 \n" // bottom |
3252 MEMACCESS(2) | 3257 MEMACCESS(2) |
3253 "vld1.8 {d3}, [%2],%6 \n" | 3258 "ld1 {v3.8b}, [%2],%6 \n" |
3254 "subs %4, %4, #8 \n" // 8 pixels | 3259 "subs %4, %4, #8 \n" // 8 pixels |
3255 "vsubl.u8 q1, d2, d3 \n" | 3260 "usubl v1.8h, v2.8b, v3.8b \n" |
3256 "vadd.s16 q0, q0, q1 \n" | 3261 "add v0.8h, v0.8h, v1.8h \n" |
3257 "vabs.s16 q0, q0 \n" | 3262 "abs v0.8h, v0.8h \n" |
3258 "vqmovn.u16 d0, q0 \n" | 3263 "uqxtn v0.8b, v0.8h \n" |
3259 MEMACCESS(3) | 3264 MEMACCESS(3) |
3260 "vst1.8 {d0}, [%3]! \n" // store 8 sobelx | 3265 "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx |
3261 "bgt 1b \n" | 3266 "bgt 1b \n" |
3262 : "+r"(src_y0), // %0 | 3267 : "+r"(src_y0), // %0 |
3263 "+r"(src_y1), // %1 | 3268 "+r"(src_y1), // %1 |
3264 "+r"(src_y2), // %2 | 3269 "+r"(src_y2), // %2 |
3265 "+r"(dst_sobelx), // %3 | 3270 "+r"(dst_sobelx), // %3 |
3266 "+r"(width) // %4 | 3271 "+r"(width) // %4 |
3267 : "r"(2), // %5 | 3272 : "r"(2), // %5 |
3268 "r"(6) // %6 | 3273 "r"(6) // %6 |
3269 : "cc", "memory", "q0", "q1" // Clobber List | 3274 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
3270 ); | 3275 ); |
3271 } | 3276 } |
3272 #endif // HAS_SOBELXROW_NEON | 3277 #endif // HAS_SOBELXROW_NEON |
3273 | 3278 |
3274 // SobelY as a matrix is | 3279 // SobelY as a matrix is |
3275 // -1 -2 -1 | 3280 // -1 -2 -1 |
3276 // 0 0 0 | 3281 // 0 0 0 |
3277 // 1 2 1 | 3282 // 1 2 1 |
3278 #ifdef HAS_SOBELYROW_NEON | 3283 #ifdef HAS_SOBELYROW_NEON |
3279 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, | 3284 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, |
3280 uint8* dst_sobely, int width) { | 3285 uint8* dst_sobely, int width) { |
3281 asm volatile ( | 3286 asm volatile ( |
3282 ".p2align 2 \n" | 3287 ".p2align 2 \n" |
3283 "1: \n" | 3288 "1: \n" |
3284 MEMACCESS(0) | 3289 MEMACCESS(0) |
3285 "vld1.8 {d0}, [%0],%4 \n" // left | 3290 "ld1 {v0.8b}, [%0],%4 \n" // left |
3286 MEMACCESS(1) | 3291 MEMACCESS(1) |
3287 "vld1.8 {d1}, [%1],%4 \n" | 3292 "ld1 {v1.8b}, [%1],%4 \n" |
3288 "vsubl.u8 q0, d0, d1 \n" | 3293 "usubl v0.8h, v0.8b, v1.8b \n" |
3289 MEMACCESS(0) | 3294 MEMACCESS(0) |
3290 "vld1.8 {d2}, [%0],%4 \n" // center * 2 | 3295 "ld1 {v2.8b}, [%0],%4 \n" // center * 2 |
3291 MEMACCESS(1) | 3296 MEMACCESS(1) |
3292 "vld1.8 {d3}, [%1],%4 \n" | 3297 "ld1 {v3.8b}, [%1],%4 \n" |
3293 "vsubl.u8 q1, d2, d3 \n" | 3298 "usubl v1.8h, v2.8b, v3.8b \n" |
3294 "vadd.s16 q0, q0, q1 \n" | 3299 "add v0.8h, v0.8h, v1.8h \n" |
3295 "vadd.s16 q0, q0, q1 \n" | 3300 "add v0.8h, v0.8h, v1.8h \n" |
3296 MEMACCESS(0) | 3301 MEMACCESS(0) |
3297 "vld1.8 {d2}, [%0],%5 \n" // right | 3302 "ld1 {v2.8b}, [%0],%5 \n" // right |
3298 MEMACCESS(1) | 3303 MEMACCESS(1) |
3299 "vld1.8 {d3}, [%1],%5 \n" | 3304 "ld1 {v3.8b}, [%1],%5 \n" |
3300 "subs %3, %3, #8 \n" // 8 pixels | 3305 "subs %3, %3, #8 \n" // 8 pixels |
3301 "vsubl.u8 q1, d2, d3 \n" | 3306 "usubl v1.8h, v2.8b, v3.8b \n" |
3302 "vadd.s16 q0, q0, q1 \n" | 3307 "add v0.8h, v0.8h, v1.8h \n" |
3303 "vabs.s16 q0, q0 \n" | 3308 "abs v0.8h, v0.8h \n" |
3304 "vqmovn.u16 d0, q0 \n" | 3309 "uqxtn v0.8b, v0.8h \n" |
3305 MEMACCESS(2) | 3310 MEMACCESS(2) |
3306 "vst1.8 {d0}, [%2]! \n" // store 8 sobely | 3311 "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely |
3307 "bgt 1b \n" | 3312 "bgt 1b \n" |
3308 : "+r"(src_y0), // %0 | 3313 : "+r"(src_y0), // %0 |
3309 "+r"(src_y1), // %1 | 3314 "+r"(src_y1), // %1 |
3310 "+r"(dst_sobely), // %2 | 3315 "+r"(dst_sobely), // %2 |
3311 "+r"(width) // %3 | 3316 "+r"(width) // %3 |
3312 : "r"(1), // %4 | 3317 : "r"(1), // %4 |
3313 "r"(6) // %5 | 3318 "r"(6) // %5 |
3314 : "cc", "memory", "q0", "q1" // Clobber List | 3319 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
3315 ); | 3320 ); |
3316 } | 3321 } |
3317 #endif // HAS_SOBELYROW_NEON | 3322 #endif // HAS_SOBELYROW_NEON |
3318 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 3323 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |
3319 | 3324 |
3320 #ifdef __cplusplus | 3325 #ifdef __cplusplus |
3321 } // extern "C" | 3326 } // extern "C" |
3322 } // namespace libyuv | 3327 } // namespace libyuv |
3323 #endif | 3328 #endif |
OLD | NEW |