OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 884 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
895 : "+r"(src), // %0 | 895 : "+r"(src), // %0 |
896 "+r"(dst), // %1 | 896 "+r"(dst), // %1 |
897 "+r"(width64) // %2 | 897 "+r"(width64) // %2 |
898 : "r"((ptrdiff_t)-16) // %3 | 898 : "r"((ptrdiff_t)-16) // %3 |
899 : "cc", "memory", "v0" | 899 : "cc", "memory", "v0" |
900 ); | 900 ); |
901 } | 901 } |
902 #endif // HAS_ARGBMIRRORROW_NEON | 902 #endif // HAS_ARGBMIRRORROW_NEON |
903 | 903 |
904 #ifdef HAS_RGB24TOARGBROW_NEON | 904 #ifdef HAS_RGB24TOARGBROW_NEON |
905 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { | 905 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { |
906 asm volatile ( | 906 asm volatile ( |
907 "movi v4.8b, #255 \n" // Alpha | 907 "movi v4.8b, #255 \n" // Alpha |
908 "1: \n" | 908 "1: \n" |
909 MEMACCESS(0) | 909 MEMACCESS(0) |
910 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. | 910 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. |
911 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 911 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
912 MEMACCESS(1) | 912 MEMACCESS(1) |
913 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels | 913 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels |
914 "b.gt 1b \n" | 914 "b.gt 1b \n" |
915 : "+r"(src_rgb24), // %0 | 915 : "+r"(src_rgb24), // %0 |
916 "+r"(dst_argb), // %1 | 916 "+r"(dst_argb), // %1 |
917 "+r"(pix) // %2 | 917 "+r"(width) // %2 |
918 : | 918 : |
919 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List | 919 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List |
920 ); | 920 ); |
921 } | 921 } |
922 #endif // HAS_RGB24TOARGBROW_NEON | 922 #endif // HAS_RGB24TOARGBROW_NEON |
923 | 923 |
924 #ifdef HAS_RAWTOARGBROW_NEON | 924 #ifdef HAS_RAWTOARGBROW_NEON |
925 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { | 925 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { |
926 asm volatile ( | 926 asm volatile ( |
927 "movi v5.8b, #255 \n" // Alpha | 927 "movi v5.8b, #255 \n" // Alpha |
928 "1: \n" | 928 "1: \n" |
929 MEMACCESS(0) | 929 MEMACCESS(0) |
930 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b | 930 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b |
931 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 931 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
932 "orr v3.8b, v1.8b, v1.8b \n" // move g | 932 "orr v3.8b, v1.8b, v1.8b \n" // move g |
933 "orr v4.8b, v0.8b, v0.8b \n" // move r | 933 "orr v4.8b, v0.8b, v0.8b \n" // move r |
934 MEMACCESS(1) | 934 MEMACCESS(1) |
935 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a | 935 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a |
936 "b.gt 1b \n" | 936 "b.gt 1b \n" |
937 : "+r"(src_raw), // %0 | 937 : "+r"(src_raw), // %0 |
938 "+r"(dst_argb), // %1 | 938 "+r"(dst_argb), // %1 |
939 "+r"(pix) // %2 | 939 "+r"(width) // %2 |
940 : | 940 : |
941 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List | 941 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List |
942 ); | 942 ); |
943 } | 943 } |
944 #endif // HAS_RAWTOARGBROW_NEON | 944 #endif // HAS_RAWTOARGBROW_NEON |
945 | 945 |
946 #define RGB565TOARGB \ | 946 #define RGB565TOARGB \ |
947 "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ | 947 "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ |
948 "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ | 948 "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ |
949 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ | 949 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ |
950 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ | 950 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ |
951 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ | 951 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ |
952 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ | 952 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ |
953 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ | 953 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ |
954 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ | 954 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ |
955 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ | 955 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ |
956 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ | 956 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ |
957 "dup v2.2D, v0.D[1] \n" /* R */ | 957 "dup v2.2D, v0.D[1] \n" /* R */ |
958 | 958 |
959 #ifdef HAS_RGB565TOARGBROW_NEON | 959 #ifdef HAS_RGB565TOARGBROW_NEON |
960 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { | 960 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { |
961 asm volatile ( | 961 asm volatile ( |
962 "movi v3.8b, #255 \n" // Alpha | 962 "movi v3.8b, #255 \n" // Alpha |
963 "1: \n" | 963 "1: \n" |
964 MEMACCESS(0) | 964 MEMACCESS(0) |
965 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. | 965 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. |
966 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 966 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
967 RGB565TOARGB | 967 RGB565TOARGB |
968 MEMACCESS(1) | 968 MEMACCESS(1) |
969 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels | 969 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels |
970 "b.gt 1b \n" | 970 "b.gt 1b \n" |
971 : "+r"(src_rgb565), // %0 | 971 : "+r"(src_rgb565), // %0 |
972 "+r"(dst_argb), // %1 | 972 "+r"(dst_argb), // %1 |
973 "+r"(pix) // %2 | 973 "+r"(width) // %2 |
974 : | 974 : |
975 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List | 975 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List |
976 ); | 976 ); |
977 } | 977 } |
978 #endif // HAS_RGB565TOARGBROW_NEON | 978 #endif // HAS_RGB565TOARGBROW_NEON |
979 | 979 |
980 #define ARGB1555TOARGB \ | 980 #define ARGB1555TOARGB \ |
981 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ | 981 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ |
982 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ | 982 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ |
983 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ | 983 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ |
(...skipping 25 matching lines...) Expand all Loading... |
1009 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ | 1009 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ |
1010 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ | 1010 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ |
1011 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ | 1011 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ |
1012 \ | 1012 \ |
1013 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ | 1013 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ |
1014 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ | 1014 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ |
1015 "dup v1.2D, v0.D[1] \n" /* G */ \ | 1015 "dup v1.2D, v0.D[1] \n" /* G */ \ |
1016 | 1016 |
1017 #ifdef HAS_ARGB1555TOARGBROW_NEON | 1017 #ifdef HAS_ARGB1555TOARGBROW_NEON |
1018 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, | 1018 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, |
1019 int pix) { | 1019 int width) { |
1020 asm volatile ( | 1020 asm volatile ( |
1021 "movi v3.8b, #255 \n" // Alpha | 1021 "movi v3.8b, #255 \n" // Alpha |
1022 "1: \n" | 1022 "1: \n" |
1023 MEMACCESS(0) | 1023 MEMACCESS(0) |
1024 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. | 1024 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. |
1025 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1025 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1026 ARGB1555TOARGB | 1026 ARGB1555TOARGB |
1027 MEMACCESS(1) | 1027 MEMACCESS(1) |
1028 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels | 1028 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels |
1029 "b.gt 1b \n" | 1029 "b.gt 1b \n" |
1030 : "+r"(src_argb1555), // %0 | 1030 : "+r"(src_argb1555), // %0 |
1031 "+r"(dst_argb), // %1 | 1031 "+r"(dst_argb), // %1 |
1032 "+r"(pix) // %2 | 1032 "+r"(width) // %2 |
1033 : | 1033 : |
1034 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 1034 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
1035 ); | 1035 ); |
1036 } | 1036 } |
1037 #endif // HAS_ARGB1555TOARGBROW_NEON | 1037 #endif // HAS_ARGB1555TOARGBROW_NEON |
1038 | 1038 |
1039 #define ARGB4444TOARGB \ | 1039 #define ARGB4444TOARGB \ |
1040 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ | 1040 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ |
1041 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ | 1041 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ |
1042 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ | 1042 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ |
1043 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ | 1043 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ |
1044 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ | 1044 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ |
1045 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ | 1045 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ |
1046 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ | 1046 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ |
1047 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ | 1047 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ |
1048 "dup v0.2D, v2.D[1] \n" \ | 1048 "dup v0.2D, v2.D[1] \n" \ |
1049 "dup v1.2D, v3.D[1] \n" | 1049 "dup v1.2D, v3.D[1] \n" |
1050 | 1050 |
1051 #ifdef HAS_ARGB4444TOARGBROW_NEON | 1051 #ifdef HAS_ARGB4444TOARGBROW_NEON |
1052 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, | 1052 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, |
1053 int pix) { | 1053 int width) { |
1054 asm volatile ( | 1054 asm volatile ( |
1055 "1: \n" | 1055 "1: \n" |
1056 MEMACCESS(0) | 1056 MEMACCESS(0) |
1057 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. | 1057 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. |
1058 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1058 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1059 ARGB4444TOARGB | 1059 ARGB4444TOARGB |
1060 MEMACCESS(1) | 1060 MEMACCESS(1) |
1061 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels | 1061 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels |
1062 "b.gt 1b \n" | 1062 "b.gt 1b \n" |
1063 : "+r"(src_argb4444), // %0 | 1063 : "+r"(src_argb4444), // %0 |
1064 "+r"(dst_argb), // %1 | 1064 "+r"(dst_argb), // %1 |
1065 "+r"(pix) // %2 | 1065 "+r"(width) // %2 |
1066 : | 1066 : |
1067 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List | 1067 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List |
1068 ); | 1068 ); |
1069 } | 1069 } |
1070 #endif // HAS_ARGB4444TOARGBROW_NEON | 1070 #endif // HAS_ARGB4444TOARGBROW_NEON |
1071 | 1071 |
1072 #ifdef HAS_ARGBTORGB24ROW_NEON | 1072 #ifdef HAS_ARGBTORGB24ROW_NEON |
1073 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { | 1073 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { |
1074 asm volatile ( | 1074 asm volatile ( |
1075 "1: \n" | 1075 "1: \n" |
1076 MEMACCESS(0) | 1076 MEMACCESS(0) |
1077 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels | 1077 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels |
1078 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1078 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1079 MEMACCESS(1) | 1079 MEMACCESS(1) |
1080 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. | 1080 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. |
1081 "b.gt 1b \n" | 1081 "b.gt 1b \n" |
1082 : "+r"(src_argb), // %0 | 1082 : "+r"(src_argb), // %0 |
1083 "+r"(dst_rgb24), // %1 | 1083 "+r"(dst_rgb24), // %1 |
1084 "+r"(pix) // %2 | 1084 "+r"(width) // %2 |
1085 : | 1085 : |
1086 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List | 1086 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List |
1087 ); | 1087 ); |
1088 } | 1088 } |
1089 #endif // HAS_ARGBTORGB24ROW_NEON | 1089 #endif // HAS_ARGBTORGB24ROW_NEON |
1090 | 1090 |
1091 #ifdef HAS_ARGBTORAWROW_NEON | 1091 #ifdef HAS_ARGBTORAWROW_NEON |
1092 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { | 1092 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { |
1093 asm volatile ( | 1093 asm volatile ( |
1094 "1: \n" | 1094 "1: \n" |
1095 MEMACCESS(0) | 1095 MEMACCESS(0) |
1096 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a | 1096 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a |
1097 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1097 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1098 "orr v4.8b, v2.8b, v2.8b \n" // mov g | 1098 "orr v4.8b, v2.8b, v2.8b \n" // mov g |
1099 "orr v5.8b, v1.8b, v1.8b \n" // mov b | 1099 "orr v5.8b, v1.8b, v1.8b \n" // mov b |
1100 MEMACCESS(1) | 1100 MEMACCESS(1) |
1101 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b | 1101 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b |
1102 "b.gt 1b \n" | 1102 "b.gt 1b \n" |
1103 : "+r"(src_argb), // %0 | 1103 : "+r"(src_argb), // %0 |
1104 "+r"(dst_raw), // %1 | 1104 "+r"(dst_raw), // %1 |
1105 "+r"(pix) // %2 | 1105 "+r"(width) // %2 |
1106 : | 1106 : |
1107 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List | 1107 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List |
1108 ); | 1108 ); |
1109 } | 1109 } |
1110 #endif // HAS_ARGBTORAWROW_NEON | 1110 #endif // HAS_ARGBTORAWROW_NEON |
1111 | 1111 |
1112 #ifdef HAS_YUY2TOYROW_NEON | 1112 #ifdef HAS_YUY2TOYROW_NEON |
1113 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { | 1113 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { |
1114 asm volatile ( | 1114 asm volatile ( |
1115 "1: \n" | 1115 "1: \n" |
1116 MEMACCESS(0) | 1116 MEMACCESS(0) |
1117 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. | 1117 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. |
1118 "subs %w2, %w2, #16 \n" // 16 processed per loop. | 1118 "subs %w2, %w2, #16 \n" // 16 processed per loop. |
1119 MEMACCESS(1) | 1119 MEMACCESS(1) |
1120 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. | 1120 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. |
1121 "b.gt 1b \n" | 1121 "b.gt 1b \n" |
1122 : "+r"(src_yuy2), // %0 | 1122 : "+r"(src_yuy2), // %0 |
1123 "+r"(dst_y), // %1 | 1123 "+r"(dst_y), // %1 |
1124 "+r"(pix) // %2 | 1124 "+r"(width) // %2 |
1125 : | 1125 : |
1126 : "cc", "memory", "v0", "v1" // Clobber List | 1126 : "cc", "memory", "v0", "v1" // Clobber List |
1127 ); | 1127 ); |
1128 } | 1128 } |
1129 #endif // HAS_YUY2TOYROW_NEON | 1129 #endif // HAS_YUY2TOYROW_NEON |
1130 | 1130 |
1131 #ifdef HAS_UYVYTOYROW_NEON | 1131 #ifdef HAS_UYVYTOYROW_NEON |
1132 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { | 1132 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { |
1133 asm volatile ( | 1133 asm volatile ( |
1134 "1: \n" | 1134 "1: \n" |
1135 MEMACCESS(0) | 1135 MEMACCESS(0) |
1136 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. | 1136 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. |
1137 "subs %w2, %w2, #16 \n" // 16 processed per loop. | 1137 "subs %w2, %w2, #16 \n" // 16 processed per loop. |
1138 MEMACCESS(1) | 1138 MEMACCESS(1) |
1139 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. | 1139 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. |
1140 "b.gt 1b \n" | 1140 "b.gt 1b \n" |
1141 : "+r"(src_uyvy), // %0 | 1141 : "+r"(src_uyvy), // %0 |
1142 "+r"(dst_y), // %1 | 1142 "+r"(dst_y), // %1 |
1143 "+r"(pix) // %2 | 1143 "+r"(width) // %2 |
1144 : | 1144 : |
1145 : "cc", "memory", "v0", "v1" // Clobber List | 1145 : "cc", "memory", "v0", "v1" // Clobber List |
1146 ); | 1146 ); |
1147 } | 1147 } |
1148 #endif // HAS_UYVYTOYROW_NEON | 1148 #endif // HAS_UYVYTOYROW_NEON |
1149 | 1149 |
1150 #ifdef HAS_YUY2TOUV422ROW_NEON | 1150 #ifdef HAS_YUY2TOUV422ROW_NEON |
1151 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, | 1151 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, |
1152 int pix) { | 1152 int width) { |
1153 asm volatile ( | 1153 asm volatile ( |
1154 "1: \n" | 1154 "1: \n" |
1155 MEMACCESS(0) | 1155 MEMACCESS(0) |
1156 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels | 1156 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels |
1157 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. | 1157 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. |
1158 MEMACCESS(1) | 1158 MEMACCESS(1) |
1159 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. | 1159 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. |
1160 MEMACCESS(2) | 1160 MEMACCESS(2) |
1161 "st1 {v3.8b}, [%2], #8 \n" // store 8 V. | 1161 "st1 {v3.8b}, [%2], #8 \n" // store 8 V. |
1162 "b.gt 1b \n" | 1162 "b.gt 1b \n" |
1163 : "+r"(src_yuy2), // %0 | 1163 : "+r"(src_yuy2), // %0 |
1164 "+r"(dst_u), // %1 | 1164 "+r"(dst_u), // %1 |
1165 "+r"(dst_v), // %2 | 1165 "+r"(dst_v), // %2 |
1166 "+r"(pix) // %3 | 1166 "+r"(width) // %3 |
1167 : | 1167 : |
1168 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 1168 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
1169 ); | 1169 ); |
1170 } | 1170 } |
1171 #endif // HAS_YUY2TOUV422ROW_NEON | 1171 #endif // HAS_YUY2TOUV422ROW_NEON |
1172 | 1172 |
1173 #ifdef HAS_UYVYTOUV422ROW_NEON | 1173 #ifdef HAS_UYVYTOUV422ROW_NEON |
1174 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, | 1174 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, |
1175 int pix) { | 1175 int width) { |
1176 asm volatile ( | 1176 asm volatile ( |
1177 "1: \n" | 1177 "1: \n" |
1178 MEMACCESS(0) | 1178 MEMACCESS(0) |
1179 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels | 1179 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels |
1180 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. | 1180 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. |
1181 MEMACCESS(1) | 1181 MEMACCESS(1) |
1182 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. | 1182 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. |
1183 MEMACCESS(2) | 1183 MEMACCESS(2) |
1184 "st1 {v2.8b}, [%2], #8 \n" // store 8 V. | 1184 "st1 {v2.8b}, [%2], #8 \n" // store 8 V. |
1185 "b.gt 1b \n" | 1185 "b.gt 1b \n" |
1186 : "+r"(src_uyvy), // %0 | 1186 : "+r"(src_uyvy), // %0 |
1187 "+r"(dst_u), // %1 | 1187 "+r"(dst_u), // %1 |
1188 "+r"(dst_v), // %2 | 1188 "+r"(dst_v), // %2 |
1189 "+r"(pix) // %3 | 1189 "+r"(width) // %3 |
1190 : | 1190 : |
1191 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 1191 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
1192 ); | 1192 ); |
1193 } | 1193 } |
1194 #endif // HAS_UYVYTOUV422ROW_NEON | 1194 #endif // HAS_UYVYTOUV422ROW_NEON |
1195 | 1195 |
1196 #ifdef HAS_YUY2TOUVROW_NEON | 1196 #ifdef HAS_YUY2TOUVROW_NEON |
1197 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, | 1197 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, |
1198 uint8* dst_u, uint8* dst_v, int pix) { | 1198 uint8* dst_u, uint8* dst_v, int width) { |
1199 const uint8* src_yuy2b = src_yuy2 + stride_yuy2; | 1199 const uint8* src_yuy2b = src_yuy2 + stride_yuy2; |
1200 asm volatile ( | 1200 asm volatile ( |
1201 "1: \n" | 1201 "1: \n" |
1202 MEMACCESS(0) | 1202 MEMACCESS(0) |
1203 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels | 1203 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels |
1204 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. | 1204 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. |
1205 MEMACCESS(1) | 1205 MEMACCESS(1) |
1206 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row | 1206 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row |
1207 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U | 1207 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U |
1208 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V | 1208 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V |
1209 MEMACCESS(2) | 1209 MEMACCESS(2) |
1210 "st1 {v1.8b}, [%2], #8 \n" // store 8 U. | 1210 "st1 {v1.8b}, [%2], #8 \n" // store 8 U. |
1211 MEMACCESS(3) | 1211 MEMACCESS(3) |
1212 "st1 {v3.8b}, [%3], #8 \n" // store 8 V. | 1212 "st1 {v3.8b}, [%3], #8 \n" // store 8 V. |
1213 "b.gt 1b \n" | 1213 "b.gt 1b \n" |
1214 : "+r"(src_yuy2), // %0 | 1214 : "+r"(src_yuy2), // %0 |
1215 "+r"(src_yuy2b), // %1 | 1215 "+r"(src_yuy2b), // %1 |
1216 "+r"(dst_u), // %2 | 1216 "+r"(dst_u), // %2 |
1217 "+r"(dst_v), // %3 | 1217 "+r"(dst_v), // %3 |
1218 "+r"(pix) // %4 | 1218 "+r"(width) // %4 |
1219 : | 1219 : |
1220 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", | 1220 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", |
1221 "v5", "v6", "v7" // Clobber List | 1221 "v5", "v6", "v7" // Clobber List |
1222 ); | 1222 ); |
1223 } | 1223 } |
1224 #endif // HAS_YUY2TOUVROW_NEON | 1224 #endif // HAS_YUY2TOUVROW_NEON |
1225 | 1225 |
1226 #ifdef HAS_UYVYTOUVROW_NEON | 1226 #ifdef HAS_UYVYTOUVROW_NEON |
1227 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, | 1227 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, |
1228 uint8* dst_u, uint8* dst_v, int pix) { | 1228 uint8* dst_u, uint8* dst_v, int width) { |
1229 const uint8* src_uyvyb = src_uyvy + stride_uyvy; | 1229 const uint8* src_uyvyb = src_uyvy + stride_uyvy; |
1230 asm volatile ( | 1230 asm volatile ( |
1231 "1: \n" | 1231 "1: \n" |
1232 MEMACCESS(0) | 1232 MEMACCESS(0) |
1233 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels | 1233 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels |
1234 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. | 1234 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. |
1235 MEMACCESS(1) | 1235 MEMACCESS(1) |
1236 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row | 1236 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row |
1237 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U | 1237 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U |
1238 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V | 1238 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V |
1239 MEMACCESS(2) | 1239 MEMACCESS(2) |
1240 "st1 {v0.8b}, [%2], #8 \n" // store 8 U. | 1240 "st1 {v0.8b}, [%2], #8 \n" // store 8 U. |
1241 MEMACCESS(3) | 1241 MEMACCESS(3) |
1242 "st1 {v2.8b}, [%3], #8 \n" // store 8 V. | 1242 "st1 {v2.8b}, [%3], #8 \n" // store 8 V. |
1243 "b.gt 1b \n" | 1243 "b.gt 1b \n" |
1244 : "+r"(src_uyvy), // %0 | 1244 : "+r"(src_uyvy), // %0 |
1245 "+r"(src_uyvyb), // %1 | 1245 "+r"(src_uyvyb), // %1 |
1246 "+r"(dst_u), // %2 | 1246 "+r"(dst_u), // %2 |
1247 "+r"(dst_v), // %3 | 1247 "+r"(dst_v), // %3 |
1248 "+r"(pix) // %4 | 1248 "+r"(width) // %4 |
1249 : | 1249 : |
1250 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", | 1250 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", |
1251 "v5", "v6", "v7" // Clobber List | 1251 "v5", "v6", "v7" // Clobber List |
1252 ); | 1252 ); |
1253 } | 1253 } |
1254 #endif // HAS_UYVYTOUVROW_NEON | 1254 #endif // HAS_UYVYTOUVROW_NEON |
1255 | 1255 |
1256 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 1256 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
1257 #ifdef HAS_ARGBSHUFFLEROW_NEON | 1257 #ifdef HAS_ARGBSHUFFLEROW_NEON |
1258 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, | 1258 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, |
1259 const uint8* shuffler, int pix) { | 1259 const uint8* shuffler, int width) { |
1260 asm volatile ( | 1260 asm volatile ( |
1261 MEMACCESS(3) | 1261 MEMACCESS(3) |
1262 "ld1 {v2.16b}, [%3] \n" // shuffler | 1262 "ld1 {v2.16b}, [%3] \n" // shuffler |
1263 "1: \n" | 1263 "1: \n" |
1264 MEMACCESS(0) | 1264 MEMACCESS(0) |
1265 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. | 1265 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. |
1266 "subs %w2, %w2, #4 \n" // 4 processed per loop | 1266 "subs %w2, %w2, #4 \n" // 4 processed per loop |
1267 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels | 1267 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels |
1268 MEMACCESS(1) | 1268 MEMACCESS(1) |
1269 "st1 {v1.16b}, [%1], #16 \n" // store 4. | 1269 "st1 {v1.16b}, [%1], #16 \n" // store 4. |
1270 "b.gt 1b \n" | 1270 "b.gt 1b \n" |
1271 : "+r"(src_argb), // %0 | 1271 : "+r"(src_argb), // %0 |
1272 "+r"(dst_argb), // %1 | 1272 "+r"(dst_argb), // %1 |
1273 "+r"(pix) // %2 | 1273 "+r"(width) // %2 |
1274 : "r"(shuffler) // %3 | 1274 : "r"(shuffler) // %3 |
1275 : "cc", "memory", "v0", "v1", "v2" // Clobber List | 1275 : "cc", "memory", "v0", "v1", "v2" // Clobber List |
1276 ); | 1276 ); |
1277 } | 1277 } |
1278 #endif // HAS_ARGBSHUFFLEROW_NEON | 1278 #endif // HAS_ARGBSHUFFLEROW_NEON |
1279 | 1279 |
1280 #ifdef HAS_I422TOYUY2ROW_NEON | 1280 #ifdef HAS_I422TOYUY2ROW_NEON |
1281 void I422ToYUY2Row_NEON(const uint8* src_y, | 1281 void I422ToYUY2Row_NEON(const uint8* src_y, |
1282 const uint8* src_u, | 1282 const uint8* src_u, |
1283 const uint8* src_v, | 1283 const uint8* src_v, |
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1329 "+r"(src_v), // %2 | 1329 "+r"(src_v), // %2 |
1330 "+r"(dst_uyvy), // %3 | 1330 "+r"(dst_uyvy), // %3 |
1331 "+r"(width) // %4 | 1331 "+r"(width) // %4 |
1332 : | 1332 : |
1333 : "cc", "memory", "v0", "v1", "v2", "v3" | 1333 : "cc", "memory", "v0", "v1", "v2", "v3" |
1334 ); | 1334 ); |
1335 } | 1335 } |
1336 #endif // HAS_I422TOUYVYROW_NEON | 1336 #endif // HAS_I422TOUYVYROW_NEON |
1337 | 1337 |
1338 #ifdef HAS_ARGBTORGB565ROW_NEON | 1338 #ifdef HAS_ARGBTORGB565ROW_NEON |
1339 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { | 1339 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { |
1340 asm volatile ( | 1340 asm volatile ( |
1341 "1: \n" | 1341 "1: \n" |
1342 MEMACCESS(0) | 1342 MEMACCESS(0) |
1343 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels | 1343 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels |
1344 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1344 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1345 ARGBTORGB565 | 1345 ARGBTORGB565 |
1346 MEMACCESS(1) | 1346 MEMACCESS(1) |
1347 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. | 1347 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. |
1348 "b.gt 1b \n" | 1348 "b.gt 1b \n" |
1349 : "+r"(src_argb), // %0 | 1349 : "+r"(src_argb), // %0 |
1350 "+r"(dst_rgb565), // %1 | 1350 "+r"(dst_rgb565), // %1 |
1351 "+r"(pix) // %2 | 1351 "+r"(width) // %2 |
1352 : | 1352 : |
1353 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" | 1353 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" |
1354 ); | 1354 ); |
1355 } | 1355 } |
1356 #endif // HAS_ARGBTORGB565ROW_NEON | 1356 #endif // HAS_ARGBTORGB565ROW_NEON |
1357 | 1357 |
1358 #ifdef HAS_ARGBTORGB565DITHERROW_NEON | 1358 #ifdef HAS_ARGBTORGB565DITHERROW_NEON |
1359 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, | 1359 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, |
1360 const uint32 dither4, int width) { | 1360 const uint32 dither4, int width) { |
1361 asm volatile ( | 1361 asm volatile ( |
(...skipping 13 matching lines...) Expand all Loading... |
1375 : "r"(src_argb), // %1 | 1375 : "r"(src_argb), // %1 |
1376 "r"(dither4), // %2 | 1376 "r"(dither4), // %2 |
1377 "r"(width) // %3 | 1377 "r"(width) // %3 |
1378 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23" | 1378 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23" |
1379 ); | 1379 ); |
1380 } | 1380 } |
1381 #endif // HAS_ARGBTORGB565ROW_NEON | 1381 #endif // HAS_ARGBTORGB565ROW_NEON |
1382 | 1382 |
1383 #ifdef HAS_ARGBTOARGB1555ROW_NEON | 1383 #ifdef HAS_ARGBTOARGB1555ROW_NEON |
1384 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, | 1384 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, |
1385 int pix) { | 1385 int width) { |
1386 asm volatile ( | 1386 asm volatile ( |
1387 "1: \n" | 1387 "1: \n" |
1388 MEMACCESS(0) | 1388 MEMACCESS(0) |
1389 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels | 1389 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels |
1390 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1390 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1391 ARGBTOARGB1555 | 1391 ARGBTOARGB1555 |
1392 MEMACCESS(1) | 1392 MEMACCESS(1) |
1393 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. | 1393 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. |
1394 "b.gt 1b \n" | 1394 "b.gt 1b \n" |
1395 : "+r"(src_argb), // %0 | 1395 : "+r"(src_argb), // %0 |
1396 "+r"(dst_argb1555), // %1 | 1396 "+r"(dst_argb1555), // %1 |
1397 "+r"(pix) // %2 | 1397 "+r"(width) // %2 |
1398 : | 1398 : |
1399 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" | 1399 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" |
1400 ); | 1400 ); |
1401 } | 1401 } |
1402 #endif // HAS_ARGBTOARGB1555ROW_NEON | 1402 #endif // HAS_ARGBTOARGB1555ROW_NEON |
1403 | 1403 |
1404 #ifdef HAS_ARGBTOARGB4444ROW_NEON | 1404 #ifdef HAS_ARGBTOARGB4444ROW_NEON |
1405 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, | 1405 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, |
1406 int pix) { | 1406 int width) { |
1407 asm volatile ( | 1407 asm volatile ( |
1408 "movi v4.16b, #0x0f \n" // bits to clear with vbic. | 1408 "movi v4.16b, #0x0f \n" // bits to clear with vbic. |
1409 "1: \n" | 1409 "1: \n" |
1410 MEMACCESS(0) | 1410 MEMACCESS(0) |
1411 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels | 1411 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels |
1412 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1412 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1413 ARGBTOARGB4444 | 1413 ARGBTOARGB4444 |
1414 MEMACCESS(1) | 1414 MEMACCESS(1) |
1415 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. | 1415 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. |
1416 "b.gt 1b \n" | 1416 "b.gt 1b \n" |
1417 : "+r"(src_argb), // %0 | 1417 : "+r"(src_argb), // %0 |
1418 "+r"(dst_argb4444), // %1 | 1418 "+r"(dst_argb4444), // %1 |
1419 "+r"(pix) // %2 | 1419 "+r"(width) // %2 |
1420 : | 1420 : |
1421 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" | 1421 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" |
1422 ); | 1422 ); |
1423 } | 1423 } |
1424 #endif // HAS_ARGBTOARGB4444ROW_NEON | 1424 #endif // HAS_ARGBTOARGB4444ROW_NEON |
1425 | 1425 |
1426 #ifdef HAS_ARGBTOYROW_NEON | 1426 #ifdef HAS_ARGBTOYROW_NEON |
1427 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { | 1427 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { |
1428 asm volatile ( | 1428 asm volatile ( |
1429 "movi v4.8b, #13 \n" // B * 0.1016 coefficient | 1429 "movi v4.8b, #13 \n" // B * 0.1016 coefficient |
1430 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 1430 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
1431 "movi v6.8b, #33 \n" // R * 0.2578 coefficient | 1431 "movi v6.8b, #33 \n" // R * 0.2578 coefficient |
1432 "movi v7.8b, #16 \n" // Add 16 constant | 1432 "movi v7.8b, #16 \n" // Add 16 constant |
1433 "1: \n" | 1433 "1: \n" |
1434 MEMACCESS(0) | 1434 MEMACCESS(0) |
1435 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 1435 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
1436 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1436 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1437 "umull v3.8h, v0.8b, v4.8b \n" // B | 1437 "umull v3.8h, v0.8b, v4.8b \n" // B |
1438 "umlal v3.8h, v1.8b, v5.8b \n" // G | 1438 "umlal v3.8h, v1.8b, v5.8b \n" // G |
1439 "umlal v3.8h, v2.8b, v6.8b \n" // R | 1439 "umlal v3.8h, v2.8b, v6.8b \n" // R |
1440 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y | 1440 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y |
1441 "uqadd v0.8b, v0.8b, v7.8b \n" | 1441 "uqadd v0.8b, v0.8b, v7.8b \n" |
1442 MEMACCESS(1) | 1442 MEMACCESS(1) |
1443 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 1443 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
1444 "b.gt 1b \n" | 1444 "b.gt 1b \n" |
1445 : "+r"(src_argb), // %0 | 1445 : "+r"(src_argb), // %0 |
1446 "+r"(dst_y), // %1 | 1446 "+r"(dst_y), // %1 |
1447 "+r"(pix) // %2 | 1447 "+r"(width) // %2 |
1448 : | 1448 : |
1449 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" | 1449 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
1450 ); | 1450 ); |
1451 } | 1451 } |
1452 #endif // HAS_ARGBTOYROW_NEON | 1452 #endif // HAS_ARGBTOYROW_NEON |
1453 | 1453 |
1454 #ifdef HAS_ARGBTOYJROW_NEON | 1454 #ifdef HAS_ARGBTOYJROW_NEON |
1455 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { | 1455 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { |
1456 asm volatile ( | 1456 asm volatile ( |
1457 "movi v4.8b, #15 \n" // B * 0.11400 coefficient | 1457 "movi v4.8b, #15 \n" // B * 0.11400 coefficient |
1458 "movi v5.8b, #75 \n" // G * 0.58700 coefficient | 1458 "movi v5.8b, #75 \n" // G * 0.58700 coefficient |
1459 "movi v6.8b, #38 \n" // R * 0.29900 coefficient | 1459 "movi v6.8b, #38 \n" // R * 0.29900 coefficient |
1460 "1: \n" | 1460 "1: \n" |
1461 MEMACCESS(0) | 1461 MEMACCESS(0) |
1462 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 1462 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
1463 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1463 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1464 "umull v3.8h, v0.8b, v4.8b \n" // B | 1464 "umull v3.8h, v0.8b, v4.8b \n" // B |
1465 "umlal v3.8h, v1.8b, v5.8b \n" // G | 1465 "umlal v3.8h, v1.8b, v5.8b \n" // G |
1466 "umlal v3.8h, v2.8b, v6.8b \n" // R | 1466 "umlal v3.8h, v2.8b, v6.8b \n" // R |
1467 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y | 1467 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y |
1468 MEMACCESS(1) | 1468 MEMACCESS(1) |
1469 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 1469 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
1470 "b.gt 1b \n" | 1470 "b.gt 1b \n" |
1471 : "+r"(src_argb), // %0 | 1471 : "+r"(src_argb), // %0 |
1472 "+r"(dst_y), // %1 | 1472 "+r"(dst_y), // %1 |
1473 "+r"(pix) // %2 | 1473 "+r"(width) // %2 |
1474 : | 1474 : |
1475 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" | 1475 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" |
1476 ); | 1476 ); |
1477 } | 1477 } |
1478 #endif // HAS_ARGBTOYJROW_NEON | 1478 #endif // HAS_ARGBTOYJROW_NEON |
1479 | 1479 |
1480 // 8x1 pixels. | 1480 // 8x1 pixels. |
1481 #ifdef HAS_ARGBTOUV444ROW_NEON | 1481 #ifdef HAS_ARGBTOUV444ROW_NEON |
1482 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, | 1482 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, |
1483 int pix) { | 1483 int width) { |
1484 asm volatile ( | 1484 asm volatile ( |
1485 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient | 1485 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient |
1486 "movi v25.8b, #74 \n" // UG -0.5781 coefficient | 1486 "movi v25.8b, #74 \n" // UG -0.5781 coefficient |
1487 "movi v26.8b, #38 \n" // UR -0.2969 coefficient | 1487 "movi v26.8b, #38 \n" // UR -0.2969 coefficient |
1488 "movi v27.8b, #18 \n" // VB -0.1406 coefficient | 1488 "movi v27.8b, #18 \n" // VB -0.1406 coefficient |
1489 "movi v28.8b, #94 \n" // VG -0.7344 coefficient | 1489 "movi v28.8b, #94 \n" // VG -0.7344 coefficient |
1490 "movi v29.16b,#0x80 \n" // 128.5 | 1490 "movi v29.16b,#0x80 \n" // 128.5 |
1491 "1: \n" | 1491 "1: \n" |
1492 MEMACCESS(0) | 1492 MEMACCESS(0) |
1493 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 1493 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
(...skipping 12 matching lines...) Expand all Loading... |
1506 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V | 1506 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V |
1507 | 1507 |
1508 MEMACCESS(1) | 1508 MEMACCESS(1) |
1509 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. | 1509 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. |
1510 MEMACCESS(2) | 1510 MEMACCESS(2) |
1511 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. | 1511 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. |
1512 "b.gt 1b \n" | 1512 "b.gt 1b \n" |
1513 : "+r"(src_argb), // %0 | 1513 : "+r"(src_argb), // %0 |
1514 "+r"(dst_u), // %1 | 1514 "+r"(dst_u), // %1 |
1515 "+r"(dst_v), // %2 | 1515 "+r"(dst_v), // %2 |
1516 "+r"(pix) // %3 | 1516 "+r"(width) // %3 |
1517 : | 1517 : |
1518 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", | 1518 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", |
1519 "v24", "v25", "v26", "v27", "v28", "v29" | 1519 "v24", "v25", "v26", "v27", "v28", "v29" |
1520 ); | 1520 ); |
1521 } | 1521 } |
1522 #endif // HAS_ARGBTOUV444ROW_NEON | 1522 #endif // HAS_ARGBTOUV444ROW_NEON |
1523 | 1523 |
1524 // 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16. | 1524 // 16x1 pixels -> 8x1. width is number of argb pixels. e.g. 16. |
1525 #ifdef HAS_ARGBTOUV422ROW_NEON | 1525 #ifdef HAS_ARGBTOUV422ROW_NEON |
1526 void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, | 1526 void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, |
1527 int pix) { | 1527 int width) { |
1528 asm volatile ( | 1528 asm volatile ( |
1529 RGBTOUV_SETUP_REG | 1529 RGBTOUV_SETUP_REG |
1530 "1: \n" | 1530 "1: \n" |
1531 MEMACCESS(0) | 1531 MEMACCESS(0) |
1532 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | 1532 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. |
1533 | 1533 |
1534 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. | 1534 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. |
1535 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 1535 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
1536 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. | 1536 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. |
1537 | 1537 |
(...skipping 12 matching lines...) Expand all Loading... |
1550 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V | 1550 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V |
1551 | 1551 |
1552 MEMACCESS(1) | 1552 MEMACCESS(1) |
1553 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. | 1553 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. |
1554 MEMACCESS(2) | 1554 MEMACCESS(2) |
1555 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. | 1555 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. |
1556 "b.gt 1b \n" | 1556 "b.gt 1b \n" |
1557 : "+r"(src_argb), // %0 | 1557 : "+r"(src_argb), // %0 |
1558 "+r"(dst_u), // %1 | 1558 "+r"(dst_u), // %1 |
1559 "+r"(dst_v), // %2 | 1559 "+r"(dst_v), // %2 |
1560 "+r"(pix) // %3 | 1560 "+r"(width) // %3 |
1561 : | 1561 : |
1562 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1562 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
1563 "v20", "v21", "v22", "v23", "v24", "v25" | 1563 "v20", "v21", "v22", "v23", "v24", "v25" |
1564 ); | 1564 ); |
1565 } | 1565 } |
1566 #endif // HAS_ARGBTOUV422ROW_NEON | 1566 #endif // HAS_ARGBTOUV422ROW_NEON |
1567 | 1567 |
1568 // 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32. | 1568 // 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32. |
1569 #ifdef HAS_ARGBTOUV411ROW_NEON | 1569 #ifdef HAS_ARGBTOUV411ROW_NEON |
1570 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, | 1570 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, |
1571 int pix) { | 1571 int width) { |
1572 asm volatile ( | 1572 asm volatile ( |
1573 RGBTOUV_SETUP_REG | 1573 RGBTOUV_SETUP_REG |
1574 "1: \n" | 1574 "1: \n" |
1575 MEMACCESS(0) | 1575 MEMACCESS(0) |
1576 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | 1576 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. |
1577 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. | 1577 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. |
1578 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 1578 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
1579 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. | 1579 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. |
1580 MEMACCESS(0) | 1580 MEMACCESS(0) |
1581 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16. | 1581 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16. |
(...skipping 21 matching lines...) Expand all Loading... |
1603 "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U | 1603 "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U |
1604 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V | 1604 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V |
1605 MEMACCESS(1) | 1605 MEMACCESS(1) |
1606 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. | 1606 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. |
1607 MEMACCESS(2) | 1607 MEMACCESS(2) |
1608 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. | 1608 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. |
1609 "b.gt 1b \n" | 1609 "b.gt 1b \n" |
1610 : "+r"(src_argb), // %0 | 1610 : "+r"(src_argb), // %0 |
1611 "+r"(dst_u), // %1 | 1611 "+r"(dst_u), // %1 |
1612 "+r"(dst_v), // %2 | 1612 "+r"(dst_v), // %2 |
1613 "+r"(pix) // %3 | 1613 "+r"(width) // %3 |
1614 : | 1614 : |
1615 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1615 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
1616 "v20", "v21", "v22", "v23", "v24", "v25" | 1616 "v20", "v21", "v22", "v23", "v24", "v25" |
1617 ); | 1617 ); |
1618 } | 1618 } |
1619 #endif // HAS_ARGBTOUV411ROW_NEON | 1619 #endif // HAS_ARGBTOUV411ROW_NEON |
1620 | 1620 |
1621 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. | 1621 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. |
1622 #define RGBTOUV(QB, QG, QR) \ | 1622 #define RGBTOUV(QB, QG, QR) \ |
1623 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ | 1623 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ |
1624 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ | 1624 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ |
1625 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ | 1625 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ |
1626 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ | 1626 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ |
1627 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ | 1627 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ |
1628 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ | 1628 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ |
1629 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ | 1629 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ |
1630 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ | 1630 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ |
1631 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ | 1631 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ |
1632 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ | 1632 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ |
1633 | 1633 |
1634 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. | 1634 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. |
1635 // TODO(fbarchard): consider ptrdiff_t for all strides. | 1635 // TODO(fbarchard): consider ptrdiff_t for all strides. |
1636 | 1636 |
1637 #ifdef HAS_ARGBTOUVROW_NEON | 1637 #ifdef HAS_ARGBTOUVROW_NEON |
1638 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, | 1638 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, |
1639 uint8* dst_u, uint8* dst_v, int pix) { | 1639 uint8* dst_u, uint8* dst_v, int width) { |
1640 const uint8* src_argb_1 = src_argb + src_stride_argb; | 1640 const uint8* src_argb_1 = src_argb + src_stride_argb; |
1641 asm volatile ( | 1641 asm volatile ( |
1642 RGBTOUV_SETUP_REG | 1642 RGBTOUV_SETUP_REG |
1643 "1: \n" | 1643 "1: \n" |
1644 MEMACCESS(0) | 1644 MEMACCESS(0) |
1645 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | 1645 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. |
1646 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. | 1646 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. |
1647 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 1647 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
1648 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. | 1648 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. |
1649 | 1649 |
(...skipping 11 matching lines...) Expand all Loading... |
1661 RGBTOUV(v0.8h, v1.8h, v2.8h) | 1661 RGBTOUV(v0.8h, v1.8h, v2.8h) |
1662 MEMACCESS(2) | 1662 MEMACCESS(2) |
1663 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 1663 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
1664 MEMACCESS(3) | 1664 MEMACCESS(3) |
1665 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 1665 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
1666 "b.gt 1b \n" | 1666 "b.gt 1b \n" |
1667 : "+r"(src_argb), // %0 | 1667 : "+r"(src_argb), // %0 |
1668 "+r"(src_argb_1), // %1 | 1668 "+r"(src_argb_1), // %1 |
1669 "+r"(dst_u), // %2 | 1669 "+r"(dst_u), // %2 |
1670 "+r"(dst_v), // %3 | 1670 "+r"(dst_v), // %3 |
1671 "+r"(pix) // %4 | 1671 "+r"(width) // %4 |
1672 : | 1672 : |
1673 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1673 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
1674 "v20", "v21", "v22", "v23", "v24", "v25" | 1674 "v20", "v21", "v22", "v23", "v24", "v25" |
1675 ); | 1675 ); |
1676 } | 1676 } |
1677 #endif // HAS_ARGBTOUVROW_NEON | 1677 #endif // HAS_ARGBTOUVROW_NEON |
1678 | 1678 |
1679 // TODO(fbarchard): Subsample match C code. | 1679 // TODO(fbarchard): Subsample match C code. |
1680 #ifdef HAS_ARGBTOUVJROW_NEON | 1680 #ifdef HAS_ARGBTOUVJROW_NEON |
1681 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, | 1681 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, |
1682 uint8* dst_u, uint8* dst_v, int pix) { | 1682 uint8* dst_u, uint8* dst_v, int width) { |
1683 const uint8* src_argb_1 = src_argb + src_stride_argb; | 1683 const uint8* src_argb_1 = src_argb + src_stride_argb; |
1684 asm volatile ( | 1684 asm volatile ( |
1685 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 | 1685 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 |
1686 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 | 1686 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 |
1687 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 | 1687 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 |
1688 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 | 1688 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 |
1689 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 | 1689 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 |
1690 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) | 1690 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) |
1691 "1: \n" | 1691 "1: \n" |
1692 MEMACCESS(0) | 1692 MEMACCESS(0) |
(...skipping 15 matching lines...) Expand all Loading... |
1708 RGBTOUV(v0.8h, v1.8h, v2.8h) | 1708 RGBTOUV(v0.8h, v1.8h, v2.8h) |
1709 MEMACCESS(2) | 1709 MEMACCESS(2) |
1710 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 1710 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
1711 MEMACCESS(3) | 1711 MEMACCESS(3) |
1712 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 1712 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
1713 "b.gt 1b \n" | 1713 "b.gt 1b \n" |
1714 : "+r"(src_argb), // %0 | 1714 : "+r"(src_argb), // %0 |
1715 "+r"(src_argb_1), // %1 | 1715 "+r"(src_argb_1), // %1 |
1716 "+r"(dst_u), // %2 | 1716 "+r"(dst_u), // %2 |
1717 "+r"(dst_v), // %3 | 1717 "+r"(dst_v), // %3 |
1718 "+r"(pix) // %4 | 1718 "+r"(width) // %4 |
1719 : | 1719 : |
1720 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1720 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
1721 "v20", "v21", "v22", "v23", "v24", "v25" | 1721 "v20", "v21", "v22", "v23", "v24", "v25" |
1722 ); | 1722 ); |
1723 } | 1723 } |
1724 #endif // HAS_ARGBTOUVJROW_NEON | 1724 #endif // HAS_ARGBTOUVJROW_NEON |
1725 | 1725 |
1726 #ifdef HAS_BGRATOUVROW_NEON | 1726 #ifdef HAS_BGRATOUVROW_NEON |
1727 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, | 1727 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, |
1728 uint8* dst_u, uint8* dst_v, int pix) { | 1728 uint8* dst_u, uint8* dst_v, int width) { |
1729 const uint8* src_bgra_1 = src_bgra + src_stride_bgra; | 1729 const uint8* src_bgra_1 = src_bgra + src_stride_bgra; |
1730 asm volatile ( | 1730 asm volatile ( |
1731 RGBTOUV_SETUP_REG | 1731 RGBTOUV_SETUP_REG |
1732 "1: \n" | 1732 "1: \n" |
1733 MEMACCESS(0) | 1733 MEMACCESS(0) |
1734 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | 1734 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. |
1735 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. | 1735 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. |
1736 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. | 1736 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. |
1737 "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. | 1737 "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. |
1738 MEMACCESS(1) | 1738 MEMACCESS(1) |
(...skipping 10 matching lines...) Expand all Loading... |
1749 RGBTOUV(v0.8h, v1.8h, v2.8h) | 1749 RGBTOUV(v0.8h, v1.8h, v2.8h) |
1750 MEMACCESS(2) | 1750 MEMACCESS(2) |
1751 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 1751 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
1752 MEMACCESS(3) | 1752 MEMACCESS(3) |
1753 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 1753 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
1754 "b.gt 1b \n" | 1754 "b.gt 1b \n" |
1755 : "+r"(src_bgra), // %0 | 1755 : "+r"(src_bgra), // %0 |
1756 "+r"(src_bgra_1), // %1 | 1756 "+r"(src_bgra_1), // %1 |
1757 "+r"(dst_u), // %2 | 1757 "+r"(dst_u), // %2 |
1758 "+r"(dst_v), // %3 | 1758 "+r"(dst_v), // %3 |
1759 "+r"(pix) // %4 | 1759 "+r"(width) // %4 |
1760 : | 1760 : |
1761 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1761 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
1762 "v20", "v21", "v22", "v23", "v24", "v25" | 1762 "v20", "v21", "v22", "v23", "v24", "v25" |
1763 ); | 1763 ); |
1764 } | 1764 } |
1765 #endif // HAS_BGRATOUVROW_NEON | 1765 #endif // HAS_BGRATOUVROW_NEON |
1766 | 1766 |
1767 #ifdef HAS_ABGRTOUVROW_NEON | 1767 #ifdef HAS_ABGRTOUVROW_NEON |
1768 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, | 1768 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, |
1769 uint8* dst_u, uint8* dst_v, int pix) { | 1769 uint8* dst_u, uint8* dst_v, int width) { |
1770 const uint8* src_abgr_1 = src_abgr + src_stride_abgr; | 1770 const uint8* src_abgr_1 = src_abgr + src_stride_abgr; |
1771 asm volatile ( | 1771 asm volatile ( |
1772 RGBTOUV_SETUP_REG | 1772 RGBTOUV_SETUP_REG |
1773 "1: \n" | 1773 "1: \n" |
1774 MEMACCESS(0) | 1774 MEMACCESS(0) |
1775 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | 1775 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. |
1776 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. | 1776 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. |
1777 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 1777 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
1778 "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. | 1778 "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. |
1779 MEMACCESS(1) | 1779 MEMACCESS(1) |
(...skipping 10 matching lines...) Expand all Loading... |
1790 RGBTOUV(v0.8h, v2.8h, v1.8h) | 1790 RGBTOUV(v0.8h, v2.8h, v1.8h) |
1791 MEMACCESS(2) | 1791 MEMACCESS(2) |
1792 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 1792 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
1793 MEMACCESS(3) | 1793 MEMACCESS(3) |
1794 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 1794 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
1795 "b.gt 1b \n" | 1795 "b.gt 1b \n" |
1796 : "+r"(src_abgr), // %0 | 1796 : "+r"(src_abgr), // %0 |
1797 "+r"(src_abgr_1), // %1 | 1797 "+r"(src_abgr_1), // %1 |
1798 "+r"(dst_u), // %2 | 1798 "+r"(dst_u), // %2 |
1799 "+r"(dst_v), // %3 | 1799 "+r"(dst_v), // %3 |
1800 "+r"(pix) // %4 | 1800 "+r"(width) // %4 |
1801 : | 1801 : |
1802 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1802 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
1803 "v20", "v21", "v22", "v23", "v24", "v25" | 1803 "v20", "v21", "v22", "v23", "v24", "v25" |
1804 ); | 1804 ); |
1805 } | 1805 } |
1806 #endif // HAS_ABGRTOUVROW_NEON | 1806 #endif // HAS_ABGRTOUVROW_NEON |
1807 | 1807 |
1808 #ifdef HAS_RGBATOUVROW_NEON | 1808 #ifdef HAS_RGBATOUVROW_NEON |
1809 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, | 1809 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, |
1810 uint8* dst_u, uint8* dst_v, int pix) { | 1810 uint8* dst_u, uint8* dst_v, int width) { |
1811 const uint8* src_rgba_1 = src_rgba + src_stride_rgba; | 1811 const uint8* src_rgba_1 = src_rgba + src_stride_rgba; |
1812 asm volatile ( | 1812 asm volatile ( |
1813 RGBTOUV_SETUP_REG | 1813 RGBTOUV_SETUP_REG |
1814 "1: \n" | 1814 "1: \n" |
1815 MEMACCESS(0) | 1815 MEMACCESS(0) |
1816 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | 1816 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. |
1817 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. | 1817 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. |
1818 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. | 1818 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. |
1819 "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. | 1819 "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. |
1820 MEMACCESS(1) | 1820 MEMACCESS(1) |
(...skipping 10 matching lines...) Expand all Loading... |
1831 RGBTOUV(v0.8h, v1.8h, v2.8h) | 1831 RGBTOUV(v0.8h, v1.8h, v2.8h) |
1832 MEMACCESS(2) | 1832 MEMACCESS(2) |
1833 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 1833 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
1834 MEMACCESS(3) | 1834 MEMACCESS(3) |
1835 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 1835 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
1836 "b.gt 1b \n" | 1836 "b.gt 1b \n" |
1837 : "+r"(src_rgba), // %0 | 1837 : "+r"(src_rgba), // %0 |
1838 "+r"(src_rgba_1), // %1 | 1838 "+r"(src_rgba_1), // %1 |
1839 "+r"(dst_u), // %2 | 1839 "+r"(dst_u), // %2 |
1840 "+r"(dst_v), // %3 | 1840 "+r"(dst_v), // %3 |
1841 "+r"(pix) // %4 | 1841 "+r"(width) // %4 |
1842 : | 1842 : |
1843 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1843 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
1844 "v20", "v21", "v22", "v23", "v24", "v25" | 1844 "v20", "v21", "v22", "v23", "v24", "v25" |
1845 ); | 1845 ); |
1846 } | 1846 } |
1847 #endif // HAS_RGBATOUVROW_NEON | 1847 #endif // HAS_RGBATOUVROW_NEON |
1848 | 1848 |
1849 #ifdef HAS_RGB24TOUVROW_NEON | 1849 #ifdef HAS_RGB24TOUVROW_NEON |
1850 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, | 1850 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, |
1851 uint8* dst_u, uint8* dst_v, int pix) { | 1851 uint8* dst_u, uint8* dst_v, int width) { |
1852 const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; | 1852 const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; |
1853 asm volatile ( | 1853 asm volatile ( |
1854 RGBTOUV_SETUP_REG | 1854 RGBTOUV_SETUP_REG |
1855 "1: \n" | 1855 "1: \n" |
1856 MEMACCESS(0) | 1856 MEMACCESS(0) |
1857 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. | 1857 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. |
1858 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. | 1858 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. |
1859 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 1859 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
1860 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. | 1860 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. |
1861 MEMACCESS(1) | 1861 MEMACCESS(1) |
(...skipping 10 matching lines...) Expand all Loading... |
1872 RGBTOUV(v0.8h, v1.8h, v2.8h) | 1872 RGBTOUV(v0.8h, v1.8h, v2.8h) |
1873 MEMACCESS(2) | 1873 MEMACCESS(2) |
1874 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 1874 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
1875 MEMACCESS(3) | 1875 MEMACCESS(3) |
1876 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 1876 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
1877 "b.gt 1b \n" | 1877 "b.gt 1b \n" |
1878 : "+r"(src_rgb24), // %0 | 1878 : "+r"(src_rgb24), // %0 |
1879 "+r"(src_rgb24_1), // %1 | 1879 "+r"(src_rgb24_1), // %1 |
1880 "+r"(dst_u), // %2 | 1880 "+r"(dst_u), // %2 |
1881 "+r"(dst_v), // %3 | 1881 "+r"(dst_v), // %3 |
1882 "+r"(pix) // %4 | 1882 "+r"(width) // %4 |
1883 : | 1883 : |
1884 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1884 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
1885 "v20", "v21", "v22", "v23", "v24", "v25" | 1885 "v20", "v21", "v22", "v23", "v24", "v25" |
1886 ); | 1886 ); |
1887 } | 1887 } |
1888 #endif // HAS_RGB24TOUVROW_NEON | 1888 #endif // HAS_RGB24TOUVROW_NEON |
1889 | 1889 |
1890 #ifdef HAS_RAWTOUVROW_NEON | 1890 #ifdef HAS_RAWTOUVROW_NEON |
1891 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, | 1891 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, |
1892 uint8* dst_u, uint8* dst_v, int pix) { | 1892 uint8* dst_u, uint8* dst_v, int width) { |
1893 const uint8* src_raw_1 = src_raw + src_stride_raw; | 1893 const uint8* src_raw_1 = src_raw + src_stride_raw; |
1894 asm volatile ( | 1894 asm volatile ( |
1895 RGBTOUV_SETUP_REG | 1895 RGBTOUV_SETUP_REG |
1896 "1: \n" | 1896 "1: \n" |
1897 MEMACCESS(0) | 1897 MEMACCESS(0) |
1898 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. | 1898 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. |
1899 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. | 1899 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. |
1900 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 1900 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
1901 "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. | 1901 "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. |
1902 MEMACCESS(1) | 1902 MEMACCESS(1) |
(...skipping 10 matching lines...) Expand all Loading... |
1913 RGBTOUV(v2.8h, v1.8h, v0.8h) | 1913 RGBTOUV(v2.8h, v1.8h, v0.8h) |
1914 MEMACCESS(2) | 1914 MEMACCESS(2) |
1915 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 1915 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
1916 MEMACCESS(3) | 1916 MEMACCESS(3) |
1917 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 1917 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
1918 "b.gt 1b \n" | 1918 "b.gt 1b \n" |
1919 : "+r"(src_raw), // %0 | 1919 : "+r"(src_raw), // %0 |
1920 "+r"(src_raw_1), // %1 | 1920 "+r"(src_raw_1), // %1 |
1921 "+r"(dst_u), // %2 | 1921 "+r"(dst_u), // %2 |
1922 "+r"(dst_v), // %3 | 1922 "+r"(dst_v), // %3 |
1923 "+r"(pix) // %4 | 1923 "+r"(width) // %4 |
1924 : | 1924 : |
1925 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1925 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
1926 "v20", "v21", "v22", "v23", "v24", "v25" | 1926 "v20", "v21", "v22", "v23", "v24", "v25" |
1927 ); | 1927 ); |
1928 } | 1928 } |
1929 #endif // HAS_RAWTOUVROW_NEON | 1929 #endif // HAS_RAWTOUVROW_NEON |
1930 | 1930 |
1931 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. | 1931 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. |
1932 #ifdef HAS_RGB565TOUVROW_NEON | 1932 #ifdef HAS_RGB565TOUVROW_NEON |
1933 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, | 1933 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, |
1934 uint8* dst_u, uint8* dst_v, int pix) { | 1934 uint8* dst_u, uint8* dst_v, int width) { |
1935 const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; | 1935 const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; |
1936 asm volatile ( | 1936 asm volatile ( |
1937 "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 | 1937 "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 |
1938 "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 | 1938 "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 |
1939 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 | 1939 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 |
1940 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 | 1940 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 |
1941 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 | 1941 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 |
1942 "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) | 1942 "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) |
1943 "1: \n" | 1943 "1: \n" |
1944 MEMACCESS(0) | 1944 MEMACCESS(0) |
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1988 "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V | 1988 "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V |
1989 MEMACCESS(2) | 1989 MEMACCESS(2) |
1990 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 1990 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
1991 MEMACCESS(3) | 1991 MEMACCESS(3) |
1992 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 1992 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
1993 "b.gt 1b \n" | 1993 "b.gt 1b \n" |
1994 : "+r"(src_rgb565), // %0 | 1994 : "+r"(src_rgb565), // %0 |
1995 "+r"(src_rgb565_1), // %1 | 1995 "+r"(src_rgb565_1), // %1 |
1996 "+r"(dst_u), // %2 | 1996 "+r"(dst_u), // %2 |
1997 "+r"(dst_v), // %3 | 1997 "+r"(dst_v), // %3 |
1998 "+r"(pix) // %4 | 1998 "+r"(width) // %4 |
1999 : | 1999 : |
2000 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 2000 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
2001 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", | 2001 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", |
2002 "v25", "v26", "v27" | 2002 "v25", "v26", "v27" |
2003 ); | 2003 ); |
2004 } | 2004 } |
2005 #endif // HAS_RGB565TOUVROW_NEON | 2005 #endif // HAS_RGB565TOUVROW_NEON |
2006 | 2006 |
2007 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. | 2007 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. |
2008 #ifdef HAS_ARGB1555TOUVROW_NEON | 2008 #ifdef HAS_ARGB1555TOUVROW_NEON |
2009 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, | 2009 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, |
2010 uint8* dst_u, uint8* dst_v, int pix) { | 2010 uint8* dst_u, uint8* dst_v, int width) { |
2011 const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; | 2011 const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; |
2012 asm volatile ( | 2012 asm volatile ( |
2013 RGBTOUV_SETUP_REG | 2013 RGBTOUV_SETUP_REG |
2014 "1: \n" | 2014 "1: \n" |
2015 MEMACCESS(0) | 2015 MEMACCESS(0) |
2016 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. | 2016 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. |
2017 RGB555TOARGB | 2017 RGB555TOARGB |
2018 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. | 2018 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. |
2019 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. | 2019 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. |
2020 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. | 2020 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2059 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V | 2059 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V |
2060 MEMACCESS(2) | 2060 MEMACCESS(2) |
2061 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 2061 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
2062 MEMACCESS(3) | 2062 MEMACCESS(3) |
2063 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 2063 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
2064 "b.gt 1b \n" | 2064 "b.gt 1b \n" |
2065 : "+r"(src_argb1555), // %0 | 2065 : "+r"(src_argb1555), // %0 |
2066 "+r"(src_argb1555_1), // %1 | 2066 "+r"(src_argb1555_1), // %1 |
2067 "+r"(dst_u), // %2 | 2067 "+r"(dst_u), // %2 |
2068 "+r"(dst_v), // %3 | 2068 "+r"(dst_v), // %3 |
2069 "+r"(pix) // %4 | 2069 "+r"(width) // %4 |
2070 : | 2070 : |
2071 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", | 2071 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", |
2072 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", | 2072 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", |
2073 "v26", "v27", "v28" | 2073 "v26", "v27", "v28" |
2074 ); | 2074 ); |
2075 } | 2075 } |
2076 #endif // HAS_ARGB1555TOUVROW_NEON | 2076 #endif // HAS_ARGB1555TOUVROW_NEON |
2077 | 2077 |
2078 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. | 2078 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. |
2079 #ifdef HAS_ARGB4444TOUVROW_NEON | 2079 #ifdef HAS_ARGB4444TOUVROW_NEON |
2080 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, | 2080 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, |
2081 uint8* dst_u, uint8* dst_v, int pix) { | 2081 uint8* dst_u, uint8* dst_v, int width) { |
2082 const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; | 2082 const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; |
2083 asm volatile ( | 2083 asm volatile ( |
2084 RGBTOUV_SETUP_REG | 2084 RGBTOUV_SETUP_REG |
2085 "1: \n" | 2085 "1: \n" |
2086 MEMACCESS(0) | 2086 MEMACCESS(0) |
2087 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. | 2087 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. |
2088 ARGB4444TOARGB | 2088 ARGB4444TOARGB |
2089 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. | 2089 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. |
2090 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. | 2090 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. |
2091 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. | 2091 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2130 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V | 2130 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V |
2131 MEMACCESS(2) | 2131 MEMACCESS(2) |
2132 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 2132 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
2133 MEMACCESS(3) | 2133 MEMACCESS(3) |
2134 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 2134 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
2135 "b.gt 1b \n" | 2135 "b.gt 1b \n" |
2136 : "+r"(src_argb4444), // %0 | 2136 : "+r"(src_argb4444), // %0 |
2137 "+r"(src_argb4444_1), // %1 | 2137 "+r"(src_argb4444_1), // %1 |
2138 "+r"(dst_u), // %2 | 2138 "+r"(dst_u), // %2 |
2139 "+r"(dst_v), // %3 | 2139 "+r"(dst_v), // %3 |
2140 "+r"(pix) // %4 | 2140 "+r"(width) // %4 |
2141 : | 2141 : |
2142 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", | 2142 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", |
2143 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", | 2143 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", |
2144 "v26", "v27", "v28" | 2144 "v26", "v27", "v28" |
2145 | 2145 |
2146 ); | 2146 ); |
2147 } | 2147 } |
2148 #endif // HAS_ARGB4444TOUVROW_NEON | 2148 #endif // HAS_ARGB4444TOUVROW_NEON |
2149 | 2149 |
2150 #ifdef HAS_RGB565TOYROW_NEON | 2150 #ifdef HAS_RGB565TOYROW_NEON |
2151 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { | 2151 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { |
2152 asm volatile ( | 2152 asm volatile ( |
2153 "movi v24.8b, #13 \n" // B * 0.1016 coefficient | 2153 "movi v24.8b, #13 \n" // B * 0.1016 coefficient |
2154 "movi v25.8b, #65 \n" // G * 0.5078 coefficient | 2154 "movi v25.8b, #65 \n" // G * 0.5078 coefficient |
2155 "movi v26.8b, #33 \n" // R * 0.2578 coefficient | 2155 "movi v26.8b, #33 \n" // R * 0.2578 coefficient |
2156 "movi v27.8b, #16 \n" // Add 16 constant | 2156 "movi v27.8b, #16 \n" // Add 16 constant |
2157 "1: \n" | 2157 "1: \n" |
2158 MEMACCESS(0) | 2158 MEMACCESS(0) |
2159 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. | 2159 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. |
2160 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2160 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2161 RGB565TOARGB | 2161 RGB565TOARGB |
2162 "umull v3.8h, v0.8b, v24.8b \n" // B | 2162 "umull v3.8h, v0.8b, v24.8b \n" // B |
2163 "umlal v3.8h, v1.8b, v25.8b \n" // G | 2163 "umlal v3.8h, v1.8b, v25.8b \n" // G |
2164 "umlal v3.8h, v2.8b, v26.8b \n" // R | 2164 "umlal v3.8h, v2.8b, v26.8b \n" // R |
2165 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y | 2165 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y |
2166 "uqadd v0.8b, v0.8b, v27.8b \n" | 2166 "uqadd v0.8b, v0.8b, v27.8b \n" |
2167 MEMACCESS(1) | 2167 MEMACCESS(1) |
2168 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2168 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2169 "b.gt 1b \n" | 2169 "b.gt 1b \n" |
2170 : "+r"(src_rgb565), // %0 | 2170 : "+r"(src_rgb565), // %0 |
2171 "+r"(dst_y), // %1 | 2171 "+r"(dst_y), // %1 |
2172 "+r"(pix) // %2 | 2172 "+r"(width) // %2 |
2173 : | 2173 : |
2174 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", | 2174 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", |
2175 "v24", "v25", "v26", "v27" | 2175 "v24", "v25", "v26", "v27" |
2176 ); | 2176 ); |
2177 } | 2177 } |
2178 #endif // HAS_RGB565TOYROW_NEON | 2178 #endif // HAS_RGB565TOYROW_NEON |
2179 | 2179 |
2180 #ifdef HAS_ARGB1555TOYROW_NEON | 2180 #ifdef HAS_ARGB1555TOYROW_NEON |
2181 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { | 2181 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { |
2182 asm volatile ( | 2182 asm volatile ( |
2183 "movi v4.8b, #13 \n" // B * 0.1016 coefficient | 2183 "movi v4.8b, #13 \n" // B * 0.1016 coefficient |
2184 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2184 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
2185 "movi v6.8b, #33 \n" // R * 0.2578 coefficient | 2185 "movi v6.8b, #33 \n" // R * 0.2578 coefficient |
2186 "movi v7.8b, #16 \n" // Add 16 constant | 2186 "movi v7.8b, #16 \n" // Add 16 constant |
2187 "1: \n" | 2187 "1: \n" |
2188 MEMACCESS(0) | 2188 MEMACCESS(0) |
2189 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. | 2189 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. |
2190 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2190 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2191 ARGB1555TOARGB | 2191 ARGB1555TOARGB |
2192 "umull v3.8h, v0.8b, v4.8b \n" // B | 2192 "umull v3.8h, v0.8b, v4.8b \n" // B |
2193 "umlal v3.8h, v1.8b, v5.8b \n" // G | 2193 "umlal v3.8h, v1.8b, v5.8b \n" // G |
2194 "umlal v3.8h, v2.8b, v6.8b \n" // R | 2194 "umlal v3.8h, v2.8b, v6.8b \n" // R |
2195 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y | 2195 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y |
2196 "uqadd v0.8b, v0.8b, v7.8b \n" | 2196 "uqadd v0.8b, v0.8b, v7.8b \n" |
2197 MEMACCESS(1) | 2197 MEMACCESS(1) |
2198 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2198 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2199 "b.gt 1b \n" | 2199 "b.gt 1b \n" |
2200 : "+r"(src_argb1555), // %0 | 2200 : "+r"(src_argb1555), // %0 |
2201 "+r"(dst_y), // %1 | 2201 "+r"(dst_y), // %1 |
2202 "+r"(pix) // %2 | 2202 "+r"(width) // %2 |
2203 : | 2203 : |
2204 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" | 2204 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
2205 ); | 2205 ); |
2206 } | 2206 } |
2207 #endif // HAS_ARGB1555TOYROW_NEON | 2207 #endif // HAS_ARGB1555TOYROW_NEON |
2208 | 2208 |
2209 #ifdef HAS_ARGB4444TOYROW_NEON | 2209 #ifdef HAS_ARGB4444TOYROW_NEON |
2210 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { | 2210 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { |
2211 asm volatile ( | 2211 asm volatile ( |
2212 "movi v24.8b, #13 \n" // B * 0.1016 coefficient | 2212 "movi v24.8b, #13 \n" // B * 0.1016 coefficient |
2213 "movi v25.8b, #65 \n" // G * 0.5078 coefficient | 2213 "movi v25.8b, #65 \n" // G * 0.5078 coefficient |
2214 "movi v26.8b, #33 \n" // R * 0.2578 coefficient | 2214 "movi v26.8b, #33 \n" // R * 0.2578 coefficient |
2215 "movi v27.8b, #16 \n" // Add 16 constant | 2215 "movi v27.8b, #16 \n" // Add 16 constant |
2216 "1: \n" | 2216 "1: \n" |
2217 MEMACCESS(0) | 2217 MEMACCESS(0) |
2218 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. | 2218 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. |
2219 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2219 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2220 ARGB4444TOARGB | 2220 ARGB4444TOARGB |
2221 "umull v3.8h, v0.8b, v24.8b \n" // B | 2221 "umull v3.8h, v0.8b, v24.8b \n" // B |
2222 "umlal v3.8h, v1.8b, v25.8b \n" // G | 2222 "umlal v3.8h, v1.8b, v25.8b \n" // G |
2223 "umlal v3.8h, v2.8b, v26.8b \n" // R | 2223 "umlal v3.8h, v2.8b, v26.8b \n" // R |
2224 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y | 2224 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y |
2225 "uqadd v0.8b, v0.8b, v27.8b \n" | 2225 "uqadd v0.8b, v0.8b, v27.8b \n" |
2226 MEMACCESS(1) | 2226 MEMACCESS(1) |
2227 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2227 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2228 "b.gt 1b \n" | 2228 "b.gt 1b \n" |
2229 : "+r"(src_argb4444), // %0 | 2229 : "+r"(src_argb4444), // %0 |
2230 "+r"(dst_y), // %1 | 2230 "+r"(dst_y), // %1 |
2231 "+r"(pix) // %2 | 2231 "+r"(width) // %2 |
2232 : | 2232 : |
2233 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" | 2233 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" |
2234 ); | 2234 ); |
2235 } | 2235 } |
2236 #endif // HAS_ARGB4444TOYROW_NEON | 2236 #endif // HAS_ARGB4444TOYROW_NEON |
2237 | 2237 |
2238 #ifdef HAS_BGRATOYROW_NEON | 2238 #ifdef HAS_BGRATOYROW_NEON |
2239 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { | 2239 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { |
2240 asm volatile ( | 2240 asm volatile ( |
2241 "movi v4.8b, #33 \n" // R * 0.2578 coefficient | 2241 "movi v4.8b, #33 \n" // R * 0.2578 coefficient |
2242 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2242 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
2243 "movi v6.8b, #13 \n" // B * 0.1016 coefficient | 2243 "movi v6.8b, #13 \n" // B * 0.1016 coefficient |
2244 "movi v7.8b, #16 \n" // Add 16 constant | 2244 "movi v7.8b, #16 \n" // Add 16 constant |
2245 "1: \n" | 2245 "1: \n" |
2246 MEMACCESS(0) | 2246 MEMACCESS(0) |
2247 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. | 2247 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. |
2248 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2248 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2249 "umull v16.8h, v1.8b, v4.8b \n" // R | 2249 "umull v16.8h, v1.8b, v4.8b \n" // R |
2250 "umlal v16.8h, v2.8b, v5.8b \n" // G | 2250 "umlal v16.8h, v2.8b, v5.8b \n" // G |
2251 "umlal v16.8h, v3.8b, v6.8b \n" // B | 2251 "umlal v16.8h, v3.8b, v6.8b \n" // B |
2252 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2252 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
2253 "uqadd v0.8b, v0.8b, v7.8b \n" | 2253 "uqadd v0.8b, v0.8b, v7.8b \n" |
2254 MEMACCESS(1) | 2254 MEMACCESS(1) |
2255 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2255 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2256 "b.gt 1b \n" | 2256 "b.gt 1b \n" |
2257 : "+r"(src_bgra), // %0 | 2257 : "+r"(src_bgra), // %0 |
2258 "+r"(dst_y), // %1 | 2258 "+r"(dst_y), // %1 |
2259 "+r"(pix) // %2 | 2259 "+r"(width) // %2 |
2260 : | 2260 : |
2261 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2261 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
2262 ); | 2262 ); |
2263 } | 2263 } |
2264 #endif // HAS_BGRATOYROW_NEON | 2264 #endif // HAS_BGRATOYROW_NEON |
2265 | 2265 |
2266 #ifdef HAS_ABGRTOYROW_NEON | 2266 #ifdef HAS_ABGRTOYROW_NEON |
2267 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { | 2267 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { |
2268 asm volatile ( | 2268 asm volatile ( |
2269 "movi v4.8b, #33 \n" // R * 0.2578 coefficient | 2269 "movi v4.8b, #33 \n" // R * 0.2578 coefficient |
2270 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2270 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
2271 "movi v6.8b, #13 \n" // B * 0.1016 coefficient | 2271 "movi v6.8b, #13 \n" // B * 0.1016 coefficient |
2272 "movi v7.8b, #16 \n" // Add 16 constant | 2272 "movi v7.8b, #16 \n" // Add 16 constant |
2273 "1: \n" | 2273 "1: \n" |
2274 MEMACCESS(0) | 2274 MEMACCESS(0) |
2275 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. | 2275 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. |
2276 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2276 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2277 "umull v16.8h, v0.8b, v4.8b \n" // R | 2277 "umull v16.8h, v0.8b, v4.8b \n" // R |
2278 "umlal v16.8h, v1.8b, v5.8b \n" // G | 2278 "umlal v16.8h, v1.8b, v5.8b \n" // G |
2279 "umlal v16.8h, v2.8b, v6.8b \n" // B | 2279 "umlal v16.8h, v2.8b, v6.8b \n" // B |
2280 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2280 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
2281 "uqadd v0.8b, v0.8b, v7.8b \n" | 2281 "uqadd v0.8b, v0.8b, v7.8b \n" |
2282 MEMACCESS(1) | 2282 MEMACCESS(1) |
2283 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2283 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2284 "b.gt 1b \n" | 2284 "b.gt 1b \n" |
2285 : "+r"(src_abgr), // %0 | 2285 : "+r"(src_abgr), // %0 |
2286 "+r"(dst_y), // %1 | 2286 "+r"(dst_y), // %1 |
2287 "+r"(pix) // %2 | 2287 "+r"(width) // %2 |
2288 : | 2288 : |
2289 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2289 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
2290 ); | 2290 ); |
2291 } | 2291 } |
2292 #endif // HAS_ABGRTOYROW_NEON | 2292 #endif // HAS_ABGRTOYROW_NEON |
2293 | 2293 |
2294 #ifdef HAS_RGBATOYROW_NEON | 2294 #ifdef HAS_RGBATOYROW_NEON |
2295 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { | 2295 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { |
2296 asm volatile ( | 2296 asm volatile ( |
2297 "movi v4.8b, #13 \n" // B * 0.1016 coefficient | 2297 "movi v4.8b, #13 \n" // B * 0.1016 coefficient |
2298 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2298 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
2299 "movi v6.8b, #33 \n" // R * 0.2578 coefficient | 2299 "movi v6.8b, #33 \n" // R * 0.2578 coefficient |
2300 "movi v7.8b, #16 \n" // Add 16 constant | 2300 "movi v7.8b, #16 \n" // Add 16 constant |
2301 "1: \n" | 2301 "1: \n" |
2302 MEMACCESS(0) | 2302 MEMACCESS(0) |
2303 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. | 2303 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. |
2304 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2304 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2305 "umull v16.8h, v1.8b, v4.8b \n" // B | 2305 "umull v16.8h, v1.8b, v4.8b \n" // B |
2306 "umlal v16.8h, v2.8b, v5.8b \n" // G | 2306 "umlal v16.8h, v2.8b, v5.8b \n" // G |
2307 "umlal v16.8h, v3.8b, v6.8b \n" // R | 2307 "umlal v16.8h, v3.8b, v6.8b \n" // R |
2308 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2308 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
2309 "uqadd v0.8b, v0.8b, v7.8b \n" | 2309 "uqadd v0.8b, v0.8b, v7.8b \n" |
2310 MEMACCESS(1) | 2310 MEMACCESS(1) |
2311 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2311 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2312 "b.gt 1b \n" | 2312 "b.gt 1b \n" |
2313 : "+r"(src_rgba), // %0 | 2313 : "+r"(src_rgba), // %0 |
2314 "+r"(dst_y), // %1 | 2314 "+r"(dst_y), // %1 |
2315 "+r"(pix) // %2 | 2315 "+r"(width) // %2 |
2316 : | 2316 : |
2317 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2317 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
2318 ); | 2318 ); |
2319 } | 2319 } |
2320 #endif // HAS_RGBATOYROW_NEON | 2320 #endif // HAS_RGBATOYROW_NEON |
2321 | 2321 |
2322 #ifdef HAS_RGB24TOYROW_NEON | 2322 #ifdef HAS_RGB24TOYROW_NEON |
2323 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { | 2323 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { |
2324 asm volatile ( | 2324 asm volatile ( |
2325 "movi v4.8b, #13 \n" // B * 0.1016 coefficient | 2325 "movi v4.8b, #13 \n" // B * 0.1016 coefficient |
2326 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2326 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
2327 "movi v6.8b, #33 \n" // R * 0.2578 coefficient | 2327 "movi v6.8b, #33 \n" // R * 0.2578 coefficient |
2328 "movi v7.8b, #16 \n" // Add 16 constant | 2328 "movi v7.8b, #16 \n" // Add 16 constant |
2329 "1: \n" | 2329 "1: \n" |
2330 MEMACCESS(0) | 2330 MEMACCESS(0) |
2331 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. | 2331 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. |
2332 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2332 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2333 "umull v16.8h, v0.8b, v4.8b \n" // B | 2333 "umull v16.8h, v0.8b, v4.8b \n" // B |
2334 "umlal v16.8h, v1.8b, v5.8b \n" // G | 2334 "umlal v16.8h, v1.8b, v5.8b \n" // G |
2335 "umlal v16.8h, v2.8b, v6.8b \n" // R | 2335 "umlal v16.8h, v2.8b, v6.8b \n" // R |
2336 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2336 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
2337 "uqadd v0.8b, v0.8b, v7.8b \n" | 2337 "uqadd v0.8b, v0.8b, v7.8b \n" |
2338 MEMACCESS(1) | 2338 MEMACCESS(1) |
2339 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2339 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2340 "b.gt 1b \n" | 2340 "b.gt 1b \n" |
2341 : "+r"(src_rgb24), // %0 | 2341 : "+r"(src_rgb24), // %0 |
2342 "+r"(dst_y), // %1 | 2342 "+r"(dst_y), // %1 |
2343 "+r"(pix) // %2 | 2343 "+r"(width) // %2 |
2344 : | 2344 : |
2345 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2345 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
2346 ); | 2346 ); |
2347 } | 2347 } |
2348 #endif // HAS_RGB24TOYROW_NEON | 2348 #endif // HAS_RGB24TOYROW_NEON |
2349 | 2349 |
2350 #ifdef HAS_RAWTOYROW_NEON | 2350 #ifdef HAS_RAWTOYROW_NEON |
2351 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { | 2351 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { |
2352 asm volatile ( | 2352 asm volatile ( |
2353 "movi v4.8b, #33 \n" // R * 0.2578 coefficient | 2353 "movi v4.8b, #33 \n" // R * 0.2578 coefficient |
2354 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2354 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
2355 "movi v6.8b, #13 \n" // B * 0.1016 coefficient | 2355 "movi v6.8b, #13 \n" // B * 0.1016 coefficient |
2356 "movi v7.8b, #16 \n" // Add 16 constant | 2356 "movi v7.8b, #16 \n" // Add 16 constant |
2357 "1: \n" | 2357 "1: \n" |
2358 MEMACCESS(0) | 2358 MEMACCESS(0) |
2359 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. | 2359 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. |
2360 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2360 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2361 "umull v16.8h, v0.8b, v4.8b \n" // B | 2361 "umull v16.8h, v0.8b, v4.8b \n" // B |
2362 "umlal v16.8h, v1.8b, v5.8b \n" // G | 2362 "umlal v16.8h, v1.8b, v5.8b \n" // G |
2363 "umlal v16.8h, v2.8b, v6.8b \n" // R | 2363 "umlal v16.8h, v2.8b, v6.8b \n" // R |
2364 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2364 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
2365 "uqadd v0.8b, v0.8b, v7.8b \n" | 2365 "uqadd v0.8b, v0.8b, v7.8b \n" |
2366 MEMACCESS(1) | 2366 MEMACCESS(1) |
2367 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2367 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2368 "b.gt 1b \n" | 2368 "b.gt 1b \n" |
2369 : "+r"(src_raw), // %0 | 2369 : "+r"(src_raw), // %0 |
2370 "+r"(dst_y), // %1 | 2370 "+r"(dst_y), // %1 |
2371 "+r"(pix) // %2 | 2371 "+r"(width) // %2 |
2372 : | 2372 : |
2373 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2373 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
2374 ); | 2374 ); |
2375 } | 2375 } |
2376 #endif // HAS_RAWTOYROW_NEON | 2376 #endif // HAS_RAWTOYROW_NEON |
2377 | 2377 |
2378 // Bilinear filter 16x2 -> 16x1 | 2378 // Bilinear filter 16x2 -> 16x1 |
2379 #ifdef HAS_INTERPOLATEROW_NEON | 2379 #ifdef HAS_INTERPOLATEROW_NEON |
2380 void InterpolateRow_NEON(uint8* dst_ptr, | 2380 void InterpolateRow_NEON(uint8* dst_ptr, |
2381 const uint8* src_ptr, ptrdiff_t src_stride, | 2381 const uint8* src_ptr, ptrdiff_t src_stride, |
(...skipping 690 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3072 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 3072 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
3073 ); | 3073 ); |
3074 } | 3074 } |
3075 #endif // HAS_SOBELYROW_NEON | 3075 #endif // HAS_SOBELYROW_NEON |
3076 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 3076 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |
3077 | 3077 |
3078 #ifdef __cplusplus | 3078 #ifdef __cplusplus |
3079 } // extern "C" | 3079 } // extern "C" |
3080 } // namespace libyuv | 3080 } // namespace libyuv |
3081 #endif | 3081 #endif |
OLD | NEW |