Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(271)

Side by Side Diff: source/row_neon64.cc

Issue 1398633002: change all pix parameters to width for consistency (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/row_neon.cc ('k') | source/row_win.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 884 matching lines...) Expand 10 before | Expand all | Expand 10 after
895 : "+r"(src), // %0 895 : "+r"(src), // %0
896 "+r"(dst), // %1 896 "+r"(dst), // %1
897 "+r"(width64) // %2 897 "+r"(width64) // %2
898 : "r"((ptrdiff_t)-16) // %3 898 : "r"((ptrdiff_t)-16) // %3
899 : "cc", "memory", "v0" 899 : "cc", "memory", "v0"
900 ); 900 );
901 } 901 }
902 #endif // HAS_ARGBMIRRORROW_NEON 902 #endif // HAS_ARGBMIRRORROW_NEON
903 903
904 #ifdef HAS_RGB24TOARGBROW_NEON 904 #ifdef HAS_RGB24TOARGBROW_NEON
905 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { 905 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
906 asm volatile ( 906 asm volatile (
907 "movi v4.8b, #255 \n" // Alpha 907 "movi v4.8b, #255 \n" // Alpha
908 "1: \n" 908 "1: \n"
909 MEMACCESS(0) 909 MEMACCESS(0)
910 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. 910 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
911 "subs %w2, %w2, #8 \n" // 8 processed per loop. 911 "subs %w2, %w2, #8 \n" // 8 processed per loop.
912 MEMACCESS(1) 912 MEMACCESS(1)
913 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels 913 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels
914 "b.gt 1b \n" 914 "b.gt 1b \n"
915 : "+r"(src_rgb24), // %0 915 : "+r"(src_rgb24), // %0
916 "+r"(dst_argb), // %1 916 "+r"(dst_argb), // %1
917 "+r"(pix) // %2 917 "+r"(width) // %2
918 : 918 :
919 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List 919 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
920 ); 920 );
921 } 921 }
922 #endif // HAS_RGB24TOARGBROW_NEON 922 #endif // HAS_RGB24TOARGBROW_NEON
923 923
924 #ifdef HAS_RAWTOARGBROW_NEON 924 #ifdef HAS_RAWTOARGBROW_NEON
925 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { 925 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
926 asm volatile ( 926 asm volatile (
927 "movi v5.8b, #255 \n" // Alpha 927 "movi v5.8b, #255 \n" // Alpha
928 "1: \n" 928 "1: \n"
929 MEMACCESS(0) 929 MEMACCESS(0)
930 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b 930 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
931 "subs %w2, %w2, #8 \n" // 8 processed per loop. 931 "subs %w2, %w2, #8 \n" // 8 processed per loop.
932 "orr v3.8b, v1.8b, v1.8b \n" // move g 932 "orr v3.8b, v1.8b, v1.8b \n" // move g
933 "orr v4.8b, v0.8b, v0.8b \n" // move r 933 "orr v4.8b, v0.8b, v0.8b \n" // move r
934 MEMACCESS(1) 934 MEMACCESS(1)
935 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a 935 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
936 "b.gt 1b \n" 936 "b.gt 1b \n"
937 : "+r"(src_raw), // %0 937 : "+r"(src_raw), // %0
938 "+r"(dst_argb), // %1 938 "+r"(dst_argb), // %1
939 "+r"(pix) // %2 939 "+r"(width) // %2
940 : 940 :
941 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List 941 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
942 ); 942 );
943 } 943 }
944 #endif // HAS_RAWTOARGBROW_NEON 944 #endif // HAS_RAWTOARGBROW_NEON
945 945
946 #define RGB565TOARGB \ 946 #define RGB565TOARGB \
947 "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ 947 "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
948 "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ 948 "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
949 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ 949 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
950 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ 950 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \
951 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ 951 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
952 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ 952 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
953 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ 953 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
954 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ 954 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
955 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ 955 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
956 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ 956 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
957 "dup v2.2D, v0.D[1] \n" /* R */ 957 "dup v2.2D, v0.D[1] \n" /* R */
958 958
959 #ifdef HAS_RGB565TOARGBROW_NEON 959 #ifdef HAS_RGB565TOARGBROW_NEON
960 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { 960 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
961 asm volatile ( 961 asm volatile (
962 "movi v3.8b, #255 \n" // Alpha 962 "movi v3.8b, #255 \n" // Alpha
963 "1: \n" 963 "1: \n"
964 MEMACCESS(0) 964 MEMACCESS(0)
965 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 965 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
966 "subs %w2, %w2, #8 \n" // 8 processed per loop. 966 "subs %w2, %w2, #8 \n" // 8 processed per loop.
967 RGB565TOARGB 967 RGB565TOARGB
968 MEMACCESS(1) 968 MEMACCESS(1)
969 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 969 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
970 "b.gt 1b \n" 970 "b.gt 1b \n"
971 : "+r"(src_rgb565), // %0 971 : "+r"(src_rgb565), // %0
972 "+r"(dst_argb), // %1 972 "+r"(dst_argb), // %1
973 "+r"(pix) // %2 973 "+r"(width) // %2
974 : 974 :
975 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List 975 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
976 ); 976 );
977 } 977 }
978 #endif // HAS_RGB565TOARGBROW_NEON 978 #endif // HAS_RGB565TOARGBROW_NEON
979 979
980 #define ARGB1555TOARGB \ 980 #define ARGB1555TOARGB \
981 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ 981 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
982 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ 982 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
983 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ 983 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
(...skipping 25 matching lines...) Expand all
1009 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ 1009 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \
1010 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ 1010 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
1011 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ 1011 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
1012 \ 1012 \
1013 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ 1013 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
1014 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ 1014 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \
1015 "dup v1.2D, v0.D[1] \n" /* G */ \ 1015 "dup v1.2D, v0.D[1] \n" /* G */ \
1016 1016
1017 #ifdef HAS_ARGB1555TOARGBROW_NEON 1017 #ifdef HAS_ARGB1555TOARGBROW_NEON
1018 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, 1018 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
1019 int pix) { 1019 int width) {
1020 asm volatile ( 1020 asm volatile (
1021 "movi v3.8b, #255 \n" // Alpha 1021 "movi v3.8b, #255 \n" // Alpha
1022 "1: \n" 1022 "1: \n"
1023 MEMACCESS(0) 1023 MEMACCESS(0)
1024 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 1024 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
1025 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1025 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1026 ARGB1555TOARGB 1026 ARGB1555TOARGB
1027 MEMACCESS(1) 1027 MEMACCESS(1)
1028 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 1028 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
1029 "b.gt 1b \n" 1029 "b.gt 1b \n"
1030 : "+r"(src_argb1555), // %0 1030 : "+r"(src_argb1555), // %0
1031 "+r"(dst_argb), // %1 1031 "+r"(dst_argb), // %1
1032 "+r"(pix) // %2 1032 "+r"(width) // %2
1033 : 1033 :
1034 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1034 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1035 ); 1035 );
1036 } 1036 }
1037 #endif // HAS_ARGB1555TOARGBROW_NEON 1037 #endif // HAS_ARGB1555TOARGBROW_NEON
1038 1038
1039 #define ARGB4444TOARGB \ 1039 #define ARGB4444TOARGB \
1040 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ 1040 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
1041 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ 1041 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
1042 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ 1042 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \
1043 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ 1043 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \
1044 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ 1044 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \
1045 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ 1045 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \
1046 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ 1046 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \
1047 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ 1047 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \
1048 "dup v0.2D, v2.D[1] \n" \ 1048 "dup v0.2D, v2.D[1] \n" \
1049 "dup v1.2D, v3.D[1] \n" 1049 "dup v1.2D, v3.D[1] \n"
1050 1050
1051 #ifdef HAS_ARGB4444TOARGBROW_NEON 1051 #ifdef HAS_ARGB4444TOARGBROW_NEON
1052 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, 1052 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
1053 int pix) { 1053 int width) {
1054 asm volatile ( 1054 asm volatile (
1055 "1: \n" 1055 "1: \n"
1056 MEMACCESS(0) 1056 MEMACCESS(0)
1057 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 1057 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
1058 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1058 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1059 ARGB4444TOARGB 1059 ARGB4444TOARGB
1060 MEMACCESS(1) 1060 MEMACCESS(1)
1061 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 1061 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
1062 "b.gt 1b \n" 1062 "b.gt 1b \n"
1063 : "+r"(src_argb4444), // %0 1063 : "+r"(src_argb4444), // %0
1064 "+r"(dst_argb), // %1 1064 "+r"(dst_argb), // %1
1065 "+r"(pix) // %2 1065 "+r"(width) // %2
1066 : 1066 :
1067 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List 1067 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
1068 ); 1068 );
1069 } 1069 }
1070 #endif // HAS_ARGB4444TOARGBROW_NEON 1070 #endif // HAS_ARGB4444TOARGBROW_NEON
1071 1071
1072 #ifdef HAS_ARGBTORGB24ROW_NEON 1072 #ifdef HAS_ARGBTORGB24ROW_NEON
1073 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { 1073 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
1074 asm volatile ( 1074 asm volatile (
1075 "1: \n" 1075 "1: \n"
1076 MEMACCESS(0) 1076 MEMACCESS(0)
1077 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels 1077 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels
1078 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1078 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1079 MEMACCESS(1) 1079 MEMACCESS(1)
1080 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. 1080 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
1081 "b.gt 1b \n" 1081 "b.gt 1b \n"
1082 : "+r"(src_argb), // %0 1082 : "+r"(src_argb), // %0
1083 "+r"(dst_rgb24), // %1 1083 "+r"(dst_rgb24), // %1
1084 "+r"(pix) // %2 1084 "+r"(width) // %2
1085 : 1085 :
1086 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List 1086 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
1087 ); 1087 );
1088 } 1088 }
1089 #endif // HAS_ARGBTORGB24ROW_NEON 1089 #endif // HAS_ARGBTORGB24ROW_NEON
1090 1090
1091 #ifdef HAS_ARGBTORAWROW_NEON 1091 #ifdef HAS_ARGBTORAWROW_NEON
1092 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { 1092 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
1093 asm volatile ( 1093 asm volatile (
1094 "1: \n" 1094 "1: \n"
1095 MEMACCESS(0) 1095 MEMACCESS(0)
1096 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a 1096 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
1097 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1097 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1098 "orr v4.8b, v2.8b, v2.8b \n" // mov g 1098 "orr v4.8b, v2.8b, v2.8b \n" // mov g
1099 "orr v5.8b, v1.8b, v1.8b \n" // mov b 1099 "orr v5.8b, v1.8b, v1.8b \n" // mov b
1100 MEMACCESS(1) 1100 MEMACCESS(1)
1101 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b 1101 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
1102 "b.gt 1b \n" 1102 "b.gt 1b \n"
1103 : "+r"(src_argb), // %0 1103 : "+r"(src_argb), // %0
1104 "+r"(dst_raw), // %1 1104 "+r"(dst_raw), // %1
1105 "+r"(pix) // %2 1105 "+r"(width) // %2
1106 : 1106 :
1107 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List 1107 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
1108 ); 1108 );
1109 } 1109 }
1110 #endif // HAS_ARGBTORAWROW_NEON 1110 #endif // HAS_ARGBTORAWROW_NEON
1111 1111
1112 #ifdef HAS_YUY2TOYROW_NEON 1112 #ifdef HAS_YUY2TOYROW_NEON
1113 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { 1113 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
1114 asm volatile ( 1114 asm volatile (
1115 "1: \n" 1115 "1: \n"
1116 MEMACCESS(0) 1116 MEMACCESS(0)
1117 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. 1117 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
1118 "subs %w2, %w2, #16 \n" // 16 processed per loop. 1118 "subs %w2, %w2, #16 \n" // 16 processed per loop.
1119 MEMACCESS(1) 1119 MEMACCESS(1)
1120 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. 1120 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
1121 "b.gt 1b \n" 1121 "b.gt 1b \n"
1122 : "+r"(src_yuy2), // %0 1122 : "+r"(src_yuy2), // %0
1123 "+r"(dst_y), // %1 1123 "+r"(dst_y), // %1
1124 "+r"(pix) // %2 1124 "+r"(width) // %2
1125 : 1125 :
1126 : "cc", "memory", "v0", "v1" // Clobber List 1126 : "cc", "memory", "v0", "v1" // Clobber List
1127 ); 1127 );
1128 } 1128 }
1129 #endif // HAS_YUY2TOYROW_NEON 1129 #endif // HAS_YUY2TOYROW_NEON
1130 1130
1131 #ifdef HAS_UYVYTOYROW_NEON 1131 #ifdef HAS_UYVYTOYROW_NEON
1132 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { 1132 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
1133 asm volatile ( 1133 asm volatile (
1134 "1: \n" 1134 "1: \n"
1135 MEMACCESS(0) 1135 MEMACCESS(0)
1136 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. 1136 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
1137 "subs %w2, %w2, #16 \n" // 16 processed per loop. 1137 "subs %w2, %w2, #16 \n" // 16 processed per loop.
1138 MEMACCESS(1) 1138 MEMACCESS(1)
1139 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. 1139 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
1140 "b.gt 1b \n" 1140 "b.gt 1b \n"
1141 : "+r"(src_uyvy), // %0 1141 : "+r"(src_uyvy), // %0
1142 "+r"(dst_y), // %1 1142 "+r"(dst_y), // %1
1143 "+r"(pix) // %2 1143 "+r"(width) // %2
1144 : 1144 :
1145 : "cc", "memory", "v0", "v1" // Clobber List 1145 : "cc", "memory", "v0", "v1" // Clobber List
1146 ); 1146 );
1147 } 1147 }
1148 #endif // HAS_UYVYTOYROW_NEON 1148 #endif // HAS_UYVYTOYROW_NEON
1149 1149
1150 #ifdef HAS_YUY2TOUV422ROW_NEON 1150 #ifdef HAS_YUY2TOUV422ROW_NEON
1151 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, 1151 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
1152 int pix) { 1152 int width) {
1153 asm volatile ( 1153 asm volatile (
1154 "1: \n" 1154 "1: \n"
1155 MEMACCESS(0) 1155 MEMACCESS(0)
1156 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels 1156 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels
1157 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. 1157 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
1158 MEMACCESS(1) 1158 MEMACCESS(1)
1159 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. 1159 "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
1160 MEMACCESS(2) 1160 MEMACCESS(2)
1161 "st1 {v3.8b}, [%2], #8 \n" // store 8 V. 1161 "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
1162 "b.gt 1b \n" 1162 "b.gt 1b \n"
1163 : "+r"(src_yuy2), // %0 1163 : "+r"(src_yuy2), // %0
1164 "+r"(dst_u), // %1 1164 "+r"(dst_u), // %1
1165 "+r"(dst_v), // %2 1165 "+r"(dst_v), // %2
1166 "+r"(pix) // %3 1166 "+r"(width) // %3
1167 : 1167 :
1168 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1168 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1169 ); 1169 );
1170 } 1170 }
1171 #endif // HAS_YUY2TOUV422ROW_NEON 1171 #endif // HAS_YUY2TOUV422ROW_NEON
1172 1172
1173 #ifdef HAS_UYVYTOUV422ROW_NEON 1173 #ifdef HAS_UYVYTOUV422ROW_NEON
1174 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, 1174 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
1175 int pix) { 1175 int width) {
1176 asm volatile ( 1176 asm volatile (
1177 "1: \n" 1177 "1: \n"
1178 MEMACCESS(0) 1178 MEMACCESS(0)
1179 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels 1179 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels
1180 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. 1180 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
1181 MEMACCESS(1) 1181 MEMACCESS(1)
1182 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. 1182 "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
1183 MEMACCESS(2) 1183 MEMACCESS(2)
1184 "st1 {v2.8b}, [%2], #8 \n" // store 8 V. 1184 "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
1185 "b.gt 1b \n" 1185 "b.gt 1b \n"
1186 : "+r"(src_uyvy), // %0 1186 : "+r"(src_uyvy), // %0
1187 "+r"(dst_u), // %1 1187 "+r"(dst_u), // %1
1188 "+r"(dst_v), // %2 1188 "+r"(dst_v), // %2
1189 "+r"(pix) // %3 1189 "+r"(width) // %3
1190 : 1190 :
1191 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1191 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1192 ); 1192 );
1193 } 1193 }
1194 #endif // HAS_UYVYTOUV422ROW_NEON 1194 #endif // HAS_UYVYTOUV422ROW_NEON
1195 1195
1196 #ifdef HAS_YUY2TOUVROW_NEON 1196 #ifdef HAS_YUY2TOUVROW_NEON
1197 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, 1197 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
1198 uint8* dst_u, uint8* dst_v, int pix) { 1198 uint8* dst_u, uint8* dst_v, int width) {
1199 const uint8* src_yuy2b = src_yuy2 + stride_yuy2; 1199 const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
1200 asm volatile ( 1200 asm volatile (
1201 "1: \n" 1201 "1: \n"
1202 MEMACCESS(0) 1202 MEMACCESS(0)
1203 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels 1203 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
1204 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. 1204 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
1205 MEMACCESS(1) 1205 MEMACCESS(1)
1206 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row 1206 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
1207 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U 1207 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
1208 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V 1208 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
1209 MEMACCESS(2) 1209 MEMACCESS(2)
1210 "st1 {v1.8b}, [%2], #8 \n" // store 8 U. 1210 "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
1211 MEMACCESS(3) 1211 MEMACCESS(3)
1212 "st1 {v3.8b}, [%3], #8 \n" // store 8 V. 1212 "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
1213 "b.gt 1b \n" 1213 "b.gt 1b \n"
1214 : "+r"(src_yuy2), // %0 1214 : "+r"(src_yuy2), // %0
1215 "+r"(src_yuy2b), // %1 1215 "+r"(src_yuy2b), // %1
1216 "+r"(dst_u), // %2 1216 "+r"(dst_u), // %2
1217 "+r"(dst_v), // %3 1217 "+r"(dst_v), // %3
1218 "+r"(pix) // %4 1218 "+r"(width) // %4
1219 : 1219 :
1220 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", 1220 : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1221 "v5", "v6", "v7" // Clobber List 1221 "v5", "v6", "v7" // Clobber List
1222 ); 1222 );
1223 } 1223 }
1224 #endif // HAS_YUY2TOUVROW_NEON 1224 #endif // HAS_YUY2TOUVROW_NEON
1225 1225
1226 #ifdef HAS_UYVYTOUVROW_NEON 1226 #ifdef HAS_UYVYTOUVROW_NEON
1227 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, 1227 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
1228 uint8* dst_u, uint8* dst_v, int pix) { 1228 uint8* dst_u, uint8* dst_v, int width) {
1229 const uint8* src_uyvyb = src_uyvy + stride_uyvy; 1229 const uint8* src_uyvyb = src_uyvy + stride_uyvy;
1230 asm volatile ( 1230 asm volatile (
1231 "1: \n" 1231 "1: \n"
1232 MEMACCESS(0) 1232 MEMACCESS(0)
1233 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels 1233 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
1234 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. 1234 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
1235 MEMACCESS(1) 1235 MEMACCESS(1)
1236 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row 1236 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
1237 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U 1237 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
1238 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V 1238 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
1239 MEMACCESS(2) 1239 MEMACCESS(2)
1240 "st1 {v0.8b}, [%2], #8 \n" // store 8 U. 1240 "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
1241 MEMACCESS(3) 1241 MEMACCESS(3)
1242 "st1 {v2.8b}, [%3], #8 \n" // store 8 V. 1242 "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
1243 "b.gt 1b \n" 1243 "b.gt 1b \n"
1244 : "+r"(src_uyvy), // %0 1244 : "+r"(src_uyvy), // %0
1245 "+r"(src_uyvyb), // %1 1245 "+r"(src_uyvyb), // %1
1246 "+r"(dst_u), // %2 1246 "+r"(dst_u), // %2
1247 "+r"(dst_v), // %3 1247 "+r"(dst_v), // %3
1248 "+r"(pix) // %4 1248 "+r"(width) // %4
1249 : 1249 :
1250 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", 1250 : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1251 "v5", "v6", "v7" // Clobber List 1251 "v5", "v6", "v7" // Clobber List
1252 ); 1252 );
1253 } 1253 }
1254 #endif // HAS_UYVYTOUVROW_NEON 1254 #endif // HAS_UYVYTOUVROW_NEON
1255 1255
1256 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 1256 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
1257 #ifdef HAS_ARGBSHUFFLEROW_NEON 1257 #ifdef HAS_ARGBSHUFFLEROW_NEON
1258 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, 1258 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
1259 const uint8* shuffler, int pix) { 1259 const uint8* shuffler, int width) {
1260 asm volatile ( 1260 asm volatile (
1261 MEMACCESS(3) 1261 MEMACCESS(3)
1262 "ld1 {v2.16b}, [%3] \n" // shuffler 1262 "ld1 {v2.16b}, [%3] \n" // shuffler
1263 "1: \n" 1263 "1: \n"
1264 MEMACCESS(0) 1264 MEMACCESS(0)
1265 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. 1265 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
1266 "subs %w2, %w2, #4 \n" // 4 processed per loop 1266 "subs %w2, %w2, #4 \n" // 4 processed per loop
1267 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels 1267 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
1268 MEMACCESS(1) 1268 MEMACCESS(1)
1269 "st1 {v1.16b}, [%1], #16 \n" // store 4. 1269 "st1 {v1.16b}, [%1], #16 \n" // store 4.
1270 "b.gt 1b \n" 1270 "b.gt 1b \n"
1271 : "+r"(src_argb), // %0 1271 : "+r"(src_argb), // %0
1272 "+r"(dst_argb), // %1 1272 "+r"(dst_argb), // %1
1273 "+r"(pix) // %2 1273 "+r"(width) // %2
1274 : "r"(shuffler) // %3 1274 : "r"(shuffler) // %3
1275 : "cc", "memory", "v0", "v1", "v2" // Clobber List 1275 : "cc", "memory", "v0", "v1", "v2" // Clobber List
1276 ); 1276 );
1277 } 1277 }
1278 #endif // HAS_ARGBSHUFFLEROW_NEON 1278 #endif // HAS_ARGBSHUFFLEROW_NEON
1279 1279
1280 #ifdef HAS_I422TOYUY2ROW_NEON 1280 #ifdef HAS_I422TOYUY2ROW_NEON
1281 void I422ToYUY2Row_NEON(const uint8* src_y, 1281 void I422ToYUY2Row_NEON(const uint8* src_y,
1282 const uint8* src_u, 1282 const uint8* src_u,
1283 const uint8* src_v, 1283 const uint8* src_v,
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
1329 "+r"(src_v), // %2 1329 "+r"(src_v), // %2
1330 "+r"(dst_uyvy), // %3 1330 "+r"(dst_uyvy), // %3
1331 "+r"(width) // %4 1331 "+r"(width) // %4
1332 : 1332 :
1333 : "cc", "memory", "v0", "v1", "v2", "v3" 1333 : "cc", "memory", "v0", "v1", "v2", "v3"
1334 ); 1334 );
1335 } 1335 }
1336 #endif // HAS_I422TOUYVYROW_NEON 1336 #endif // HAS_I422TOUYVYROW_NEON
1337 1337
1338 #ifdef HAS_ARGBTORGB565ROW_NEON 1338 #ifdef HAS_ARGBTORGB565ROW_NEON
1339 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { 1339 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
1340 asm volatile ( 1340 asm volatile (
1341 "1: \n" 1341 "1: \n"
1342 MEMACCESS(0) 1342 MEMACCESS(0)
1343 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1343 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1344 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1344 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1345 ARGBTORGB565 1345 ARGBTORGB565
1346 MEMACCESS(1) 1346 MEMACCESS(1)
1347 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. 1347 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
1348 "b.gt 1b \n" 1348 "b.gt 1b \n"
1349 : "+r"(src_argb), // %0 1349 : "+r"(src_argb), // %0
1350 "+r"(dst_rgb565), // %1 1350 "+r"(dst_rgb565), // %1
1351 "+r"(pix) // %2 1351 "+r"(width) // %2
1352 : 1352 :
1353 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" 1353 : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1354 ); 1354 );
1355 } 1355 }
1356 #endif // HAS_ARGBTORGB565ROW_NEON 1356 #endif // HAS_ARGBTORGB565ROW_NEON
1357 1357
1358 #ifdef HAS_ARGBTORGB565DITHERROW_NEON 1358 #ifdef HAS_ARGBTORGB565DITHERROW_NEON
1359 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, 1359 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
1360 const uint32 dither4, int width) { 1360 const uint32 dither4, int width) {
1361 asm volatile ( 1361 asm volatile (
(...skipping 13 matching lines...) Expand all
1375 : "r"(src_argb), // %1 1375 : "r"(src_argb), // %1
1376 "r"(dither4), // %2 1376 "r"(dither4), // %2
1377 "r"(width) // %3 1377 "r"(width) // %3
1378 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23" 1378 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
1379 ); 1379 );
1380 } 1380 }
1381 #endif // HAS_ARGBTORGB565ROW_NEON 1381 #endif // HAS_ARGBTORGB565ROW_NEON
1382 1382
1383 #ifdef HAS_ARGBTOARGB1555ROW_NEON 1383 #ifdef HAS_ARGBTOARGB1555ROW_NEON
1384 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, 1384 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
1385 int pix) { 1385 int width) {
1386 asm volatile ( 1386 asm volatile (
1387 "1: \n" 1387 "1: \n"
1388 MEMACCESS(0) 1388 MEMACCESS(0)
1389 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1389 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1390 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1390 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1391 ARGBTOARGB1555 1391 ARGBTOARGB1555
1392 MEMACCESS(1) 1392 MEMACCESS(1)
1393 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. 1393 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555.
1394 "b.gt 1b \n" 1394 "b.gt 1b \n"
1395 : "+r"(src_argb), // %0 1395 : "+r"(src_argb), // %0
1396 "+r"(dst_argb1555), // %1 1396 "+r"(dst_argb1555), // %1
1397 "+r"(pix) // %2 1397 "+r"(width) // %2
1398 : 1398 :
1399 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" 1399 : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1400 ); 1400 );
1401 } 1401 }
1402 #endif // HAS_ARGBTOARGB1555ROW_NEON 1402 #endif // HAS_ARGBTOARGB1555ROW_NEON
1403 1403
1404 #ifdef HAS_ARGBTOARGB4444ROW_NEON 1404 #ifdef HAS_ARGBTOARGB4444ROW_NEON
1405 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, 1405 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
1406 int pix) { 1406 int width) {
1407 asm volatile ( 1407 asm volatile (
1408 "movi v4.16b, #0x0f \n" // bits to clear with vbic. 1408 "movi v4.16b, #0x0f \n" // bits to clear with vbic.
1409 "1: \n" 1409 "1: \n"
1410 MEMACCESS(0) 1410 MEMACCESS(0)
1411 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1411 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1412 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1412 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1413 ARGBTOARGB4444 1413 ARGBTOARGB4444
1414 MEMACCESS(1) 1414 MEMACCESS(1)
1415 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. 1415 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444.
1416 "b.gt 1b \n" 1416 "b.gt 1b \n"
1417 : "+r"(src_argb), // %0 1417 : "+r"(src_argb), // %0
1418 "+r"(dst_argb4444), // %1 1418 "+r"(dst_argb4444), // %1
1419 "+r"(pix) // %2 1419 "+r"(width) // %2
1420 : 1420 :
1421 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" 1421 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
1422 ); 1422 );
1423 } 1423 }
1424 #endif // HAS_ARGBTOARGB4444ROW_NEON 1424 #endif // HAS_ARGBTOARGB4444ROW_NEON
1425 1425
1426 #ifdef HAS_ARGBTOYROW_NEON 1426 #ifdef HAS_ARGBTOYROW_NEON
1427 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { 1427 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
1428 asm volatile ( 1428 asm volatile (
1429 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 1429 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
1430 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 1430 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
1431 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1431 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
1432 "movi v7.8b, #16 \n" // Add 16 constant 1432 "movi v7.8b, #16 \n" // Add 16 constant
1433 "1: \n" 1433 "1: \n"
1434 MEMACCESS(0) 1434 MEMACCESS(0)
1435 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1435 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
1436 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1436 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1437 "umull v3.8h, v0.8b, v4.8b \n" // B 1437 "umull v3.8h, v0.8b, v4.8b \n" // B
1438 "umlal v3.8h, v1.8b, v5.8b \n" // G 1438 "umlal v3.8h, v1.8b, v5.8b \n" // G
1439 "umlal v3.8h, v2.8b, v6.8b \n" // R 1439 "umlal v3.8h, v2.8b, v6.8b \n" // R
1440 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 1440 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
1441 "uqadd v0.8b, v0.8b, v7.8b \n" 1441 "uqadd v0.8b, v0.8b, v7.8b \n"
1442 MEMACCESS(1) 1442 MEMACCESS(1)
1443 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1443 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1444 "b.gt 1b \n" 1444 "b.gt 1b \n"
1445 : "+r"(src_argb), // %0 1445 : "+r"(src_argb), // %0
1446 "+r"(dst_y), // %1 1446 "+r"(dst_y), // %1
1447 "+r"(pix) // %2 1447 "+r"(width) // %2
1448 : 1448 :
1449 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 1449 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1450 ); 1450 );
1451 } 1451 }
1452 #endif // HAS_ARGBTOYROW_NEON 1452 #endif // HAS_ARGBTOYROW_NEON
1453 1453
1454 #ifdef HAS_ARGBTOYJROW_NEON 1454 #ifdef HAS_ARGBTOYJROW_NEON
1455 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { 1455 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
1456 asm volatile ( 1456 asm volatile (
1457 "movi v4.8b, #15 \n" // B * 0.11400 coefficient 1457 "movi v4.8b, #15 \n" // B * 0.11400 coefficient
1458 "movi v5.8b, #75 \n" // G * 0.58700 coefficient 1458 "movi v5.8b, #75 \n" // G * 0.58700 coefficient
1459 "movi v6.8b, #38 \n" // R * 0.29900 coefficient 1459 "movi v6.8b, #38 \n" // R * 0.29900 coefficient
1460 "1: \n" 1460 "1: \n"
1461 MEMACCESS(0) 1461 MEMACCESS(0)
1462 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1462 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
1463 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1463 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1464 "umull v3.8h, v0.8b, v4.8b \n" // B 1464 "umull v3.8h, v0.8b, v4.8b \n" // B
1465 "umlal v3.8h, v1.8b, v5.8b \n" // G 1465 "umlal v3.8h, v1.8b, v5.8b \n" // G
1466 "umlal v3.8h, v2.8b, v6.8b \n" // R 1466 "umlal v3.8h, v2.8b, v6.8b \n" // R
1467 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y 1467 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
1468 MEMACCESS(1) 1468 MEMACCESS(1)
1469 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1469 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1470 "b.gt 1b \n" 1470 "b.gt 1b \n"
1471 : "+r"(src_argb), // %0 1471 : "+r"(src_argb), // %0
1472 "+r"(dst_y), // %1 1472 "+r"(dst_y), // %1
1473 "+r"(pix) // %2 1473 "+r"(width) // %2
1474 : 1474 :
1475 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" 1475 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
1476 ); 1476 );
1477 } 1477 }
1478 #endif // HAS_ARGBTOYJROW_NEON 1478 #endif // HAS_ARGBTOYJROW_NEON
1479 1479
1480 // 8x1 pixels. 1480 // 8x1 pixels.
1481 #ifdef HAS_ARGBTOUV444ROW_NEON 1481 #ifdef HAS_ARGBTOUV444ROW_NEON
1482 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1482 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1483 int pix) { 1483 int width) {
1484 asm volatile ( 1484 asm volatile (
1485 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient 1485 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient
1486 "movi v25.8b, #74 \n" // UG -0.5781 coefficient 1486 "movi v25.8b, #74 \n" // UG -0.5781 coefficient
1487 "movi v26.8b, #38 \n" // UR -0.2969 coefficient 1487 "movi v26.8b, #38 \n" // UR -0.2969 coefficient
1488 "movi v27.8b, #18 \n" // VB -0.1406 coefficient 1488 "movi v27.8b, #18 \n" // VB -0.1406 coefficient
1489 "movi v28.8b, #94 \n" // VG -0.7344 coefficient 1489 "movi v28.8b, #94 \n" // VG -0.7344 coefficient
1490 "movi v29.16b,#0x80 \n" // 128.5 1490 "movi v29.16b,#0x80 \n" // 128.5
1491 "1: \n" 1491 "1: \n"
1492 MEMACCESS(0) 1492 MEMACCESS(0)
1493 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1493 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
(...skipping 12 matching lines...) Expand all
1506 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V 1506 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
1507 1507
1508 MEMACCESS(1) 1508 MEMACCESS(1)
1509 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. 1509 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
1510 MEMACCESS(2) 1510 MEMACCESS(2)
1511 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. 1511 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
1512 "b.gt 1b \n" 1512 "b.gt 1b \n"
1513 : "+r"(src_argb), // %0 1513 : "+r"(src_argb), // %0
1514 "+r"(dst_u), // %1 1514 "+r"(dst_u), // %1
1515 "+r"(dst_v), // %2 1515 "+r"(dst_v), // %2
1516 "+r"(pix) // %3 1516 "+r"(width) // %3
1517 : 1517 :
1518 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", 1518 : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1519 "v24", "v25", "v26", "v27", "v28", "v29" 1519 "v24", "v25", "v26", "v27", "v28", "v29"
1520 ); 1520 );
1521 } 1521 }
1522 #endif // HAS_ARGBTOUV444ROW_NEON 1522 #endif // HAS_ARGBTOUV444ROW_NEON
1523 1523
1524 // 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16. 1524 // 16x1 pixels -> 8x1. width is number of argb pixels. e.g. 16.
1525 #ifdef HAS_ARGBTOUV422ROW_NEON 1525 #ifdef HAS_ARGBTOUV422ROW_NEON
1526 void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1526 void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1527 int pix) { 1527 int width) {
1528 asm volatile ( 1528 asm volatile (
1529 RGBTOUV_SETUP_REG 1529 RGBTOUV_SETUP_REG
1530 "1: \n" 1530 "1: \n"
1531 MEMACCESS(0) 1531 MEMACCESS(0)
1532 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1532 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1533 1533
1534 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1534 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1535 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1535 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1536 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 1536 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1537 1537
(...skipping 12 matching lines...) Expand all
1550 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V 1550 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V
1551 1551
1552 MEMACCESS(1) 1552 MEMACCESS(1)
1553 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. 1553 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
1554 MEMACCESS(2) 1554 MEMACCESS(2)
1555 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. 1555 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
1556 "b.gt 1b \n" 1556 "b.gt 1b \n"
1557 : "+r"(src_argb), // %0 1557 : "+r"(src_argb), // %0
1558 "+r"(dst_u), // %1 1558 "+r"(dst_u), // %1
1559 "+r"(dst_v), // %2 1559 "+r"(dst_v), // %2
1560 "+r"(pix) // %3 1560 "+r"(width) // %3
1561 : 1561 :
1562 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1562 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1563 "v20", "v21", "v22", "v23", "v24", "v25" 1563 "v20", "v21", "v22", "v23", "v24", "v25"
1564 ); 1564 );
1565 } 1565 }
1566 #endif // HAS_ARGBTOUV422ROW_NEON 1566 #endif // HAS_ARGBTOUV422ROW_NEON
1567 1567
1568 // 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32. 1568 // 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32.
1569 #ifdef HAS_ARGBTOUV411ROW_NEON 1569 #ifdef HAS_ARGBTOUV411ROW_NEON
1570 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1570 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1571 int pix) { 1571 int width) {
1572 asm volatile ( 1572 asm volatile (
1573 RGBTOUV_SETUP_REG 1573 RGBTOUV_SETUP_REG
1574 "1: \n" 1574 "1: \n"
1575 MEMACCESS(0) 1575 MEMACCESS(0)
1576 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1576 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1577 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1577 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1578 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1578 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1579 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 1579 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1580 MEMACCESS(0) 1580 MEMACCESS(0)
1581 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16. 1581 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16.
(...skipping 21 matching lines...) Expand all
1603 "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U 1603 "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U
1604 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V 1604 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V
1605 MEMACCESS(1) 1605 MEMACCESS(1)
1606 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. 1606 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
1607 MEMACCESS(2) 1607 MEMACCESS(2)
1608 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. 1608 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
1609 "b.gt 1b \n" 1609 "b.gt 1b \n"
1610 : "+r"(src_argb), // %0 1610 : "+r"(src_argb), // %0
1611 "+r"(dst_u), // %1 1611 "+r"(dst_u), // %1
1612 "+r"(dst_v), // %2 1612 "+r"(dst_v), // %2
1613 "+r"(pix) // %3 1613 "+r"(width) // %3
1614 : 1614 :
1615 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1615 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1616 "v20", "v21", "v22", "v23", "v24", "v25" 1616 "v20", "v21", "v22", "v23", "v24", "v25"
1617 ); 1617 );
1618 } 1618 }
1619 #endif // HAS_ARGBTOUV411ROW_NEON 1619 #endif // HAS_ARGBTOUV411ROW_NEON
1620 1620
1621 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. 1621 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
1622 #define RGBTOUV(QB, QG, QR) \ 1622 #define RGBTOUV(QB, QG, QR) \
1623 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ 1623 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \
1624 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ 1624 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \
1625 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ 1625 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \
1626 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ 1626 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \
1627 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ 1627 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \
1628 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ 1628 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \
1629 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ 1629 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
1630 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ 1630 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
1631 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ 1631 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
1632 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ 1632 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
1633 1633
1634 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. 1634 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
1635 // TODO(fbarchard): consider ptrdiff_t for all strides. 1635 // TODO(fbarchard): consider ptrdiff_t for all strides.
1636 1636
1637 #ifdef HAS_ARGBTOUVROW_NEON 1637 #ifdef HAS_ARGBTOUVROW_NEON
1638 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, 1638 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
1639 uint8* dst_u, uint8* dst_v, int pix) { 1639 uint8* dst_u, uint8* dst_v, int width) {
1640 const uint8* src_argb_1 = src_argb + src_stride_argb; 1640 const uint8* src_argb_1 = src_argb + src_stride_argb;
1641 asm volatile ( 1641 asm volatile (
1642 RGBTOUV_SETUP_REG 1642 RGBTOUV_SETUP_REG
1643 "1: \n" 1643 "1: \n"
1644 MEMACCESS(0) 1644 MEMACCESS(0)
1645 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1645 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1646 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1646 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1647 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1647 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1648 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 1648 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1649 1649
(...skipping 11 matching lines...) Expand all
1661 RGBTOUV(v0.8h, v1.8h, v2.8h) 1661 RGBTOUV(v0.8h, v1.8h, v2.8h)
1662 MEMACCESS(2) 1662 MEMACCESS(2)
1663 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1663 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1664 MEMACCESS(3) 1664 MEMACCESS(3)
1665 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1665 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1666 "b.gt 1b \n" 1666 "b.gt 1b \n"
1667 : "+r"(src_argb), // %0 1667 : "+r"(src_argb), // %0
1668 "+r"(src_argb_1), // %1 1668 "+r"(src_argb_1), // %1
1669 "+r"(dst_u), // %2 1669 "+r"(dst_u), // %2
1670 "+r"(dst_v), // %3 1670 "+r"(dst_v), // %3
1671 "+r"(pix) // %4 1671 "+r"(width) // %4
1672 : 1672 :
1673 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1673 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1674 "v20", "v21", "v22", "v23", "v24", "v25" 1674 "v20", "v21", "v22", "v23", "v24", "v25"
1675 ); 1675 );
1676 } 1676 }
1677 #endif // HAS_ARGBTOUVROW_NEON 1677 #endif // HAS_ARGBTOUVROW_NEON
1678 1678
1679 // TODO(fbarchard): Subsample match C code. 1679 // TODO(fbarchard): Subsample match C code.
1680 #ifdef HAS_ARGBTOUVJROW_NEON 1680 #ifdef HAS_ARGBTOUVJROW_NEON
1681 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, 1681 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
1682 uint8* dst_u, uint8* dst_v, int pix) { 1682 uint8* dst_u, uint8* dst_v, int width) {
1683 const uint8* src_argb_1 = src_argb + src_stride_argb; 1683 const uint8* src_argb_1 = src_argb + src_stride_argb;
1684 asm volatile ( 1684 asm volatile (
1685 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 1685 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
1686 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 1686 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
1687 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 1687 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
1688 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 1688 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
1689 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 1689 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
1690 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) 1690 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
1691 "1: \n" 1691 "1: \n"
1692 MEMACCESS(0) 1692 MEMACCESS(0)
(...skipping 15 matching lines...) Expand all
1708 RGBTOUV(v0.8h, v1.8h, v2.8h) 1708 RGBTOUV(v0.8h, v1.8h, v2.8h)
1709 MEMACCESS(2) 1709 MEMACCESS(2)
1710 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1710 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1711 MEMACCESS(3) 1711 MEMACCESS(3)
1712 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1712 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1713 "b.gt 1b \n" 1713 "b.gt 1b \n"
1714 : "+r"(src_argb), // %0 1714 : "+r"(src_argb), // %0
1715 "+r"(src_argb_1), // %1 1715 "+r"(src_argb_1), // %1
1716 "+r"(dst_u), // %2 1716 "+r"(dst_u), // %2
1717 "+r"(dst_v), // %3 1717 "+r"(dst_v), // %3
1718 "+r"(pix) // %4 1718 "+r"(width) // %4
1719 : 1719 :
1720 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1720 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1721 "v20", "v21", "v22", "v23", "v24", "v25" 1721 "v20", "v21", "v22", "v23", "v24", "v25"
1722 ); 1722 );
1723 } 1723 }
1724 #endif // HAS_ARGBTOUVJROW_NEON 1724 #endif // HAS_ARGBTOUVJROW_NEON
1725 1725
1726 #ifdef HAS_BGRATOUVROW_NEON 1726 #ifdef HAS_BGRATOUVROW_NEON
1727 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, 1727 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
1728 uint8* dst_u, uint8* dst_v, int pix) { 1728 uint8* dst_u, uint8* dst_v, int width) {
1729 const uint8* src_bgra_1 = src_bgra + src_stride_bgra; 1729 const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
1730 asm volatile ( 1730 asm volatile (
1731 RGBTOUV_SETUP_REG 1731 RGBTOUV_SETUP_REG
1732 "1: \n" 1732 "1: \n"
1733 MEMACCESS(0) 1733 MEMACCESS(0)
1734 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1734 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1735 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. 1735 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
1736 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. 1736 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
1737 "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. 1737 "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
1738 MEMACCESS(1) 1738 MEMACCESS(1)
(...skipping 10 matching lines...) Expand all
1749 RGBTOUV(v0.8h, v1.8h, v2.8h) 1749 RGBTOUV(v0.8h, v1.8h, v2.8h)
1750 MEMACCESS(2) 1750 MEMACCESS(2)
1751 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1751 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1752 MEMACCESS(3) 1752 MEMACCESS(3)
1753 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1753 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1754 "b.gt 1b \n" 1754 "b.gt 1b \n"
1755 : "+r"(src_bgra), // %0 1755 : "+r"(src_bgra), // %0
1756 "+r"(src_bgra_1), // %1 1756 "+r"(src_bgra_1), // %1
1757 "+r"(dst_u), // %2 1757 "+r"(dst_u), // %2
1758 "+r"(dst_v), // %3 1758 "+r"(dst_v), // %3
1759 "+r"(pix) // %4 1759 "+r"(width) // %4
1760 : 1760 :
1761 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1761 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1762 "v20", "v21", "v22", "v23", "v24", "v25" 1762 "v20", "v21", "v22", "v23", "v24", "v25"
1763 ); 1763 );
1764 } 1764 }
1765 #endif // HAS_BGRATOUVROW_NEON 1765 #endif // HAS_BGRATOUVROW_NEON
1766 1766
1767 #ifdef HAS_ABGRTOUVROW_NEON 1767 #ifdef HAS_ABGRTOUVROW_NEON
1768 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, 1768 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
1769 uint8* dst_u, uint8* dst_v, int pix) { 1769 uint8* dst_u, uint8* dst_v, int width) {
1770 const uint8* src_abgr_1 = src_abgr + src_stride_abgr; 1770 const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
1771 asm volatile ( 1771 asm volatile (
1772 RGBTOUV_SETUP_REG 1772 RGBTOUV_SETUP_REG
1773 "1: \n" 1773 "1: \n"
1774 MEMACCESS(0) 1774 MEMACCESS(0)
1775 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1775 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1776 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. 1776 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
1777 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1777 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1778 "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. 1778 "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
1779 MEMACCESS(1) 1779 MEMACCESS(1)
(...skipping 10 matching lines...) Expand all
1790 RGBTOUV(v0.8h, v2.8h, v1.8h) 1790 RGBTOUV(v0.8h, v2.8h, v1.8h)
1791 MEMACCESS(2) 1791 MEMACCESS(2)
1792 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1792 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1793 MEMACCESS(3) 1793 MEMACCESS(3)
1794 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1794 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1795 "b.gt 1b \n" 1795 "b.gt 1b \n"
1796 : "+r"(src_abgr), // %0 1796 : "+r"(src_abgr), // %0
1797 "+r"(src_abgr_1), // %1 1797 "+r"(src_abgr_1), // %1
1798 "+r"(dst_u), // %2 1798 "+r"(dst_u), // %2
1799 "+r"(dst_v), // %3 1799 "+r"(dst_v), // %3
1800 "+r"(pix) // %4 1800 "+r"(width) // %4
1801 : 1801 :
1802 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1802 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1803 "v20", "v21", "v22", "v23", "v24", "v25" 1803 "v20", "v21", "v22", "v23", "v24", "v25"
1804 ); 1804 );
1805 } 1805 }
1806 #endif // HAS_ABGRTOUVROW_NEON 1806 #endif // HAS_ABGRTOUVROW_NEON
1807 1807
1808 #ifdef HAS_RGBATOUVROW_NEON 1808 #ifdef HAS_RGBATOUVROW_NEON
1809 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, 1809 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
1810 uint8* dst_u, uint8* dst_v, int pix) { 1810 uint8* dst_u, uint8* dst_v, int width) {
1811 const uint8* src_rgba_1 = src_rgba + src_stride_rgba; 1811 const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
1812 asm volatile ( 1812 asm volatile (
1813 RGBTOUV_SETUP_REG 1813 RGBTOUV_SETUP_REG
1814 "1: \n" 1814 "1: \n"
1815 MEMACCESS(0) 1815 MEMACCESS(0)
1816 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1816 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1817 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. 1817 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
1818 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. 1818 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
1819 "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. 1819 "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
1820 MEMACCESS(1) 1820 MEMACCESS(1)
(...skipping 10 matching lines...) Expand all
1831 RGBTOUV(v0.8h, v1.8h, v2.8h) 1831 RGBTOUV(v0.8h, v1.8h, v2.8h)
1832 MEMACCESS(2) 1832 MEMACCESS(2)
1833 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1833 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1834 MEMACCESS(3) 1834 MEMACCESS(3)
1835 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1835 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1836 "b.gt 1b \n" 1836 "b.gt 1b \n"
1837 : "+r"(src_rgba), // %0 1837 : "+r"(src_rgba), // %0
1838 "+r"(src_rgba_1), // %1 1838 "+r"(src_rgba_1), // %1
1839 "+r"(dst_u), // %2 1839 "+r"(dst_u), // %2
1840 "+r"(dst_v), // %3 1840 "+r"(dst_v), // %3
1841 "+r"(pix) // %4 1841 "+r"(width) // %4
1842 : 1842 :
1843 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1843 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1844 "v20", "v21", "v22", "v23", "v24", "v25" 1844 "v20", "v21", "v22", "v23", "v24", "v25"
1845 ); 1845 );
1846 } 1846 }
1847 #endif // HAS_RGBATOUVROW_NEON 1847 #endif // HAS_RGBATOUVROW_NEON
1848 1848
1849 #ifdef HAS_RGB24TOUVROW_NEON 1849 #ifdef HAS_RGB24TOUVROW_NEON
1850 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, 1850 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
1851 uint8* dst_u, uint8* dst_v, int pix) { 1851 uint8* dst_u, uint8* dst_v, int width) {
1852 const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; 1852 const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
1853 asm volatile ( 1853 asm volatile (
1854 RGBTOUV_SETUP_REG 1854 RGBTOUV_SETUP_REG
1855 "1: \n" 1855 "1: \n"
1856 MEMACCESS(0) 1856 MEMACCESS(0)
1857 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. 1857 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
1858 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1858 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1859 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1859 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1860 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 1860 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1861 MEMACCESS(1) 1861 MEMACCESS(1)
(...skipping 10 matching lines...) Expand all
1872 RGBTOUV(v0.8h, v1.8h, v2.8h) 1872 RGBTOUV(v0.8h, v1.8h, v2.8h)
1873 MEMACCESS(2) 1873 MEMACCESS(2)
1874 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1874 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1875 MEMACCESS(3) 1875 MEMACCESS(3)
1876 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1876 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1877 "b.gt 1b \n" 1877 "b.gt 1b \n"
1878 : "+r"(src_rgb24), // %0 1878 : "+r"(src_rgb24), // %0
1879 "+r"(src_rgb24_1), // %1 1879 "+r"(src_rgb24_1), // %1
1880 "+r"(dst_u), // %2 1880 "+r"(dst_u), // %2
1881 "+r"(dst_v), // %3 1881 "+r"(dst_v), // %3
1882 "+r"(pix) // %4 1882 "+r"(width) // %4
1883 : 1883 :
1884 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1884 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1885 "v20", "v21", "v22", "v23", "v24", "v25" 1885 "v20", "v21", "v22", "v23", "v24", "v25"
1886 ); 1886 );
1887 } 1887 }
1888 #endif // HAS_RGB24TOUVROW_NEON 1888 #endif // HAS_RGB24TOUVROW_NEON
1889 1889
1890 #ifdef HAS_RAWTOUVROW_NEON 1890 #ifdef HAS_RAWTOUVROW_NEON
1891 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, 1891 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
1892 uint8* dst_u, uint8* dst_v, int pix) { 1892 uint8* dst_u, uint8* dst_v, int width) {
1893 const uint8* src_raw_1 = src_raw + src_stride_raw; 1893 const uint8* src_raw_1 = src_raw + src_stride_raw;
1894 asm volatile ( 1894 asm volatile (
1895 RGBTOUV_SETUP_REG 1895 RGBTOUV_SETUP_REG
1896 "1: \n" 1896 "1: \n"
1897 MEMACCESS(0) 1897 MEMACCESS(0)
1898 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. 1898 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
1899 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. 1899 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
1900 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1900 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1901 "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. 1901 "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
1902 MEMACCESS(1) 1902 MEMACCESS(1)
(...skipping 10 matching lines...) Expand all
1913 RGBTOUV(v2.8h, v1.8h, v0.8h) 1913 RGBTOUV(v2.8h, v1.8h, v0.8h)
1914 MEMACCESS(2) 1914 MEMACCESS(2)
1915 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1915 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1916 MEMACCESS(3) 1916 MEMACCESS(3)
1917 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1917 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1918 "b.gt 1b \n" 1918 "b.gt 1b \n"
1919 : "+r"(src_raw), // %0 1919 : "+r"(src_raw), // %0
1920 "+r"(src_raw_1), // %1 1920 "+r"(src_raw_1), // %1
1921 "+r"(dst_u), // %2 1921 "+r"(dst_u), // %2
1922 "+r"(dst_v), // %3 1922 "+r"(dst_v), // %3
1923 "+r"(pix) // %4 1923 "+r"(width) // %4
1924 : 1924 :
1925 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1925 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1926 "v20", "v21", "v22", "v23", "v24", "v25" 1926 "v20", "v21", "v22", "v23", "v24", "v25"
1927 ); 1927 );
1928 } 1928 }
1929 #endif // HAS_RAWTOUVROW_NEON 1929 #endif // HAS_RAWTOUVROW_NEON
1930 1930
1931 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. 1931 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
1932 #ifdef HAS_RGB565TOUVROW_NEON 1932 #ifdef HAS_RGB565TOUVROW_NEON
1933 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, 1933 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
1934 uint8* dst_u, uint8* dst_v, int pix) { 1934 uint8* dst_u, uint8* dst_v, int width) {
1935 const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; 1935 const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
1936 asm volatile ( 1936 asm volatile (
1937 "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 1937 "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2
1938 "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 1938 "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2
1939 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 1939 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
1940 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 1940 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
1941 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 1941 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
1942 "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) 1942 "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
1943 "1: \n" 1943 "1: \n"
1944 MEMACCESS(0) 1944 MEMACCESS(0)
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after
1988 "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V 1988 "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V
1989 MEMACCESS(2) 1989 MEMACCESS(2)
1990 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1990 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1991 MEMACCESS(3) 1991 MEMACCESS(3)
1992 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1992 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1993 "b.gt 1b \n" 1993 "b.gt 1b \n"
1994 : "+r"(src_rgb565), // %0 1994 : "+r"(src_rgb565), // %0
1995 "+r"(src_rgb565_1), // %1 1995 "+r"(src_rgb565_1), // %1
1996 "+r"(dst_u), // %2 1996 "+r"(dst_u), // %2
1997 "+r"(dst_v), // %3 1997 "+r"(dst_v), // %3
1998 "+r"(pix) // %4 1998 "+r"(width) // %4
1999 : 1999 :
2000 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 2000 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2001 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", 2001 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
2002 "v25", "v26", "v27" 2002 "v25", "v26", "v27"
2003 ); 2003 );
2004 } 2004 }
2005 #endif // HAS_RGB565TOUVROW_NEON 2005 #endif // HAS_RGB565TOUVROW_NEON
2006 2006
2007 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. 2007 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
2008 #ifdef HAS_ARGB1555TOUVROW_NEON 2008 #ifdef HAS_ARGB1555TOUVROW_NEON
2009 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, 2009 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
2010 uint8* dst_u, uint8* dst_v, int pix) { 2010 uint8* dst_u, uint8* dst_v, int width) {
2011 const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; 2011 const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
2012 asm volatile ( 2012 asm volatile (
2013 RGBTOUV_SETUP_REG 2013 RGBTOUV_SETUP_REG
2014 "1: \n" 2014 "1: \n"
2015 MEMACCESS(0) 2015 MEMACCESS(0)
2016 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 2016 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
2017 RGB555TOARGB 2017 RGB555TOARGB
2018 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 2018 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2019 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 2019 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2020 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 2020 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
2059 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V 2059 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
2060 MEMACCESS(2) 2060 MEMACCESS(2)
2061 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 2061 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2062 MEMACCESS(3) 2062 MEMACCESS(3)
2063 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 2063 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2064 "b.gt 1b \n" 2064 "b.gt 1b \n"
2065 : "+r"(src_argb1555), // %0 2065 : "+r"(src_argb1555), // %0
2066 "+r"(src_argb1555_1), // %1 2066 "+r"(src_argb1555_1), // %1
2067 "+r"(dst_u), // %2 2067 "+r"(dst_u), // %2
2068 "+r"(dst_v), // %3 2068 "+r"(dst_v), // %3
2069 "+r"(pix) // %4 2069 "+r"(width) // %4
2070 : 2070 :
2071 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", 2071 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
2072 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", 2072 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
2073 "v26", "v27", "v28" 2073 "v26", "v27", "v28"
2074 ); 2074 );
2075 } 2075 }
2076 #endif // HAS_ARGB1555TOUVROW_NEON 2076 #endif // HAS_ARGB1555TOUVROW_NEON
2077 2077
2078 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. 2078 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
2079 #ifdef HAS_ARGB4444TOUVROW_NEON 2079 #ifdef HAS_ARGB4444TOUVROW_NEON
2080 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, 2080 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
2081 uint8* dst_u, uint8* dst_v, int pix) { 2081 uint8* dst_u, uint8* dst_v, int width) {
2082 const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; 2082 const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
2083 asm volatile ( 2083 asm volatile (
2084 RGBTOUV_SETUP_REG 2084 RGBTOUV_SETUP_REG
2085 "1: \n" 2085 "1: \n"
2086 MEMACCESS(0) 2086 MEMACCESS(0)
2087 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 2087 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
2088 ARGB4444TOARGB 2088 ARGB4444TOARGB
2089 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 2089 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2090 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 2090 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2091 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 2091 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
2130 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V 2130 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
2131 MEMACCESS(2) 2131 MEMACCESS(2)
2132 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 2132 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2133 MEMACCESS(3) 2133 MEMACCESS(3)
2134 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 2134 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2135 "b.gt 1b \n" 2135 "b.gt 1b \n"
2136 : "+r"(src_argb4444), // %0 2136 : "+r"(src_argb4444), // %0
2137 "+r"(src_argb4444_1), // %1 2137 "+r"(src_argb4444_1), // %1
2138 "+r"(dst_u), // %2 2138 "+r"(dst_u), // %2
2139 "+r"(dst_v), // %3 2139 "+r"(dst_v), // %3
2140 "+r"(pix) // %4 2140 "+r"(width) // %4
2141 : 2141 :
2142 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", 2142 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
2143 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", 2143 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
2144 "v26", "v27", "v28" 2144 "v26", "v27", "v28"
2145 2145
2146 ); 2146 );
2147 } 2147 }
2148 #endif // HAS_ARGB4444TOUVROW_NEON 2148 #endif // HAS_ARGB4444TOUVROW_NEON
2149 2149
2150 #ifdef HAS_RGB565TOYROW_NEON 2150 #ifdef HAS_RGB565TOYROW_NEON
2151 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { 2151 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
2152 asm volatile ( 2152 asm volatile (
2153 "movi v24.8b, #13 \n" // B * 0.1016 coefficient 2153 "movi v24.8b, #13 \n" // B * 0.1016 coefficient
2154 "movi v25.8b, #65 \n" // G * 0.5078 coefficient 2154 "movi v25.8b, #65 \n" // G * 0.5078 coefficient
2155 "movi v26.8b, #33 \n" // R * 0.2578 coefficient 2155 "movi v26.8b, #33 \n" // R * 0.2578 coefficient
2156 "movi v27.8b, #16 \n" // Add 16 constant 2156 "movi v27.8b, #16 \n" // Add 16 constant
2157 "1: \n" 2157 "1: \n"
2158 MEMACCESS(0) 2158 MEMACCESS(0)
2159 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 2159 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
2160 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2160 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2161 RGB565TOARGB 2161 RGB565TOARGB
2162 "umull v3.8h, v0.8b, v24.8b \n" // B 2162 "umull v3.8h, v0.8b, v24.8b \n" // B
2163 "umlal v3.8h, v1.8b, v25.8b \n" // G 2163 "umlal v3.8h, v1.8b, v25.8b \n" // G
2164 "umlal v3.8h, v2.8b, v26.8b \n" // R 2164 "umlal v3.8h, v2.8b, v26.8b \n" // R
2165 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 2165 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
2166 "uqadd v0.8b, v0.8b, v27.8b \n" 2166 "uqadd v0.8b, v0.8b, v27.8b \n"
2167 MEMACCESS(1) 2167 MEMACCESS(1)
2168 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2168 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2169 "b.gt 1b \n" 2169 "b.gt 1b \n"
2170 : "+r"(src_rgb565), // %0 2170 : "+r"(src_rgb565), // %0
2171 "+r"(dst_y), // %1 2171 "+r"(dst_y), // %1
2172 "+r"(pix) // %2 2172 "+r"(width) // %2
2173 : 2173 :
2174 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", 2174 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
2175 "v24", "v25", "v26", "v27" 2175 "v24", "v25", "v26", "v27"
2176 ); 2176 );
2177 } 2177 }
2178 #endif // HAS_RGB565TOYROW_NEON 2178 #endif // HAS_RGB565TOYROW_NEON
2179 2179
2180 #ifdef HAS_ARGB1555TOYROW_NEON 2180 #ifdef HAS_ARGB1555TOYROW_NEON
2181 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { 2181 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
2182 asm volatile ( 2182 asm volatile (
2183 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 2183 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
2184 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2184 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2185 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 2185 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
2186 "movi v7.8b, #16 \n" // Add 16 constant 2186 "movi v7.8b, #16 \n" // Add 16 constant
2187 "1: \n" 2187 "1: \n"
2188 MEMACCESS(0) 2188 MEMACCESS(0)
2189 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 2189 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
2190 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2190 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2191 ARGB1555TOARGB 2191 ARGB1555TOARGB
2192 "umull v3.8h, v0.8b, v4.8b \n" // B 2192 "umull v3.8h, v0.8b, v4.8b \n" // B
2193 "umlal v3.8h, v1.8b, v5.8b \n" // G 2193 "umlal v3.8h, v1.8b, v5.8b \n" // G
2194 "umlal v3.8h, v2.8b, v6.8b \n" // R 2194 "umlal v3.8h, v2.8b, v6.8b \n" // R
2195 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 2195 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
2196 "uqadd v0.8b, v0.8b, v7.8b \n" 2196 "uqadd v0.8b, v0.8b, v7.8b \n"
2197 MEMACCESS(1) 2197 MEMACCESS(1)
2198 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2198 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2199 "b.gt 1b \n" 2199 "b.gt 1b \n"
2200 : "+r"(src_argb1555), // %0 2200 : "+r"(src_argb1555), // %0
2201 "+r"(dst_y), // %1 2201 "+r"(dst_y), // %1
2202 "+r"(pix) // %2 2202 "+r"(width) // %2
2203 : 2203 :
2204 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2204 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2205 ); 2205 );
2206 } 2206 }
2207 #endif // HAS_ARGB1555TOYROW_NEON 2207 #endif // HAS_ARGB1555TOYROW_NEON
2208 2208
2209 #ifdef HAS_ARGB4444TOYROW_NEON 2209 #ifdef HAS_ARGB4444TOYROW_NEON
2210 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { 2210 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
2211 asm volatile ( 2211 asm volatile (
2212 "movi v24.8b, #13 \n" // B * 0.1016 coefficient 2212 "movi v24.8b, #13 \n" // B * 0.1016 coefficient
2213 "movi v25.8b, #65 \n" // G * 0.5078 coefficient 2213 "movi v25.8b, #65 \n" // G * 0.5078 coefficient
2214 "movi v26.8b, #33 \n" // R * 0.2578 coefficient 2214 "movi v26.8b, #33 \n" // R * 0.2578 coefficient
2215 "movi v27.8b, #16 \n" // Add 16 constant 2215 "movi v27.8b, #16 \n" // Add 16 constant
2216 "1: \n" 2216 "1: \n"
2217 MEMACCESS(0) 2217 MEMACCESS(0)
2218 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 2218 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
2219 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2219 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2220 ARGB4444TOARGB 2220 ARGB4444TOARGB
2221 "umull v3.8h, v0.8b, v24.8b \n" // B 2221 "umull v3.8h, v0.8b, v24.8b \n" // B
2222 "umlal v3.8h, v1.8b, v25.8b \n" // G 2222 "umlal v3.8h, v1.8b, v25.8b \n" // G
2223 "umlal v3.8h, v2.8b, v26.8b \n" // R 2223 "umlal v3.8h, v2.8b, v26.8b \n" // R
2224 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 2224 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
2225 "uqadd v0.8b, v0.8b, v27.8b \n" 2225 "uqadd v0.8b, v0.8b, v27.8b \n"
2226 MEMACCESS(1) 2226 MEMACCESS(1)
2227 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2227 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2228 "b.gt 1b \n" 2228 "b.gt 1b \n"
2229 : "+r"(src_argb4444), // %0 2229 : "+r"(src_argb4444), // %0
2230 "+r"(dst_y), // %1 2230 "+r"(dst_y), // %1
2231 "+r"(pix) // %2 2231 "+r"(width) // %2
2232 : 2232 :
2233 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" 2233 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
2234 ); 2234 );
2235 } 2235 }
2236 #endif // HAS_ARGB4444TOYROW_NEON 2236 #endif // HAS_ARGB4444TOYROW_NEON
2237 2237
2238 #ifdef HAS_BGRATOYROW_NEON 2238 #ifdef HAS_BGRATOYROW_NEON
2239 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { 2239 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
2240 asm volatile ( 2240 asm volatile (
2241 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 2241 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
2242 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2242 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2243 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 2243 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
2244 "movi v7.8b, #16 \n" // Add 16 constant 2244 "movi v7.8b, #16 \n" // Add 16 constant
2245 "1: \n" 2245 "1: \n"
2246 MEMACCESS(0) 2246 MEMACCESS(0)
2247 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 2247 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2248 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2248 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2249 "umull v16.8h, v1.8b, v4.8b \n" // R 2249 "umull v16.8h, v1.8b, v4.8b \n" // R
2250 "umlal v16.8h, v2.8b, v5.8b \n" // G 2250 "umlal v16.8h, v2.8b, v5.8b \n" // G
2251 "umlal v16.8h, v3.8b, v6.8b \n" // B 2251 "umlal v16.8h, v3.8b, v6.8b \n" // B
2252 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2252 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2253 "uqadd v0.8b, v0.8b, v7.8b \n" 2253 "uqadd v0.8b, v0.8b, v7.8b \n"
2254 MEMACCESS(1) 2254 MEMACCESS(1)
2255 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2255 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2256 "b.gt 1b \n" 2256 "b.gt 1b \n"
2257 : "+r"(src_bgra), // %0 2257 : "+r"(src_bgra), // %0
2258 "+r"(dst_y), // %1 2258 "+r"(dst_y), // %1
2259 "+r"(pix) // %2 2259 "+r"(width) // %2
2260 : 2260 :
2261 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2261 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2262 ); 2262 );
2263 } 2263 }
2264 #endif // HAS_BGRATOYROW_NEON 2264 #endif // HAS_BGRATOYROW_NEON
2265 2265
2266 #ifdef HAS_ABGRTOYROW_NEON 2266 #ifdef HAS_ABGRTOYROW_NEON
2267 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { 2267 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
2268 asm volatile ( 2268 asm volatile (
2269 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 2269 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
2270 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2270 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2271 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 2271 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
2272 "movi v7.8b, #16 \n" // Add 16 constant 2272 "movi v7.8b, #16 \n" // Add 16 constant
2273 "1: \n" 2273 "1: \n"
2274 MEMACCESS(0) 2274 MEMACCESS(0)
2275 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 2275 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2276 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2276 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2277 "umull v16.8h, v0.8b, v4.8b \n" // R 2277 "umull v16.8h, v0.8b, v4.8b \n" // R
2278 "umlal v16.8h, v1.8b, v5.8b \n" // G 2278 "umlal v16.8h, v1.8b, v5.8b \n" // G
2279 "umlal v16.8h, v2.8b, v6.8b \n" // B 2279 "umlal v16.8h, v2.8b, v6.8b \n" // B
2280 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2280 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2281 "uqadd v0.8b, v0.8b, v7.8b \n" 2281 "uqadd v0.8b, v0.8b, v7.8b \n"
2282 MEMACCESS(1) 2282 MEMACCESS(1)
2283 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2283 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2284 "b.gt 1b \n" 2284 "b.gt 1b \n"
2285 : "+r"(src_abgr), // %0 2285 : "+r"(src_abgr), // %0
2286 "+r"(dst_y), // %1 2286 "+r"(dst_y), // %1
2287 "+r"(pix) // %2 2287 "+r"(width) // %2
2288 : 2288 :
2289 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2289 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2290 ); 2290 );
2291 } 2291 }
2292 #endif // HAS_ABGRTOYROW_NEON 2292 #endif // HAS_ABGRTOYROW_NEON
2293 2293
2294 #ifdef HAS_RGBATOYROW_NEON 2294 #ifdef HAS_RGBATOYROW_NEON
2295 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { 2295 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
2296 asm volatile ( 2296 asm volatile (
2297 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 2297 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
2298 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2298 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2299 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 2299 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
2300 "movi v7.8b, #16 \n" // Add 16 constant 2300 "movi v7.8b, #16 \n" // Add 16 constant
2301 "1: \n" 2301 "1: \n"
2302 MEMACCESS(0) 2302 MEMACCESS(0)
2303 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 2303 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2304 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2304 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2305 "umull v16.8h, v1.8b, v4.8b \n" // B 2305 "umull v16.8h, v1.8b, v4.8b \n" // B
2306 "umlal v16.8h, v2.8b, v5.8b \n" // G 2306 "umlal v16.8h, v2.8b, v5.8b \n" // G
2307 "umlal v16.8h, v3.8b, v6.8b \n" // R 2307 "umlal v16.8h, v3.8b, v6.8b \n" // R
2308 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2308 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2309 "uqadd v0.8b, v0.8b, v7.8b \n" 2309 "uqadd v0.8b, v0.8b, v7.8b \n"
2310 MEMACCESS(1) 2310 MEMACCESS(1)
2311 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2311 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2312 "b.gt 1b \n" 2312 "b.gt 1b \n"
2313 : "+r"(src_rgba), // %0 2313 : "+r"(src_rgba), // %0
2314 "+r"(dst_y), // %1 2314 "+r"(dst_y), // %1
2315 "+r"(pix) // %2 2315 "+r"(width) // %2
2316 : 2316 :
2317 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2317 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2318 ); 2318 );
2319 } 2319 }
2320 #endif // HAS_RGBATOYROW_NEON 2320 #endif // HAS_RGBATOYROW_NEON
2321 2321
2322 #ifdef HAS_RGB24TOYROW_NEON 2322 #ifdef HAS_RGB24TOYROW_NEON
2323 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { 2323 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
2324 asm volatile ( 2324 asm volatile (
2325 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 2325 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
2326 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2326 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2327 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 2327 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
2328 "movi v7.8b, #16 \n" // Add 16 constant 2328 "movi v7.8b, #16 \n" // Add 16 constant
2329 "1: \n" 2329 "1: \n"
2330 MEMACCESS(0) 2330 MEMACCESS(0)
2331 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. 2331 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
2332 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2332 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2333 "umull v16.8h, v0.8b, v4.8b \n" // B 2333 "umull v16.8h, v0.8b, v4.8b \n" // B
2334 "umlal v16.8h, v1.8b, v5.8b \n" // G 2334 "umlal v16.8h, v1.8b, v5.8b \n" // G
2335 "umlal v16.8h, v2.8b, v6.8b \n" // R 2335 "umlal v16.8h, v2.8b, v6.8b \n" // R
2336 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2336 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2337 "uqadd v0.8b, v0.8b, v7.8b \n" 2337 "uqadd v0.8b, v0.8b, v7.8b \n"
2338 MEMACCESS(1) 2338 MEMACCESS(1)
2339 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2339 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2340 "b.gt 1b \n" 2340 "b.gt 1b \n"
2341 : "+r"(src_rgb24), // %0 2341 : "+r"(src_rgb24), // %0
2342 "+r"(dst_y), // %1 2342 "+r"(dst_y), // %1
2343 "+r"(pix) // %2 2343 "+r"(width) // %2
2344 : 2344 :
2345 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2345 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2346 ); 2346 );
2347 } 2347 }
2348 #endif // HAS_RGB24TOYROW_NEON 2348 #endif // HAS_RGB24TOYROW_NEON
2349 2349
2350 #ifdef HAS_RAWTOYROW_NEON 2350 #ifdef HAS_RAWTOYROW_NEON
2351 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { 2351 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
2352 asm volatile ( 2352 asm volatile (
2353 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 2353 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
2354 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2354 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2355 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 2355 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
2356 "movi v7.8b, #16 \n" // Add 16 constant 2356 "movi v7.8b, #16 \n" // Add 16 constant
2357 "1: \n" 2357 "1: \n"
2358 MEMACCESS(0) 2358 MEMACCESS(0)
2359 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. 2359 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
2360 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2360 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2361 "umull v16.8h, v0.8b, v4.8b \n" // B 2361 "umull v16.8h, v0.8b, v4.8b \n" // B
2362 "umlal v16.8h, v1.8b, v5.8b \n" // G 2362 "umlal v16.8h, v1.8b, v5.8b \n" // G
2363 "umlal v16.8h, v2.8b, v6.8b \n" // R 2363 "umlal v16.8h, v2.8b, v6.8b \n" // R
2364 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2364 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2365 "uqadd v0.8b, v0.8b, v7.8b \n" 2365 "uqadd v0.8b, v0.8b, v7.8b \n"
2366 MEMACCESS(1) 2366 MEMACCESS(1)
2367 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2367 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2368 "b.gt 1b \n" 2368 "b.gt 1b \n"
2369 : "+r"(src_raw), // %0 2369 : "+r"(src_raw), // %0
2370 "+r"(dst_y), // %1 2370 "+r"(dst_y), // %1
2371 "+r"(pix) // %2 2371 "+r"(width) // %2
2372 : 2372 :
2373 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2373 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2374 ); 2374 );
2375 } 2375 }
2376 #endif // HAS_RAWTOYROW_NEON 2376 #endif // HAS_RAWTOYROW_NEON
2377 2377
2378 // Bilinear filter 16x2 -> 16x1 2378 // Bilinear filter 16x2 -> 16x1
2379 #ifdef HAS_INTERPOLATEROW_NEON 2379 #ifdef HAS_INTERPOLATEROW_NEON
2380 void InterpolateRow_NEON(uint8* dst_ptr, 2380 void InterpolateRow_NEON(uint8* dst_ptr,
2381 const uint8* src_ptr, ptrdiff_t src_stride, 2381 const uint8* src_ptr, ptrdiff_t src_stride,
(...skipping 690 matching lines...) Expand 10 before | Expand all | Expand 10 after
3072 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 3072 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
3073 ); 3073 );
3074 } 3074 }
3075 #endif // HAS_SOBELYROW_NEON 3075 #endif // HAS_SOBELYROW_NEON
3076 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 3076 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
3077 3077
3078 #ifdef __cplusplus 3078 #ifdef __cplusplus
3079 } // extern "C" 3079 } // extern "C"
3080 } // namespace libyuv 3080 } // namespace libyuv
3081 #endif 3081 #endif
OLDNEW
« no previous file with comments | « source/row_neon.cc ('k') | source/row_win.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698