| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 884 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 895 : "+r"(src), // %0 | 895 : "+r"(src), // %0 |
| 896 "+r"(dst), // %1 | 896 "+r"(dst), // %1 |
| 897 "+r"(width64) // %2 | 897 "+r"(width64) // %2 |
| 898 : "r"((ptrdiff_t)-16) // %3 | 898 : "r"((ptrdiff_t)-16) // %3 |
| 899 : "cc", "memory", "v0" | 899 : "cc", "memory", "v0" |
| 900 ); | 900 ); |
| 901 } | 901 } |
| 902 #endif // HAS_ARGBMIRRORROW_NEON | 902 #endif // HAS_ARGBMIRRORROW_NEON |
| 903 | 903 |
| 904 #ifdef HAS_RGB24TOARGBROW_NEON | 904 #ifdef HAS_RGB24TOARGBROW_NEON |
| 905 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { | 905 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { |
| 906 asm volatile ( | 906 asm volatile ( |
| 907 "movi v4.8b, #255 \n" // Alpha | 907 "movi v4.8b, #255 \n" // Alpha |
| 908 "1: \n" | 908 "1: \n" |
| 909 MEMACCESS(0) | 909 MEMACCESS(0) |
| 910 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. | 910 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. |
| 911 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 911 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 912 MEMACCESS(1) | 912 MEMACCESS(1) |
| 913 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels | 913 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels |
| 914 "b.gt 1b \n" | 914 "b.gt 1b \n" |
| 915 : "+r"(src_rgb24), // %0 | 915 : "+r"(src_rgb24), // %0 |
| 916 "+r"(dst_argb), // %1 | 916 "+r"(dst_argb), // %1 |
| 917 "+r"(pix) // %2 | 917 "+r"(width) // %2 |
| 918 : | 918 : |
| 919 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List | 919 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List |
| 920 ); | 920 ); |
| 921 } | 921 } |
| 922 #endif // HAS_RGB24TOARGBROW_NEON | 922 #endif // HAS_RGB24TOARGBROW_NEON |
| 923 | 923 |
| 924 #ifdef HAS_RAWTOARGBROW_NEON | 924 #ifdef HAS_RAWTOARGBROW_NEON |
| 925 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { | 925 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { |
| 926 asm volatile ( | 926 asm volatile ( |
| 927 "movi v5.8b, #255 \n" // Alpha | 927 "movi v5.8b, #255 \n" // Alpha |
| 928 "1: \n" | 928 "1: \n" |
| 929 MEMACCESS(0) | 929 MEMACCESS(0) |
| 930 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b | 930 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b |
| 931 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 931 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 932 "orr v3.8b, v1.8b, v1.8b \n" // move g | 932 "orr v3.8b, v1.8b, v1.8b \n" // move g |
| 933 "orr v4.8b, v0.8b, v0.8b \n" // move r | 933 "orr v4.8b, v0.8b, v0.8b \n" // move r |
| 934 MEMACCESS(1) | 934 MEMACCESS(1) |
| 935 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a | 935 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a |
| 936 "b.gt 1b \n" | 936 "b.gt 1b \n" |
| 937 : "+r"(src_raw), // %0 | 937 : "+r"(src_raw), // %0 |
| 938 "+r"(dst_argb), // %1 | 938 "+r"(dst_argb), // %1 |
| 939 "+r"(pix) // %2 | 939 "+r"(width) // %2 |
| 940 : | 940 : |
| 941 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List | 941 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List |
| 942 ); | 942 ); |
| 943 } | 943 } |
| 944 #endif // HAS_RAWTOARGBROW_NEON | 944 #endif // HAS_RAWTOARGBROW_NEON |
| 945 | 945 |
| 946 #define RGB565TOARGB \ | 946 #define RGB565TOARGB \ |
| 947 "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ | 947 "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ |
| 948 "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ | 948 "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ |
| 949 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ | 949 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ |
| 950 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ | 950 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ |
| 951 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ | 951 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ |
| 952 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ | 952 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ |
| 953 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ | 953 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ |
| 954 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ | 954 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ |
| 955 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ | 955 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ |
| 956 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ | 956 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ |
| 957 "dup v2.2D, v0.D[1] \n" /* R */ | 957 "dup v2.2D, v0.D[1] \n" /* R */ |
| 958 | 958 |
| 959 #ifdef HAS_RGB565TOARGBROW_NEON | 959 #ifdef HAS_RGB565TOARGBROW_NEON |
| 960 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { | 960 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { |
| 961 asm volatile ( | 961 asm volatile ( |
| 962 "movi v3.8b, #255 \n" // Alpha | 962 "movi v3.8b, #255 \n" // Alpha |
| 963 "1: \n" | 963 "1: \n" |
| 964 MEMACCESS(0) | 964 MEMACCESS(0) |
| 965 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. | 965 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. |
| 966 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 966 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 967 RGB565TOARGB | 967 RGB565TOARGB |
| 968 MEMACCESS(1) | 968 MEMACCESS(1) |
| 969 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels | 969 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels |
| 970 "b.gt 1b \n" | 970 "b.gt 1b \n" |
| 971 : "+r"(src_rgb565), // %0 | 971 : "+r"(src_rgb565), // %0 |
| 972 "+r"(dst_argb), // %1 | 972 "+r"(dst_argb), // %1 |
| 973 "+r"(pix) // %2 | 973 "+r"(width) // %2 |
| 974 : | 974 : |
| 975 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List | 975 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List |
| 976 ); | 976 ); |
| 977 } | 977 } |
| 978 #endif // HAS_RGB565TOARGBROW_NEON | 978 #endif // HAS_RGB565TOARGBROW_NEON |
| 979 | 979 |
| 980 #define ARGB1555TOARGB \ | 980 #define ARGB1555TOARGB \ |
| 981 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ | 981 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ |
| 982 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ | 982 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ |
| 983 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ | 983 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ |
| (...skipping 25 matching lines...) Expand all Loading... |
| 1009 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ | 1009 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ |
| 1010 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ | 1010 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ |
| 1011 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ | 1011 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ |
| 1012 \ | 1012 \ |
| 1013 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ | 1013 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ |
| 1014 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ | 1014 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ |
| 1015 "dup v1.2D, v0.D[1] \n" /* G */ \ | 1015 "dup v1.2D, v0.D[1] \n" /* G */ \ |
| 1016 | 1016 |
| 1017 #ifdef HAS_ARGB1555TOARGBROW_NEON | 1017 #ifdef HAS_ARGB1555TOARGBROW_NEON |
| 1018 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, | 1018 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, |
| 1019 int pix) { | 1019 int width) { |
| 1020 asm volatile ( | 1020 asm volatile ( |
| 1021 "movi v3.8b, #255 \n" // Alpha | 1021 "movi v3.8b, #255 \n" // Alpha |
| 1022 "1: \n" | 1022 "1: \n" |
| 1023 MEMACCESS(0) | 1023 MEMACCESS(0) |
| 1024 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. | 1024 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. |
| 1025 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1025 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 1026 ARGB1555TOARGB | 1026 ARGB1555TOARGB |
| 1027 MEMACCESS(1) | 1027 MEMACCESS(1) |
| 1028 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels | 1028 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels |
| 1029 "b.gt 1b \n" | 1029 "b.gt 1b \n" |
| 1030 : "+r"(src_argb1555), // %0 | 1030 : "+r"(src_argb1555), // %0 |
| 1031 "+r"(dst_argb), // %1 | 1031 "+r"(dst_argb), // %1 |
| 1032 "+r"(pix) // %2 | 1032 "+r"(width) // %2 |
| 1033 : | 1033 : |
| 1034 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 1034 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| 1035 ); | 1035 ); |
| 1036 } | 1036 } |
| 1037 #endif // HAS_ARGB1555TOARGBROW_NEON | 1037 #endif // HAS_ARGB1555TOARGBROW_NEON |
| 1038 | 1038 |
| 1039 #define ARGB4444TOARGB \ | 1039 #define ARGB4444TOARGB \ |
| 1040 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ | 1040 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ |
| 1041 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ | 1041 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ |
| 1042 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ | 1042 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ |
| 1043 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ | 1043 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ |
| 1044 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ | 1044 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ |
| 1045 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ | 1045 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ |
| 1046 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ | 1046 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ |
| 1047 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ | 1047 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ |
| 1048 "dup v0.2D, v2.D[1] \n" \ | 1048 "dup v0.2D, v2.D[1] \n" \ |
| 1049 "dup v1.2D, v3.D[1] \n" | 1049 "dup v1.2D, v3.D[1] \n" |
| 1050 | 1050 |
| 1051 #ifdef HAS_ARGB4444TOARGBROW_NEON | 1051 #ifdef HAS_ARGB4444TOARGBROW_NEON |
| 1052 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, | 1052 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, |
| 1053 int pix) { | 1053 int width) { |
| 1054 asm volatile ( | 1054 asm volatile ( |
| 1055 "1: \n" | 1055 "1: \n" |
| 1056 MEMACCESS(0) | 1056 MEMACCESS(0) |
| 1057 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. | 1057 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. |
| 1058 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1058 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 1059 ARGB4444TOARGB | 1059 ARGB4444TOARGB |
| 1060 MEMACCESS(1) | 1060 MEMACCESS(1) |
| 1061 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels | 1061 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels |
| 1062 "b.gt 1b \n" | 1062 "b.gt 1b \n" |
| 1063 : "+r"(src_argb4444), // %0 | 1063 : "+r"(src_argb4444), // %0 |
| 1064 "+r"(dst_argb), // %1 | 1064 "+r"(dst_argb), // %1 |
| 1065 "+r"(pix) // %2 | 1065 "+r"(width) // %2 |
| 1066 : | 1066 : |
| 1067 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List | 1067 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List |
| 1068 ); | 1068 ); |
| 1069 } | 1069 } |
| 1070 #endif // HAS_ARGB4444TOARGBROW_NEON | 1070 #endif // HAS_ARGB4444TOARGBROW_NEON |
| 1071 | 1071 |
| 1072 #ifdef HAS_ARGBTORGB24ROW_NEON | 1072 #ifdef HAS_ARGBTORGB24ROW_NEON |
| 1073 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { | 1073 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { |
| 1074 asm volatile ( | 1074 asm volatile ( |
| 1075 "1: \n" | 1075 "1: \n" |
| 1076 MEMACCESS(0) | 1076 MEMACCESS(0) |
| 1077 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels | 1077 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels |
| 1078 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1078 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 1079 MEMACCESS(1) | 1079 MEMACCESS(1) |
| 1080 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. | 1080 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. |
| 1081 "b.gt 1b \n" | 1081 "b.gt 1b \n" |
| 1082 : "+r"(src_argb), // %0 | 1082 : "+r"(src_argb), // %0 |
| 1083 "+r"(dst_rgb24), // %1 | 1083 "+r"(dst_rgb24), // %1 |
| 1084 "+r"(pix) // %2 | 1084 "+r"(width) // %2 |
| 1085 : | 1085 : |
| 1086 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List | 1086 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List |
| 1087 ); | 1087 ); |
| 1088 } | 1088 } |
| 1089 #endif // HAS_ARGBTORGB24ROW_NEON | 1089 #endif // HAS_ARGBTORGB24ROW_NEON |
| 1090 | 1090 |
| 1091 #ifdef HAS_ARGBTORAWROW_NEON | 1091 #ifdef HAS_ARGBTORAWROW_NEON |
| 1092 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { | 1092 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { |
| 1093 asm volatile ( | 1093 asm volatile ( |
| 1094 "1: \n" | 1094 "1: \n" |
| 1095 MEMACCESS(0) | 1095 MEMACCESS(0) |
| 1096 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a | 1096 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a |
| 1097 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1097 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 1098 "orr v4.8b, v2.8b, v2.8b \n" // mov g | 1098 "orr v4.8b, v2.8b, v2.8b \n" // mov g |
| 1099 "orr v5.8b, v1.8b, v1.8b \n" // mov b | 1099 "orr v5.8b, v1.8b, v1.8b \n" // mov b |
| 1100 MEMACCESS(1) | 1100 MEMACCESS(1) |
| 1101 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b | 1101 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b |
| 1102 "b.gt 1b \n" | 1102 "b.gt 1b \n" |
| 1103 : "+r"(src_argb), // %0 | 1103 : "+r"(src_argb), // %0 |
| 1104 "+r"(dst_raw), // %1 | 1104 "+r"(dst_raw), // %1 |
| 1105 "+r"(pix) // %2 | 1105 "+r"(width) // %2 |
| 1106 : | 1106 : |
| 1107 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List | 1107 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List |
| 1108 ); | 1108 ); |
| 1109 } | 1109 } |
| 1110 #endif // HAS_ARGBTORAWROW_NEON | 1110 #endif // HAS_ARGBTORAWROW_NEON |
| 1111 | 1111 |
| 1112 #ifdef HAS_YUY2TOYROW_NEON | 1112 #ifdef HAS_YUY2TOYROW_NEON |
| 1113 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { | 1113 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { |
| 1114 asm volatile ( | 1114 asm volatile ( |
| 1115 "1: \n" | 1115 "1: \n" |
| 1116 MEMACCESS(0) | 1116 MEMACCESS(0) |
| 1117 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. | 1117 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. |
| 1118 "subs %w2, %w2, #16 \n" // 16 processed per loop. | 1118 "subs %w2, %w2, #16 \n" // 16 processed per loop. |
| 1119 MEMACCESS(1) | 1119 MEMACCESS(1) |
| 1120 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. | 1120 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. |
| 1121 "b.gt 1b \n" | 1121 "b.gt 1b \n" |
| 1122 : "+r"(src_yuy2), // %0 | 1122 : "+r"(src_yuy2), // %0 |
| 1123 "+r"(dst_y), // %1 | 1123 "+r"(dst_y), // %1 |
| 1124 "+r"(pix) // %2 | 1124 "+r"(width) // %2 |
| 1125 : | 1125 : |
| 1126 : "cc", "memory", "v0", "v1" // Clobber List | 1126 : "cc", "memory", "v0", "v1" // Clobber List |
| 1127 ); | 1127 ); |
| 1128 } | 1128 } |
| 1129 #endif // HAS_YUY2TOYROW_NEON | 1129 #endif // HAS_YUY2TOYROW_NEON |
| 1130 | 1130 |
| 1131 #ifdef HAS_UYVYTOYROW_NEON | 1131 #ifdef HAS_UYVYTOYROW_NEON |
| 1132 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { | 1132 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { |
| 1133 asm volatile ( | 1133 asm volatile ( |
| 1134 "1: \n" | 1134 "1: \n" |
| 1135 MEMACCESS(0) | 1135 MEMACCESS(0) |
| 1136 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. | 1136 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. |
| 1137 "subs %w2, %w2, #16 \n" // 16 processed per loop. | 1137 "subs %w2, %w2, #16 \n" // 16 processed per loop. |
| 1138 MEMACCESS(1) | 1138 MEMACCESS(1) |
| 1139 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. | 1139 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. |
| 1140 "b.gt 1b \n" | 1140 "b.gt 1b \n" |
| 1141 : "+r"(src_uyvy), // %0 | 1141 : "+r"(src_uyvy), // %0 |
| 1142 "+r"(dst_y), // %1 | 1142 "+r"(dst_y), // %1 |
| 1143 "+r"(pix) // %2 | 1143 "+r"(width) // %2 |
| 1144 : | 1144 : |
| 1145 : "cc", "memory", "v0", "v1" // Clobber List | 1145 : "cc", "memory", "v0", "v1" // Clobber List |
| 1146 ); | 1146 ); |
| 1147 } | 1147 } |
| 1148 #endif // HAS_UYVYTOYROW_NEON | 1148 #endif // HAS_UYVYTOYROW_NEON |
| 1149 | 1149 |
| 1150 #ifdef HAS_YUY2TOUV422ROW_NEON | 1150 #ifdef HAS_YUY2TOUV422ROW_NEON |
| 1151 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, | 1151 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, |
| 1152 int pix) { | 1152 int width) { |
| 1153 asm volatile ( | 1153 asm volatile ( |
| 1154 "1: \n" | 1154 "1: \n" |
| 1155 MEMACCESS(0) | 1155 MEMACCESS(0) |
| 1156 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels | 1156 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels |
| 1157 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. | 1157 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. |
| 1158 MEMACCESS(1) | 1158 MEMACCESS(1) |
| 1159 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. | 1159 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. |
| 1160 MEMACCESS(2) | 1160 MEMACCESS(2) |
| 1161 "st1 {v3.8b}, [%2], #8 \n" // store 8 V. | 1161 "st1 {v3.8b}, [%2], #8 \n" // store 8 V. |
| 1162 "b.gt 1b \n" | 1162 "b.gt 1b \n" |
| 1163 : "+r"(src_yuy2), // %0 | 1163 : "+r"(src_yuy2), // %0 |
| 1164 "+r"(dst_u), // %1 | 1164 "+r"(dst_u), // %1 |
| 1165 "+r"(dst_v), // %2 | 1165 "+r"(dst_v), // %2 |
| 1166 "+r"(pix) // %3 | 1166 "+r"(width) // %3 |
| 1167 : | 1167 : |
| 1168 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 1168 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| 1169 ); | 1169 ); |
| 1170 } | 1170 } |
| 1171 #endif // HAS_YUY2TOUV422ROW_NEON | 1171 #endif // HAS_YUY2TOUV422ROW_NEON |
| 1172 | 1172 |
| 1173 #ifdef HAS_UYVYTOUV422ROW_NEON | 1173 #ifdef HAS_UYVYTOUV422ROW_NEON |
| 1174 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, | 1174 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, |
| 1175 int pix) { | 1175 int width) { |
| 1176 asm volatile ( | 1176 asm volatile ( |
| 1177 "1: \n" | 1177 "1: \n" |
| 1178 MEMACCESS(0) | 1178 MEMACCESS(0) |
| 1179 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels | 1179 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels |
| 1180 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. | 1180 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. |
| 1181 MEMACCESS(1) | 1181 MEMACCESS(1) |
| 1182 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. | 1182 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. |
| 1183 MEMACCESS(2) | 1183 MEMACCESS(2) |
| 1184 "st1 {v2.8b}, [%2], #8 \n" // store 8 V. | 1184 "st1 {v2.8b}, [%2], #8 \n" // store 8 V. |
| 1185 "b.gt 1b \n" | 1185 "b.gt 1b \n" |
| 1186 : "+r"(src_uyvy), // %0 | 1186 : "+r"(src_uyvy), // %0 |
| 1187 "+r"(dst_u), // %1 | 1187 "+r"(dst_u), // %1 |
| 1188 "+r"(dst_v), // %2 | 1188 "+r"(dst_v), // %2 |
| 1189 "+r"(pix) // %3 | 1189 "+r"(width) // %3 |
| 1190 : | 1190 : |
| 1191 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 1191 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| 1192 ); | 1192 ); |
| 1193 } | 1193 } |
| 1194 #endif // HAS_UYVYTOUV422ROW_NEON | 1194 #endif // HAS_UYVYTOUV422ROW_NEON |
| 1195 | 1195 |
| 1196 #ifdef HAS_YUY2TOUVROW_NEON | 1196 #ifdef HAS_YUY2TOUVROW_NEON |
| 1197 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, | 1197 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, |
| 1198 uint8* dst_u, uint8* dst_v, int pix) { | 1198 uint8* dst_u, uint8* dst_v, int width) { |
| 1199 const uint8* src_yuy2b = src_yuy2 + stride_yuy2; | 1199 const uint8* src_yuy2b = src_yuy2 + stride_yuy2; |
| 1200 asm volatile ( | 1200 asm volatile ( |
| 1201 "1: \n" | 1201 "1: \n" |
| 1202 MEMACCESS(0) | 1202 MEMACCESS(0) |
| 1203 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels | 1203 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels |
| 1204 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. | 1204 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. |
| 1205 MEMACCESS(1) | 1205 MEMACCESS(1) |
| 1206 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row | 1206 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row |
| 1207 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U | 1207 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U |
| 1208 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V | 1208 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V |
| 1209 MEMACCESS(2) | 1209 MEMACCESS(2) |
| 1210 "st1 {v1.8b}, [%2], #8 \n" // store 8 U. | 1210 "st1 {v1.8b}, [%2], #8 \n" // store 8 U. |
| 1211 MEMACCESS(3) | 1211 MEMACCESS(3) |
| 1212 "st1 {v3.8b}, [%3], #8 \n" // store 8 V. | 1212 "st1 {v3.8b}, [%3], #8 \n" // store 8 V. |
| 1213 "b.gt 1b \n" | 1213 "b.gt 1b \n" |
| 1214 : "+r"(src_yuy2), // %0 | 1214 : "+r"(src_yuy2), // %0 |
| 1215 "+r"(src_yuy2b), // %1 | 1215 "+r"(src_yuy2b), // %1 |
| 1216 "+r"(dst_u), // %2 | 1216 "+r"(dst_u), // %2 |
| 1217 "+r"(dst_v), // %3 | 1217 "+r"(dst_v), // %3 |
| 1218 "+r"(pix) // %4 | 1218 "+r"(width) // %4 |
| 1219 : | 1219 : |
| 1220 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", | 1220 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", |
| 1221 "v5", "v6", "v7" // Clobber List | 1221 "v5", "v6", "v7" // Clobber List |
| 1222 ); | 1222 ); |
| 1223 } | 1223 } |
| 1224 #endif // HAS_YUY2TOUVROW_NEON | 1224 #endif // HAS_YUY2TOUVROW_NEON |
| 1225 | 1225 |
| 1226 #ifdef HAS_UYVYTOUVROW_NEON | 1226 #ifdef HAS_UYVYTOUVROW_NEON |
| 1227 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, | 1227 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, |
| 1228 uint8* dst_u, uint8* dst_v, int pix) { | 1228 uint8* dst_u, uint8* dst_v, int width) { |
| 1229 const uint8* src_uyvyb = src_uyvy + stride_uyvy; | 1229 const uint8* src_uyvyb = src_uyvy + stride_uyvy; |
| 1230 asm volatile ( | 1230 asm volatile ( |
| 1231 "1: \n" | 1231 "1: \n" |
| 1232 MEMACCESS(0) | 1232 MEMACCESS(0) |
| 1233 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels | 1233 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels |
| 1234 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. | 1234 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. |
| 1235 MEMACCESS(1) | 1235 MEMACCESS(1) |
| 1236 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row | 1236 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row |
| 1237 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U | 1237 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U |
| 1238 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V | 1238 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V |
| 1239 MEMACCESS(2) | 1239 MEMACCESS(2) |
| 1240 "st1 {v0.8b}, [%2], #8 \n" // store 8 U. | 1240 "st1 {v0.8b}, [%2], #8 \n" // store 8 U. |
| 1241 MEMACCESS(3) | 1241 MEMACCESS(3) |
| 1242 "st1 {v2.8b}, [%3], #8 \n" // store 8 V. | 1242 "st1 {v2.8b}, [%3], #8 \n" // store 8 V. |
| 1243 "b.gt 1b \n" | 1243 "b.gt 1b \n" |
| 1244 : "+r"(src_uyvy), // %0 | 1244 : "+r"(src_uyvy), // %0 |
| 1245 "+r"(src_uyvyb), // %1 | 1245 "+r"(src_uyvyb), // %1 |
| 1246 "+r"(dst_u), // %2 | 1246 "+r"(dst_u), // %2 |
| 1247 "+r"(dst_v), // %3 | 1247 "+r"(dst_v), // %3 |
| 1248 "+r"(pix) // %4 | 1248 "+r"(width) // %4 |
| 1249 : | 1249 : |
| 1250 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", | 1250 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", |
| 1251 "v5", "v6", "v7" // Clobber List | 1251 "v5", "v6", "v7" // Clobber List |
| 1252 ); | 1252 ); |
| 1253 } | 1253 } |
| 1254 #endif // HAS_UYVYTOUVROW_NEON | 1254 #endif // HAS_UYVYTOUVROW_NEON |
| 1255 | 1255 |
| 1256 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 1256 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
| 1257 #ifdef HAS_ARGBSHUFFLEROW_NEON | 1257 #ifdef HAS_ARGBSHUFFLEROW_NEON |
| 1258 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, | 1258 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, |
| 1259 const uint8* shuffler, int pix) { | 1259 const uint8* shuffler, int width) { |
| 1260 asm volatile ( | 1260 asm volatile ( |
| 1261 MEMACCESS(3) | 1261 MEMACCESS(3) |
| 1262 "ld1 {v2.16b}, [%3] \n" // shuffler | 1262 "ld1 {v2.16b}, [%3] \n" // shuffler |
| 1263 "1: \n" | 1263 "1: \n" |
| 1264 MEMACCESS(0) | 1264 MEMACCESS(0) |
| 1265 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. | 1265 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. |
| 1266 "subs %w2, %w2, #4 \n" // 4 processed per loop | 1266 "subs %w2, %w2, #4 \n" // 4 processed per loop |
| 1267 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels | 1267 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels |
| 1268 MEMACCESS(1) | 1268 MEMACCESS(1) |
| 1269 "st1 {v1.16b}, [%1], #16 \n" // store 4. | 1269 "st1 {v1.16b}, [%1], #16 \n" // store 4. |
| 1270 "b.gt 1b \n" | 1270 "b.gt 1b \n" |
| 1271 : "+r"(src_argb), // %0 | 1271 : "+r"(src_argb), // %0 |
| 1272 "+r"(dst_argb), // %1 | 1272 "+r"(dst_argb), // %1 |
| 1273 "+r"(pix) // %2 | 1273 "+r"(width) // %2 |
| 1274 : "r"(shuffler) // %3 | 1274 : "r"(shuffler) // %3 |
| 1275 : "cc", "memory", "v0", "v1", "v2" // Clobber List | 1275 : "cc", "memory", "v0", "v1", "v2" // Clobber List |
| 1276 ); | 1276 ); |
| 1277 } | 1277 } |
| 1278 #endif // HAS_ARGBSHUFFLEROW_NEON | 1278 #endif // HAS_ARGBSHUFFLEROW_NEON |
| 1279 | 1279 |
| 1280 #ifdef HAS_I422TOYUY2ROW_NEON | 1280 #ifdef HAS_I422TOYUY2ROW_NEON |
| 1281 void I422ToYUY2Row_NEON(const uint8* src_y, | 1281 void I422ToYUY2Row_NEON(const uint8* src_y, |
| 1282 const uint8* src_u, | 1282 const uint8* src_u, |
| 1283 const uint8* src_v, | 1283 const uint8* src_v, |
| (...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1329 "+r"(src_v), // %2 | 1329 "+r"(src_v), // %2 |
| 1330 "+r"(dst_uyvy), // %3 | 1330 "+r"(dst_uyvy), // %3 |
| 1331 "+r"(width) // %4 | 1331 "+r"(width) // %4 |
| 1332 : | 1332 : |
| 1333 : "cc", "memory", "v0", "v1", "v2", "v3" | 1333 : "cc", "memory", "v0", "v1", "v2", "v3" |
| 1334 ); | 1334 ); |
| 1335 } | 1335 } |
| 1336 #endif // HAS_I422TOUYVYROW_NEON | 1336 #endif // HAS_I422TOUYVYROW_NEON |
| 1337 | 1337 |
| 1338 #ifdef HAS_ARGBTORGB565ROW_NEON | 1338 #ifdef HAS_ARGBTORGB565ROW_NEON |
| 1339 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { | 1339 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { |
| 1340 asm volatile ( | 1340 asm volatile ( |
| 1341 "1: \n" | 1341 "1: \n" |
| 1342 MEMACCESS(0) | 1342 MEMACCESS(0) |
| 1343 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels | 1343 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels |
| 1344 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1344 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 1345 ARGBTORGB565 | 1345 ARGBTORGB565 |
| 1346 MEMACCESS(1) | 1346 MEMACCESS(1) |
| 1347 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. | 1347 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. |
| 1348 "b.gt 1b \n" | 1348 "b.gt 1b \n" |
| 1349 : "+r"(src_argb), // %0 | 1349 : "+r"(src_argb), // %0 |
| 1350 "+r"(dst_rgb565), // %1 | 1350 "+r"(dst_rgb565), // %1 |
| 1351 "+r"(pix) // %2 | 1351 "+r"(width) // %2 |
| 1352 : | 1352 : |
| 1353 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" | 1353 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" |
| 1354 ); | 1354 ); |
| 1355 } | 1355 } |
| 1356 #endif // HAS_ARGBTORGB565ROW_NEON | 1356 #endif // HAS_ARGBTORGB565ROW_NEON |
| 1357 | 1357 |
| 1358 #ifdef HAS_ARGBTORGB565DITHERROW_NEON | 1358 #ifdef HAS_ARGBTORGB565DITHERROW_NEON |
| 1359 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, | 1359 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, |
| 1360 const uint32 dither4, int width) { | 1360 const uint32 dither4, int width) { |
| 1361 asm volatile ( | 1361 asm volatile ( |
| (...skipping 13 matching lines...) Expand all Loading... |
| 1375 : "r"(src_argb), // %1 | 1375 : "r"(src_argb), // %1 |
| 1376 "r"(dither4), // %2 | 1376 "r"(dither4), // %2 |
| 1377 "r"(width) // %3 | 1377 "r"(width) // %3 |
| 1378 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23" | 1378 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23" |
| 1379 ); | 1379 ); |
| 1380 } | 1380 } |
| 1381 #endif // HAS_ARGBTORGB565ROW_NEON | 1381 #endif // HAS_ARGBTORGB565ROW_NEON |
| 1382 | 1382 |
| 1383 #ifdef HAS_ARGBTOARGB1555ROW_NEON | 1383 #ifdef HAS_ARGBTOARGB1555ROW_NEON |
| 1384 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, | 1384 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, |
| 1385 int pix) { | 1385 int width) { |
| 1386 asm volatile ( | 1386 asm volatile ( |
| 1387 "1: \n" | 1387 "1: \n" |
| 1388 MEMACCESS(0) | 1388 MEMACCESS(0) |
| 1389 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels | 1389 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels |
| 1390 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1390 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 1391 ARGBTOARGB1555 | 1391 ARGBTOARGB1555 |
| 1392 MEMACCESS(1) | 1392 MEMACCESS(1) |
| 1393 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. | 1393 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. |
| 1394 "b.gt 1b \n" | 1394 "b.gt 1b \n" |
| 1395 : "+r"(src_argb), // %0 | 1395 : "+r"(src_argb), // %0 |
| 1396 "+r"(dst_argb1555), // %1 | 1396 "+r"(dst_argb1555), // %1 |
| 1397 "+r"(pix) // %2 | 1397 "+r"(width) // %2 |
| 1398 : | 1398 : |
| 1399 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" | 1399 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" |
| 1400 ); | 1400 ); |
| 1401 } | 1401 } |
| 1402 #endif // HAS_ARGBTOARGB1555ROW_NEON | 1402 #endif // HAS_ARGBTOARGB1555ROW_NEON |
| 1403 | 1403 |
| 1404 #ifdef HAS_ARGBTOARGB4444ROW_NEON | 1404 #ifdef HAS_ARGBTOARGB4444ROW_NEON |
| 1405 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, | 1405 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, |
| 1406 int pix) { | 1406 int width) { |
| 1407 asm volatile ( | 1407 asm volatile ( |
| 1408 "movi v4.16b, #0x0f \n" // bits to clear with vbic. | 1408 "movi v4.16b, #0x0f \n" // bits to clear with vbic. |
| 1409 "1: \n" | 1409 "1: \n" |
| 1410 MEMACCESS(0) | 1410 MEMACCESS(0) |
| 1411 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels | 1411 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels |
| 1412 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1412 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 1413 ARGBTOARGB4444 | 1413 ARGBTOARGB4444 |
| 1414 MEMACCESS(1) | 1414 MEMACCESS(1) |
| 1415 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. | 1415 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. |
| 1416 "b.gt 1b \n" | 1416 "b.gt 1b \n" |
| 1417 : "+r"(src_argb), // %0 | 1417 : "+r"(src_argb), // %0 |
| 1418 "+r"(dst_argb4444), // %1 | 1418 "+r"(dst_argb4444), // %1 |
| 1419 "+r"(pix) // %2 | 1419 "+r"(width) // %2 |
| 1420 : | 1420 : |
| 1421 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" | 1421 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" |
| 1422 ); | 1422 ); |
| 1423 } | 1423 } |
| 1424 #endif // HAS_ARGBTOARGB4444ROW_NEON | 1424 #endif // HAS_ARGBTOARGB4444ROW_NEON |
| 1425 | 1425 |
| 1426 #ifdef HAS_ARGBTOYROW_NEON | 1426 #ifdef HAS_ARGBTOYROW_NEON |
| 1427 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { | 1427 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { |
| 1428 asm volatile ( | 1428 asm volatile ( |
| 1429 "movi v4.8b, #13 \n" // B * 0.1016 coefficient | 1429 "movi v4.8b, #13 \n" // B * 0.1016 coefficient |
| 1430 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 1430 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
| 1431 "movi v6.8b, #33 \n" // R * 0.2578 coefficient | 1431 "movi v6.8b, #33 \n" // R * 0.2578 coefficient |
| 1432 "movi v7.8b, #16 \n" // Add 16 constant | 1432 "movi v7.8b, #16 \n" // Add 16 constant |
| 1433 "1: \n" | 1433 "1: \n" |
| 1434 MEMACCESS(0) | 1434 MEMACCESS(0) |
| 1435 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 1435 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
| 1436 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1436 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 1437 "umull v3.8h, v0.8b, v4.8b \n" // B | 1437 "umull v3.8h, v0.8b, v4.8b \n" // B |
| 1438 "umlal v3.8h, v1.8b, v5.8b \n" // G | 1438 "umlal v3.8h, v1.8b, v5.8b \n" // G |
| 1439 "umlal v3.8h, v2.8b, v6.8b \n" // R | 1439 "umlal v3.8h, v2.8b, v6.8b \n" // R |
| 1440 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y | 1440 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y |
| 1441 "uqadd v0.8b, v0.8b, v7.8b \n" | 1441 "uqadd v0.8b, v0.8b, v7.8b \n" |
| 1442 MEMACCESS(1) | 1442 MEMACCESS(1) |
| 1443 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 1443 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
| 1444 "b.gt 1b \n" | 1444 "b.gt 1b \n" |
| 1445 : "+r"(src_argb), // %0 | 1445 : "+r"(src_argb), // %0 |
| 1446 "+r"(dst_y), // %1 | 1446 "+r"(dst_y), // %1 |
| 1447 "+r"(pix) // %2 | 1447 "+r"(width) // %2 |
| 1448 : | 1448 : |
| 1449 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" | 1449 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
| 1450 ); | 1450 ); |
| 1451 } | 1451 } |
| 1452 #endif // HAS_ARGBTOYROW_NEON | 1452 #endif // HAS_ARGBTOYROW_NEON |
| 1453 | 1453 |
| 1454 #ifdef HAS_ARGBTOYJROW_NEON | 1454 #ifdef HAS_ARGBTOYJROW_NEON |
| 1455 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { | 1455 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { |
| 1456 asm volatile ( | 1456 asm volatile ( |
| 1457 "movi v4.8b, #15 \n" // B * 0.11400 coefficient | 1457 "movi v4.8b, #15 \n" // B * 0.11400 coefficient |
| 1458 "movi v5.8b, #75 \n" // G * 0.58700 coefficient | 1458 "movi v5.8b, #75 \n" // G * 0.58700 coefficient |
| 1459 "movi v6.8b, #38 \n" // R * 0.29900 coefficient | 1459 "movi v6.8b, #38 \n" // R * 0.29900 coefficient |
| 1460 "1: \n" | 1460 "1: \n" |
| 1461 MEMACCESS(0) | 1461 MEMACCESS(0) |
| 1462 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 1462 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
| 1463 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1463 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 1464 "umull v3.8h, v0.8b, v4.8b \n" // B | 1464 "umull v3.8h, v0.8b, v4.8b \n" // B |
| 1465 "umlal v3.8h, v1.8b, v5.8b \n" // G | 1465 "umlal v3.8h, v1.8b, v5.8b \n" // G |
| 1466 "umlal v3.8h, v2.8b, v6.8b \n" // R | 1466 "umlal v3.8h, v2.8b, v6.8b \n" // R |
| 1467 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y | 1467 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y |
| 1468 MEMACCESS(1) | 1468 MEMACCESS(1) |
| 1469 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 1469 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
| 1470 "b.gt 1b \n" | 1470 "b.gt 1b \n" |
| 1471 : "+r"(src_argb), // %0 | 1471 : "+r"(src_argb), // %0 |
| 1472 "+r"(dst_y), // %1 | 1472 "+r"(dst_y), // %1 |
| 1473 "+r"(pix) // %2 | 1473 "+r"(width) // %2 |
| 1474 : | 1474 : |
| 1475 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" | 1475 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" |
| 1476 ); | 1476 ); |
| 1477 } | 1477 } |
| 1478 #endif // HAS_ARGBTOYJROW_NEON | 1478 #endif // HAS_ARGBTOYJROW_NEON |
| 1479 | 1479 |
| 1480 // 8x1 pixels. | 1480 // 8x1 pixels. |
| 1481 #ifdef HAS_ARGBTOUV444ROW_NEON | 1481 #ifdef HAS_ARGBTOUV444ROW_NEON |
| 1482 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, | 1482 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, |
| 1483 int pix) { | 1483 int width) { |
| 1484 asm volatile ( | 1484 asm volatile ( |
| 1485 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient | 1485 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient |
| 1486 "movi v25.8b, #74 \n" // UG -0.5781 coefficient | 1486 "movi v25.8b, #74 \n" // UG -0.5781 coefficient |
| 1487 "movi v26.8b, #38 \n" // UR -0.2969 coefficient | 1487 "movi v26.8b, #38 \n" // UR -0.2969 coefficient |
| 1488 "movi v27.8b, #18 \n" // VB -0.1406 coefficient | 1488 "movi v27.8b, #18 \n" // VB -0.1406 coefficient |
| 1489 "movi v28.8b, #94 \n" // VG -0.7344 coefficient | 1489 "movi v28.8b, #94 \n" // VG -0.7344 coefficient |
| 1490 "movi v29.16b,#0x80 \n" // 128.5 | 1490 "movi v29.16b,#0x80 \n" // 128.5 |
| 1491 "1: \n" | 1491 "1: \n" |
| 1492 MEMACCESS(0) | 1492 MEMACCESS(0) |
| 1493 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 1493 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
| (...skipping 12 matching lines...) Expand all Loading... |
| 1506 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V | 1506 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V |
| 1507 | 1507 |
| 1508 MEMACCESS(1) | 1508 MEMACCESS(1) |
| 1509 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. | 1509 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. |
| 1510 MEMACCESS(2) | 1510 MEMACCESS(2) |
| 1511 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. | 1511 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. |
| 1512 "b.gt 1b \n" | 1512 "b.gt 1b \n" |
| 1513 : "+r"(src_argb), // %0 | 1513 : "+r"(src_argb), // %0 |
| 1514 "+r"(dst_u), // %1 | 1514 "+r"(dst_u), // %1 |
| 1515 "+r"(dst_v), // %2 | 1515 "+r"(dst_v), // %2 |
| 1516 "+r"(pix) // %3 | 1516 "+r"(width) // %3 |
| 1517 : | 1517 : |
| 1518 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", | 1518 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", |
| 1519 "v24", "v25", "v26", "v27", "v28", "v29" | 1519 "v24", "v25", "v26", "v27", "v28", "v29" |
| 1520 ); | 1520 ); |
| 1521 } | 1521 } |
| 1522 #endif // HAS_ARGBTOUV444ROW_NEON | 1522 #endif // HAS_ARGBTOUV444ROW_NEON |
| 1523 | 1523 |
| 1524 // 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16. | 1524 // 16x1 pixels -> 8x1. width is number of argb pixels. e.g. 16. |
| 1525 #ifdef HAS_ARGBTOUV422ROW_NEON | 1525 #ifdef HAS_ARGBTOUV422ROW_NEON |
| 1526 void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, | 1526 void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, |
| 1527 int pix) { | 1527 int width) { |
| 1528 asm volatile ( | 1528 asm volatile ( |
| 1529 RGBTOUV_SETUP_REG | 1529 RGBTOUV_SETUP_REG |
| 1530 "1: \n" | 1530 "1: \n" |
| 1531 MEMACCESS(0) | 1531 MEMACCESS(0) |
| 1532 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | 1532 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. |
| 1533 | 1533 |
| 1534 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. | 1534 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. |
| 1535 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 1535 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
| 1536 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. | 1536 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. |
| 1537 | 1537 |
| (...skipping 12 matching lines...) Expand all Loading... |
| 1550 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V | 1550 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V |
| 1551 | 1551 |
| 1552 MEMACCESS(1) | 1552 MEMACCESS(1) |
| 1553 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. | 1553 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. |
| 1554 MEMACCESS(2) | 1554 MEMACCESS(2) |
| 1555 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. | 1555 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. |
| 1556 "b.gt 1b \n" | 1556 "b.gt 1b \n" |
| 1557 : "+r"(src_argb), // %0 | 1557 : "+r"(src_argb), // %0 |
| 1558 "+r"(dst_u), // %1 | 1558 "+r"(dst_u), // %1 |
| 1559 "+r"(dst_v), // %2 | 1559 "+r"(dst_v), // %2 |
| 1560 "+r"(pix) // %3 | 1560 "+r"(width) // %3 |
| 1561 : | 1561 : |
| 1562 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1562 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
| 1563 "v20", "v21", "v22", "v23", "v24", "v25" | 1563 "v20", "v21", "v22", "v23", "v24", "v25" |
| 1564 ); | 1564 ); |
| 1565 } | 1565 } |
| 1566 #endif // HAS_ARGBTOUV422ROW_NEON | 1566 #endif // HAS_ARGBTOUV422ROW_NEON |
| 1567 | 1567 |
| 1568 // 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32. | 1568 // 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32. |
| 1569 #ifdef HAS_ARGBTOUV411ROW_NEON | 1569 #ifdef HAS_ARGBTOUV411ROW_NEON |
| 1570 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, | 1570 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, |
| 1571 int pix) { | 1571 int width) { |
| 1572 asm volatile ( | 1572 asm volatile ( |
| 1573 RGBTOUV_SETUP_REG | 1573 RGBTOUV_SETUP_REG |
| 1574 "1: \n" | 1574 "1: \n" |
| 1575 MEMACCESS(0) | 1575 MEMACCESS(0) |
| 1576 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | 1576 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. |
| 1577 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. | 1577 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. |
| 1578 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 1578 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
| 1579 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. | 1579 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. |
| 1580 MEMACCESS(0) | 1580 MEMACCESS(0) |
| 1581 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16. | 1581 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16. |
| (...skipping 21 matching lines...) Expand all Loading... |
| 1603 "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U | 1603 "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U |
| 1604 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V | 1604 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V |
| 1605 MEMACCESS(1) | 1605 MEMACCESS(1) |
| 1606 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. | 1606 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. |
| 1607 MEMACCESS(2) | 1607 MEMACCESS(2) |
| 1608 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. | 1608 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. |
| 1609 "b.gt 1b \n" | 1609 "b.gt 1b \n" |
| 1610 : "+r"(src_argb), // %0 | 1610 : "+r"(src_argb), // %0 |
| 1611 "+r"(dst_u), // %1 | 1611 "+r"(dst_u), // %1 |
| 1612 "+r"(dst_v), // %2 | 1612 "+r"(dst_v), // %2 |
| 1613 "+r"(pix) // %3 | 1613 "+r"(width) // %3 |
| 1614 : | 1614 : |
| 1615 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1615 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
| 1616 "v20", "v21", "v22", "v23", "v24", "v25" | 1616 "v20", "v21", "v22", "v23", "v24", "v25" |
| 1617 ); | 1617 ); |
| 1618 } | 1618 } |
| 1619 #endif // HAS_ARGBTOUV411ROW_NEON | 1619 #endif // HAS_ARGBTOUV411ROW_NEON |
| 1620 | 1620 |
| 1621 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. | 1621 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. |
| 1622 #define RGBTOUV(QB, QG, QR) \ | 1622 #define RGBTOUV(QB, QG, QR) \ |
| 1623 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ | 1623 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ |
| 1624 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ | 1624 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ |
| 1625 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ | 1625 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ |
| 1626 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ | 1626 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ |
| 1627 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ | 1627 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ |
| 1628 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ | 1628 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ |
| 1629 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ | 1629 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ |
| 1630 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ | 1630 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ |
| 1631 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ | 1631 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ |
| 1632 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ | 1632 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ |
| 1633 | 1633 |
| 1634 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. | 1634 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. |
| 1635 // TODO(fbarchard): consider ptrdiff_t for all strides. | 1635 // TODO(fbarchard): consider ptrdiff_t for all strides. |
| 1636 | 1636 |
| 1637 #ifdef HAS_ARGBTOUVROW_NEON | 1637 #ifdef HAS_ARGBTOUVROW_NEON |
| 1638 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, | 1638 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, |
| 1639 uint8* dst_u, uint8* dst_v, int pix) { | 1639 uint8* dst_u, uint8* dst_v, int width) { |
| 1640 const uint8* src_argb_1 = src_argb + src_stride_argb; | 1640 const uint8* src_argb_1 = src_argb + src_stride_argb; |
| 1641 asm volatile ( | 1641 asm volatile ( |
| 1642 RGBTOUV_SETUP_REG | 1642 RGBTOUV_SETUP_REG |
| 1643 "1: \n" | 1643 "1: \n" |
| 1644 MEMACCESS(0) | 1644 MEMACCESS(0) |
| 1645 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | 1645 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. |
| 1646 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. | 1646 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. |
| 1647 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 1647 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
| 1648 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. | 1648 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. |
| 1649 | 1649 |
| (...skipping 11 matching lines...) Expand all Loading... |
| 1661 RGBTOUV(v0.8h, v1.8h, v2.8h) | 1661 RGBTOUV(v0.8h, v1.8h, v2.8h) |
| 1662 MEMACCESS(2) | 1662 MEMACCESS(2) |
| 1663 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 1663 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
| 1664 MEMACCESS(3) | 1664 MEMACCESS(3) |
| 1665 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 1665 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
| 1666 "b.gt 1b \n" | 1666 "b.gt 1b \n" |
| 1667 : "+r"(src_argb), // %0 | 1667 : "+r"(src_argb), // %0 |
| 1668 "+r"(src_argb_1), // %1 | 1668 "+r"(src_argb_1), // %1 |
| 1669 "+r"(dst_u), // %2 | 1669 "+r"(dst_u), // %2 |
| 1670 "+r"(dst_v), // %3 | 1670 "+r"(dst_v), // %3 |
| 1671 "+r"(pix) // %4 | 1671 "+r"(width) // %4 |
| 1672 : | 1672 : |
| 1673 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1673 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
| 1674 "v20", "v21", "v22", "v23", "v24", "v25" | 1674 "v20", "v21", "v22", "v23", "v24", "v25" |
| 1675 ); | 1675 ); |
| 1676 } | 1676 } |
| 1677 #endif // HAS_ARGBTOUVROW_NEON | 1677 #endif // HAS_ARGBTOUVROW_NEON |
| 1678 | 1678 |
| 1679 // TODO(fbarchard): Subsample match C code. | 1679 // TODO(fbarchard): Subsample match C code. |
| 1680 #ifdef HAS_ARGBTOUVJROW_NEON | 1680 #ifdef HAS_ARGBTOUVJROW_NEON |
| 1681 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, | 1681 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, |
| 1682 uint8* dst_u, uint8* dst_v, int pix) { | 1682 uint8* dst_u, uint8* dst_v, int width) { |
| 1683 const uint8* src_argb_1 = src_argb + src_stride_argb; | 1683 const uint8* src_argb_1 = src_argb + src_stride_argb; |
| 1684 asm volatile ( | 1684 asm volatile ( |
| 1685 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 | 1685 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 |
| 1686 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 | 1686 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 |
| 1687 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 | 1687 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 |
| 1688 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 | 1688 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 |
| 1689 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 | 1689 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 |
| 1690 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) | 1690 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) |
| 1691 "1: \n" | 1691 "1: \n" |
| 1692 MEMACCESS(0) | 1692 MEMACCESS(0) |
| (...skipping 15 matching lines...) Expand all Loading... |
| 1708 RGBTOUV(v0.8h, v1.8h, v2.8h) | 1708 RGBTOUV(v0.8h, v1.8h, v2.8h) |
| 1709 MEMACCESS(2) | 1709 MEMACCESS(2) |
| 1710 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 1710 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
| 1711 MEMACCESS(3) | 1711 MEMACCESS(3) |
| 1712 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 1712 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
| 1713 "b.gt 1b \n" | 1713 "b.gt 1b \n" |
| 1714 : "+r"(src_argb), // %0 | 1714 : "+r"(src_argb), // %0 |
| 1715 "+r"(src_argb_1), // %1 | 1715 "+r"(src_argb_1), // %1 |
| 1716 "+r"(dst_u), // %2 | 1716 "+r"(dst_u), // %2 |
| 1717 "+r"(dst_v), // %3 | 1717 "+r"(dst_v), // %3 |
| 1718 "+r"(pix) // %4 | 1718 "+r"(width) // %4 |
| 1719 : | 1719 : |
| 1720 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1720 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
| 1721 "v20", "v21", "v22", "v23", "v24", "v25" | 1721 "v20", "v21", "v22", "v23", "v24", "v25" |
| 1722 ); | 1722 ); |
| 1723 } | 1723 } |
| 1724 #endif // HAS_ARGBTOUVJROW_NEON | 1724 #endif // HAS_ARGBTOUVJROW_NEON |
| 1725 | 1725 |
| 1726 #ifdef HAS_BGRATOUVROW_NEON | 1726 #ifdef HAS_BGRATOUVROW_NEON |
| 1727 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, | 1727 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, |
| 1728 uint8* dst_u, uint8* dst_v, int pix) { | 1728 uint8* dst_u, uint8* dst_v, int width) { |
| 1729 const uint8* src_bgra_1 = src_bgra + src_stride_bgra; | 1729 const uint8* src_bgra_1 = src_bgra + src_stride_bgra; |
| 1730 asm volatile ( | 1730 asm volatile ( |
| 1731 RGBTOUV_SETUP_REG | 1731 RGBTOUV_SETUP_REG |
| 1732 "1: \n" | 1732 "1: \n" |
| 1733 MEMACCESS(0) | 1733 MEMACCESS(0) |
| 1734 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | 1734 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. |
| 1735 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. | 1735 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. |
| 1736 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. | 1736 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. |
| 1737 "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. | 1737 "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. |
| 1738 MEMACCESS(1) | 1738 MEMACCESS(1) |
| (...skipping 10 matching lines...) Expand all Loading... |
| 1749 RGBTOUV(v0.8h, v1.8h, v2.8h) | 1749 RGBTOUV(v0.8h, v1.8h, v2.8h) |
| 1750 MEMACCESS(2) | 1750 MEMACCESS(2) |
| 1751 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 1751 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
| 1752 MEMACCESS(3) | 1752 MEMACCESS(3) |
| 1753 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 1753 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
| 1754 "b.gt 1b \n" | 1754 "b.gt 1b \n" |
| 1755 : "+r"(src_bgra), // %0 | 1755 : "+r"(src_bgra), // %0 |
| 1756 "+r"(src_bgra_1), // %1 | 1756 "+r"(src_bgra_1), // %1 |
| 1757 "+r"(dst_u), // %2 | 1757 "+r"(dst_u), // %2 |
| 1758 "+r"(dst_v), // %3 | 1758 "+r"(dst_v), // %3 |
| 1759 "+r"(pix) // %4 | 1759 "+r"(width) // %4 |
| 1760 : | 1760 : |
| 1761 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1761 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
| 1762 "v20", "v21", "v22", "v23", "v24", "v25" | 1762 "v20", "v21", "v22", "v23", "v24", "v25" |
| 1763 ); | 1763 ); |
| 1764 } | 1764 } |
| 1765 #endif // HAS_BGRATOUVROW_NEON | 1765 #endif // HAS_BGRATOUVROW_NEON |
| 1766 | 1766 |
| 1767 #ifdef HAS_ABGRTOUVROW_NEON | 1767 #ifdef HAS_ABGRTOUVROW_NEON |
| 1768 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, | 1768 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, |
| 1769 uint8* dst_u, uint8* dst_v, int pix) { | 1769 uint8* dst_u, uint8* dst_v, int width) { |
| 1770 const uint8* src_abgr_1 = src_abgr + src_stride_abgr; | 1770 const uint8* src_abgr_1 = src_abgr + src_stride_abgr; |
| 1771 asm volatile ( | 1771 asm volatile ( |
| 1772 RGBTOUV_SETUP_REG | 1772 RGBTOUV_SETUP_REG |
| 1773 "1: \n" | 1773 "1: \n" |
| 1774 MEMACCESS(0) | 1774 MEMACCESS(0) |
| 1775 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | 1775 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. |
| 1776 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. | 1776 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. |
| 1777 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 1777 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
| 1778 "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. | 1778 "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. |
| 1779 MEMACCESS(1) | 1779 MEMACCESS(1) |
| (...skipping 10 matching lines...) Expand all Loading... |
| 1790 RGBTOUV(v0.8h, v2.8h, v1.8h) | 1790 RGBTOUV(v0.8h, v2.8h, v1.8h) |
| 1791 MEMACCESS(2) | 1791 MEMACCESS(2) |
| 1792 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 1792 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
| 1793 MEMACCESS(3) | 1793 MEMACCESS(3) |
| 1794 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 1794 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
| 1795 "b.gt 1b \n" | 1795 "b.gt 1b \n" |
| 1796 : "+r"(src_abgr), // %0 | 1796 : "+r"(src_abgr), // %0 |
| 1797 "+r"(src_abgr_1), // %1 | 1797 "+r"(src_abgr_1), // %1 |
| 1798 "+r"(dst_u), // %2 | 1798 "+r"(dst_u), // %2 |
| 1799 "+r"(dst_v), // %3 | 1799 "+r"(dst_v), // %3 |
| 1800 "+r"(pix) // %4 | 1800 "+r"(width) // %4 |
| 1801 : | 1801 : |
| 1802 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1802 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
| 1803 "v20", "v21", "v22", "v23", "v24", "v25" | 1803 "v20", "v21", "v22", "v23", "v24", "v25" |
| 1804 ); | 1804 ); |
| 1805 } | 1805 } |
| 1806 #endif // HAS_ABGRTOUVROW_NEON | 1806 #endif // HAS_ABGRTOUVROW_NEON |
| 1807 | 1807 |
| 1808 #ifdef HAS_RGBATOUVROW_NEON | 1808 #ifdef HAS_RGBATOUVROW_NEON |
| 1809 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, | 1809 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, |
| 1810 uint8* dst_u, uint8* dst_v, int pix) { | 1810 uint8* dst_u, uint8* dst_v, int width) { |
| 1811 const uint8* src_rgba_1 = src_rgba + src_stride_rgba; | 1811 const uint8* src_rgba_1 = src_rgba + src_stride_rgba; |
| 1812 asm volatile ( | 1812 asm volatile ( |
| 1813 RGBTOUV_SETUP_REG | 1813 RGBTOUV_SETUP_REG |
| 1814 "1: \n" | 1814 "1: \n" |
| 1815 MEMACCESS(0) | 1815 MEMACCESS(0) |
| 1816 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | 1816 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. |
| 1817 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. | 1817 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. |
| 1818 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. | 1818 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. |
| 1819 "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. | 1819 "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. |
| 1820 MEMACCESS(1) | 1820 MEMACCESS(1) |
| (...skipping 10 matching lines...) Expand all Loading... |
| 1831 RGBTOUV(v0.8h, v1.8h, v2.8h) | 1831 RGBTOUV(v0.8h, v1.8h, v2.8h) |
| 1832 MEMACCESS(2) | 1832 MEMACCESS(2) |
| 1833 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 1833 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
| 1834 MEMACCESS(3) | 1834 MEMACCESS(3) |
| 1835 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 1835 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
| 1836 "b.gt 1b \n" | 1836 "b.gt 1b \n" |
| 1837 : "+r"(src_rgba), // %0 | 1837 : "+r"(src_rgba), // %0 |
| 1838 "+r"(src_rgba_1), // %1 | 1838 "+r"(src_rgba_1), // %1 |
| 1839 "+r"(dst_u), // %2 | 1839 "+r"(dst_u), // %2 |
| 1840 "+r"(dst_v), // %3 | 1840 "+r"(dst_v), // %3 |
| 1841 "+r"(pix) // %4 | 1841 "+r"(width) // %4 |
| 1842 : | 1842 : |
| 1843 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1843 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
| 1844 "v20", "v21", "v22", "v23", "v24", "v25" | 1844 "v20", "v21", "v22", "v23", "v24", "v25" |
| 1845 ); | 1845 ); |
| 1846 } | 1846 } |
| 1847 #endif // HAS_RGBATOUVROW_NEON | 1847 #endif // HAS_RGBATOUVROW_NEON |
| 1848 | 1848 |
| 1849 #ifdef HAS_RGB24TOUVROW_NEON | 1849 #ifdef HAS_RGB24TOUVROW_NEON |
| 1850 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, | 1850 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, |
| 1851 uint8* dst_u, uint8* dst_v, int pix) { | 1851 uint8* dst_u, uint8* dst_v, int width) { |
| 1852 const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; | 1852 const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; |
| 1853 asm volatile ( | 1853 asm volatile ( |
| 1854 RGBTOUV_SETUP_REG | 1854 RGBTOUV_SETUP_REG |
| 1855 "1: \n" | 1855 "1: \n" |
| 1856 MEMACCESS(0) | 1856 MEMACCESS(0) |
| 1857 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. | 1857 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. |
| 1858 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. | 1858 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. |
| 1859 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 1859 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
| 1860 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. | 1860 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. |
| 1861 MEMACCESS(1) | 1861 MEMACCESS(1) |
| (...skipping 10 matching lines...) Expand all Loading... |
| 1872 RGBTOUV(v0.8h, v1.8h, v2.8h) | 1872 RGBTOUV(v0.8h, v1.8h, v2.8h) |
| 1873 MEMACCESS(2) | 1873 MEMACCESS(2) |
| 1874 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 1874 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
| 1875 MEMACCESS(3) | 1875 MEMACCESS(3) |
| 1876 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 1876 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
| 1877 "b.gt 1b \n" | 1877 "b.gt 1b \n" |
| 1878 : "+r"(src_rgb24), // %0 | 1878 : "+r"(src_rgb24), // %0 |
| 1879 "+r"(src_rgb24_1), // %1 | 1879 "+r"(src_rgb24_1), // %1 |
| 1880 "+r"(dst_u), // %2 | 1880 "+r"(dst_u), // %2 |
| 1881 "+r"(dst_v), // %3 | 1881 "+r"(dst_v), // %3 |
| 1882 "+r"(pix) // %4 | 1882 "+r"(width) // %4 |
| 1883 : | 1883 : |
| 1884 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1884 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
| 1885 "v20", "v21", "v22", "v23", "v24", "v25" | 1885 "v20", "v21", "v22", "v23", "v24", "v25" |
| 1886 ); | 1886 ); |
| 1887 } | 1887 } |
| 1888 #endif // HAS_RGB24TOUVROW_NEON | 1888 #endif // HAS_RGB24TOUVROW_NEON |
| 1889 | 1889 |
| 1890 #ifdef HAS_RAWTOUVROW_NEON | 1890 #ifdef HAS_RAWTOUVROW_NEON |
| 1891 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, | 1891 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, |
| 1892 uint8* dst_u, uint8* dst_v, int pix) { | 1892 uint8* dst_u, uint8* dst_v, int width) { |
| 1893 const uint8* src_raw_1 = src_raw + src_stride_raw; | 1893 const uint8* src_raw_1 = src_raw + src_stride_raw; |
| 1894 asm volatile ( | 1894 asm volatile ( |
| 1895 RGBTOUV_SETUP_REG | 1895 RGBTOUV_SETUP_REG |
| 1896 "1: \n" | 1896 "1: \n" |
| 1897 MEMACCESS(0) | 1897 MEMACCESS(0) |
| 1898 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. | 1898 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. |
| 1899 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. | 1899 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. |
| 1900 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 1900 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
| 1901 "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. | 1901 "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. |
| 1902 MEMACCESS(1) | 1902 MEMACCESS(1) |
| (...skipping 10 matching lines...) Expand all Loading... |
| 1913 RGBTOUV(v2.8h, v1.8h, v0.8h) | 1913 RGBTOUV(v2.8h, v1.8h, v0.8h) |
| 1914 MEMACCESS(2) | 1914 MEMACCESS(2) |
| 1915 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 1915 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
| 1916 MEMACCESS(3) | 1916 MEMACCESS(3) |
| 1917 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 1917 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
| 1918 "b.gt 1b \n" | 1918 "b.gt 1b \n" |
| 1919 : "+r"(src_raw), // %0 | 1919 : "+r"(src_raw), // %0 |
| 1920 "+r"(src_raw_1), // %1 | 1920 "+r"(src_raw_1), // %1 |
| 1921 "+r"(dst_u), // %2 | 1921 "+r"(dst_u), // %2 |
| 1922 "+r"(dst_v), // %3 | 1922 "+r"(dst_v), // %3 |
| 1923 "+r"(pix) // %4 | 1923 "+r"(width) // %4 |
| 1924 : | 1924 : |
| 1925 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1925 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
| 1926 "v20", "v21", "v22", "v23", "v24", "v25" | 1926 "v20", "v21", "v22", "v23", "v24", "v25" |
| 1927 ); | 1927 ); |
| 1928 } | 1928 } |
| 1929 #endif // HAS_RAWTOUVROW_NEON | 1929 #endif // HAS_RAWTOUVROW_NEON |
| 1930 | 1930 |
| 1931 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. | 1931 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. |
| 1932 #ifdef HAS_RGB565TOUVROW_NEON | 1932 #ifdef HAS_RGB565TOUVROW_NEON |
| 1933 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, | 1933 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, |
| 1934 uint8* dst_u, uint8* dst_v, int pix) { | 1934 uint8* dst_u, uint8* dst_v, int width) { |
| 1935 const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; | 1935 const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; |
| 1936 asm volatile ( | 1936 asm volatile ( |
| 1937 "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 | 1937 "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 |
| 1938 "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 | 1938 "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 |
| 1939 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 | 1939 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 |
| 1940 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 | 1940 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 |
| 1941 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 | 1941 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 |
| 1942 "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) | 1942 "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) |
| 1943 "1: \n" | 1943 "1: \n" |
| 1944 MEMACCESS(0) | 1944 MEMACCESS(0) |
| (...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1988 "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V | 1988 "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V |
| 1989 MEMACCESS(2) | 1989 MEMACCESS(2) |
| 1990 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 1990 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
| 1991 MEMACCESS(3) | 1991 MEMACCESS(3) |
| 1992 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 1992 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
| 1993 "b.gt 1b \n" | 1993 "b.gt 1b \n" |
| 1994 : "+r"(src_rgb565), // %0 | 1994 : "+r"(src_rgb565), // %0 |
| 1995 "+r"(src_rgb565_1), // %1 | 1995 "+r"(src_rgb565_1), // %1 |
| 1996 "+r"(dst_u), // %2 | 1996 "+r"(dst_u), // %2 |
| 1997 "+r"(dst_v), // %3 | 1997 "+r"(dst_v), // %3 |
| 1998 "+r"(pix) // %4 | 1998 "+r"(width) // %4 |
| 1999 : | 1999 : |
| 2000 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 2000 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
| 2001 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", | 2001 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", |
| 2002 "v25", "v26", "v27" | 2002 "v25", "v26", "v27" |
| 2003 ); | 2003 ); |
| 2004 } | 2004 } |
| 2005 #endif // HAS_RGB565TOUVROW_NEON | 2005 #endif // HAS_RGB565TOUVROW_NEON |
| 2006 | 2006 |
| 2007 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. | 2007 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. |
| 2008 #ifdef HAS_ARGB1555TOUVROW_NEON | 2008 #ifdef HAS_ARGB1555TOUVROW_NEON |
| 2009 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, | 2009 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, |
| 2010 uint8* dst_u, uint8* dst_v, int pix) { | 2010 uint8* dst_u, uint8* dst_v, int width) { |
| 2011 const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; | 2011 const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; |
| 2012 asm volatile ( | 2012 asm volatile ( |
| 2013 RGBTOUV_SETUP_REG | 2013 RGBTOUV_SETUP_REG |
| 2014 "1: \n" | 2014 "1: \n" |
| 2015 MEMACCESS(0) | 2015 MEMACCESS(0) |
| 2016 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. | 2016 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. |
| 2017 RGB555TOARGB | 2017 RGB555TOARGB |
| 2018 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. | 2018 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. |
| 2019 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. | 2019 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. |
| 2020 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. | 2020 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. |
| (...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2059 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V | 2059 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V |
| 2060 MEMACCESS(2) | 2060 MEMACCESS(2) |
| 2061 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 2061 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
| 2062 MEMACCESS(3) | 2062 MEMACCESS(3) |
| 2063 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 2063 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
| 2064 "b.gt 1b \n" | 2064 "b.gt 1b \n" |
| 2065 : "+r"(src_argb1555), // %0 | 2065 : "+r"(src_argb1555), // %0 |
| 2066 "+r"(src_argb1555_1), // %1 | 2066 "+r"(src_argb1555_1), // %1 |
| 2067 "+r"(dst_u), // %2 | 2067 "+r"(dst_u), // %2 |
| 2068 "+r"(dst_v), // %3 | 2068 "+r"(dst_v), // %3 |
| 2069 "+r"(pix) // %4 | 2069 "+r"(width) // %4 |
| 2070 : | 2070 : |
| 2071 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", | 2071 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", |
| 2072 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", | 2072 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", |
| 2073 "v26", "v27", "v28" | 2073 "v26", "v27", "v28" |
| 2074 ); | 2074 ); |
| 2075 } | 2075 } |
| 2076 #endif // HAS_ARGB1555TOUVROW_NEON | 2076 #endif // HAS_ARGB1555TOUVROW_NEON |
| 2077 | 2077 |
| 2078 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. | 2078 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. |
| 2079 #ifdef HAS_ARGB4444TOUVROW_NEON | 2079 #ifdef HAS_ARGB4444TOUVROW_NEON |
| 2080 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, | 2080 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, |
| 2081 uint8* dst_u, uint8* dst_v, int pix) { | 2081 uint8* dst_u, uint8* dst_v, int width) { |
| 2082 const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; | 2082 const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; |
| 2083 asm volatile ( | 2083 asm volatile ( |
| 2084 RGBTOUV_SETUP_REG | 2084 RGBTOUV_SETUP_REG |
| 2085 "1: \n" | 2085 "1: \n" |
| 2086 MEMACCESS(0) | 2086 MEMACCESS(0) |
| 2087 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. | 2087 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. |
| 2088 ARGB4444TOARGB | 2088 ARGB4444TOARGB |
| 2089 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. | 2089 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. |
| 2090 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. | 2090 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. |
| 2091 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. | 2091 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. |
| (...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2130 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V | 2130 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V |
| 2131 MEMACCESS(2) | 2131 MEMACCESS(2) |
| 2132 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 2132 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
| 2133 MEMACCESS(3) | 2133 MEMACCESS(3) |
| 2134 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 2134 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
| 2135 "b.gt 1b \n" | 2135 "b.gt 1b \n" |
| 2136 : "+r"(src_argb4444), // %0 | 2136 : "+r"(src_argb4444), // %0 |
| 2137 "+r"(src_argb4444_1), // %1 | 2137 "+r"(src_argb4444_1), // %1 |
| 2138 "+r"(dst_u), // %2 | 2138 "+r"(dst_u), // %2 |
| 2139 "+r"(dst_v), // %3 | 2139 "+r"(dst_v), // %3 |
| 2140 "+r"(pix) // %4 | 2140 "+r"(width) // %4 |
| 2141 : | 2141 : |
| 2142 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", | 2142 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", |
| 2143 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", | 2143 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", |
| 2144 "v26", "v27", "v28" | 2144 "v26", "v27", "v28" |
| 2145 | 2145 |
| 2146 ); | 2146 ); |
| 2147 } | 2147 } |
| 2148 #endif // HAS_ARGB4444TOUVROW_NEON | 2148 #endif // HAS_ARGB4444TOUVROW_NEON |
| 2149 | 2149 |
| 2150 #ifdef HAS_RGB565TOYROW_NEON | 2150 #ifdef HAS_RGB565TOYROW_NEON |
| 2151 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { | 2151 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { |
| 2152 asm volatile ( | 2152 asm volatile ( |
| 2153 "movi v24.8b, #13 \n" // B * 0.1016 coefficient | 2153 "movi v24.8b, #13 \n" // B * 0.1016 coefficient |
| 2154 "movi v25.8b, #65 \n" // G * 0.5078 coefficient | 2154 "movi v25.8b, #65 \n" // G * 0.5078 coefficient |
| 2155 "movi v26.8b, #33 \n" // R * 0.2578 coefficient | 2155 "movi v26.8b, #33 \n" // R * 0.2578 coefficient |
| 2156 "movi v27.8b, #16 \n" // Add 16 constant | 2156 "movi v27.8b, #16 \n" // Add 16 constant |
| 2157 "1: \n" | 2157 "1: \n" |
| 2158 MEMACCESS(0) | 2158 MEMACCESS(0) |
| 2159 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. | 2159 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. |
| 2160 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2160 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 2161 RGB565TOARGB | 2161 RGB565TOARGB |
| 2162 "umull v3.8h, v0.8b, v24.8b \n" // B | 2162 "umull v3.8h, v0.8b, v24.8b \n" // B |
| 2163 "umlal v3.8h, v1.8b, v25.8b \n" // G | 2163 "umlal v3.8h, v1.8b, v25.8b \n" // G |
| 2164 "umlal v3.8h, v2.8b, v26.8b \n" // R | 2164 "umlal v3.8h, v2.8b, v26.8b \n" // R |
| 2165 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y | 2165 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y |
| 2166 "uqadd v0.8b, v0.8b, v27.8b \n" | 2166 "uqadd v0.8b, v0.8b, v27.8b \n" |
| 2167 MEMACCESS(1) | 2167 MEMACCESS(1) |
| 2168 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2168 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
| 2169 "b.gt 1b \n" | 2169 "b.gt 1b \n" |
| 2170 : "+r"(src_rgb565), // %0 | 2170 : "+r"(src_rgb565), // %0 |
| 2171 "+r"(dst_y), // %1 | 2171 "+r"(dst_y), // %1 |
| 2172 "+r"(pix) // %2 | 2172 "+r"(width) // %2 |
| 2173 : | 2173 : |
| 2174 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", | 2174 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", |
| 2175 "v24", "v25", "v26", "v27" | 2175 "v24", "v25", "v26", "v27" |
| 2176 ); | 2176 ); |
| 2177 } | 2177 } |
| 2178 #endif // HAS_RGB565TOYROW_NEON | 2178 #endif // HAS_RGB565TOYROW_NEON |
| 2179 | 2179 |
| 2180 #ifdef HAS_ARGB1555TOYROW_NEON | 2180 #ifdef HAS_ARGB1555TOYROW_NEON |
| 2181 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { | 2181 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { |
| 2182 asm volatile ( | 2182 asm volatile ( |
| 2183 "movi v4.8b, #13 \n" // B * 0.1016 coefficient | 2183 "movi v4.8b, #13 \n" // B * 0.1016 coefficient |
| 2184 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2184 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
| 2185 "movi v6.8b, #33 \n" // R * 0.2578 coefficient | 2185 "movi v6.8b, #33 \n" // R * 0.2578 coefficient |
| 2186 "movi v7.8b, #16 \n" // Add 16 constant | 2186 "movi v7.8b, #16 \n" // Add 16 constant |
| 2187 "1: \n" | 2187 "1: \n" |
| 2188 MEMACCESS(0) | 2188 MEMACCESS(0) |
| 2189 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. | 2189 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. |
| 2190 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2190 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 2191 ARGB1555TOARGB | 2191 ARGB1555TOARGB |
| 2192 "umull v3.8h, v0.8b, v4.8b \n" // B | 2192 "umull v3.8h, v0.8b, v4.8b \n" // B |
| 2193 "umlal v3.8h, v1.8b, v5.8b \n" // G | 2193 "umlal v3.8h, v1.8b, v5.8b \n" // G |
| 2194 "umlal v3.8h, v2.8b, v6.8b \n" // R | 2194 "umlal v3.8h, v2.8b, v6.8b \n" // R |
| 2195 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y | 2195 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y |
| 2196 "uqadd v0.8b, v0.8b, v7.8b \n" | 2196 "uqadd v0.8b, v0.8b, v7.8b \n" |
| 2197 MEMACCESS(1) | 2197 MEMACCESS(1) |
| 2198 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2198 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
| 2199 "b.gt 1b \n" | 2199 "b.gt 1b \n" |
| 2200 : "+r"(src_argb1555), // %0 | 2200 : "+r"(src_argb1555), // %0 |
| 2201 "+r"(dst_y), // %1 | 2201 "+r"(dst_y), // %1 |
| 2202 "+r"(pix) // %2 | 2202 "+r"(width) // %2 |
| 2203 : | 2203 : |
| 2204 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" | 2204 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
| 2205 ); | 2205 ); |
| 2206 } | 2206 } |
| 2207 #endif // HAS_ARGB1555TOYROW_NEON | 2207 #endif // HAS_ARGB1555TOYROW_NEON |
| 2208 | 2208 |
| 2209 #ifdef HAS_ARGB4444TOYROW_NEON | 2209 #ifdef HAS_ARGB4444TOYROW_NEON |
| 2210 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { | 2210 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { |
| 2211 asm volatile ( | 2211 asm volatile ( |
| 2212 "movi v24.8b, #13 \n" // B * 0.1016 coefficient | 2212 "movi v24.8b, #13 \n" // B * 0.1016 coefficient |
| 2213 "movi v25.8b, #65 \n" // G * 0.5078 coefficient | 2213 "movi v25.8b, #65 \n" // G * 0.5078 coefficient |
| 2214 "movi v26.8b, #33 \n" // R * 0.2578 coefficient | 2214 "movi v26.8b, #33 \n" // R * 0.2578 coefficient |
| 2215 "movi v27.8b, #16 \n" // Add 16 constant | 2215 "movi v27.8b, #16 \n" // Add 16 constant |
| 2216 "1: \n" | 2216 "1: \n" |
| 2217 MEMACCESS(0) | 2217 MEMACCESS(0) |
| 2218 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. | 2218 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. |
| 2219 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2219 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 2220 ARGB4444TOARGB | 2220 ARGB4444TOARGB |
| 2221 "umull v3.8h, v0.8b, v24.8b \n" // B | 2221 "umull v3.8h, v0.8b, v24.8b \n" // B |
| 2222 "umlal v3.8h, v1.8b, v25.8b \n" // G | 2222 "umlal v3.8h, v1.8b, v25.8b \n" // G |
| 2223 "umlal v3.8h, v2.8b, v26.8b \n" // R | 2223 "umlal v3.8h, v2.8b, v26.8b \n" // R |
| 2224 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y | 2224 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y |
| 2225 "uqadd v0.8b, v0.8b, v27.8b \n" | 2225 "uqadd v0.8b, v0.8b, v27.8b \n" |
| 2226 MEMACCESS(1) | 2226 MEMACCESS(1) |
| 2227 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2227 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
| 2228 "b.gt 1b \n" | 2228 "b.gt 1b \n" |
| 2229 : "+r"(src_argb4444), // %0 | 2229 : "+r"(src_argb4444), // %0 |
| 2230 "+r"(dst_y), // %1 | 2230 "+r"(dst_y), // %1 |
| 2231 "+r"(pix) // %2 | 2231 "+r"(width) // %2 |
| 2232 : | 2232 : |
| 2233 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" | 2233 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" |
| 2234 ); | 2234 ); |
| 2235 } | 2235 } |
| 2236 #endif // HAS_ARGB4444TOYROW_NEON | 2236 #endif // HAS_ARGB4444TOYROW_NEON |
| 2237 | 2237 |
| 2238 #ifdef HAS_BGRATOYROW_NEON | 2238 #ifdef HAS_BGRATOYROW_NEON |
| 2239 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { | 2239 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { |
| 2240 asm volatile ( | 2240 asm volatile ( |
| 2241 "movi v4.8b, #33 \n" // R * 0.2578 coefficient | 2241 "movi v4.8b, #33 \n" // R * 0.2578 coefficient |
| 2242 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2242 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
| 2243 "movi v6.8b, #13 \n" // B * 0.1016 coefficient | 2243 "movi v6.8b, #13 \n" // B * 0.1016 coefficient |
| 2244 "movi v7.8b, #16 \n" // Add 16 constant | 2244 "movi v7.8b, #16 \n" // Add 16 constant |
| 2245 "1: \n" | 2245 "1: \n" |
| 2246 MEMACCESS(0) | 2246 MEMACCESS(0) |
| 2247 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. | 2247 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. |
| 2248 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2248 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 2249 "umull v16.8h, v1.8b, v4.8b \n" // R | 2249 "umull v16.8h, v1.8b, v4.8b \n" // R |
| 2250 "umlal v16.8h, v2.8b, v5.8b \n" // G | 2250 "umlal v16.8h, v2.8b, v5.8b \n" // G |
| 2251 "umlal v16.8h, v3.8b, v6.8b \n" // B | 2251 "umlal v16.8h, v3.8b, v6.8b \n" // B |
| 2252 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2252 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
| 2253 "uqadd v0.8b, v0.8b, v7.8b \n" | 2253 "uqadd v0.8b, v0.8b, v7.8b \n" |
| 2254 MEMACCESS(1) | 2254 MEMACCESS(1) |
| 2255 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2255 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
| 2256 "b.gt 1b \n" | 2256 "b.gt 1b \n" |
| 2257 : "+r"(src_bgra), // %0 | 2257 : "+r"(src_bgra), // %0 |
| 2258 "+r"(dst_y), // %1 | 2258 "+r"(dst_y), // %1 |
| 2259 "+r"(pix) // %2 | 2259 "+r"(width) // %2 |
| 2260 : | 2260 : |
| 2261 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2261 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
| 2262 ); | 2262 ); |
| 2263 } | 2263 } |
| 2264 #endif // HAS_BGRATOYROW_NEON | 2264 #endif // HAS_BGRATOYROW_NEON |
| 2265 | 2265 |
| 2266 #ifdef HAS_ABGRTOYROW_NEON | 2266 #ifdef HAS_ABGRTOYROW_NEON |
| 2267 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { | 2267 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { |
| 2268 asm volatile ( | 2268 asm volatile ( |
| 2269 "movi v4.8b, #33 \n" // R * 0.2578 coefficient | 2269 "movi v4.8b, #33 \n" // R * 0.2578 coefficient |
| 2270 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2270 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
| 2271 "movi v6.8b, #13 \n" // B * 0.1016 coefficient | 2271 "movi v6.8b, #13 \n" // B * 0.1016 coefficient |
| 2272 "movi v7.8b, #16 \n" // Add 16 constant | 2272 "movi v7.8b, #16 \n" // Add 16 constant |
| 2273 "1: \n" | 2273 "1: \n" |
| 2274 MEMACCESS(0) | 2274 MEMACCESS(0) |
| 2275 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. | 2275 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. |
| 2276 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2276 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 2277 "umull v16.8h, v0.8b, v4.8b \n" // R | 2277 "umull v16.8h, v0.8b, v4.8b \n" // R |
| 2278 "umlal v16.8h, v1.8b, v5.8b \n" // G | 2278 "umlal v16.8h, v1.8b, v5.8b \n" // G |
| 2279 "umlal v16.8h, v2.8b, v6.8b \n" // B | 2279 "umlal v16.8h, v2.8b, v6.8b \n" // B |
| 2280 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2280 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
| 2281 "uqadd v0.8b, v0.8b, v7.8b \n" | 2281 "uqadd v0.8b, v0.8b, v7.8b \n" |
| 2282 MEMACCESS(1) | 2282 MEMACCESS(1) |
| 2283 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2283 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
| 2284 "b.gt 1b \n" | 2284 "b.gt 1b \n" |
| 2285 : "+r"(src_abgr), // %0 | 2285 : "+r"(src_abgr), // %0 |
| 2286 "+r"(dst_y), // %1 | 2286 "+r"(dst_y), // %1 |
| 2287 "+r"(pix) // %2 | 2287 "+r"(width) // %2 |
| 2288 : | 2288 : |
| 2289 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2289 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
| 2290 ); | 2290 ); |
| 2291 } | 2291 } |
| 2292 #endif // HAS_ABGRTOYROW_NEON | 2292 #endif // HAS_ABGRTOYROW_NEON |
| 2293 | 2293 |
| 2294 #ifdef HAS_RGBATOYROW_NEON | 2294 #ifdef HAS_RGBATOYROW_NEON |
| 2295 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { | 2295 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { |
| 2296 asm volatile ( | 2296 asm volatile ( |
| 2297 "movi v4.8b, #13 \n" // B * 0.1016 coefficient | 2297 "movi v4.8b, #13 \n" // B * 0.1016 coefficient |
| 2298 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2298 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
| 2299 "movi v6.8b, #33 \n" // R * 0.2578 coefficient | 2299 "movi v6.8b, #33 \n" // R * 0.2578 coefficient |
| 2300 "movi v7.8b, #16 \n" // Add 16 constant | 2300 "movi v7.8b, #16 \n" // Add 16 constant |
| 2301 "1: \n" | 2301 "1: \n" |
| 2302 MEMACCESS(0) | 2302 MEMACCESS(0) |
| 2303 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. | 2303 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. |
| 2304 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2304 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 2305 "umull v16.8h, v1.8b, v4.8b \n" // B | 2305 "umull v16.8h, v1.8b, v4.8b \n" // B |
| 2306 "umlal v16.8h, v2.8b, v5.8b \n" // G | 2306 "umlal v16.8h, v2.8b, v5.8b \n" // G |
| 2307 "umlal v16.8h, v3.8b, v6.8b \n" // R | 2307 "umlal v16.8h, v3.8b, v6.8b \n" // R |
| 2308 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2308 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
| 2309 "uqadd v0.8b, v0.8b, v7.8b \n" | 2309 "uqadd v0.8b, v0.8b, v7.8b \n" |
| 2310 MEMACCESS(1) | 2310 MEMACCESS(1) |
| 2311 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2311 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
| 2312 "b.gt 1b \n" | 2312 "b.gt 1b \n" |
| 2313 : "+r"(src_rgba), // %0 | 2313 : "+r"(src_rgba), // %0 |
| 2314 "+r"(dst_y), // %1 | 2314 "+r"(dst_y), // %1 |
| 2315 "+r"(pix) // %2 | 2315 "+r"(width) // %2 |
| 2316 : | 2316 : |
| 2317 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2317 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
| 2318 ); | 2318 ); |
| 2319 } | 2319 } |
| 2320 #endif // HAS_RGBATOYROW_NEON | 2320 #endif // HAS_RGBATOYROW_NEON |
| 2321 | 2321 |
| 2322 #ifdef HAS_RGB24TOYROW_NEON | 2322 #ifdef HAS_RGB24TOYROW_NEON |
| 2323 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { | 2323 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { |
| 2324 asm volatile ( | 2324 asm volatile ( |
| 2325 "movi v4.8b, #13 \n" // B * 0.1016 coefficient | 2325 "movi v4.8b, #13 \n" // B * 0.1016 coefficient |
| 2326 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2326 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
| 2327 "movi v6.8b, #33 \n" // R * 0.2578 coefficient | 2327 "movi v6.8b, #33 \n" // R * 0.2578 coefficient |
| 2328 "movi v7.8b, #16 \n" // Add 16 constant | 2328 "movi v7.8b, #16 \n" // Add 16 constant |
| 2329 "1: \n" | 2329 "1: \n" |
| 2330 MEMACCESS(0) | 2330 MEMACCESS(0) |
| 2331 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. | 2331 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. |
| 2332 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2332 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 2333 "umull v16.8h, v0.8b, v4.8b \n" // B | 2333 "umull v16.8h, v0.8b, v4.8b \n" // B |
| 2334 "umlal v16.8h, v1.8b, v5.8b \n" // G | 2334 "umlal v16.8h, v1.8b, v5.8b \n" // G |
| 2335 "umlal v16.8h, v2.8b, v6.8b \n" // R | 2335 "umlal v16.8h, v2.8b, v6.8b \n" // R |
| 2336 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2336 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
| 2337 "uqadd v0.8b, v0.8b, v7.8b \n" | 2337 "uqadd v0.8b, v0.8b, v7.8b \n" |
| 2338 MEMACCESS(1) | 2338 MEMACCESS(1) |
| 2339 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2339 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
| 2340 "b.gt 1b \n" | 2340 "b.gt 1b \n" |
| 2341 : "+r"(src_rgb24), // %0 | 2341 : "+r"(src_rgb24), // %0 |
| 2342 "+r"(dst_y), // %1 | 2342 "+r"(dst_y), // %1 |
| 2343 "+r"(pix) // %2 | 2343 "+r"(width) // %2 |
| 2344 : | 2344 : |
| 2345 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2345 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
| 2346 ); | 2346 ); |
| 2347 } | 2347 } |
| 2348 #endif // HAS_RGB24TOYROW_NEON | 2348 #endif // HAS_RGB24TOYROW_NEON |
| 2349 | 2349 |
| 2350 #ifdef HAS_RAWTOYROW_NEON | 2350 #ifdef HAS_RAWTOYROW_NEON |
| 2351 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { | 2351 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { |
| 2352 asm volatile ( | 2352 asm volatile ( |
| 2353 "movi v4.8b, #33 \n" // R * 0.2578 coefficient | 2353 "movi v4.8b, #33 \n" // R * 0.2578 coefficient |
| 2354 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2354 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
| 2355 "movi v6.8b, #13 \n" // B * 0.1016 coefficient | 2355 "movi v6.8b, #13 \n" // B * 0.1016 coefficient |
| 2356 "movi v7.8b, #16 \n" // Add 16 constant | 2356 "movi v7.8b, #16 \n" // Add 16 constant |
| 2357 "1: \n" | 2357 "1: \n" |
| 2358 MEMACCESS(0) | 2358 MEMACCESS(0) |
| 2359 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. | 2359 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. |
| 2360 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2360 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 2361 "umull v16.8h, v0.8b, v4.8b \n" // B | 2361 "umull v16.8h, v0.8b, v4.8b \n" // B |
| 2362 "umlal v16.8h, v1.8b, v5.8b \n" // G | 2362 "umlal v16.8h, v1.8b, v5.8b \n" // G |
| 2363 "umlal v16.8h, v2.8b, v6.8b \n" // R | 2363 "umlal v16.8h, v2.8b, v6.8b \n" // R |
| 2364 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2364 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
| 2365 "uqadd v0.8b, v0.8b, v7.8b \n" | 2365 "uqadd v0.8b, v0.8b, v7.8b \n" |
| 2366 MEMACCESS(1) | 2366 MEMACCESS(1) |
| 2367 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2367 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
| 2368 "b.gt 1b \n" | 2368 "b.gt 1b \n" |
| 2369 : "+r"(src_raw), // %0 | 2369 : "+r"(src_raw), // %0 |
| 2370 "+r"(dst_y), // %1 | 2370 "+r"(dst_y), // %1 |
| 2371 "+r"(pix) // %2 | 2371 "+r"(width) // %2 |
| 2372 : | 2372 : |
| 2373 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2373 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
| 2374 ); | 2374 ); |
| 2375 } | 2375 } |
| 2376 #endif // HAS_RAWTOYROW_NEON | 2376 #endif // HAS_RAWTOYROW_NEON |
| 2377 | 2377 |
| 2378 // Bilinear filter 16x2 -> 16x1 | 2378 // Bilinear filter 16x2 -> 16x1 |
| 2379 #ifdef HAS_INTERPOLATEROW_NEON | 2379 #ifdef HAS_INTERPOLATEROW_NEON |
| 2380 void InterpolateRow_NEON(uint8* dst_ptr, | 2380 void InterpolateRow_NEON(uint8* dst_ptr, |
| 2381 const uint8* src_ptr, ptrdiff_t src_stride, | 2381 const uint8* src_ptr, ptrdiff_t src_stride, |
| (...skipping 690 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3072 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 3072 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| 3073 ); | 3073 ); |
| 3074 } | 3074 } |
| 3075 #endif // HAS_SOBELYROW_NEON | 3075 #endif // HAS_SOBELYROW_NEON |
| 3076 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 3076 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |
| 3077 | 3077 |
| 3078 #ifdef __cplusplus | 3078 #ifdef __cplusplus |
| 3079 } // extern "C" | 3079 } // extern "C" |
| 3080 } // namespace libyuv | 3080 } // namespace libyuv |
| 3081 #endif | 3081 #endif |
| OLD | NEW |