| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 806 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 817 #endif // HAS_UYVYTOARGBROW_NEON | 817 #endif // HAS_UYVYTOARGBROW_NEON |
| 818 | 818 |
| 819 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. | 819 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. |
| 820 #ifdef HAS_SPLITUVROW_NEON | 820 #ifdef HAS_SPLITUVROW_NEON |
| 821 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, | 821 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, |
| 822 int width) { | 822 int width) { |
| 823 asm volatile ( | 823 asm volatile ( |
| 824 ".p2align 2 \n" | 824 ".p2align 2 \n" |
| 825 "1: \n" | 825 "1: \n" |
| 826 MEMACCESS(0) | 826 MEMACCESS(0) |
| 827 "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV | 827 "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pairs of UV |
| 828 "subs %3, %3, #16 \n" // 16 processed per loop | 828 "subs %3, %3, #16 \n" // 16 processed per loop |
| 829 MEMACCESS(1) | 829 MEMACCESS(1) |
| 830 "vst1.8 {q0}, [%1]! \n" // store U | 830 "st1 {v0.16b}, [%1], #16 \n" // store U |
| 831 MEMACCESS(2) | 831 MEMACCESS(2) |
| 832 "vst1.8 {q1}, [%2]! \n" // store V | 832 "st1 {v1.16b}, [%2], #16 \n" // store V |
| 833 "bgt 1b \n" | 833 "bgt 1b \n" |
| 834 : "+r"(src_uv), // %0 | 834 : "+r"(src_uv), // %0 |
| 835 "+r"(dst_u), // %1 | 835 "+r"(dst_u), // %1 |
| 836 "+r"(dst_v), // %2 | 836 "+r"(dst_v), // %2 |
| 837 "+r"(width) // %3 // Output registers | 837 "+r"(width) // %3 // Output registers |
| 838 : // Input registers | 838 : // Input registers |
| 839 : "cc", "memory", "q0", "q1" // Clobber List | 839 : "cc", "memory", "v0", "v1" // Clobber List |
| 840 ); | 840 ); |
| 841 } | 841 } |
| 842 #endif // HAS_SPLITUVROW_NEON | 842 #endif // HAS_SPLITUVROW_NEON |
| 843 | 843 |
| 844 // Reads 16 U's and V's and writes out 16 pairs of UV. | 844 // Reads 16 U's and V's and writes out 16 pairs of UV. |
| 845 #ifdef HAS_MERGEUVROW_NEON | 845 #ifdef HAS_MERGEUVROW_NEON |
| 846 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, | 846 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |
| 847 int width) { | 847 int width) { |
| 848 asm volatile ( | 848 asm volatile ( |
| 849 ".p2align 2 \n" | 849 ".p2align 2 \n" |
| 850 "1: \n" | 850 "1: \n" |
| 851 MEMACCESS(0) | 851 MEMACCESS(0) |
| 852 "vld1.8 {q0}, [%0]! \n" // load U | 852 "ld1 {v0.16b}, [%0], #16 \n" // load U |
| 853 MEMACCESS(1) | 853 MEMACCESS(1) |
| 854 "vld1.8 {q1}, [%1]! \n" // load V | 854 "ld1 {v1.16b}, [%1], #16 \n" // load V |
| 855 "subs %3, %3, #16 \n" // 16 processed per loop | 855 "subs %3, %3, #16 \n" // 16 processed per loop |
| 856 MEMACCESS(2) | 856 MEMACCESS(2) |
| 857 "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV | 857 "st2 {v0.16b, v1.16b}, [%2], #32 \n" // store 16 pairs of UV |
| 858 "bgt 1b \n" | 858 "bgt 1b \n" |
| 859 : | 859 : |
| 860 "+r"(src_u), // %0 | 860 "+r"(src_u), // %0 |
| 861 "+r"(src_v), // %1 | 861 "+r"(src_v), // %1 |
| 862 "+r"(dst_uv), // %2 | 862 "+r"(dst_uv), // %2 |
| 863 "+r"(width) // %3 // Output registers | 863 "+r"(width) // %3 // Output registers |
| 864 : // Input registers | 864 : // Input registers |
| 865 : "cc", "memory", "q0", "q1" // Clobber List | 865 : "cc", "memory", "v0", "v1" // Clobber List |
| 866 ); | 866 ); |
| 867 } | 867 } |
| 868 #endif // HAS_MERGEUVROW_NEON | 868 #endif // HAS_MERGEUVROW_NEON |
| 869 | 869 |
| 870 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. | 870 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. |
| 871 #ifdef HAS_COPYROW_NEON | 871 #ifdef HAS_COPYROW_NEON |
| 872 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { | 872 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { |
| 873 asm volatile ( | 873 asm volatile ( |
| 874 ".p2align 2 \n" | 874 ".p2align 2 \n" |
| 875 "1: \n" | 875 "1: \n" |
| 876 MEMACCESS(0) | 876 MEMACCESS(0) |
| 877 "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 | 877 "ld1 {v0.8b-v3.8b}, [%0], #32 \n" // load 32 |
| 878 "subs %2, %2, #32 \n" // 32 processed per loop | 878 "subs %2, %2, #32 \n" // 32 processed per loop |
| 879 MEMACCESS(1) | 879 MEMACCESS(1) |
| 880 "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 | 880 "st1 {v0.8b-v3.8b}, [%1], #32 \n" // store 32 |
| 881 "bgt 1b \n" | 881 "bgt 1b \n" |
| 882 : "+r"(src), // %0 | 882 : "+r"(src), // %0 |
| 883 "+r"(dst), // %1 | 883 "+r"(dst), // %1 |
| 884 "+r"(count) // %2 // Output registers | 884 "+r"(count) // %2 // Output registers |
| 885 : // Input registers | 885 : // Input registers |
| 886 : "cc", "memory", "q0", "q1" // Clobber List | 886 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| 887 ); | 887 ); |
| 888 } | 888 } |
| 889 #endif // HAS_COPYROW_NEON | 889 #endif // HAS_COPYROW_NEON |
| 890 | 890 |
| 891 // SetRow8 writes 'count' bytes using a 32 bit value repeated. | 891 // SetRow8 writes 'count' bytes using a 32 bit value repeated. |
| 892 #ifdef HAS_SETROW_NEON | 892 #ifdef HAS_SETROW_NEON |
| 893 void SetRow_NEON(uint8* dst, uint32 v32, int count) { | 893 void SetRow_NEON(uint8* dst, uint32 v32, int count) { |
| 894 asm volatile ( | 894 asm volatile ( |
| 895 "vdup.u32 q0, %2 \n" // duplicate 4 ints | 895 "dup v0.4s, %w2 \n" // duplicate 4 ints |
| 896 "1: \n" | 896 "1: \n" |
| 897 "subs %1, %1, #16 \n" // 16 bytes per loop | 897 "subs %1, %1, #16 \n" // 16 bytes per loop |
| 898 MEMACCESS(0) | 898 MEMACCESS(0) |
| 899 "vst1.8 {q0}, [%0]! \n" // store | 899 "st1 {v0.16b}, [%0], #16 \n" // store |
| 900 "bgt 1b \n" | 900 "bgt 1b \n" |
| 901 : "+r"(dst), // %0 | 901 : "+r"(dst), // %0 |
| 902 "+r"(count) // %1 | 902 "+r"(count) // %1 |
| 903 : "r"(v32) // %2 | 903 : "r"(v32) // %2 |
| 904 : "cc", "memory", "q0" | 904 : "cc", "memory", "v0" |
| 905 ); | 905 ); |
| 906 } | 906 } |
| 907 #endif // HAS_SETROW_NEON | 907 #endif // HAS_SETROW_NEON |
| 908 | 908 |
| 909 // TODO(fbarchard): Make fully assembler | 909 // TODO(fbarchard): Make fully assembler |
| 910 // SetRow32 writes 'count' words using a 32 bit value repeated. | 910 // SetRow32 writes 'count' words using a 32 bit value repeated. |
| 911 #ifdef HAS_ARGBSETROWS_NEON | 911 #ifdef HAS_ARGBSETROWS_NEON |
| 912 void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, | 912 void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, |
| 913 int dst_stride, int height) { | 913 int dst_stride, int height) { |
| 914 for (int y = 0; y < height; ++y) { | 914 for (int y = 0; y < height; ++y) { |
| 915 SetRow_NEON(dst, v32, width << 2); | 915 SetRow_NEON(dst, v32, width << 2); |
| 916 dst += dst_stride; | 916 dst += dst_stride; |
| 917 } | 917 } |
| 918 } | 918 } |
| 919 #endif // HAS_ARGBSETROWS_NEON | 919 #endif // HAS_ARGBSETROWS_NEON |
| 920 | 920 |
| 921 #ifdef HAS_MIRRORROW_NEON | 921 #ifdef HAS_MIRRORROW_NEON |
| 922 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { | 922 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { |
| 923 asm volatile ( | 923 asm volatile ( |
| 924 // Start at end of source row. | 924 // Start at end of source row. |
| 925 "mov r3, #-16 \n" | |
| 926 "add %0, %0, %2 \n" | 925 "add %0, %0, %2 \n" |
| 927 "sub %0, #16 \n" | 926 "sub %0, %0, #16 \n" |
| 928 | 927 |
| 929 ".p2align 2 \n" | 928 ".p2align 2 \n" |
| 930 "1: \n" | 929 "1: \n" |
| 931 MEMACCESS(0) | 930 MEMACCESS(0) |
| 932 "vld1.8 {q0}, [%0], r3 \n" // src -= 16 | 931 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 |
| 933 "subs %2, #16 \n" // 16 pixels per loop. | 932 "subs %2, %2, #16 \n" // 16 pixels per loop. |
| 934 "vrev64.8 q0, q0 \n" | 933 "rev64 v0.16b, v0.16b \n" |
| 935 MEMACCESS(1) | 934 MEMACCESS(1) |
| 936 "vst1.8 {d1}, [%1]! \n" // dst += 16 | 935 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 |
| 937 MEMACCESS(1) | 936 MEMACCESS(1) |
| 938 "vst1.8 {d0}, [%1]! \n" | 937 "st1 {v0.D}[0], [%1], #8 \n" |
| 939 "bgt 1b \n" | 938 "bgt 1b \n" |
| 940 : "+r"(src), // %0 | 939 : "+r"(src), // %0 |
| 941 "+r"(dst), // %1 | 940 "+r"(dst), // %1 |
| 942 "+r"(width) // %2 | 941 "+r"(width) // %2 |
| 943 : | 942 : "r"((ptrdiff_t)-16) // %3 |
| 944 : "cc", "memory", "r3", "q0" | 943 : "cc", "memory", "v0" |
| 945 ); | 944 ); |
| 946 } | 945 } |
| 947 #endif // HAS_MIRRORROW_NEON | 946 #endif // HAS_MIRRORROW_NEON |
| 948 | 947 |
| 949 #ifdef HAS_MIRRORUVROW_NEON | 948 #ifdef HAS_MIRRORUVROW_NEON |
| 950 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, | 949 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, |
| 951 int width) { | 950 int width) { |
| 952 asm volatile ( | 951 asm volatile ( |
| 953 // Start at end of source row. | 952 // Start at end of source row. |
| 954 "mov r12, #-16 \n" | |
| 955 "add %0, %0, %3, lsl #1 \n" | 953 "add %0, %0, %3, lsl #1 \n" |
| 956 "sub %0, #16 \n" | 954 "sub %0, %0, #16 \n" |
| 957 | 955 |
| 958 ".p2align 2 \n" | 956 ".p2align 2 \n" |
| 959 "1: \n" | 957 "1: \n" |
| 960 MEMACCESS(0) | 958 MEMACCESS(0) |
| 961 "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 | 959 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 |
| 962 "subs %3, #8 \n" // 8 pixels per loop. | 960 "subs %3, %3, #8 \n" // 8 pixels per loop. |
| 963 "vrev64.8 q0, q0 \n" | 961 "rev64 v0.8b, v0.8b \n" |
| 962 "rev64 v1.8b, v1.8b \n" |
| 964 MEMACCESS(1) | 963 MEMACCESS(1) |
| 965 "vst1.8 {d0}, [%1]! \n" // dst += 8 | 964 "st1 {v0.8b}, [%1], #8 \n" // dst += 8 |
| 966 MEMACCESS(2) | 965 MEMACCESS(2) |
| 967 "vst1.8 {d1}, [%2]! \n" | 966 "st1 {v1.8b}, [%2], #8 \n" |
| 968 "bgt 1b \n" | 967 "bgt 1b \n" |
| 969 : "+r"(src_uv), // %0 | 968 : "+r"(src_uv), // %0 |
| 970 "+r"(dst_u), // %1 | 969 "+r"(dst_u), // %1 |
| 971 "+r"(dst_v), // %2 | 970 "+r"(dst_v), // %2 |
| 972 "+r"(width) // %3 | 971 "+r"(width) // %3 |
| 973 : | 972 : "r"((ptrdiff_t)-16) // %4 |
| 974 : "cc", "memory", "r12", "q0" | 973 : "cc", "memory", "v0", "v1" |
| 975 ); | 974 ); |
| 976 } | 975 } |
| 977 #endif // HAS_MIRRORUVROW_NEON | 976 #endif // HAS_MIRRORUVROW_NEON |
| 978 | 977 |
| 979 #ifdef HAS_ARGBMIRRORROW_NEON | 978 #ifdef HAS_ARGBMIRRORROW_NEON |
| 980 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { | 979 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { |
| 981 asm volatile ( | 980 asm volatile ( |
| 982 // Start at end of source row. | 981 // Start at end of source row. |
| 983 "mov r3, #-16 \n" | |
| 984 "add %0, %0, %2, lsl #2 \n" | 982 "add %0, %0, %2, lsl #2 \n" |
| 985 "sub %0, #16 \n" | 983 "sub %0, %0, #16 \n" |
| 986 | 984 |
| 987 ".p2align 2 \n" | 985 ".p2align 2 \n" |
| 988 "1: \n" | 986 "1: \n" |
| 989 MEMACCESS(0) | 987 MEMACCESS(0) |
| 990 "vld1.8 {q0}, [%0], r3 \n" // src -= 16 | 988 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 |
| 991 "subs %2, #4 \n" // 4 pixels per loop. | 989 "subs %2, %2, #4 \n" // 4 pixels per loop. |
| 992 "vrev64.32 q0, q0 \n" | 990 "rev64 v0.4s, v0.4s \n" |
| 993 MEMACCESS(1) | 991 MEMACCESS(1) |
| 994 "vst1.8 {d1}, [%1]! \n" // dst += 16 | 992 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 |
| 995 MEMACCESS(1) | 993 MEMACCESS(1) |
| 996 "vst1.8 {d0}, [%1]! \n" | 994 "st1 {v0.D}[0], [%1], #8 \n" |
| 997 "bgt 1b \n" | 995 "bgt 1b \n" |
| 998 : "+r"(src), // %0 | 996 : "+r"(src), // %0 |
| 999 "+r"(dst), // %1 | 997 "+r"(dst), // %1 |
| 1000 "+r"(width) // %2 | 998 "+r"(width) // %2 |
| 1001 : | 999 : "r"((ptrdiff_t)-16) // %3 |
| 1002 : "cc", "memory", "r3", "q0" | 1000 : "cc", "memory", "v0" |
| 1003 ); | 1001 ); |
| 1004 } | 1002 } |
| 1005 #endif // HAS_ARGBMIRRORROW_NEON | 1003 #endif // HAS_ARGBMIRRORROW_NEON |
| 1006 | 1004 |
| 1007 #ifdef HAS_RGB24TOARGBROW_NEON | 1005 #ifdef HAS_RGB24TOARGBROW_NEON |
| 1008 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { | 1006 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { |
| 1009 asm volatile ( | 1007 asm volatile ( |
| 1010 "vmov.u8 d4, #255 \n" // Alpha | 1008 "movi v4.8b, #255 \n" // Alpha |
| 1011 ".p2align 2 \n" | 1009 ".p2align 2 \n" |
| 1012 "1: \n" | 1010 "1: \n" |
| 1013 MEMACCESS(0) | 1011 MEMACCESS(0) |
| 1014 "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. | 1012 "ld3 {v1.8b-v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. |
| 1015 "subs %2, %2, #8 \n" // 8 processed per loop. | 1013 "subs %2, %2, #8 \n" // 8 processed per loop. |
| 1016 MEMACCESS(1) | 1014 MEMACCESS(1) |
| 1017 "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. | 1015 "st4 {v1.8b-v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB. |
| 1018 "bgt 1b \n" | 1016 "bgt 1b \n" |
| 1019 : "+r"(src_rgb24), // %0 | 1017 : "+r"(src_rgb24), // %0 |
| 1020 "+r"(dst_argb), // %1 | 1018 "+r"(dst_argb), // %1 |
| 1021 "+r"(pix) // %2 | 1019 "+r"(pix) // %2 |
| 1022 : | 1020 : |
| 1023 : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List | 1021 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List |
| 1024 ); | 1022 ); |
| 1025 } | 1023 } |
| 1026 #endif // HAS_RGB24TOARGBROW_NEON | 1024 #endif // HAS_RGB24TOARGBROW_NEON |
| 1027 | 1025 |
| 1028 #ifdef HAS_RAWTOARGBROW_NEON | 1026 #ifdef HAS_RAWTOARGBROW_NEON |
| 1029 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { | 1027 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { |
| 1030 asm volatile ( | 1028 asm volatile ( |
| 1031 "vmov.u8 d4, #255 \n" // Alpha | 1029 "movi v5.8b, #255 \n" // Alpha |
| 1032 ".p2align 2 \n" | 1030 ".p2align 2 \n" |
| 1033 "1: \n" | 1031 "1: \n" |
| 1034 MEMACCESS(0) | 1032 MEMACCESS(0) |
| 1035 "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. | 1033 "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // read r g b |
| 1036 "subs %2, %2, #8 \n" // 8 processed per loop. | 1034 "subs %2, %2, #8 \n" // 8 processed per loop. |
| 1037 "vswp.u8 d1, d3 \n" // swap R, B | 1035 "mov v3.8b, v1.8b \n" // move g |
| 1036 "mov v4.8b, v0.8b \n" // move r |
| 1038 MEMACCESS(1) | 1037 MEMACCESS(1) |
| 1039 "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. | 1038 "st4 {v2.8b-v5.8b}, [%1], #32 \n" // store b g r a |
| 1040 "bgt 1b \n" | 1039 "bgt 1b \n" |
| 1041 : "+r"(src_raw), // %0 | 1040 : "+r"(src_raw), // %0 |
| 1042 "+r"(dst_argb), // %1 | 1041 "+r"(dst_argb), // %1 |
| 1043 "+r"(pix) // %2 | 1042 "+r"(pix) // %2 |
| 1044 : | 1043 : |
| 1045 : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List | 1044 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List |
| 1046 ); | 1045 ); |
| 1047 } | 1046 } |
| 1048 #endif // HAS_RAWTOARGBROW_NEON | 1047 #endif // HAS_RAWTOARGBROW_NEON |
| 1049 | 1048 |
| 1050 #define RGB565TOARGB \ | 1049 #define RGB565TOARGB \ |
| 1051 "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ | 1050 "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ |
| 1052 "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ | 1051 "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ |
| 1053 "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ | 1052 "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ |
| 1054 "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ | 1053 "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ |
| 1055 "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ | 1054 "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ |
| (...skipping 107 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1163 ); | 1162 ); |
| 1164 } | 1163 } |
| 1165 #endif // HAS_ARGB4444TOARGBROW_NEON | 1164 #endif // HAS_ARGB4444TOARGBROW_NEON |
| 1166 | 1165 |
| 1167 #ifdef HAS_ARGBTORGB24ROW_NEON | 1166 #ifdef HAS_ARGBTORGB24ROW_NEON |
| 1168 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { | 1167 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { |
| 1169 asm volatile ( | 1168 asm volatile ( |
| 1170 ".p2align 2 \n" | 1169 ".p2align 2 \n" |
| 1171 "1: \n" | 1170 "1: \n" |
| 1172 MEMACCESS(0) | 1171 MEMACCESS(0) |
| 1173 "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. | 1172 "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB. |
| 1174 "subs %2, %2, #8 \n" // 8 processed per loop. | 1173 "subs %2, %2, #8 \n" // 8 processed per loop. |
| 1175 MEMACCESS(1) | 1174 MEMACCESS(1) |
| 1176 "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. | 1175 "st3 {v1.8b-v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. |
| 1177 "bgt 1b \n" | 1176 "bgt 1b \n" |
| 1178 : "+r"(src_argb), // %0 | 1177 : "+r"(src_argb), // %0 |
| 1179 "+r"(dst_rgb24), // %1 | 1178 "+r"(dst_rgb24), // %1 |
| 1180 "+r"(pix) // %2 | 1179 "+r"(pix) // %2 |
| 1181 : | 1180 : |
| 1182 : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List | 1181 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List |
| 1183 ); | 1182 ); |
| 1184 } | 1183 } |
| 1185 #endif // HAS_ARGBTORGB24ROW_NEON | 1184 #endif // HAS_ARGBTORGB24ROW_NEON |
| 1186 | 1185 |
| 1187 #ifdef HAS_ARGBTORAWROW_NEON | 1186 #ifdef HAS_ARGBTORAWROW_NEON |
| 1188 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { | 1187 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { |
| 1189 asm volatile ( | 1188 asm volatile ( |
| 1190 ".p2align 2 \n" | 1189 ".p2align 2 \n" |
| 1191 "1: \n" | 1190 "1: \n" |
| 1192 MEMACCESS(0) | 1191 MEMACCESS(0) |
| 1193 "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. | 1192 "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load b g r a |
| 1194 "subs %2, %2, #8 \n" // 8 processed per loop. | 1193 "subs %2, %2, #8 \n" // 8 processed per loop. |
| 1195 "vswp.u8 d1, d3 \n" // swap R, B | 1194 "mov v4.8b, v2.8b \n" // mov g |
| 1195 "mov v5.8b, v1.8b \n" // mov b |
| 1196 MEMACCESS(1) | 1196 MEMACCESS(1) |
| 1197 "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. | 1197 "st3 {v3.8b-v5.8b}, [%1], #24 \n" // store r g b |
| 1198 "bgt 1b \n" | 1198 "bgt 1b \n" |
| 1199 : "+r"(src_argb), // %0 | 1199 : "+r"(src_argb), // %0 |
| 1200 "+r"(dst_raw), // %1 | 1200 "+r"(dst_raw), // %1 |
| 1201 "+r"(pix) // %2 | 1201 "+r"(pix) // %2 |
| 1202 : | 1202 : |
| 1203 : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List | 1203 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List |
| 1204 ); | 1204 ); |
| 1205 } | 1205 } |
| 1206 #endif // HAS_ARGBTORAWROW_NEON | 1206 #endif // HAS_ARGBTORAWROW_NEON |
| 1207 | 1207 |
| 1208 #ifdef HAS_YUY2TOYROW_NEON | 1208 #ifdef HAS_YUY2TOYROW_NEON |
| 1209 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { | 1209 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { |
| 1210 asm volatile ( | 1210 asm volatile ( |
| 1211 ".p2align 2 \n" | 1211 ".p2align 2 \n" |
| 1212 "1: \n" | 1212 "1: \n" |
| 1213 MEMACCESS(0) | 1213 MEMACCESS(0) |
| 1214 "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. | 1214 "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. |
| 1215 "subs %2, %2, #16 \n" // 16 processed per loop. | 1215 "subs %2, %2, #16 \n" // 16 processed per loop. |
| 1216 MEMACCESS(1) | 1216 MEMACCESS(1) |
| 1217 "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. | 1217 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. |
| 1218 "bgt 1b \n" | 1218 "bgt 1b \n" |
| 1219 : "+r"(src_yuy2), // %0 | 1219 : "+r"(src_yuy2), // %0 |
| 1220 "+r"(dst_y), // %1 | 1220 "+r"(dst_y), // %1 |
| 1221 "+r"(pix) // %2 | 1221 "+r"(pix) // %2 |
| 1222 : | 1222 : |
| 1223 : "cc", "memory", "q0", "q1" // Clobber List | 1223 : "cc", "memory", "v0", "v1" // Clobber List |
| 1224 ); | 1224 ); |
| 1225 } | 1225 } |
| 1226 #endif // HAS_YUY2TOYROW_NEON | 1226 #endif // HAS_YUY2TOYROW_NEON |
| 1227 | 1227 |
| 1228 #ifdef HAS_UYVYTOYROW_NEON | 1228 #ifdef HAS_UYVYTOYROW_NEON |
| 1229 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { | 1229 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { |
| 1230 asm volatile ( | 1230 asm volatile ( |
| 1231 ".p2align 2 \n" | 1231 ".p2align 2 \n" |
| 1232 "1: \n" | 1232 "1: \n" |
| 1233 MEMACCESS(0) | 1233 MEMACCESS(0) |
| 1234 "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. | 1234 "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. |
| 1235 "subs %2, %2, #16 \n" // 16 processed per loop. | 1235 "subs %2, %2, #16 \n" // 16 processed per loop. |
| 1236 MEMACCESS(1) | 1236 MEMACCESS(1) |
| 1237 "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. | 1237 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. |
| 1238 "bgt 1b \n" | 1238 "bgt 1b \n" |
| 1239 : "+r"(src_uyvy), // %0 | 1239 : "+r"(src_uyvy), // %0 |
| 1240 "+r"(dst_y), // %1 | 1240 "+r"(dst_y), // %1 |
| 1241 "+r"(pix) // %2 | 1241 "+r"(pix) // %2 |
| 1242 : | 1242 : |
| 1243 : "cc", "memory", "q0", "q1" // Clobber List | 1243 : "cc", "memory", "v0", "v1" // Clobber List |
| 1244 ); | 1244 ); |
| 1245 } | 1245 } |
| 1246 #endif // HAS_UYVYTOYROW_NEON | 1246 #endif // HAS_UYVYTOYROW_NEON |
| 1247 | 1247 |
| 1248 #ifdef HAS_YUY2TOUV422ROW_NEON | 1248 #ifdef HAS_YUY2TOUV422ROW_NEON |
| 1249 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, | 1249 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, |
| 1250 int pix) { | 1250 int pix) { |
| 1251 asm volatile ( | 1251 asm volatile ( |
| 1252 ".p2align 2 \n" | 1252 ".p2align 2 \n" |
| 1253 "1: \n" | 1253 "1: \n" |
| 1254 MEMACCESS(0) | 1254 MEMACCESS(0) |
| 1255 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. | 1255 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. |
| 1256 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. | 1256 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. |
| 1257 MEMACCESS(1) | 1257 MEMACCESS(1) |
| 1258 "vst1.8 {d1}, [%1]! \n" // store 8 U. | 1258 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. |
| 1259 MEMACCESS(2) | 1259 MEMACCESS(2) |
| 1260 "vst1.8 {d3}, [%2]! \n" // store 8 V. | 1260 "st1 {v3.8b}, [%2], #8 \n" // store 8 V. |
| 1261 "bgt 1b \n" | 1261 "bgt 1b \n" |
| 1262 : "+r"(src_yuy2), // %0 | 1262 : "+r"(src_yuy2), // %0 |
| 1263 "+r"(dst_u), // %1 | 1263 "+r"(dst_u), // %1 |
| 1264 "+r"(dst_v), // %2 | 1264 "+r"(dst_v), // %2 |
| 1265 "+r"(pix) // %3 | 1265 "+r"(pix) // %3 |
| 1266 : | 1266 : |
| 1267 : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List | 1267 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| 1268 ); | 1268 ); |
| 1269 } | 1269 } |
| 1270 #endif // HAS_YUY2TOUV422ROW_NEON | 1270 #endif // HAS_YUY2TOUV422ROW_NEON |
| 1271 | 1271 |
| 1272 #ifdef HAS_UYVYTOUV422ROW_NEON | 1272 #ifdef HAS_UYVYTOUV422ROW_NEON |
| 1273 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, | 1273 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, |
| 1274 int pix) { | 1274 int pix) { |
| 1275 asm volatile ( | 1275 asm volatile ( |
| 1276 ".p2align 2 \n" | 1276 ".p2align 2 \n" |
| 1277 "1: \n" | 1277 "1: \n" |
| 1278 MEMACCESS(0) | 1278 MEMACCESS(0) |
| 1279 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. | 1279 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. |
| 1280 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. | 1280 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. |
| 1281 MEMACCESS(1) | 1281 MEMACCESS(1) |
| 1282 "vst1.8 {d0}, [%1]! \n" // store 8 U. | 1282 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. |
| 1283 MEMACCESS(2) | 1283 MEMACCESS(2) |
| 1284 "vst1.8 {d2}, [%2]! \n" // store 8 V. | 1284 "st1 {v2.8b}, [%2], #8 \n" // store 8 V. |
| 1285 "bgt 1b \n" | 1285 "bgt 1b \n" |
| 1286 : "+r"(src_uyvy), // %0 | 1286 : "+r"(src_uyvy), // %0 |
| 1287 "+r"(dst_u), // %1 | 1287 "+r"(dst_u), // %1 |
| 1288 "+r"(dst_v), // %2 | 1288 "+r"(dst_v), // %2 |
| 1289 "+r"(pix) // %3 | 1289 "+r"(pix) // %3 |
| 1290 : | 1290 : |
| 1291 : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List | 1291 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| 1292 ); | 1292 ); |
| 1293 } | 1293 } |
| 1294 #endif // HAS_UYVYTOUV422ROW_NEON | 1294 #endif // HAS_UYVYTOUV422ROW_NEON |
| 1295 | 1295 |
| 1296 #ifdef HAS_YUY2TOUVROW_NEON | 1296 #ifdef HAS_YUY2TOUVROW_NEON |
| 1297 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, | 1297 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, |
| 1298 uint8* dst_u, uint8* dst_v, int pix) { | 1298 uint8* dst_u, uint8* dst_v, int pix) { |
| 1299 asm volatile ( | 1299 asm volatile ( |
| 1300 "add %1, %0, %1 \n" // stride + src_yuy2 | 1300 "add %x1, %x0, %w1, sxtw \n" // stride + src_yuy2 |
| 1301 ".p2align 2 \n" | 1301 ".p2align 2 \n" |
| 1302 "1: \n" | 1302 "1: \n" |
| 1303 MEMACCESS(0) | 1303 MEMACCESS(0) |
| 1304 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. | 1304 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. |
| 1305 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. | 1305 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. |
| 1306 MEMACCESS(1) | 1306 MEMACCESS(1) |
| 1307 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. | 1307 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row YUY2. |
| 1308 "vrhadd.u8 d1, d1, d5 \n" // average rows of U | 1308 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U |
| 1309 "vrhadd.u8 d3, d3, d7 \n" // average rows of V | 1309 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V |
| 1310 MEMACCESS(2) | 1310 MEMACCESS(2) |
| 1311 "vst1.8 {d1}, [%2]! \n" // store 8 U. | 1311 "st1 {v1.8b}, [%2], #8 \n" // store 8 U. |
| 1312 MEMACCESS(3) | 1312 MEMACCESS(3) |
| 1313 "vst1.8 {d3}, [%3]! \n" // store 8 V. | 1313 "st1 {v3.8b}, [%3], #8 \n" // store 8 V. |
| 1314 "bgt 1b \n" | 1314 "bgt 1b \n" |
| 1315 : "+r"(src_yuy2), // %0 | 1315 : "+r"(src_yuy2), // %0 |
| 1316 "+r"(stride_yuy2), // %1 | 1316 "+r"(stride_yuy2), // %1 |
| 1317 "+r"(dst_u), // %2 | 1317 "+r"(dst_u), // %2 |
| 1318 "+r"(dst_v), // %3 | 1318 "+r"(dst_v), // %3 |
| 1319 "+r"(pix) // %4 | 1319 "+r"(pix) // %4 |
| 1320 : | 1320 : |
| 1321 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber L
ist | 1321 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber L
ist |
| 1322 ); | 1322 ); |
| 1323 } | 1323 } |
| 1324 #endif // HAS_YUY2TOUVROW_NEON | 1324 #endif // HAS_YUY2TOUVROW_NEON |
| 1325 | 1325 |
| 1326 #ifdef HAS_UYVYTOUVROW_NEON | 1326 #ifdef HAS_UYVYTOUVROW_NEON |
| 1327 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, | 1327 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, |
| 1328 uint8* dst_u, uint8* dst_v, int pix) { | 1328 uint8* dst_u, uint8* dst_v, int pix) { |
| 1329 asm volatile ( | 1329 asm volatile ( |
| 1330 "add %1, %0, %1 \n" // stride + src_uyvy | 1330 "add %x1, %x0, %w1, sxtw \n" // stride + src_uyvy |
| 1331 ".p2align 2 \n" | 1331 ".p2align 2 \n" |
| 1332 "1: \n" | 1332 "1: \n" |
| 1333 MEMACCESS(0) | 1333 MEMACCESS(0) |
| 1334 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. | 1334 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. |
| 1335 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. | 1335 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. |
| 1336 MEMACCESS(1) | 1336 MEMACCESS(1) |
| 1337 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. | 1337 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row UYVY. |
| 1338 "vrhadd.u8 d0, d0, d4 \n" // average rows of U | 1338 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U |
| 1339 "vrhadd.u8 d2, d2, d6 \n" // average rows of V | 1339 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V |
| 1340 MEMACCESS(2) | 1340 MEMACCESS(2) |
| 1341 "vst1.8 {d0}, [%2]! \n" // store 8 U. | 1341 "st1 {v0.8b}, [%2], #8 \n" // store 8 U. |
| 1342 MEMACCESS(3) | 1342 MEMACCESS(3) |
| 1343 "vst1.8 {d2}, [%3]! \n" // store 8 V. | 1343 "st1 {v2.8b}, [%3], #8 \n" // store 8 V. |
| 1344 "bgt 1b \n" | 1344 "bgt 1b \n" |
| 1345 : "+r"(src_uyvy), // %0 | 1345 : "+r"(src_uyvy), // %0 |
| 1346 "+r"(stride_uyvy), // %1 | 1346 "+r"(stride_uyvy), // %1 |
| 1347 "+r"(dst_u), // %2 | 1347 "+r"(dst_u), // %2 |
| 1348 "+r"(dst_v), // %3 | 1348 "+r"(dst_v), // %3 |
| 1349 "+r"(pix) // %4 | 1349 "+r"(pix) // %4 |
| 1350 : | 1350 : |
| 1351 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber L
ist | 1351 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber L
ist |
| 1352 ); | 1352 ); |
| 1353 } | 1353 } |
| 1354 #endif // HAS_UYVYTOUVROW_NEON | 1354 #endif // HAS_UYVYTOUVROW_NEON |
| 1355 | 1355 |
| 1356 #ifdef HAS_HALFROW_NEON | 1356 #ifdef HAS_HALFROW_NEON |
| 1357 void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, | 1357 void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, |
| 1358 uint8* dst_uv, int pix) { | 1358 uint8* dst_uv, int pix) { |
| 1359 asm volatile ( | 1359 asm volatile ( |
| 1360 // change the stride to row 2 pointer | 1360 // change the stride to row 2 pointer |
| 1361 "add %1, %0 \n" | 1361 "add %x1, %x0, %w1, sxtw \n" |
| 1362 "1: \n" | 1362 "1: \n" |
| 1363 MEMACCESS(0) | 1363 MEMACCESS(0) |
| 1364 "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels. | 1364 "ld1 {v0.16b}, [%0], #16 \n" // load row 1 16 pixels. |
| 1365 "subs %3, %3, #16 \n" // 16 processed per loop | 1365 "subs %3, %3, #16 \n" // 16 processed per loop |
| 1366 MEMACCESS(1) | 1366 MEMACCESS(1) |
| 1367 "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels. | 1367 "ld1 {v1.16b}, [%1], #16 \n" // load row 2 16 pixels. |
| 1368 "vrhadd.u8 q0, q1 \n" // average row 1 and 2 | 1368 "urhadd v0.16b, v0.16b, v1.16b \n" // average row 1 and 2 |
| 1369 MEMACCESS(2) | 1369 MEMACCESS(2) |
| 1370 "vst1.8 {q0}, [%2]! \n" | 1370 "st1 {v0.16b}, [%2], #16 \n" |
| 1371 "bgt 1b \n" | 1371 "bgt 1b \n" |
| 1372 : "+r"(src_uv), // %0 | 1372 : "+r"(src_uv), // %0 |
| 1373 "+r"(src_uv_stride), // %1 | 1373 "+r"(src_uv_stride), // %1 |
| 1374 "+r"(dst_uv), // %2 | 1374 "+r"(dst_uv), // %2 |
| 1375 "+r"(pix) // %3 | 1375 "+r"(pix) // %3 |
| 1376 : | 1376 : |
| 1377 : "cc", "memory", "q0", "q1" // Clobber List | 1377 : "cc", "memory", "v0", "v1" // Clobber List |
| 1378 ); | 1378 ); |
| 1379 } | 1379 } |
| 1380 #endif // HAS_HALFROW_NEON | 1380 #endif // HAS_HALFROW_NEON |
| 1381 | 1381 |
| 1382 // Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG | 1382 // Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG |
| 1383 #ifdef HAS_ARGBTOBAYERROW_NEON | 1383 #ifdef HAS_ARGBTOBAYERROW_NEON |
| 1384 void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, | 1384 void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, |
| 1385 uint32 selector, int pix) { | 1385 uint32 selector, int pix) { |
| 1386 asm volatile ( | 1386 asm volatile ( |
| 1387 "vmov.u32 d6[0], %3 \n" // selector | 1387 "mov v2.s[0], %w3 \n" // selector |
| 1388 "1: \n" | 1388 "1: \n" |
| 1389 MEMACCESS(0) | 1389 MEMACCESS(0) |
| 1390 "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels. | 1390 "ld1 {v0.16b, v1.16b}, [%0], 32 \n" // load row 8 pixels. |
| 1391 "subs %2, %2, #8 \n" // 8 processed per loop | 1391 "subs %2, %2, #8 \n" // 8 processed per loop |
| 1392 "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels | 1392 "tbl v4.8b, {v0.16b}, v2.8b \n" // look up 4 pixels |
| 1393 "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels | 1393 "tbl v5.8b, {v1.16b}, v2.8b \n" // look up 4 pixels |
| 1394 "vtrn.u32 d4, d5 \n" // combine 8 pixels | 1394 "trn1 v4.4s, v4.4s, v5.4s \n" // combine 8 pixels |
| 1395 MEMACCESS(1) | 1395 MEMACCESS(1) |
| 1396 "vst1.8 {d4}, [%1]! \n" // store 8. | 1396 "st1 {v4.8b}, [%1], #8 \n" // store 8. |
| 1397 "bgt 1b \n" | 1397 "bgt 1b \n" |
| 1398 : "+r"(src_argb), // %0 | 1398 : "+r"(src_argb), // %0 |
| 1399 "+r"(dst_bayer), // %1 | 1399 "+r"(dst_bayer), // %1 |
| 1400 "+r"(pix) // %2 | 1400 "+r"(pix) // %2 |
| 1401 : "r"(selector) // %3 | 1401 : "r"(selector) // %3 |
| 1402 : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List | 1402 : "cc", "memory", "v0", "v1", "v2", "v4", "v5" // Clobber List |
| 1403 ); | 1403 ); |
| 1404 } | 1404 } |
| 1405 #endif // HAS_ARGBTOBAYERROW_NEON | 1405 #endif // HAS_ARGBTOBAYERROW_NEON |
| 1406 | 1406 |
| 1407 // Select G channels from ARGB. e.g. GGGGGGGG | 1407 // Select G channels from ARGB. e.g. GGGGGGGG |
| 1408 #ifdef HAS_ARGBTOBAYERGGROW_NEON | 1408 #ifdef HAS_ARGBTOBAYERGGROW_NEON |
| 1409 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, | 1409 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, |
| 1410 uint32 /*selector*/, int pix) { | 1410 uint32 /*selector*/, int pix) { |
| 1411 asm volatile ( | 1411 asm volatile ( |
| 1412 "1: \n" | 1412 "1: \n" |
| 1413 MEMACCESS(0) | 1413 MEMACCESS(0) |
| 1414 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels. | 1414 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load row 8 pixels. |
| 1415 "subs %2, %2, #8 \n" // 8 processed per loop | 1415 "subs %2, %2, #8 \n" // 8 processed per loop |
| 1416 MEMACCESS(1) | 1416 MEMACCESS(1) |
| 1417 "vst1.8 {d1}, [%1]! \n" // store 8 G's. | 1417 "st1 {v1.8b}, [%1], #8 \n" // store 8 G's. |
| 1418 "bgt 1b \n" | 1418 "bgt 1b \n" |
| 1419 : "+r"(src_argb), // %0 | 1419 : "+r"(src_argb), // %0 |
| 1420 "+r"(dst_bayer), // %1 | 1420 "+r"(dst_bayer), // %1 |
| 1421 "+r"(pix) // %2 | 1421 "+r"(pix) // %2 |
| 1422 : | 1422 : |
| 1423 : "cc", "memory", "q0", "q1" // Clobber List | 1423 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| 1424 ); | 1424 ); |
| 1425 } | 1425 } |
| 1426 #endif // HAS_ARGBTOBAYERGGROW_NEON | 1426 #endif // HAS_ARGBTOBAYERGGROW_NEON |
| 1427 | 1427 |
| 1428 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 1428 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
| 1429 #ifdef HAS_ARGBSHUFFLEROW_NEON | 1429 #ifdef HAS_ARGBSHUFFLEROW_NEON |
| 1430 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, | 1430 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, |
| 1431 const uint8* shuffler, int pix) { | 1431 const uint8* shuffler, int pix) { |
| 1432 asm volatile ( | 1432 asm volatile ( |
| 1433 MEMACCESS(3) | 1433 MEMACCESS(3) |
| 1434 "vld1.8 {q2}, [%3] \n" // shuffler | 1434 "ld1 {v2.16b}, [%3] \n" // shuffler |
| 1435 "1: \n" | 1435 "1: \n" |
| 1436 MEMACCESS(0) | 1436 MEMACCESS(0) |
| 1437 "vld1.8 {q0}, [%0]! \n" // load 4 pixels. | 1437 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. |
| 1438 "subs %2, %2, #4 \n" // 4 processed per loop | 1438 "subs %2, %2, #4 \n" // 4 processed per loop |
| 1439 "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels | 1439 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels |
| 1440 "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels | |
| 1441 MEMACCESS(1) | 1440 MEMACCESS(1) |
| 1442 "vst1.8 {q1}, [%1]! \n" // store 4. | 1441 "st1 {v1.16b}, [%1], #16 \n" // store 4. |
| 1443 "bgt 1b \n" | 1442 "bgt 1b \n" |
| 1444 : "+r"(src_argb), // %0 | 1443 : "+r"(src_argb), // %0 |
| 1445 "+r"(dst_argb), // %1 | 1444 "+r"(dst_argb), // %1 |
| 1446 "+r"(pix) // %2 | 1445 "+r"(pix) // %2 |
| 1447 : "r"(shuffler) // %3 | 1446 : "r"(shuffler) // %3 |
| 1448 : "cc", "memory", "q0", "q1", "q2" // Clobber List | 1447 : "cc", "memory", "v0", "v1", "v2" // Clobber List |
| 1449 ); | 1448 ); |
| 1450 } | 1449 } |
| 1451 #endif // HAS_ARGBSHUFFLEROW_NEON | 1450 #endif // HAS_ARGBSHUFFLEROW_NEON |
| 1452 | 1451 |
| 1453 #ifdef HAS_I422TOYUY2ROW_NEON | 1452 #ifdef HAS_I422TOYUY2ROW_NEON |
| 1454 void I422ToYUY2Row_NEON(const uint8* src_y, | 1453 void I422ToYUY2Row_NEON(const uint8* src_y, |
| 1455 const uint8* src_u, | 1454 const uint8* src_u, |
| 1456 const uint8* src_v, | 1455 const uint8* src_v, |
| 1457 uint8* dst_yuy2, int width) { | 1456 uint8* dst_yuy2, int width) { |
| 1458 asm volatile ( | 1457 asm volatile ( |
| 1459 ".p2align 2 \n" | 1458 ".p2align 2 \n" |
| 1460 "1: \n" | 1459 "1: \n" |
| 1461 MEMACCESS(0) | 1460 MEMACCESS(0) |
| 1462 "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys | 1461 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys |
| 1462 "mov v2.8b, v1.8b \n" |
| 1463 MEMACCESS(1) | 1463 MEMACCESS(1) |
| 1464 "vld1.8 {d1}, [%1]! \n" // load 8 Us | 1464 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us |
| 1465 MEMACCESS(2) | 1465 MEMACCESS(2) |
| 1466 "vld1.8 {d3}, [%2]! \n" // load 8 Vs | 1466 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs |
| 1467 "subs %4, %4, #16 \n" // 16 pixels | 1467 "subs %4, %4, #16 \n" // 16 pixels |
| 1468 MEMACCESS(3) | 1468 MEMACCESS(3) |
| 1469 "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. | 1469 "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels. |
| 1470 "bgt 1b \n" | 1470 "bgt 1b \n" |
| 1471 : "+r"(src_y), // %0 | 1471 : "+r"(src_y), // %0 |
| 1472 "+r"(src_u), // %1 | 1472 "+r"(src_u), // %1 |
| 1473 "+r"(src_v), // %2 | 1473 "+r"(src_v), // %2 |
| 1474 "+r"(dst_yuy2), // %3 | 1474 "+r"(dst_yuy2), // %3 |
| 1475 "+r"(width) // %4 | 1475 "+r"(width) // %4 |
| 1476 : | 1476 : |
| 1477 : "cc", "memory", "d0", "d1", "d2", "d3" | 1477 : "cc", "memory", "v0", "v1", "v2", "v3" |
| 1478 ); | 1478 ); |
| 1479 } | 1479 } |
| 1480 #endif // HAS_I422TOYUY2ROW_NEON | 1480 #endif // HAS_I422TOYUY2ROW_NEON |
| 1481 | 1481 |
| 1482 #ifdef HAS_I422TOUYVYROW_NEON | 1482 #ifdef HAS_I422TOUYVYROW_NEON |
| 1483 void I422ToUYVYRow_NEON(const uint8* src_y, | 1483 void I422ToUYVYRow_NEON(const uint8* src_y, |
| 1484 const uint8* src_u, | 1484 const uint8* src_u, |
| 1485 const uint8* src_v, | 1485 const uint8* src_v, |
| 1486 uint8* dst_uyvy, int width) { | 1486 uint8* dst_uyvy, int width) { |
| 1487 asm volatile ( | 1487 asm volatile ( |
| 1488 ".p2align 2 \n" | 1488 ".p2align 2 \n" |
| 1489 "1: \n" | 1489 "1: \n" |
| 1490 MEMACCESS(0) | 1490 MEMACCESS(0) |
| 1491 "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys | 1491 "ld2 {v1.8b, v2.8b}, [%0], #16 \n" // load 16 Ys |
| 1492 "mov v3.8b, v2.8b \n" |
| 1492 MEMACCESS(1) | 1493 MEMACCESS(1) |
| 1493 "vld1.8 {d0}, [%1]! \n" // load 8 Us | 1494 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us |
| 1494 MEMACCESS(2) | 1495 MEMACCESS(2) |
| 1495 "vld1.8 {d2}, [%2]! \n" // load 8 Vs | 1496 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs |
| 1496 "subs %4, %4, #16 \n" // 16 pixels | 1497 "subs %4, %4, #16 \n" // 16 pixels |
| 1497 MEMACCESS(3) | 1498 MEMACCESS(3) |
| 1498 "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. | 1499 "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels. |
| 1499 "bgt 1b \n" | 1500 "bgt 1b \n" |
| 1500 : "+r"(src_y), // %0 | 1501 : "+r"(src_y), // %0 |
| 1501 "+r"(src_u), // %1 | 1502 "+r"(src_u), // %1 |
| 1502 "+r"(src_v), // %2 | 1503 "+r"(src_v), // %2 |
| 1503 "+r"(dst_uyvy), // %3 | 1504 "+r"(dst_uyvy), // %3 |
| 1504 "+r"(width) // %4 | 1505 "+r"(width) // %4 |
| 1505 : | 1506 : |
| 1506 : "cc", "memory", "d0", "d1", "d2", "d3" | 1507 : "cc", "memory", "v0", "v1", "v2", "v3" |
| 1507 ); | 1508 ); |
| 1508 } | 1509 } |
| 1509 #endif // HAS_I422TOUYVYROW_NEON | 1510 #endif // HAS_I422TOUYVYROW_NEON |
| 1510 | 1511 |
| 1511 #ifdef HAS_ARGBTORGB565ROW_NEON | 1512 #ifdef HAS_ARGBTORGB565ROW_NEON |
| 1512 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { | 1513 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { |
| 1513 asm volatile ( | 1514 asm volatile ( |
| 1514 ".p2align 2 \n" | 1515 ".p2align 2 \n" |
| 1515 "1: \n" | 1516 "1: \n" |
| 1516 MEMACCESS(0) | 1517 MEMACCESS(0) |
| (...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1570 "+r"(pix) // %2 | 1571 "+r"(pix) // %2 |
| 1571 : | 1572 : |
| 1572 : "cc", "memory", "q0", "q8", "q9", "q10", "q11" | 1573 : "cc", "memory", "q0", "q8", "q9", "q10", "q11" |
| 1573 ); | 1574 ); |
| 1574 } | 1575 } |
| 1575 #endif // HAS_ARGBTOARGB4444ROW_NEON | 1576 #endif // HAS_ARGBTOARGB4444ROW_NEON |
| 1576 | 1577 |
| 1577 #ifdef HAS_ARGBTOYROW_NEON | 1578 #ifdef HAS_ARGBTOYROW_NEON |
| 1578 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { | 1579 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { |
| 1579 asm volatile ( | 1580 asm volatile ( |
| 1580 "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient | 1581 "movi v4.8b, #13 \n" // B * 0.1016 coefficient |
| 1581 "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient | 1582 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
| 1582 "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient | 1583 "movi v6.8b, #33 \n" // R * 0.2578 coefficient |
| 1583 "vmov.u8 d27, #16 \n" // Add 16 constant | 1584 "movi v7.8b, #16 \n" // Add 16 constant |
| 1584 ".p2align 2 \n" | 1585 ".p2align 2 \n" |
| 1585 "1: \n" | 1586 "1: \n" |
| 1586 MEMACCESS(0) | 1587 MEMACCESS(0) |
| 1587 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. | 1588 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
| 1588 "subs %2, %2, #8 \n" // 8 processed per loop. | 1589 "subs %2, %2, #8 \n" // 8 processed per loop. |
| 1589 "vmull.u8 q2, d0, d24 \n" // B | 1590 "umull v3.8h, v0.8b, v4.8b \n" // B |
| 1590 "vmlal.u8 q2, d1, d25 \n" // G | 1591 "umlal v3.8h, v1.8b, v5.8b \n" // G |
| 1591 "vmlal.u8 q2, d2, d26 \n" // R | 1592 "umlal v3.8h, v2.8b, v6.8b \n" // R |
| 1592 "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y | 1593 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y |
| 1593 "vqadd.u8 d0, d27 \n" | 1594 "uqadd v0.8b, v0.8b, v7.8b \n" |
| 1594 MEMACCESS(1) | 1595 MEMACCESS(1) |
| 1595 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. | 1596 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
| 1596 "bgt 1b \n" | 1597 "bgt 1b \n" |
| 1597 : "+r"(src_argb), // %0 | 1598 : "+r"(src_argb), // %0 |
| 1598 "+r"(dst_y), // %1 | 1599 "+r"(dst_y), // %1 |
| 1599 "+r"(pix) // %2 | 1600 "+r"(pix) // %2 |
| 1600 : | 1601 : |
| 1601 : "cc", "memory", "q0", "q1", "q2", "q12", "q13" | 1602 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
| 1602 ); | 1603 ); |
| 1603 } | 1604 } |
| 1604 #endif // HAS_ARGBTOYROW_NEON | 1605 #endif // HAS_ARGBTOYROW_NEON |
| 1605 | 1606 |
| 1606 #ifdef HAS_ARGBTOYJROW_NEON | 1607 #ifdef HAS_ARGBTOYJROW_NEON |
| 1607 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { | 1608 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { |
| 1608 asm volatile ( | 1609 asm volatile ( |
| 1609 "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient | 1610 "movi v4.8b, #15 \n" // B * 0.11400 coefficient |
| 1610 "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient | 1611 "movi v5.8b, #75 \n" // G * 0.58700 coefficient |
| 1611 "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient | 1612 "movi v6.8b, #38 \n" // R * 0.29900 coefficient |
| 1612 ".p2align 2 \n" | 1613 ".p2align 2 \n" |
| 1613 "1: \n" | 1614 "1: \n" |
| 1614 MEMACCESS(0) | 1615 MEMACCESS(0) |
| 1615 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. | 1616 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
| 1616 "subs %2, %2, #8 \n" // 8 processed per loop. | 1617 "subs %2, %2, #8 \n" // 8 processed per loop. |
| 1617 "vmull.u8 q2, d0, d24 \n" // B | 1618 "umull v3.8h, v0.8b, v4.8b \n" // B |
| 1618 "vmlal.u8 q2, d1, d25 \n" // G | 1619 "umlal v3.8h, v1.8b, v5.8b \n" // G |
| 1619 "vmlal.u8 q2, d2, d26 \n" // R | 1620 "umlal v3.8h, v2.8b, v6.8b \n" // R |
| 1620 "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y | 1621 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y |
| 1621 MEMACCESS(1) | 1622 MEMACCESS(1) |
| 1622 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. | 1623 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
| 1623 "bgt 1b \n" | 1624 "bgt 1b \n" |
| 1624 : "+r"(src_argb), // %0 | 1625 : "+r"(src_argb), // %0 |
| 1625 "+r"(dst_y), // %1 | 1626 "+r"(dst_y), // %1 |
| 1626 "+r"(pix) // %2 | 1627 "+r"(pix) // %2 |
| 1627 : | 1628 : |
| 1628 : "cc", "memory", "q0", "q1", "q2", "q12", "q13" | 1629 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" |
| 1629 ); | 1630 ); |
| 1630 } | 1631 } |
| 1631 #endif // HAS_ARGBTOYJROW_NEON | 1632 #endif // HAS_ARGBTOYJROW_NEON |
| 1632 | 1633 |
| 1633 // 8x1 pixels. | 1634 // 8x1 pixels. |
| 1634 #ifdef HAS_ARGBTOUV444ROW_NEON | 1635 #ifdef HAS_ARGBTOUV444ROW_NEON |
| 1635 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, | 1636 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, |
| 1636 int pix) { | 1637 int pix) { |
| 1637 asm volatile ( | 1638 asm volatile ( |
| 1638 "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient | 1639 "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient |
| (...skipping 1402 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3041 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. | 3042 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. |
| 3042 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. | 3043 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. |
| 3043 #ifdef HAS_ARGBMULTIPLYROW_NEON | 3044 #ifdef HAS_ARGBMULTIPLYROW_NEON |
| 3044 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, | 3045 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |
| 3045 uint8* dst_argb, int width) { | 3046 uint8* dst_argb, int width) { |
| 3046 asm volatile ( | 3047 asm volatile ( |
| 3047 // 8 pixel loop. | 3048 // 8 pixel loop. |
| 3048 ".p2align 2 \n" | 3049 ".p2align 2 \n" |
| 3049 "1: \n" | 3050 "1: \n" |
| 3050 MEMACCESS(0) | 3051 MEMACCESS(0) |
| 3051 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. | 3052 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
| 3052 MEMACCESS(1) | 3053 MEMACCESS(1) |
| 3053 "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels. | 3054 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. |
| 3054 "subs %3, %3, #8 \n" // 8 processed per loop. | 3055 "subs %3, %3, #8 \n" // 8 processed per loop. |
| 3055 "vmull.u8 q0, d0, d1 \n" // multiply B | 3056 "umull v0.8h, v0.8b, v4.8b \n" // multiply B |
| 3056 "vmull.u8 q1, d2, d3 \n" // multiply G | 3057 "umull v1.8h, v1.8b, v5.8b \n" // multiply G |
| 3057 "vmull.u8 q2, d4, d5 \n" // multiply R | 3058 "umull v2.8h, v2.8b, v6.8b \n" // multiply R |
| 3058 "vmull.u8 q3, d6, d7 \n" // multiply A | 3059 "umull v3.8h, v3.8b, v7.8b \n" // multiply A |
| 3059 "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B | 3060 "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B |
| 3060 "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G | 3061 "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G |
| 3061 "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R | 3062 "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R |
| 3062 "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A | 3063 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A |
| 3063 MEMACCESS(2) | 3064 MEMACCESS(2) |
| 3064 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. | 3065 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. |
| 3065 "bgt 1b \n" | 3066 "bgt 1b \n" |
| 3066 | 3067 |
| 3067 : "+r"(src_argb0), // %0 | 3068 : "+r"(src_argb0), // %0 |
| 3068 "+r"(src_argb1), // %1 | 3069 "+r"(src_argb1), // %1 |
| 3069 "+r"(dst_argb), // %2 | 3070 "+r"(dst_argb), // %2 |
| 3070 "+r"(width) // %3 | 3071 "+r"(width) // %3 |
| 3071 : | 3072 : |
| 3072 : "cc", "memory", "q0", "q1", "q2", "q3" | 3073 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
| 3073 ); | 3074 ); |
| 3074 } | 3075 } |
| 3075 #endif // HAS_ARGBMULTIPLYROW_NEON | 3076 #endif // HAS_ARGBMULTIPLYROW_NEON |
| 3076 | 3077 |
| 3077 // Add 2 rows of ARGB pixels together, 8 pixels at a time. | 3078 // Add 2 rows of ARGB pixels together, 8 pixels at a time. |
| 3078 #ifdef HAS_ARGBADDROW_NEON | 3079 #ifdef HAS_ARGBADDROW_NEON |
| 3079 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, | 3080 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |
| 3080 uint8* dst_argb, int width) { | 3081 uint8* dst_argb, int width) { |
| 3081 asm volatile ( | 3082 asm volatile ( |
| 3082 // 8 pixel loop. | 3083 // 8 pixel loop. |
| 3083 ".p2align 2 \n" | 3084 ".p2align 2 \n" |
| 3084 "1: \n" | 3085 "1: \n" |
| 3085 MEMACCESS(0) | 3086 MEMACCESS(0) |
| 3086 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. | 3087 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
| 3087 MEMACCESS(1) | 3088 MEMACCESS(1) |
| 3088 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. | 3089 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. |
| 3089 "subs %3, %3, #8 \n" // 8 processed per loop. | 3090 "subs %3, %3, #8 \n" // 8 processed per loop. |
| 3090 "vqadd.u8 q0, q0, q2 \n" // add B, G | 3091 "uqadd v0.8b, v0.8b, v4.8b \n" |
| 3091 "vqadd.u8 q1, q1, q3 \n" // add R, A | 3092 "uqadd v1.8b, v1.8b, v5.8b \n" |
| 3093 "uqadd v2.8b, v2.8b, v6.8b \n" |
| 3094 "uqadd v3.8b, v3.8b, v7.8b \n" |
| 3092 MEMACCESS(2) | 3095 MEMACCESS(2) |
| 3093 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. | 3096 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. |
| 3094 "bgt 1b \n" | 3097 "bgt 1b \n" |
| 3095 | 3098 |
| 3096 : "+r"(src_argb0), // %0 | 3099 : "+r"(src_argb0), // %0 |
| 3097 "+r"(src_argb1), // %1 | 3100 "+r"(src_argb1), // %1 |
| 3098 "+r"(dst_argb), // %2 | 3101 "+r"(dst_argb), // %2 |
| 3099 "+r"(width) // %3 | 3102 "+r"(width) // %3 |
| 3100 : | 3103 : |
| 3101 : "cc", "memory", "q0", "q1", "q2", "q3" | 3104 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
| 3102 ); | 3105 ); |
| 3103 } | 3106 } |
| 3104 #endif // HAS_ARGBADDROW_NEON | 3107 #endif // HAS_ARGBADDROW_NEON |
| 3105 | 3108 |
| 3106 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. | 3109 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. |
| 3107 #ifdef HAS_ARGBSUBTRACTROW_NEON | 3110 #ifdef HAS_ARGBSUBTRACTROW_NEON |
| 3108 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, | 3111 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |
| 3109 uint8* dst_argb, int width) { | 3112 uint8* dst_argb, int width) { |
| 3110 asm volatile ( | 3113 asm volatile ( |
| 3111 // 8 pixel loop. | 3114 // 8 pixel loop. |
| 3112 ".p2align 2 \n" | 3115 ".p2align 2 \n" |
| 3113 "1: \n" | 3116 "1: \n" |
| 3114 MEMACCESS(0) | 3117 MEMACCESS(0) |
| 3115 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. | 3118 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
| 3116 MEMACCESS(1) | 3119 MEMACCESS(1) |
| 3117 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. | 3120 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. |
| 3118 "subs %3, %3, #8 \n" // 8 processed per loop. | 3121 "subs %3, %3, #8 \n" // 8 processed per loop. |
| 3119 "vqsub.u8 q0, q0, q2 \n" // subtract B, G | 3122 "uqsub v0.8b, v0.8b, v4.8b \n" |
| 3120 "vqsub.u8 q1, q1, q3 \n" // subtract R, A | 3123 "uqsub v1.8b, v1.8b, v5.8b \n" |
| 3124 "uqsub v2.8b, v2.8b, v6.8b \n" |
| 3125 "uqsub v3.8b, v3.8b, v7.8b \n" |
| 3121 MEMACCESS(2) | 3126 MEMACCESS(2) |
| 3122 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. | 3127 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. |
| 3123 "bgt 1b \n" | 3128 "bgt 1b \n" |
| 3124 | 3129 |
| 3125 : "+r"(src_argb0), // %0 | 3130 : "+r"(src_argb0), // %0 |
| 3126 "+r"(src_argb1), // %1 | 3131 "+r"(src_argb1), // %1 |
| 3127 "+r"(dst_argb), // %2 | 3132 "+r"(dst_argb), // %2 |
| 3128 "+r"(width) // %3 | 3133 "+r"(width) // %3 |
| 3129 : | 3134 : |
| 3130 : "cc", "memory", "q0", "q1", "q2", "q3" | 3135 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
| 3131 ); | 3136 ); |
| 3132 } | 3137 } |
| 3133 #endif // HAS_ARGBSUBTRACTROW_NEON | 3138 #endif // HAS_ARGBSUBTRACTROW_NEON |
| 3134 | 3139 |
| 3135 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. | 3140 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. |
| 3136 // A = 255 | 3141 // A = 255 |
| 3137 // R = Sobel | 3142 // R = Sobel |
| 3138 // G = Sobel | 3143 // G = Sobel |
| 3139 // B = Sobel | 3144 // B = Sobel |
| 3140 #ifdef HAS_SOBELROW_NEON | 3145 #ifdef HAS_SOBELROW_NEON |
| 3141 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, | 3146 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, |
| 3142 uint8* dst_argb, int width) { | 3147 uint8* dst_argb, int width) { |
| 3143 asm volatile ( | 3148 asm volatile ( |
| 3144 "vmov.u8 d3, #255 \n" // alpha | 3149 "movi v3.8b, #255 \n" // alpha |
| 3145 // 8 pixel loop. | 3150 // 8 pixel loop. |
| 3146 ".p2align 2 \n" | 3151 ".p2align 2 \n" |
| 3147 "1: \n" | 3152 "1: \n" |
| 3148 MEMACCESS(0) | 3153 MEMACCESS(0) |
| 3149 "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. | 3154 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. |
| 3150 MEMACCESS(1) | 3155 MEMACCESS(1) |
| 3151 "vld1.8 {d1}, [%1]! \n" // load 8 sobely. | 3156 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. |
| 3152 "subs %3, %3, #8 \n" // 8 processed per loop. | 3157 "subs %3, %3, #8 \n" // 8 processed per loop. |
| 3153 "vqadd.u8 d0, d0, d1 \n" // add | 3158 "uqadd v0.8b, v0.8b, v1.8b \n" // add |
| 3154 "vmov.u8 d1, d0 \n" | 3159 "mov v1.8b, v0.8b \n" |
| 3155 "vmov.u8 d2, d0 \n" | 3160 "mov v2.8b, v0.8b \n" |
| 3156 MEMACCESS(2) | 3161 MEMACCESS(2) |
| 3157 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. | 3162 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. |
| 3158 "bgt 1b \n" | 3163 "bgt 1b \n" |
| 3159 : "+r"(src_sobelx), // %0 | 3164 : "+r"(src_sobelx), // %0 |
| 3160 "+r"(src_sobely), // %1 | 3165 "+r"(src_sobely), // %1 |
| 3161 "+r"(dst_argb), // %2 | 3166 "+r"(dst_argb), // %2 |
| 3162 "+r"(width) // %3 | 3167 "+r"(width) // %3 |
| 3163 : | 3168 : |
| 3164 : "cc", "memory", "q0", "q1" | 3169 : "cc", "memory", "v0", "v1", "v2", "v3" |
| 3165 ); | 3170 ); |
| 3166 } | 3171 } |
| 3167 #endif // HAS_SOBELROW_NEON | 3172 #endif // HAS_SOBELROW_NEON |
| 3168 | 3173 |
| 3169 // Adds Sobel X and Sobel Y and stores Sobel into plane. | 3174 // Adds Sobel X and Sobel Y and stores Sobel into plane. |
| 3170 #ifdef HAS_SOBELTOPLANEROW_NEON | 3175 #ifdef HAS_SOBELTOPLANEROW_NEON |
| 3171 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, | 3176 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, |
| 3172 uint8* dst_y, int width) { | 3177 uint8* dst_y, int width) { |
| 3173 asm volatile ( | 3178 asm volatile ( |
| 3174 // 16 pixel loop. | 3179 // 16 pixel loop. |
| 3175 ".p2align 2 \n" | 3180 ".p2align 2 \n" |
| 3176 "1: \n" | 3181 "1: \n" |
| 3177 MEMACCESS(0) | 3182 MEMACCESS(0) |
| 3178 "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. | 3183 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. |
| 3179 MEMACCESS(1) | 3184 MEMACCESS(1) |
| 3180 "vld1.8 {q1}, [%1]! \n" // load 16 sobely. | 3185 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. |
| 3181 "subs %3, %3, #16 \n" // 16 processed per loop. | 3186 "subs %3, %3, #16 \n" // 16 processed per loop. |
| 3182 "vqadd.u8 q0, q0, q1 \n" // add | 3187 "uqadd v0.16b, v0.16b, v1.16b \n" // add |
| 3183 MEMACCESS(2) | 3188 MEMACCESS(2) |
| 3184 "vst1.8 {q0}, [%2]! \n" // store 16 pixels. | 3189 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. |
| 3185 "bgt 1b \n" | 3190 "bgt 1b \n" |
| 3186 : "+r"(src_sobelx), // %0 | 3191 : "+r"(src_sobelx), // %0 |
| 3187 "+r"(src_sobely), // %1 | 3192 "+r"(src_sobely), // %1 |
| 3188 "+r"(dst_y), // %2 | 3193 "+r"(dst_y), // %2 |
| 3189 "+r"(width) // %3 | 3194 "+r"(width) // %3 |
| 3190 : | 3195 : |
| 3191 : "cc", "memory", "q0", "q1" | 3196 : "cc", "memory", "v0", "v1" |
| 3192 ); | 3197 ); |
| 3193 } | 3198 } |
| 3194 #endif // HAS_SOBELTOPLANEROW_NEON | 3199 #endif // HAS_SOBELTOPLANEROW_NEON |
| 3195 | 3200 |
| 3196 // Mixes Sobel X, Sobel Y and Sobel into ARGB. | 3201 // Mixes Sobel X, Sobel Y and Sobel into ARGB. |
| 3197 // A = 255 | 3202 // A = 255 |
| 3198 // R = Sobel X | 3203 // R = Sobel X |
| 3199 // G = Sobel | 3204 // G = Sobel |
| 3200 // B = Sobel Y | 3205 // B = Sobel Y |
| 3201 #ifdef HAS_SOBELXYROW_NEON | 3206 #ifdef HAS_SOBELXYROW_NEON |
| 3202 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, | 3207 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, |
| 3203 uint8* dst_argb, int width) { | 3208 uint8* dst_argb, int width) { |
| 3204 asm volatile ( | 3209 asm volatile ( |
| 3205 "vmov.u8 d3, #255 \n" // alpha | 3210 "movi v3.8b, #255 \n" // alpha |
| 3206 // 8 pixel loop. | 3211 // 8 pixel loop. |
| 3207 ".p2align 2 \n" | 3212 ".p2align 2 \n" |
| 3208 "1: \n" | 3213 "1: \n" |
| 3209 MEMACCESS(0) | 3214 MEMACCESS(0) |
| 3210 "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. | 3215 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. |
| 3211 MEMACCESS(1) | 3216 MEMACCESS(1) |
| 3212 "vld1.8 {d0}, [%1]! \n" // load 8 sobely. | 3217 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. |
| 3213 "subs %3, %3, #8 \n" // 8 processed per loop. | 3218 "subs %3, %3, #8 \n" // 8 processed per loop. |
| 3214 "vqadd.u8 d1, d0, d2 \n" // add | 3219 "uqadd v1.8b, v0.8b, v2.8b \n" // add |
| 3215 MEMACCESS(2) | 3220 MEMACCESS(2) |
| 3216 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. | 3221 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. |
| 3217 "bgt 1b \n" | 3222 "bgt 1b \n" |
| 3218 : "+r"(src_sobelx), // %0 | 3223 : "+r"(src_sobelx), // %0 |
| 3219 "+r"(src_sobely), // %1 | 3224 "+r"(src_sobely), // %1 |
| 3220 "+r"(dst_argb), // %2 | 3225 "+r"(dst_argb), // %2 |
| 3221 "+r"(width) // %3 | 3226 "+r"(width) // %3 |
| 3222 : | 3227 : |
| 3223 : "cc", "memory", "q0", "q1" | 3228 : "cc", "memory", "v0", "v1", "v2", "v3" |
| 3224 ); | 3229 ); |
| 3225 } | 3230 } |
| 3226 #endif // HAS_SOBELXYROW_NEON | 3231 #endif // HAS_SOBELXYROW_NEON |
| 3227 | 3232 |
| 3228 // SobelX as a matrix is | 3233 // SobelX as a matrix is |
| 3229 // -1 0 1 | 3234 // -1 0 1 |
| 3230 // -2 0 2 | 3235 // -2 0 2 |
| 3231 // -1 0 1 | 3236 // -1 0 1 |
| 3232 #ifdef HAS_SOBELXROW_NEON | 3237 #ifdef HAS_SOBELXROW_NEON |
| 3233 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, | 3238 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, |
| 3234 const uint8* src_y2, uint8* dst_sobelx, int width) { | 3239 const uint8* src_y2, uint8* dst_sobelx, int width) { |
| 3235 asm volatile ( | 3240 asm volatile ( |
| 3236 ".p2align 2 \n" | 3241 ".p2align 2 \n" |
| 3237 "1: \n" | 3242 "1: \n" |
| 3238 MEMACCESS(0) | 3243 MEMACCESS(0) |
| 3239 "vld1.8 {d0}, [%0],%5 \n" // top | 3244 "ld1 {v0.8b}, [%0],%5 \n" // top |
| 3240 MEMACCESS(0) | 3245 MEMACCESS(0) |
| 3241 "vld1.8 {d1}, [%0],%6 \n" | 3246 "ld1 {v1.8b}, [%0],%6 \n" |
| 3242 "vsubl.u8 q0, d0, d1 \n" | 3247 "usubl v0.8h, v0.8b, v1.8b \n" |
| 3243 MEMACCESS(1) | 3248 MEMACCESS(1) |
| 3244 "vld1.8 {d2}, [%1],%5 \n" // center * 2 | 3249 "ld1 {v2.8b}, [%1],%5 \n" // center * 2 |
| 3245 MEMACCESS(1) | 3250 MEMACCESS(1) |
| 3246 "vld1.8 {d3}, [%1],%6 \n" | 3251 "ld1 {v3.8b}, [%1],%6 \n" |
| 3247 "vsubl.u8 q1, d2, d3 \n" | 3252 "usubl v1.8h, v2.8b, v3.8b \n" |
| 3248 "vadd.s16 q0, q0, q1 \n" | 3253 "add v0.8h, v0.8h, v1.8h \n" |
| 3249 "vadd.s16 q0, q0, q1 \n" | 3254 "add v0.8h, v0.8h, v1.8h \n" |
| 3250 MEMACCESS(2) | 3255 MEMACCESS(2) |
| 3251 "vld1.8 {d2}, [%2],%5 \n" // bottom | 3256 "ld1 {v2.8b}, [%2],%5 \n" // bottom |
| 3252 MEMACCESS(2) | 3257 MEMACCESS(2) |
| 3253 "vld1.8 {d3}, [%2],%6 \n" | 3258 "ld1 {v3.8b}, [%2],%6 \n" |
| 3254 "subs %4, %4, #8 \n" // 8 pixels | 3259 "subs %4, %4, #8 \n" // 8 pixels |
| 3255 "vsubl.u8 q1, d2, d3 \n" | 3260 "usubl v1.8h, v2.8b, v3.8b \n" |
| 3256 "vadd.s16 q0, q0, q1 \n" | 3261 "add v0.8h, v0.8h, v1.8h \n" |
| 3257 "vabs.s16 q0, q0 \n" | 3262 "abs v0.8h, v0.8h \n" |
| 3258 "vqmovn.u16 d0, q0 \n" | 3263 "uqxtn v0.8b, v0.8h \n" |
| 3259 MEMACCESS(3) | 3264 MEMACCESS(3) |
| 3260 "vst1.8 {d0}, [%3]! \n" // store 8 sobelx | 3265 "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx |
| 3261 "bgt 1b \n" | 3266 "bgt 1b \n" |
| 3262 : "+r"(src_y0), // %0 | 3267 : "+r"(src_y0), // %0 |
| 3263 "+r"(src_y1), // %1 | 3268 "+r"(src_y1), // %1 |
| 3264 "+r"(src_y2), // %2 | 3269 "+r"(src_y2), // %2 |
| 3265 "+r"(dst_sobelx), // %3 | 3270 "+r"(dst_sobelx), // %3 |
| 3266 "+r"(width) // %4 | 3271 "+r"(width) // %4 |
| 3267 : "r"(2), // %5 | 3272 : "r"(2), // %5 |
| 3268 "r"(6) // %6 | 3273 "r"(6) // %6 |
| 3269 : "cc", "memory", "q0", "q1" // Clobber List | 3274 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| 3270 ); | 3275 ); |
| 3271 } | 3276 } |
| 3272 #endif // HAS_SOBELXROW_NEON | 3277 #endif // HAS_SOBELXROW_NEON |
| 3273 | 3278 |
| 3274 // SobelY as a matrix is | 3279 // SobelY as a matrix is |
| 3275 // -1 -2 -1 | 3280 // -1 -2 -1 |
| 3276 // 0 0 0 | 3281 // 0 0 0 |
| 3277 // 1 2 1 | 3282 // 1 2 1 |
| 3278 #ifdef HAS_SOBELYROW_NEON | 3283 #ifdef HAS_SOBELYROW_NEON |
| 3279 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, | 3284 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, |
| 3280 uint8* dst_sobely, int width) { | 3285 uint8* dst_sobely, int width) { |
| 3281 asm volatile ( | 3286 asm volatile ( |
| 3282 ".p2align 2 \n" | 3287 ".p2align 2 \n" |
| 3283 "1: \n" | 3288 "1: \n" |
| 3284 MEMACCESS(0) | 3289 MEMACCESS(0) |
| 3285 "vld1.8 {d0}, [%0],%4 \n" // left | 3290 "ld1 {v0.8b}, [%0],%4 \n" // left |
| 3286 MEMACCESS(1) | 3291 MEMACCESS(1) |
| 3287 "vld1.8 {d1}, [%1],%4 \n" | 3292 "ld1 {v1.8b}, [%1],%4 \n" |
| 3288 "vsubl.u8 q0, d0, d1 \n" | 3293 "usubl v0.8h, v0.8b, v1.8b \n" |
| 3289 MEMACCESS(0) | 3294 MEMACCESS(0) |
| 3290 "vld1.8 {d2}, [%0],%4 \n" // center * 2 | 3295 "ld1 {v2.8b}, [%0],%4 \n" // center * 2 |
| 3291 MEMACCESS(1) | 3296 MEMACCESS(1) |
| 3292 "vld1.8 {d3}, [%1],%4 \n" | 3297 "ld1 {v3.8b}, [%1],%4 \n" |
| 3293 "vsubl.u8 q1, d2, d3 \n" | 3298 "usubl v1.8h, v2.8b, v3.8b \n" |
| 3294 "vadd.s16 q0, q0, q1 \n" | 3299 "add v0.8h, v0.8h, v1.8h \n" |
| 3295 "vadd.s16 q0, q0, q1 \n" | 3300 "add v0.8h, v0.8h, v1.8h \n" |
| 3296 MEMACCESS(0) | 3301 MEMACCESS(0) |
| 3297 "vld1.8 {d2}, [%0],%5 \n" // right | 3302 "ld1 {v2.8b}, [%0],%5 \n" // right |
| 3298 MEMACCESS(1) | 3303 MEMACCESS(1) |
| 3299 "vld1.8 {d3}, [%1],%5 \n" | 3304 "ld1 {v3.8b}, [%1],%5 \n" |
| 3300 "subs %3, %3, #8 \n" // 8 pixels | 3305 "subs %3, %3, #8 \n" // 8 pixels |
| 3301 "vsubl.u8 q1, d2, d3 \n" | 3306 "usubl v1.8h, v2.8b, v3.8b \n" |
| 3302 "vadd.s16 q0, q0, q1 \n" | 3307 "add v0.8h, v0.8h, v1.8h \n" |
| 3303 "vabs.s16 q0, q0 \n" | 3308 "abs v0.8h, v0.8h \n" |
| 3304 "vqmovn.u16 d0, q0 \n" | 3309 "uqxtn v0.8b, v0.8h \n" |
| 3305 MEMACCESS(2) | 3310 MEMACCESS(2) |
| 3306 "vst1.8 {d0}, [%2]! \n" // store 8 sobely | 3311 "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely |
| 3307 "bgt 1b \n" | 3312 "bgt 1b \n" |
| 3308 : "+r"(src_y0), // %0 | 3313 : "+r"(src_y0), // %0 |
| 3309 "+r"(src_y1), // %1 | 3314 "+r"(src_y1), // %1 |
| 3310 "+r"(dst_sobely), // %2 | 3315 "+r"(dst_sobely), // %2 |
| 3311 "+r"(width) // %3 | 3316 "+r"(width) // %3 |
| 3312 : "r"(1), // %4 | 3317 : "r"(1), // %4 |
| 3313 "r"(6) // %5 | 3318 "r"(6) // %5 |
| 3314 : "cc", "memory", "q0", "q1" // Clobber List | 3319 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| 3315 ); | 3320 ); |
| 3316 } | 3321 } |
| 3317 #endif // HAS_SOBELYROW_NEON | 3322 #endif // HAS_SOBELYROW_NEON |
| 3318 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 3323 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |
| 3319 | 3324 |
| 3320 #ifdef __cplusplus | 3325 #ifdef __cplusplus |
| 3321 } // extern "C" | 3326 } // extern "C" |
| 3322 } // namespace libyuv | 3327 } // namespace libyuv |
| 3323 #endif | 3328 #endif |
| OLD | NEW |