OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 1878 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1889 __asm lea esi, [esi + 8] \ | 1889 __asm lea esi, [esi + 8] \ |
1890 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ | 1890 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ |
1891 __asm vpermq ymm0, ymm0, 0xd8 \ | 1891 __asm vpermq ymm0, ymm0, 0xd8 \ |
1892 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ | 1892 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
1893 __asm vmovdqu xmm4, [eax] /* Y */ \ | 1893 __asm vmovdqu xmm4, [eax] /* Y */ \ |
1894 __asm vpermq ymm4, ymm4, 0xd8 \ | 1894 __asm vpermq ymm4, ymm4, 0xd8 \ |
1895 __asm vpunpcklbw ymm4, ymm4, ymm4 \ | 1895 __asm vpunpcklbw ymm4, ymm4, ymm4 \ |
1896 __asm lea eax, [eax + 16] \ | 1896 __asm lea eax, [eax + 16] \ |
1897 } | 1897 } |
1898 | 1898 |
| 1899 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. |
| 1900 #define READYUVA422_AVX2 __asm { \ |
| 1901 __asm vmovq xmm0, qword ptr [esi] /* U */ \ |
| 1902 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ |
| 1903 __asm lea esi, [esi + 8] \ |
| 1904 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ |
| 1905 __asm vpermq ymm0, ymm0, 0xd8 \ |
| 1906 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
| 1907 __asm vmovdqu xmm4, [eax] /* Y */ \ |
| 1908 __asm vpermq ymm4, ymm4, 0xd8 \ |
| 1909 __asm vpunpcklbw ymm4, ymm4, ymm4 \ |
| 1910 __asm lea eax, [eax + 16] \ |
| 1911 __asm vmovdqu xmm5, [ebp] /* A */ \ |
| 1912 __asm vpermq ymm5, ymm5, 0xd8 \ |
| 1913 __asm lea ebp, [ebp + 16] \ |
| 1914 } |
| 1915 |
1899 // Read 4 UV from 411, upsample to 16 UV. | 1916 // Read 4 UV from 411, upsample to 16 UV. |
1900 #define READYUV411_AVX2 __asm { \ | 1917 #define READYUV411_AVX2 __asm { \ |
1901 __asm vmovd xmm0, dword ptr [esi] /* U */ \ | 1918 __asm vmovd xmm0, dword ptr [esi] /* U */ \ |
1902 __asm vmovd xmm1, dword ptr [esi + edi] /* V */ \ | 1919 __asm vmovd xmm1, dword ptr [esi + edi] /* V */ \ |
1903 __asm lea esi, [esi + 4] \ | 1920 __asm lea esi, [esi + 4] \ |
1904 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ | 1921 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ |
1905 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ | 1922 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
1906 __asm vpermq ymm0, ymm0, 0xd8 \ | 1923 __asm vpermq ymm0, ymm0, 0xd8 \ |
1907 __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \ | 1924 __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \ |
1908 __asm vmovdqu xmm4, [eax] /* Y */ \ | 1925 __asm vmovdqu xmm4, [eax] /* Y */ \ |
(...skipping 141 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2050 | 2067 |
2051 pop ebx | 2068 pop ebx |
2052 pop edi | 2069 pop edi |
2053 pop esi | 2070 pop esi |
2054 vzeroupper | 2071 vzeroupper |
2055 ret | 2072 ret |
2056 } | 2073 } |
2057 } | 2074 } |
2058 #endif // HAS_I422TOARGBROW_AVX2 | 2075 #endif // HAS_I422TOARGBROW_AVX2 |
2059 | 2076 |
| 2077 #ifdef HAS_I422ALPHATOARGBROW_AVX2 |
| 2078 // 16 pixels |
| 2079 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. |
| 2080 __declspec(naked) |
| 2081 void I422AlphaToARGBRow_AVX2(const uint8* y_buf, |
| 2082 const uint8* u_buf, |
| 2083 const uint8* v_buf, |
| 2084 const uint8* a_buf, |
| 2085 uint8* dst_argb, |
| 2086 struct YuvConstants* yuvconstants, |
| 2087 int width) { |
| 2088 __asm { |
| 2089 push esi |
| 2090 push edi |
| 2091 push ebx |
| 2092 push ebp |
| 2093 mov eax, [esp + 16 + 4] // Y |
| 2094 mov esi, [esp + 16 + 8] // U |
| 2095 mov edi, [esp + 16 + 12] // V |
| 2096 mov ebp, [esp + 16 + 16] // A |
| 2097 mov edx, [esp + 16 + 20] // argb |
| 2098 mov ebx, [esp + 16 + 24] // yuvconstants |
| 2099 mov ecx, [esp + 16 + 28] // width |
| 2100 sub edi, esi |
| 2101 |
| 2102 convertloop: |
| 2103 READYUVA422_AVX2 |
| 2104 YUVTORGB_AVX2(ebx) |
| 2105 STOREARGB_AVX2 |
| 2106 |
| 2107 sub ecx, 16 |
| 2108 jg convertloop |
| 2109 |
| 2110 pop ebp |
| 2111 pop ebx |
| 2112 pop edi |
| 2113 pop esi |
| 2114 vzeroupper |
| 2115 ret |
| 2116 } |
| 2117 } |
| 2118 #endif // HAS_I422ALPHATOARGBROW_AVX2 |
| 2119 |
| 2120 #ifdef HAS_I422ALPHATOABGRROW_AVX2 |
| 2121 // 16 pixels |
| 2122 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ABGR. |
| 2123 __declspec(naked) |
| 2124 void I422AlphaToABGRRow_AVX2(const uint8* y_buf, |
| 2125 const uint8* u_buf, |
| 2126 const uint8* v_buf, |
| 2127 const uint8* a_buf, |
| 2128 uint8* dst_abgr, |
| 2129 struct YuvConstants* yuvconstants, |
| 2130 int width) { |
| 2131 __asm { |
| 2132 push esi |
| 2133 push edi |
| 2134 push ebx |
| 2135 push ebp |
| 2136 mov eax, [esp + 16 + 4] // Y |
| 2137 mov esi, [esp + 16 + 8] // U |
| 2138 mov edi, [esp + 16 + 12] // V |
| 2139 mov ebp, [esp + 16 + 16] // A |
| 2140 mov edx, [esp + 16 + 20] // abgr |
| 2141 mov ebx, [esp + 16 + 24] // yuvconstants |
| 2142 mov ecx, [esp + 16 + 28] // width |
| 2143 sub edi, esi |
| 2144 |
| 2145 convertloop: |
| 2146 READYUVA422_AVX2 |
| 2147 YUVTORGB_AVX2(ebx) |
| 2148 STOREABGR_AVX2 |
| 2149 |
| 2150 sub ecx, 16 |
| 2151 jg convertloop |
| 2152 |
| 2153 pop ebp |
| 2154 pop ebx |
| 2155 pop edi |
| 2156 pop esi |
| 2157 vzeroupper |
| 2158 ret |
| 2159 } |
| 2160 } |
| 2161 #endif // HAS_I422ALPHATOABGRROW_AVX2 |
| 2162 |
2060 #ifdef HAS_I444TOARGBROW_AVX2 | 2163 #ifdef HAS_I444TOARGBROW_AVX2 |
2061 // 16 pixels | 2164 // 16 pixels |
2062 // 16 UV values with 16 Y producing 16 ARGB (64 bytes). | 2165 // 16 UV values with 16 Y producing 16 ARGB (64 bytes). |
2063 __declspec(naked) | 2166 __declspec(naked) |
2064 void I444ToARGBRow_AVX2(const uint8* y_buf, | 2167 void I444ToARGBRow_AVX2(const uint8* y_buf, |
2065 const uint8* u_buf, | 2168 const uint8* u_buf, |
2066 const uint8* v_buf, | 2169 const uint8* v_buf, |
2067 uint8* dst_argb, | 2170 uint8* dst_argb, |
2068 struct YuvConstants* yuvconstants, | 2171 struct YuvConstants* yuvconstants, |
2069 int width) { | 2172 int width) { |
(...skipping 771 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2841 jg convertloop | 2944 jg convertloop |
2842 | 2945 |
2843 pop ebx | 2946 pop ebx |
2844 pop edi | 2947 pop edi |
2845 pop esi | 2948 pop esi |
2846 ret | 2949 ret |
2847 } | 2950 } |
2848 } | 2951 } |
2849 | 2952 |
2850 // 8 pixels. | 2953 // 8 pixels. |
2851 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB (32 by
tes). | 2954 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB. |
2852 __declspec(naked) | 2955 __declspec(naked) |
2853 void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, | 2956 void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, |
2854 const uint8* u_buf, | 2957 const uint8* u_buf, |
2855 const uint8* v_buf, | 2958 const uint8* v_buf, |
2856 const uint8* a_buf, | 2959 const uint8* a_buf, |
2857 uint8* dst_argb, | 2960 uint8* dst_argb, |
2858 struct YuvConstants* yuvconstants, | 2961 struct YuvConstants* yuvconstants, |
2859 int width) { | 2962 int width) { |
2860 __asm { | 2963 __asm { |
2861 push esi | 2964 push esi |
2862 push edi | 2965 push edi |
2863 push ebx | 2966 push ebx |
2864 push ebp | 2967 push ebp |
2865 mov eax, [esp + 16 + 4] // Y | 2968 mov eax, [esp + 16 + 4] // Y |
2866 mov esi, [esp + 16 + 8] // U | 2969 mov esi, [esp + 16 + 8] // U |
2867 mov edi, [esp + 16 + 12] // V | 2970 mov edi, [esp + 16 + 12] // V |
2868 mov ebp, [esp + 16 + 16] // A | 2971 mov ebp, [esp + 16 + 16] // A |
2869 mov edx, [esp + 16 + 20] // argb | 2972 mov edx, [esp + 16 + 20] // argb |
2870 mov ebx, [esp + 16 + 24] // yuvconstants | 2973 mov ebx, [esp + 16 + 24] // yuvconstants |
2871 mov ecx, [esp + 16 + 28] // width | 2974 mov ecx, [esp + 16 + 28] // width |
2872 sub edi, esi | 2975 sub edi, esi |
2873 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | |
2874 | 2976 |
2875 convertloop: | 2977 convertloop: |
2876 READYUVA422 | 2978 READYUVA422 |
2877 YUVTORGB(ebx) | 2979 YUVTORGB(ebx) |
2878 STOREARGB | 2980 STOREARGB |
2879 | 2981 |
2880 sub ecx, 8 | 2982 sub ecx, 8 |
2881 jg convertloop | 2983 jg convertloop |
2882 | 2984 |
2883 pop ebp | 2985 pop ebp |
2884 pop ebx | 2986 pop ebx |
2885 pop edi | 2987 pop edi |
2886 pop esi | 2988 pop esi |
2887 ret | 2989 ret |
2888 } | 2990 } |
2889 } | 2991 } |
2890 | 2992 |
2891 // 8 pixels. | 2993 // 8 pixels. |
2892 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ABGR (32 by
tes). | 2994 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ABGR. |
2893 __declspec(naked) | 2995 __declspec(naked) |
2894 void I422AlphaToABGRRow_SSSE3(const uint8* y_buf, | 2996 void I422AlphaToABGRRow_SSSE3(const uint8* y_buf, |
2895 const uint8* u_buf, | 2997 const uint8* u_buf, |
2896 const uint8* v_buf, | 2998 const uint8* v_buf, |
2897 const uint8* a_buf, | 2999 const uint8* a_buf, |
2898 uint8* dst_abgr, | 3000 uint8* dst_abgr, |
2899 struct YuvConstants* yuvconstants, | 3001 struct YuvConstants* yuvconstants, |
2900 int width) { | 3002 int width) { |
2901 __asm { | 3003 __asm { |
2902 push esi | 3004 push esi |
2903 push edi | 3005 push edi |
2904 push ebx | 3006 push ebx |
2905 push ebp | 3007 push ebp |
2906 mov eax, [esp + 16 + 4] // Y | 3008 mov eax, [esp + 16 + 4] // Y |
2907 mov esi, [esp + 16 + 8] // U | 3009 mov esi, [esp + 16 + 8] // U |
2908 mov edi, [esp + 16 + 12] // V | 3010 mov edi, [esp + 16 + 12] // V |
2909 mov ebp, [esp + 16 + 16] // A | 3011 mov ebp, [esp + 16 + 16] // A |
2910 mov edx, [esp + 16 + 20] // abgr | 3012 mov edx, [esp + 16 + 20] // abgr |
2911 mov ebx, [esp + 16 + 24] // yuvconstants | 3013 mov ebx, [esp + 16 + 24] // yuvconstants |
2912 mov ecx, [esp + 16 + 28] // width | 3014 mov ecx, [esp + 16 + 28] // width |
2913 sub edi, esi | 3015 sub edi, esi |
2914 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | |
2915 | 3016 |
2916 convertloop: | 3017 convertloop: |
2917 READYUVA422 | 3018 READYUVA422 |
2918 YUVTORGB(ebx) | 3019 YUVTORGB(ebx) |
2919 STOREABGR | 3020 STOREABGR |
2920 | 3021 |
2921 sub ecx, 8 | 3022 sub ecx, 8 |
2922 jg convertloop | 3023 jg convertloop |
2923 | 3024 |
2924 pop ebp | 3025 pop ebp |
(...skipping 3626 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6551 } | 6652 } |
6552 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6653 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
6553 | 6654 |
6554 #endif // defined(_M_X64) | 6655 #endif // defined(_M_X64) |
6555 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) | 6656 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
6556 | 6657 |
6557 #ifdef __cplusplus | 6658 #ifdef __cplusplus |
6558 } // extern "C" | 6659 } // extern "C" |
6559 } // namespace libyuv | 6660 } // namespace libyuv |
6560 #endif | 6661 #endif |
OLD | NEW |