| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 1878 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1889 __asm lea esi, [esi + 8] \ | 1889 __asm lea esi, [esi + 8] \ |
| 1890 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ | 1890 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ |
| 1891 __asm vpermq ymm0, ymm0, 0xd8 \ | 1891 __asm vpermq ymm0, ymm0, 0xd8 \ |
| 1892 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ | 1892 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
| 1893 __asm vmovdqu xmm4, [eax] /* Y */ \ | 1893 __asm vmovdqu xmm4, [eax] /* Y */ \ |
| 1894 __asm vpermq ymm4, ymm4, 0xd8 \ | 1894 __asm vpermq ymm4, ymm4, 0xd8 \ |
| 1895 __asm vpunpcklbw ymm4, ymm4, ymm4 \ | 1895 __asm vpunpcklbw ymm4, ymm4, ymm4 \ |
| 1896 __asm lea eax, [eax + 16] \ | 1896 __asm lea eax, [eax + 16] \ |
| 1897 } | 1897 } |
| 1898 | 1898 |
| 1899 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. |
| 1900 #define READYUVA422_AVX2 __asm { \ |
| 1901 __asm vmovq xmm0, qword ptr [esi] /* U */ \ |
| 1902 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ |
| 1903 __asm lea esi, [esi + 8] \ |
| 1904 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ |
| 1905 __asm vpermq ymm0, ymm0, 0xd8 \ |
| 1906 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
| 1907 __asm vmovdqu xmm4, [eax] /* Y */ \ |
| 1908 __asm vpermq ymm4, ymm4, 0xd8 \ |
| 1909 __asm vpunpcklbw ymm4, ymm4, ymm4 \ |
| 1910 __asm lea eax, [eax + 16] \ |
| 1911 __asm vmovdqu xmm5, [ebp] /* A */ \ |
| 1912 __asm vpermq ymm5, ymm5, 0xd8 \ |
| 1913 __asm lea ebp, [ebp + 16] \ |
| 1914 } |
| 1915 |
| 1899 // Read 4 UV from 411, upsample to 16 UV. | 1916 // Read 4 UV from 411, upsample to 16 UV. |
| 1900 #define READYUV411_AVX2 __asm { \ | 1917 #define READYUV411_AVX2 __asm { \ |
| 1901 __asm vmovd xmm0, dword ptr [esi] /* U */ \ | 1918 __asm vmovd xmm0, dword ptr [esi] /* U */ \ |
| 1902 __asm vmovd xmm1, dword ptr [esi + edi] /* V */ \ | 1919 __asm vmovd xmm1, dword ptr [esi + edi] /* V */ \ |
| 1903 __asm lea esi, [esi + 4] \ | 1920 __asm lea esi, [esi + 4] \ |
| 1904 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ | 1921 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ |
| 1905 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ | 1922 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
| 1906 __asm vpermq ymm0, ymm0, 0xd8 \ | 1923 __asm vpermq ymm0, ymm0, 0xd8 \ |
| 1907 __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \ | 1924 __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \ |
| 1908 __asm vmovdqu xmm4, [eax] /* Y */ \ | 1925 __asm vmovdqu xmm4, [eax] /* Y */ \ |
| (...skipping 141 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2050 | 2067 |
| 2051 pop ebx | 2068 pop ebx |
| 2052 pop edi | 2069 pop edi |
| 2053 pop esi | 2070 pop esi |
| 2054 vzeroupper | 2071 vzeroupper |
| 2055 ret | 2072 ret |
| 2056 } | 2073 } |
| 2057 } | 2074 } |
| 2058 #endif // HAS_I422TOARGBROW_AVX2 | 2075 #endif // HAS_I422TOARGBROW_AVX2 |
| 2059 | 2076 |
| 2077 #ifdef HAS_I422ALPHATOARGBROW_AVX2 |
| 2078 // 16 pixels |
| 2079 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. |
| 2080 __declspec(naked) |
| 2081 void I422AlphaToARGBRow_AVX2(const uint8* y_buf, |
| 2082 const uint8* u_buf, |
| 2083 const uint8* v_buf, |
| 2084 const uint8* a_buf, |
| 2085 uint8* dst_argb, |
| 2086 struct YuvConstants* yuvconstants, |
| 2087 int width) { |
| 2088 __asm { |
| 2089 push esi |
| 2090 push edi |
| 2091 push ebx |
| 2092 push ebp |
| 2093 mov eax, [esp + 16 + 4] // Y |
| 2094 mov esi, [esp + 16 + 8] // U |
| 2095 mov edi, [esp + 16 + 12] // V |
| 2096 mov ebp, [esp + 16 + 16] // A |
| 2097 mov edx, [esp + 16 + 20] // argb |
| 2098 mov ebx, [esp + 16 + 24] // yuvconstants |
| 2099 mov ecx, [esp + 16 + 28] // width |
| 2100 sub edi, esi |
| 2101 |
| 2102 convertloop: |
| 2103 READYUVA422_AVX2 |
| 2104 YUVTORGB_AVX2(ebx) |
| 2105 STOREARGB_AVX2 |
| 2106 |
| 2107 sub ecx, 16 |
| 2108 jg convertloop |
| 2109 |
| 2110 pop ebp |
| 2111 pop ebx |
| 2112 pop edi |
| 2113 pop esi |
| 2114 vzeroupper |
| 2115 ret |
| 2116 } |
| 2117 } |
| 2118 #endif // HAS_I422ALPHATOARGBROW_AVX2 |
| 2119 |
| 2120 #ifdef HAS_I422ALPHATOABGRROW_AVX2 |
| 2121 // 16 pixels |
| 2122 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ABGR. |
| 2123 __declspec(naked) |
| 2124 void I422AlphaToABGRRow_AVX2(const uint8* y_buf, |
| 2125 const uint8* u_buf, |
| 2126 const uint8* v_buf, |
| 2127 const uint8* a_buf, |
| 2128 uint8* dst_abgr, |
| 2129 struct YuvConstants* yuvconstants, |
| 2130 int width) { |
| 2131 __asm { |
| 2132 push esi |
| 2133 push edi |
| 2134 push ebx |
| 2135 push ebp |
| 2136 mov eax, [esp + 16 + 4] // Y |
| 2137 mov esi, [esp + 16 + 8] // U |
| 2138 mov edi, [esp + 16 + 12] // V |
| 2139 mov ebp, [esp + 16 + 16] // A |
| 2140 mov edx, [esp + 16 + 20] // abgr |
| 2141 mov ebx, [esp + 16 + 24] // yuvconstants |
| 2142 mov ecx, [esp + 16 + 28] // width |
| 2143 sub edi, esi |
| 2144 |
| 2145 convertloop: |
| 2146 READYUVA422_AVX2 |
| 2147 YUVTORGB_AVX2(ebx) |
| 2148 STOREABGR_AVX2 |
| 2149 |
| 2150 sub ecx, 16 |
| 2151 jg convertloop |
| 2152 |
| 2153 pop ebp |
| 2154 pop ebx |
| 2155 pop edi |
| 2156 pop esi |
| 2157 vzeroupper |
| 2158 ret |
| 2159 } |
| 2160 } |
| 2161 #endif // HAS_I422ALPHATOABGRROW_AVX2 |
| 2162 |
| 2060 #ifdef HAS_I444TOARGBROW_AVX2 | 2163 #ifdef HAS_I444TOARGBROW_AVX2 |
| 2061 // 16 pixels | 2164 // 16 pixels |
| 2062 // 16 UV values with 16 Y producing 16 ARGB (64 bytes). | 2165 // 16 UV values with 16 Y producing 16 ARGB (64 bytes). |
| 2063 __declspec(naked) | 2166 __declspec(naked) |
| 2064 void I444ToARGBRow_AVX2(const uint8* y_buf, | 2167 void I444ToARGBRow_AVX2(const uint8* y_buf, |
| 2065 const uint8* u_buf, | 2168 const uint8* u_buf, |
| 2066 const uint8* v_buf, | 2169 const uint8* v_buf, |
| 2067 uint8* dst_argb, | 2170 uint8* dst_argb, |
| 2068 struct YuvConstants* yuvconstants, | 2171 struct YuvConstants* yuvconstants, |
| 2069 int width) { | 2172 int width) { |
| (...skipping 771 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2841 jg convertloop | 2944 jg convertloop |
| 2842 | 2945 |
| 2843 pop ebx | 2946 pop ebx |
| 2844 pop edi | 2947 pop edi |
| 2845 pop esi | 2948 pop esi |
| 2846 ret | 2949 ret |
| 2847 } | 2950 } |
| 2848 } | 2951 } |
| 2849 | 2952 |
| 2850 // 8 pixels. | 2953 // 8 pixels. |
| 2851 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB (32 by
tes). | 2954 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB. |
| 2852 __declspec(naked) | 2955 __declspec(naked) |
| 2853 void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, | 2956 void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, |
| 2854 const uint8* u_buf, | 2957 const uint8* u_buf, |
| 2855 const uint8* v_buf, | 2958 const uint8* v_buf, |
| 2856 const uint8* a_buf, | 2959 const uint8* a_buf, |
| 2857 uint8* dst_argb, | 2960 uint8* dst_argb, |
| 2858 struct YuvConstants* yuvconstants, | 2961 struct YuvConstants* yuvconstants, |
| 2859 int width) { | 2962 int width) { |
| 2860 __asm { | 2963 __asm { |
| 2861 push esi | 2964 push esi |
| 2862 push edi | 2965 push edi |
| 2863 push ebx | 2966 push ebx |
| 2864 push ebp | 2967 push ebp |
| 2865 mov eax, [esp + 16 + 4] // Y | 2968 mov eax, [esp + 16 + 4] // Y |
| 2866 mov esi, [esp + 16 + 8] // U | 2969 mov esi, [esp + 16 + 8] // U |
| 2867 mov edi, [esp + 16 + 12] // V | 2970 mov edi, [esp + 16 + 12] // V |
| 2868 mov ebp, [esp + 16 + 16] // A | 2971 mov ebp, [esp + 16 + 16] // A |
| 2869 mov edx, [esp + 16 + 20] // argb | 2972 mov edx, [esp + 16 + 20] // argb |
| 2870 mov ebx, [esp + 16 + 24] // yuvconstants | 2973 mov ebx, [esp + 16 + 24] // yuvconstants |
| 2871 mov ecx, [esp + 16 + 28] // width | 2974 mov ecx, [esp + 16 + 28] // width |
| 2872 sub edi, esi | 2975 sub edi, esi |
| 2873 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | |
| 2874 | 2976 |
| 2875 convertloop: | 2977 convertloop: |
| 2876 READYUVA422 | 2978 READYUVA422 |
| 2877 YUVTORGB(ebx) | 2979 YUVTORGB(ebx) |
| 2878 STOREARGB | 2980 STOREARGB |
| 2879 | 2981 |
| 2880 sub ecx, 8 | 2982 sub ecx, 8 |
| 2881 jg convertloop | 2983 jg convertloop |
| 2882 | 2984 |
| 2883 pop ebp | 2985 pop ebp |
| 2884 pop ebx | 2986 pop ebx |
| 2885 pop edi | 2987 pop edi |
| 2886 pop esi | 2988 pop esi |
| 2887 ret | 2989 ret |
| 2888 } | 2990 } |
| 2889 } | 2991 } |
| 2890 | 2992 |
| 2891 // 8 pixels. | 2993 // 8 pixels. |
| 2892 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ABGR (32 by
tes). | 2994 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ABGR. |
| 2893 __declspec(naked) | 2995 __declspec(naked) |
| 2894 void I422AlphaToABGRRow_SSSE3(const uint8* y_buf, | 2996 void I422AlphaToABGRRow_SSSE3(const uint8* y_buf, |
| 2895 const uint8* u_buf, | 2997 const uint8* u_buf, |
| 2896 const uint8* v_buf, | 2998 const uint8* v_buf, |
| 2897 const uint8* a_buf, | 2999 const uint8* a_buf, |
| 2898 uint8* dst_abgr, | 3000 uint8* dst_abgr, |
| 2899 struct YuvConstants* yuvconstants, | 3001 struct YuvConstants* yuvconstants, |
| 2900 int width) { | 3002 int width) { |
| 2901 __asm { | 3003 __asm { |
| 2902 push esi | 3004 push esi |
| 2903 push edi | 3005 push edi |
| 2904 push ebx | 3006 push ebx |
| 2905 push ebp | 3007 push ebp |
| 2906 mov eax, [esp + 16 + 4] // Y | 3008 mov eax, [esp + 16 + 4] // Y |
| 2907 mov esi, [esp + 16 + 8] // U | 3009 mov esi, [esp + 16 + 8] // U |
| 2908 mov edi, [esp + 16 + 12] // V | 3010 mov edi, [esp + 16 + 12] // V |
| 2909 mov ebp, [esp + 16 + 16] // A | 3011 mov ebp, [esp + 16 + 16] // A |
| 2910 mov edx, [esp + 16 + 20] // abgr | 3012 mov edx, [esp + 16 + 20] // abgr |
| 2911 mov ebx, [esp + 16 + 24] // yuvconstants | 3013 mov ebx, [esp + 16 + 24] // yuvconstants |
| 2912 mov ecx, [esp + 16 + 28] // width | 3014 mov ecx, [esp + 16 + 28] // width |
| 2913 sub edi, esi | 3015 sub edi, esi |
| 2914 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | |
| 2915 | 3016 |
| 2916 convertloop: | 3017 convertloop: |
| 2917 READYUVA422 | 3018 READYUVA422 |
| 2918 YUVTORGB(ebx) | 3019 YUVTORGB(ebx) |
| 2919 STOREABGR | 3020 STOREABGR |
| 2920 | 3021 |
| 2921 sub ecx, 8 | 3022 sub ecx, 8 |
| 2922 jg convertloop | 3023 jg convertloop |
| 2923 | 3024 |
| 2924 pop ebp | 3025 pop ebp |
| (...skipping 3626 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 6551 } | 6652 } |
| 6552 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6653 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
| 6553 | 6654 |
| 6554 #endif // defined(_M_X64) | 6655 #endif // defined(_M_X64) |
| 6555 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) | 6656 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
| 6556 | 6657 |
| 6557 #ifdef __cplusplus | 6658 #ifdef __cplusplus |
| 6558 } // extern "C" | 6659 } // extern "C" |
| 6559 } // namespace libyuv | 6660 } // namespace libyuv |
| 6560 #endif | 6661 #endif |
| OLD | NEW |