OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 22 matching lines...) Expand all Loading... |
33 xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \ | 33 xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \ |
34 xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \ | 34 xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \ |
35 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ | 35 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ |
36 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ | 36 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ |
37 u_buf += 4; \ | 37 u_buf += 4; \ |
38 xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ | 38 xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ |
39 xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ | 39 xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ |
40 y_buf += 8; \ | 40 y_buf += 8; \ |
41 | 41 |
42 // Convert 8 pixels: 8 UV and 8 Y. | 42 // Convert 8 pixels: 8 UV and 8 Y. |
43 #define YUVTORGB(YuvConstants) \ | 43 #define YUVTORGB(yuvconstants) \ |
44 xmm1 = _mm_loadu_si128(&xmm0); \ | 44 xmm1 = _mm_loadu_si128(&xmm0); \ |
45 xmm2 = _mm_loadu_si128(&xmm0); \ | 45 xmm2 = _mm_loadu_si128(&xmm0); \ |
46 xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)YuvConstants->kUVToB); \ | 46 xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \ |
47 xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)YuvConstants->kUVToG); \ | 47 xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \ |
48 xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)YuvConstants->kUVToR); \ | 48 xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \ |
49 xmm0 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasB, xmm0); \ | 49 xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \ |
50 xmm1 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasG, xmm1); \ | 50 xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \ |
51 xmm2 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasR, xmm2); \ | 51 xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \ |
52 xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)YuvConstants->kYToRgb); \ | 52 xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ |
53 xmm0 = _mm_adds_epi16(xmm0, xmm4); \ | 53 xmm0 = _mm_adds_epi16(xmm0, xmm4); \ |
54 xmm1 = _mm_adds_epi16(xmm1, xmm4); \ | 54 xmm1 = _mm_adds_epi16(xmm1, xmm4); \ |
55 xmm2 = _mm_adds_epi16(xmm2, xmm4); \ | 55 xmm2 = _mm_adds_epi16(xmm2, xmm4); \ |
56 xmm0 = _mm_srai_epi16(xmm0, 6); \ | 56 xmm0 = _mm_srai_epi16(xmm0, 6); \ |
57 xmm1 = _mm_srai_epi16(xmm1, 6); \ | 57 xmm1 = _mm_srai_epi16(xmm1, 6); \ |
58 xmm2 = _mm_srai_epi16(xmm2, 6); \ | 58 xmm2 = _mm_srai_epi16(xmm2, 6); \ |
59 xmm0 = _mm_packus_epi16(xmm0, xmm0); \ | 59 xmm0 = _mm_packus_epi16(xmm0, xmm0); \ |
60 xmm1 = _mm_packus_epi16(xmm1, xmm1); \ | 60 xmm1 = _mm_packus_epi16(xmm1, xmm1); \ |
61 xmm2 = _mm_packus_epi16(xmm2, xmm2); | 61 xmm2 = _mm_packus_epi16(xmm2, xmm2); |
62 | 62 |
(...skipping 1960 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2023 __declspec(naked) | 2023 __declspec(naked) |
2024 void I422ToARGBRow_AVX2(const uint8* y_buf, | 2024 void I422ToARGBRow_AVX2(const uint8* y_buf, |
2025 const uint8* u_buf, | 2025 const uint8* u_buf, |
2026 const uint8* v_buf, | 2026 const uint8* v_buf, |
2027 uint8* dst_argb, | 2027 uint8* dst_argb, |
2028 struct YuvConstants* yuvconstants, | 2028 struct YuvConstants* yuvconstants, |
2029 int width) { | 2029 int width) { |
2030 __asm { | 2030 __asm { |
2031 push esi | 2031 push esi |
2032 push edi | 2032 push edi |
2033 push ebp | 2033 push ebx |
2034 mov eax, [esp + 12 + 4] // Y | 2034 mov eax, [esp + 12 + 4] // Y |
2035 mov esi, [esp + 12 + 8] // U | 2035 mov esi, [esp + 12 + 8] // U |
2036 mov edi, [esp + 12 + 12] // V | 2036 mov edi, [esp + 12 + 12] // V |
2037 mov edx, [esp + 12 + 16] // argb | 2037 mov edx, [esp + 12 + 16] // argb |
2038 mov ebp, [esp + 12 + 20] // yuvconstants | 2038 mov ebx, [esp + 12 + 20] // yuvconstants |
2039 mov ecx, [esp + 12 + 24] // width | 2039 mov ecx, [esp + 12 + 24] // width |
2040 sub edi, esi | 2040 sub edi, esi |
2041 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2041 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
2042 | 2042 |
2043 convertloop: | 2043 convertloop: |
2044 READYUV422_AVX2 | 2044 READYUV422_AVX2 |
2045 YUVTORGB_AVX2(ebp) | 2045 YUVTORGB_AVX2(ebx) |
2046 STOREARGB_AVX2 | 2046 STOREARGB_AVX2 |
2047 | 2047 |
2048 sub ecx, 16 | 2048 sub ecx, 16 |
2049 jg convertloop | 2049 jg convertloop |
2050 | 2050 |
2051 pop ebp | 2051 pop ebx |
2052 pop edi | 2052 pop edi |
2053 pop esi | 2053 pop esi |
2054 vzeroupper | 2054 vzeroupper |
2055 ret | 2055 ret |
2056 } | 2056 } |
2057 } | 2057 } |
2058 #endif // HAS_I422TOARGBROW_AVX2 | 2058 #endif // HAS_I422TOARGBROW_AVX2 |
2059 | 2059 |
2060 #ifdef HAS_I444TOARGBROW_AVX2 | 2060 #ifdef HAS_I444TOARGBROW_AVX2 |
2061 // 16 pixels | 2061 // 16 pixels |
2062 // 16 UV values with 16 Y producing 16 ARGB (64 bytes). | 2062 // 16 UV values with 16 Y producing 16 ARGB (64 bytes). |
2063 __declspec(naked) | 2063 __declspec(naked) |
2064 void I444ToARGBRow_AVX2(const uint8* y_buf, | 2064 void I444ToARGBRow_AVX2(const uint8* y_buf, |
2065 const uint8* u_buf, | 2065 const uint8* u_buf, |
2066 const uint8* v_buf, | 2066 const uint8* v_buf, |
2067 uint8* dst_argb, | 2067 uint8* dst_argb, |
2068 struct YuvConstants* yuvconstants, | 2068 struct YuvConstants* yuvconstants, |
2069 int width) { | 2069 int width) { |
2070 __asm { | 2070 __asm { |
2071 push esi | 2071 push esi |
2072 push edi | 2072 push edi |
2073 push ebp | 2073 push ebx |
2074 mov eax, [esp + 12 + 4] // Y | 2074 mov eax, [esp + 12 + 4] // Y |
2075 mov esi, [esp + 12 + 8] // U | 2075 mov esi, [esp + 12 + 8] // U |
2076 mov edi, [esp + 12 + 12] // V | 2076 mov edi, [esp + 12 + 12] // V |
2077 mov edx, [esp + 12 + 16] // argb | 2077 mov edx, [esp + 12 + 16] // argb |
2078 mov ebp, [esp + 12 + 20] // yuvconstants | 2078 mov ebx, [esp + 12 + 20] // yuvconstants |
2079 mov ecx, [esp + 12 + 24] // width | 2079 mov ecx, [esp + 12 + 24] // width |
2080 sub edi, esi | 2080 sub edi, esi |
2081 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2081 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
2082 convertloop: | 2082 convertloop: |
2083 READYUV444_AVX2 | 2083 READYUV444_AVX2 |
2084 YUVTORGB_AVX2(ebp) | 2084 YUVTORGB_AVX2(ebx) |
2085 STOREARGB_AVX2 | 2085 STOREARGB_AVX2 |
2086 | 2086 |
2087 sub ecx, 16 | 2087 sub ecx, 16 |
2088 jg convertloop | 2088 jg convertloop |
2089 | 2089 |
2090 pop ebp | 2090 pop ebx |
2091 pop edi | 2091 pop edi |
2092 pop esi | 2092 pop esi |
2093 vzeroupper | 2093 vzeroupper |
2094 ret | 2094 ret |
2095 } | 2095 } |
2096 } | 2096 } |
2097 #endif // HAS_I444TOARGBROW_AVX2 | 2097 #endif // HAS_I444TOARGBROW_AVX2 |
2098 | 2098 |
2099 #ifdef HAS_I444TOABGRROW_AVX2 | 2099 #ifdef HAS_I444TOABGRROW_AVX2 |
2100 // 16 pixels | 2100 // 16 pixels |
2101 // 16 UV values with 16 Y producing 16 ABGR (64 bytes). | 2101 // 16 UV values with 16 Y producing 16 ABGR (64 bytes). |
2102 __declspec(naked) | 2102 __declspec(naked) |
2103 void I444ToABGRRow_AVX2(const uint8* y_buf, | 2103 void I444ToABGRRow_AVX2(const uint8* y_buf, |
2104 const uint8* u_buf, | 2104 const uint8* u_buf, |
2105 const uint8* v_buf, | 2105 const uint8* v_buf, |
2106 uint8* dst_abgr, | 2106 uint8* dst_abgr, |
2107 struct YuvConstants* yuvconstants, | 2107 struct YuvConstants* yuvconstants, |
2108 int width) { | 2108 int width) { |
2109 __asm { | 2109 __asm { |
2110 push esi | 2110 push esi |
2111 push edi | 2111 push edi |
2112 push ebp | 2112 push ebx |
2113 mov eax, [esp + 12 + 4] // Y | 2113 mov eax, [esp + 12 + 4] // Y |
2114 mov esi, [esp + 12 + 8] // U | 2114 mov esi, [esp + 12 + 8] // U |
2115 mov edi, [esp + 12 + 12] // V | 2115 mov edi, [esp + 12 + 12] // V |
2116 mov edx, [esp + 12 + 16] // abgr | 2116 mov edx, [esp + 12 + 16] // abgr |
2117 mov ebp, [esp + 12 + 20] // yuvconstants | 2117 mov ebx, [esp + 12 + 20] // yuvconstants |
2118 mov ecx, [esp + 12 + 24] // width | 2118 mov ecx, [esp + 12 + 24] // width |
2119 sub edi, esi | 2119 sub edi, esi |
2120 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2120 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
2121 convertloop: | 2121 convertloop: |
2122 READYUV444_AVX2 | 2122 READYUV444_AVX2 |
2123 YUVTORGB_AVX2(ebp) | 2123 YUVTORGB_AVX2(ebx) |
2124 STOREABGR_AVX2 | 2124 STOREABGR_AVX2 |
2125 | 2125 |
2126 sub ecx, 16 | 2126 sub ecx, 16 |
2127 jg convertloop | 2127 jg convertloop |
2128 | 2128 |
2129 pop ebp | 2129 pop ebx |
2130 pop edi | 2130 pop edi |
2131 pop esi | 2131 pop esi |
2132 vzeroupper | 2132 vzeroupper |
2133 ret | 2133 ret |
2134 } | 2134 } |
2135 } | 2135 } |
2136 #endif // HAS_I444TOABGRROW_AVX2 | 2136 #endif // HAS_I444TOABGRROW_AVX2 |
2137 | 2137 |
2138 #ifdef HAS_I411TOARGBROW_AVX2 | 2138 #ifdef HAS_I411TOARGBROW_AVX2 |
2139 // 16 pixels | 2139 // 16 pixels |
2140 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2140 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
2141 __declspec(naked) | 2141 __declspec(naked) |
2142 void I411ToARGBRow_AVX2(const uint8* y_buf, | 2142 void I411ToARGBRow_AVX2(const uint8* y_buf, |
2143 const uint8* u_buf, | 2143 const uint8* u_buf, |
2144 const uint8* v_buf, | 2144 const uint8* v_buf, |
2145 uint8* dst_argb, | 2145 uint8* dst_argb, |
2146 struct YuvConstants* yuvconstants, | 2146 struct YuvConstants* yuvconstants, |
2147 int width) { | 2147 int width) { |
2148 __asm { | 2148 __asm { |
2149 push esi | 2149 push esi |
2150 push edi | 2150 push edi |
2151 push ebp | 2151 push ebx |
2152 mov eax, [esp + 12 + 4] // Y | 2152 mov eax, [esp + 12 + 4] // Y |
2153 mov esi, [esp + 12 + 8] // U | 2153 mov esi, [esp + 12 + 8] // U |
2154 mov edi, [esp + 12 + 12] // V | 2154 mov edi, [esp + 12 + 12] // V |
2155 mov edx, [esp + 12 + 16] // abgr | 2155 mov edx, [esp + 12 + 16] // abgr |
2156 mov ebp, [esp + 12 + 20] // yuvconstants | 2156 mov ebx, [esp + 12 + 20] // yuvconstants |
2157 mov ecx, [esp + 12 + 24] // width | 2157 mov ecx, [esp + 12 + 24] // width |
2158 sub edi, esi | 2158 sub edi, esi |
2159 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2159 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
2160 | 2160 |
2161 convertloop: | 2161 convertloop: |
2162 READYUV411_AVX2 | 2162 READYUV411_AVX2 |
2163 YUVTORGB_AVX2(ebp) | 2163 YUVTORGB_AVX2(ebx) |
2164 STOREARGB_AVX2 | 2164 STOREARGB_AVX2 |
2165 | 2165 |
2166 sub ecx, 16 | 2166 sub ecx, 16 |
2167 jg convertloop | 2167 jg convertloop |
2168 | 2168 |
2169 pop ebp | 2169 pop ebx |
2170 pop edi | 2170 pop edi |
2171 pop esi | 2171 pop esi |
2172 vzeroupper | 2172 vzeroupper |
2173 ret | 2173 ret |
2174 } | 2174 } |
2175 } | 2175 } |
2176 #endif // HAS_I411TOARGBROW_AVX2 | 2176 #endif // HAS_I411TOARGBROW_AVX2 |
2177 | 2177 |
2178 #ifdef HAS_NV12TOARGBROW_AVX2 | 2178 #ifdef HAS_NV12TOARGBROW_AVX2 |
2179 // 16 pixels. | 2179 // 16 pixels. |
2180 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2180 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
2181 __declspec(naked) | 2181 __declspec(naked) |
2182 void NV12ToARGBRow_AVX2(const uint8* y_buf, | 2182 void NV12ToARGBRow_AVX2(const uint8* y_buf, |
2183 const uint8* uv_buf, | 2183 const uint8* uv_buf, |
2184 uint8* dst_argb, | 2184 uint8* dst_argb, |
2185 struct YuvConstants* yuvconstants, | 2185 struct YuvConstants* yuvconstants, |
2186 int width) { | 2186 int width) { |
2187 __asm { | 2187 __asm { |
2188 push esi | 2188 push esi |
2189 push ebp | 2189 push ebx |
2190 mov eax, [esp + 8 + 4] // Y | 2190 mov eax, [esp + 8 + 4] // Y |
2191 mov esi, [esp + 8 + 8] // UV | 2191 mov esi, [esp + 8 + 8] // UV |
2192 mov edx, [esp + 8 + 12] // argb | 2192 mov edx, [esp + 8 + 12] // argb |
2193 mov ebp, [esp + 8 + 16] // yuvconstants | 2193 mov ebx, [esp + 8 + 16] // yuvconstants |
2194 mov ecx, [esp + 8 + 20] // width | 2194 mov ecx, [esp + 8 + 20] // width |
2195 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2195 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
2196 | 2196 |
2197 convertloop: | 2197 convertloop: |
2198 READNV12_AVX2 | 2198 READNV12_AVX2 |
2199 YUVTORGB_AVX2(ebp) | 2199 YUVTORGB_AVX2(ebx) |
2200 STOREARGB_AVX2 | 2200 STOREARGB_AVX2 |
2201 | 2201 |
2202 sub ecx, 16 | 2202 sub ecx, 16 |
2203 jg convertloop | 2203 jg convertloop |
2204 | 2204 |
2205 pop ebp | 2205 pop ebx |
2206 pop esi | 2206 pop esi |
2207 vzeroupper | 2207 vzeroupper |
2208 ret | 2208 ret |
2209 } | 2209 } |
2210 } | 2210 } |
2211 #endif // HAS_NV12TOARGBROW_AVX2 | 2211 #endif // HAS_NV12TOARGBROW_AVX2 |
2212 | 2212 |
2213 // 16 pixels. | 2213 // 16 pixels. |
2214 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). | 2214 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
2215 __declspec(naked) | 2215 __declspec(naked) |
2216 void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, | 2216 void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, |
2217 uint8* dst_argb, | 2217 uint8* dst_argb, |
2218 struct YuvConstants* yuvconstants, | 2218 struct YuvConstants* yuvconstants, |
2219 int width) { | 2219 int width) { |
2220 __asm { | 2220 __asm { |
2221 push ebp | 2221 push ebx |
2222 mov eax, [esp + 4 + 4] // yuy2 | 2222 mov eax, [esp + 4 + 4] // yuy2 |
2223 mov edx, [esp + 4 + 8] // argb | 2223 mov edx, [esp + 4 + 8] // argb |
2224 mov ebp, [esp + 4 + 12] // yuvconstants | 2224 mov ebx, [esp + 4 + 12] // yuvconstants |
2225 mov ecx, [esp + 4 + 16] // width | 2225 mov ecx, [esp + 4 + 16] // width |
2226 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2226 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
2227 | 2227 |
2228 convertloop: | 2228 convertloop: |
2229 READYUY2_AVX2 | 2229 READYUY2_AVX2 |
2230 YUVTORGB_AVX2(ebp) | 2230 YUVTORGB_AVX2(ebx) |
2231 STOREARGB_AVX2 | 2231 STOREARGB_AVX2 |
2232 | 2232 |
2233 sub ecx, 16 | 2233 sub ecx, 16 |
2234 jg convertloop | 2234 jg convertloop |
2235 | 2235 |
2236 pop ebp | 2236 pop ebx |
2237 vzeroupper | 2237 vzeroupper |
2238 ret | 2238 ret |
2239 } | 2239 } |
2240 } | 2240 } |
2241 | 2241 |
2242 // 16 pixels. | 2242 // 16 pixels. |
2243 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). | 2243 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
2244 __declspec(naked) | 2244 __declspec(naked) |
2245 void UYVYToARGBRow_AVX2(const uint8* src_uyvy, | 2245 void UYVYToARGBRow_AVX2(const uint8* src_uyvy, |
2246 uint8* dst_argb, | 2246 uint8* dst_argb, |
2247 struct YuvConstants* yuvconstants, | 2247 struct YuvConstants* yuvconstants, |
2248 int width) { | 2248 int width) { |
2249 __asm { | 2249 __asm { |
2250 push ebp | 2250 push ebx |
2251 mov eax, [esp + 4 + 4] // uyvy | 2251 mov eax, [esp + 4 + 4] // uyvy |
2252 mov edx, [esp + 4 + 8] // argb | 2252 mov edx, [esp + 4 + 8] // argb |
2253 mov ebp, [esp + 4 + 12] // yuvconstants | 2253 mov ebx, [esp + 4 + 12] // yuvconstants |
2254 mov ecx, [esp + 4 + 16] // width | 2254 mov ecx, [esp + 4 + 16] // width |
2255 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2255 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
2256 | 2256 |
2257 convertloop: | 2257 convertloop: |
2258 READUYVY_AVX2 | 2258 READUYVY_AVX2 |
2259 YUVTORGB_AVX2(ebp) | 2259 YUVTORGB_AVX2(ebx) |
2260 STOREARGB_AVX2 | 2260 STOREARGB_AVX2 |
2261 | 2261 |
2262 sub ecx, 16 | 2262 sub ecx, 16 |
2263 jg convertloop | 2263 jg convertloop |
2264 | 2264 |
2265 pop ebp | 2265 pop ebx |
2266 vzeroupper | 2266 vzeroupper |
2267 ret | 2267 ret |
2268 } | 2268 } |
2269 } | 2269 } |
2270 | 2270 |
2271 | 2271 |
2272 #ifdef HAS_I422TOBGRAROW_AVX2 | 2272 #ifdef HAS_I422TOBGRAROW_AVX2 |
2273 // 16 pixels | 2273 // 16 pixels |
2274 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). | 2274 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). |
2275 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. | 2275 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. |
2276 __declspec(naked) | 2276 __declspec(naked) |
2277 void I422ToBGRARow_AVX2(const uint8* y_buf, | 2277 void I422ToBGRARow_AVX2(const uint8* y_buf, |
2278 const uint8* u_buf, | 2278 const uint8* u_buf, |
2279 const uint8* v_buf, | 2279 const uint8* v_buf, |
2280 uint8* dst_argb, | 2280 uint8* dst_argb, |
2281 struct YuvConstants* yuvconstants, | 2281 struct YuvConstants* yuvconstants, |
2282 int width) { | 2282 int width) { |
2283 __asm { | 2283 __asm { |
2284 push esi | 2284 push esi |
2285 push edi | 2285 push edi |
2286 push ebp | 2286 push ebx |
2287 mov eax, [esp + 12 + 4] // Y | 2287 mov eax, [esp + 12 + 4] // Y |
2288 mov esi, [esp + 12 + 8] // U | 2288 mov esi, [esp + 12 + 8] // U |
2289 mov edi, [esp + 12 + 12] // V | 2289 mov edi, [esp + 12 + 12] // V |
2290 mov edx, [esp + 12 + 16] // abgr | 2290 mov edx, [esp + 12 + 16] // abgr |
2291 mov ebp, [esp + 12 + 20] // yuvconstants | 2291 mov ebx, [esp + 12 + 20] // yuvconstants |
2292 mov ecx, [esp + 12 + 24] // width | 2292 mov ecx, [esp + 12 + 24] // width |
2293 sub edi, esi | 2293 sub edi, esi |
2294 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2294 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
2295 | 2295 |
2296 convertloop: | 2296 convertloop: |
2297 READYUV422_AVX2 | 2297 READYUV422_AVX2 |
2298 YUVTORGB_AVX2(ebp) | 2298 YUVTORGB_AVX2(ebx) |
2299 STOREBGRA_AVX2 | 2299 STOREBGRA_AVX2 |
2300 | 2300 |
2301 sub ecx, 16 | 2301 sub ecx, 16 |
2302 jg convertloop | 2302 jg convertloop |
2303 | 2303 |
2304 pop ebp | 2304 pop ebx |
2305 pop edi | 2305 pop edi |
2306 pop esi | 2306 pop esi |
2307 vzeroupper | 2307 vzeroupper |
2308 ret | 2308 ret |
2309 } | 2309 } |
2310 } | 2310 } |
2311 #endif // HAS_I422TOBGRAROW_AVX2 | 2311 #endif // HAS_I422TOBGRAROW_AVX2 |
2312 | 2312 |
2313 #ifdef HAS_I422TORGBAROW_AVX2 | 2313 #ifdef HAS_I422TORGBAROW_AVX2 |
2314 // 16 pixels | 2314 // 16 pixels |
2315 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). | 2315 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). |
2316 __declspec(naked) | 2316 __declspec(naked) |
2317 void I422ToRGBARow_AVX2(const uint8* y_buf, | 2317 void I422ToRGBARow_AVX2(const uint8* y_buf, |
2318 const uint8* u_buf, | 2318 const uint8* u_buf, |
2319 const uint8* v_buf, | 2319 const uint8* v_buf, |
2320 uint8* dst_argb, | 2320 uint8* dst_argb, |
2321 struct YuvConstants* yuvconstants, | 2321 struct YuvConstants* yuvconstants, |
2322 int width) { | 2322 int width) { |
2323 __asm { | 2323 __asm { |
2324 push esi | 2324 push esi |
2325 push edi | 2325 push edi |
2326 push ebp | 2326 push ebx |
2327 mov eax, [esp + 12 + 4] // Y | 2327 mov eax, [esp + 12 + 4] // Y |
2328 mov esi, [esp + 12 + 8] // U | 2328 mov esi, [esp + 12 + 8] // U |
2329 mov edi, [esp + 12 + 12] // V | 2329 mov edi, [esp + 12 + 12] // V |
2330 mov edx, [esp + 12 + 16] // abgr | 2330 mov edx, [esp + 12 + 16] // abgr |
2331 mov ebp, [esp + 12 + 20] // yuvconstants | 2331 mov ebx, [esp + 12 + 20] // yuvconstants |
2332 mov ecx, [esp + 12 + 24] // width | 2332 mov ecx, [esp + 12 + 24] // width |
2333 sub edi, esi | 2333 sub edi, esi |
2334 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2334 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
2335 | 2335 |
2336 convertloop: | 2336 convertloop: |
2337 READYUV422_AVX2 | 2337 READYUV422_AVX2 |
2338 YUVTORGB_AVX2(ebp) | 2338 YUVTORGB_AVX2(ebx) |
2339 STORERGBA_AVX2 | 2339 STORERGBA_AVX2 |
2340 | 2340 |
2341 sub ecx, 16 | 2341 sub ecx, 16 |
2342 jg convertloop | 2342 jg convertloop |
2343 | 2343 |
2344 pop ebp | 2344 pop ebx |
2345 pop edi | 2345 pop edi |
2346 pop esi | 2346 pop esi |
2347 vzeroupper | 2347 vzeroupper |
2348 ret | 2348 ret |
2349 } | 2349 } |
2350 } | 2350 } |
2351 #endif // HAS_I422TORGBAROW_AVX2 | 2351 #endif // HAS_I422TORGBAROW_AVX2 |
2352 | 2352 |
2353 #ifdef HAS_I422TOABGRROW_AVX2 | 2353 #ifdef HAS_I422TOABGRROW_AVX2 |
2354 // 16 pixels | 2354 // 16 pixels |
2355 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). | 2355 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). |
2356 __declspec(naked) | 2356 __declspec(naked) |
2357 void I422ToABGRRow_AVX2(const uint8* y_buf, | 2357 void I422ToABGRRow_AVX2(const uint8* y_buf, |
2358 const uint8* u_buf, | 2358 const uint8* u_buf, |
2359 const uint8* v_buf, | 2359 const uint8* v_buf, |
2360 uint8* dst_argb, | 2360 uint8* dst_argb, |
2361 struct YuvConstants* yuvconstants, | 2361 struct YuvConstants* yuvconstants, |
2362 int width) { | 2362 int width) { |
2363 __asm { | 2363 __asm { |
2364 push esi | 2364 push esi |
2365 push edi | 2365 push edi |
2366 push ebp | 2366 push ebx |
2367 mov eax, [esp + 12 + 4] // Y | 2367 mov eax, [esp + 12 + 4] // Y |
2368 mov esi, [esp + 12 + 8] // U | 2368 mov esi, [esp + 12 + 8] // U |
2369 mov edi, [esp + 12 + 12] // V | 2369 mov edi, [esp + 12 + 12] // V |
2370 mov edx, [esp + 12 + 16] // argb | 2370 mov edx, [esp + 12 + 16] // argb |
2371 mov ebp, [esp + 12 + 20] // yuvconstants | 2371 mov ebx, [esp + 12 + 20] // yuvconstants |
2372 mov ecx, [esp + 12 + 24] // width | 2372 mov ecx, [esp + 12 + 24] // width |
2373 sub edi, esi | 2373 sub edi, esi |
2374 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2374 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
2375 | 2375 |
2376 convertloop: | 2376 convertloop: |
2377 READYUV422_AVX2 | 2377 READYUV422_AVX2 |
2378 YUVTORGB_AVX2(ebp) | 2378 YUVTORGB_AVX2(ebx) |
2379 STOREABGR_AVX2 | 2379 STOREABGR_AVX2 |
2380 | 2380 |
2381 sub ecx, 16 | 2381 sub ecx, 16 |
2382 jg convertloop | 2382 jg convertloop |
2383 | 2383 |
2384 pop ebp | 2384 pop ebx |
2385 pop edi | 2385 pop edi |
2386 pop esi | 2386 pop esi |
2387 vzeroupper | 2387 vzeroupper |
2388 ret | 2388 ret |
2389 } | 2389 } |
2390 } | 2390 } |
2391 #endif // HAS_I422TOABGRROW_AVX2 | 2391 #endif // HAS_I422TOABGRROW_AVX2 |
2392 | 2392 |
2393 #if defined(HAS_I422TOARGBROW_SSSE3) | 2393 #if defined(HAS_I422TOARGBROW_SSSE3) |
2394 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. | 2394 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. |
(...skipping 213 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2608 __declspec(naked) | 2608 __declspec(naked) |
2609 void I444ToARGBRow_SSSE3(const uint8* y_buf, | 2609 void I444ToARGBRow_SSSE3(const uint8* y_buf, |
2610 const uint8* u_buf, | 2610 const uint8* u_buf, |
2611 const uint8* v_buf, | 2611 const uint8* v_buf, |
2612 uint8* dst_argb, | 2612 uint8* dst_argb, |
2613 struct YuvConstants* yuvconstants, | 2613 struct YuvConstants* yuvconstants, |
2614 int width) { | 2614 int width) { |
2615 __asm { | 2615 __asm { |
2616 push esi | 2616 push esi |
2617 push edi | 2617 push edi |
2618 push ebp | 2618 push ebx |
2619 mov eax, [esp + 12 + 4] // Y | 2619 mov eax, [esp + 12 + 4] // Y |
2620 mov esi, [esp + 12 + 8] // U | 2620 mov esi, [esp + 12 + 8] // U |
2621 mov edi, [esp + 12 + 12] // V | 2621 mov edi, [esp + 12 + 12] // V |
2622 mov edx, [esp + 12 + 16] // argb | 2622 mov edx, [esp + 12 + 16] // argb |
2623 mov ebp, [esp + 12 + 20] // yuvconstants | 2623 mov ebx, [esp + 12 + 20] // yuvconstants |
2624 mov ecx, [esp + 12 + 24] // width | 2624 mov ecx, [esp + 12 + 24] // width |
2625 sub edi, esi | 2625 sub edi, esi |
2626 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2626 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
2627 | 2627 |
2628 convertloop: | 2628 convertloop: |
2629 READYUV444 | 2629 READYUV444 |
2630 YUVTORGB(ebp) | 2630 YUVTORGB(ebx) |
2631 STOREARGB | 2631 STOREARGB |
2632 | 2632 |
2633 sub ecx, 8 | 2633 sub ecx, 8 |
2634 jg convertloop | 2634 jg convertloop |
2635 | 2635 |
2636 pop ebp | 2636 pop ebx |
2637 pop edi | 2637 pop edi |
2638 pop esi | 2638 pop esi |
2639 ret | 2639 ret |
2640 } | 2640 } |
2641 } | 2641 } |
2642 | 2642 |
2643 // 8 pixels. | 2643 // 8 pixels. |
2644 // 8 UV values, mixed with 8 Y producing 8 ABGR (32 bytes). | 2644 // 8 UV values, mixed with 8 Y producing 8 ABGR (32 bytes). |
2645 __declspec(naked) | 2645 __declspec(naked) |
2646 void I444ToABGRRow_SSSE3(const uint8* y_buf, | 2646 void I444ToABGRRow_SSSE3(const uint8* y_buf, |
2647 const uint8* u_buf, | 2647 const uint8* u_buf, |
2648 const uint8* v_buf, | 2648 const uint8* v_buf, |
2649 uint8* dst_abgr, | 2649 uint8* dst_abgr, |
2650 struct YuvConstants* yuvconstants, | 2650 struct YuvConstants* yuvconstants, |
2651 int width) { | 2651 int width) { |
2652 __asm { | 2652 __asm { |
2653 push esi | 2653 push esi |
2654 push edi | 2654 push edi |
2655 push ebp | 2655 push ebx |
2656 mov eax, [esp + 12 + 4] // Y | 2656 mov eax, [esp + 12 + 4] // Y |
2657 mov esi, [esp + 12 + 8] // U | 2657 mov esi, [esp + 12 + 8] // U |
2658 mov edi, [esp + 12 + 12] // V | 2658 mov edi, [esp + 12 + 12] // V |
2659 mov edx, [esp + 12 + 16] // abgr | 2659 mov edx, [esp + 12 + 16] // abgr |
2660 mov ebp, [esp + 12 + 20] // yuvconstants | 2660 mov ebx, [esp + 12 + 20] // yuvconstants |
2661 mov ecx, [esp + 12 + 24] // width | 2661 mov ecx, [esp + 12 + 24] // width |
2662 sub edi, esi | 2662 sub edi, esi |
2663 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2663 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
2664 | 2664 |
2665 convertloop: | 2665 convertloop: |
2666 READYUV444 | 2666 READYUV444 |
2667 YUVTORGB(ebp) | 2667 YUVTORGB(ebx) |
2668 STOREABGR | 2668 STOREABGR |
2669 | 2669 |
2670 sub ecx, 8 | 2670 sub ecx, 8 |
2671 jg convertloop | 2671 jg convertloop |
2672 | 2672 |
2673 pop ebp | 2673 pop ebx |
2674 pop edi | 2674 pop edi |
2675 pop esi | 2675 pop esi |
2676 ret | 2676 ret |
2677 } | 2677 } |
2678 } | 2678 } |
2679 | 2679 |
2680 // 8 pixels. | 2680 // 8 pixels. |
2681 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). | 2681 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). |
2682 __declspec(naked) | 2682 __declspec(naked) |
2683 void I422ToRGB24Row_SSSE3(const uint8* y_buf, | 2683 void I422ToRGB24Row_SSSE3(const uint8* y_buf, |
2684 const uint8* u_buf, | 2684 const uint8* u_buf, |
2685 const uint8* v_buf, | 2685 const uint8* v_buf, |
2686 uint8* dst_rgb24, | 2686 uint8* dst_rgb24, |
2687 struct YuvConstants* yuvconstants, | 2687 struct YuvConstants* yuvconstants, |
2688 int width) { | 2688 int width) { |
2689 __asm { | 2689 __asm { |
2690 push esi | 2690 push esi |
2691 push edi | 2691 push edi |
2692 push ebp | 2692 push ebx |
2693 mov eax, [esp + 12 + 4] // Y | 2693 mov eax, [esp + 12 + 4] // Y |
2694 mov esi, [esp + 12 + 8] // U | 2694 mov esi, [esp + 12 + 8] // U |
2695 mov edi, [esp + 12 + 12] // V | 2695 mov edi, [esp + 12 + 12] // V |
2696 mov edx, [esp + 12 + 16] // argb | 2696 mov edx, [esp + 12 + 16] // argb |
2697 mov ebp, [esp + 12 + 20] // yuvconstants | 2697 mov ebx, [esp + 12 + 20] // yuvconstants |
2698 mov ecx, [esp + 12 + 24] // width | 2698 mov ecx, [esp + 12 + 24] // width |
2699 sub edi, esi | 2699 sub edi, esi |
2700 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 | 2700 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 |
2701 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 | 2701 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 |
2702 | 2702 |
2703 convertloop: | 2703 convertloop: |
2704 READYUV422 | 2704 READYUV422 |
2705 YUVTORGB(ebp) | 2705 YUVTORGB(ebx) |
2706 STORERGB24 | 2706 STORERGB24 |
2707 | 2707 |
2708 sub ecx, 8 | 2708 sub ecx, 8 |
2709 jg convertloop | 2709 jg convertloop |
2710 | 2710 |
2711 pop ebp | 2711 pop ebx |
2712 pop edi | 2712 pop edi |
2713 pop esi | 2713 pop esi |
2714 ret | 2714 ret |
2715 } | 2715 } |
2716 } | 2716 } |
2717 | 2717 |
2718 // 8 pixels. | 2718 // 8 pixels. |
2719 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes). | 2719 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes). |
2720 __declspec(naked) | 2720 __declspec(naked) |
2721 void I422ToRAWRow_SSSE3(const uint8* y_buf, | 2721 void I422ToRAWRow_SSSE3(const uint8* y_buf, |
2722 const uint8* u_buf, | 2722 const uint8* u_buf, |
2723 const uint8* v_buf, | 2723 const uint8* v_buf, |
2724 uint8* dst_raw, | 2724 uint8* dst_raw, |
2725 struct YuvConstants* yuvconstants, | 2725 struct YuvConstants* yuvconstants, |
2726 int width) { | 2726 int width) { |
2727 __asm { | 2727 __asm { |
2728 push esi | 2728 push esi |
2729 push edi | 2729 push edi |
2730 push ebp | 2730 push ebx |
2731 mov eax, [esp + 12 + 4] // Y | 2731 mov eax, [esp + 12 + 4] // Y |
2732 mov esi, [esp + 12 + 8] // U | 2732 mov esi, [esp + 12 + 8] // U |
2733 mov edi, [esp + 12 + 12] // V | 2733 mov edi, [esp + 12 + 12] // V |
2734 mov edx, [esp + 12 + 16] // argb | 2734 mov edx, [esp + 12 + 16] // argb |
2735 mov ebp, [esp + 12 + 20] // yuvconstants | 2735 mov ebx, [esp + 12 + 20] // yuvconstants |
2736 mov ecx, [esp + 12 + 24] // width | 2736 mov ecx, [esp + 12 + 24] // width |
2737 sub edi, esi | 2737 sub edi, esi |
2738 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRAW_0 | 2738 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRAW_0 |
2739 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW | 2739 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW |
2740 | 2740 |
2741 convertloop: | 2741 convertloop: |
2742 READYUV422 | 2742 READYUV422 |
2743 YUVTORGB(ebp) | 2743 YUVTORGB(ebx) |
2744 STORERAW | 2744 STORERAW |
2745 | 2745 |
2746 sub ecx, 8 | 2746 sub ecx, 8 |
2747 jg convertloop | 2747 jg convertloop |
2748 | 2748 |
2749 pop ebp | 2749 pop ebx |
2750 pop edi | 2750 pop edi |
2751 pop esi | 2751 pop esi |
2752 ret | 2752 ret |
2753 } | 2753 } |
2754 } | 2754 } |
2755 | 2755 |
2756 // 8 pixels | 2756 // 8 pixels |
2757 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). | 2757 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). |
2758 __declspec(naked) | 2758 __declspec(naked) |
2759 void I422ToRGB565Row_SSSE3(const uint8* y_buf, | 2759 void I422ToRGB565Row_SSSE3(const uint8* y_buf, |
2760 const uint8* u_buf, | 2760 const uint8* u_buf, |
2761 const uint8* v_buf, | 2761 const uint8* v_buf, |
2762 uint8* rgb565_buf, | 2762 uint8* rgb565_buf, |
2763 struct YuvConstants* yuvconstants, | 2763 struct YuvConstants* yuvconstants, |
2764 int width) { | 2764 int width) { |
2765 __asm { | 2765 __asm { |
2766 push esi | 2766 push esi |
2767 push edi | 2767 push edi |
2768 push ebp | 2768 push ebx |
2769 mov eax, [esp + 12 + 4] // Y | 2769 mov eax, [esp + 12 + 4] // Y |
2770 mov esi, [esp + 12 + 8] // U | 2770 mov esi, [esp + 12 + 8] // U |
2771 mov edi, [esp + 12 + 12] // V | 2771 mov edi, [esp + 12 + 12] // V |
2772 mov edx, [esp + 12 + 16] // argb | 2772 mov edx, [esp + 12 + 16] // argb |
2773 mov ebp, [esp + 12 + 20] // yuvconstants | 2773 mov ebx, [esp + 12 + 20] // yuvconstants |
2774 mov ecx, [esp + 12 + 24] // width | 2774 mov ecx, [esp + 12 + 24] // width |
2775 sub edi, esi | 2775 sub edi, esi |
2776 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f | 2776 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f |
2777 psrld xmm5, 27 | 2777 psrld xmm5, 27 |
2778 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 | 2778 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 |
2779 psrld xmm6, 26 | 2779 psrld xmm6, 26 |
2780 pslld xmm6, 5 | 2780 pslld xmm6, 5 |
2781 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 | 2781 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 |
2782 pslld xmm7, 11 | 2782 pslld xmm7, 11 |
2783 | 2783 |
2784 convertloop: | 2784 convertloop: |
2785 READYUV422 | 2785 READYUV422 |
2786 YUVTORGB(ebp) | 2786 YUVTORGB(ebx) |
2787 STORERGB565 | 2787 STORERGB565 |
2788 | 2788 |
2789 sub ecx, 8 | 2789 sub ecx, 8 |
2790 jg convertloop | 2790 jg convertloop |
2791 | 2791 |
2792 pop ebp | 2792 pop ebx |
2793 pop edi | 2793 pop edi |
2794 pop esi | 2794 pop esi |
2795 ret | 2795 ret |
2796 } | 2796 } |
2797 } | 2797 } |
2798 | 2798 |
2799 // 8 pixels. | 2799 // 8 pixels. |
2800 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). | 2800 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
2801 __declspec(naked) | 2801 __declspec(naked) |
2802 void I422ToARGBRow_SSSE3(const uint8* y_buf, | 2802 void I422ToARGBRow_SSSE3(const uint8* y_buf, |
2803 const uint8* u_buf, | 2803 const uint8* u_buf, |
2804 const uint8* v_buf, | 2804 const uint8* v_buf, |
2805 uint8* dst_argb, | 2805 uint8* dst_argb, |
2806 struct YuvConstants* yuvconstants, | 2806 struct YuvConstants* yuvconstants, |
2807 int width) { | 2807 int width) { |
2808 __asm { | 2808 __asm { |
2809 push esi | 2809 push esi |
2810 push edi | 2810 push edi |
2811 push ebp | 2811 push ebx |
2812 mov eax, [esp + 12 + 4] // Y | 2812 mov eax, [esp + 12 + 4] // Y |
2813 mov esi, [esp + 12 + 8] // U | 2813 mov esi, [esp + 12 + 8] // U |
2814 mov edi, [esp + 12 + 12] // V | 2814 mov edi, [esp + 12 + 12] // V |
2815 mov edx, [esp + 12 + 16] // argb | 2815 mov edx, [esp + 12 + 16] // argb |
2816 mov ebp, [esp + 12 + 20] // yuvconstants | 2816 mov ebx, [esp + 12 + 20] // yuvconstants |
2817 mov ecx, [esp + 12 + 24] // width | 2817 mov ecx, [esp + 12 + 24] // width |
2818 sub edi, esi | 2818 sub edi, esi |
2819 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2819 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
2820 | 2820 |
2821 convertloop: | 2821 convertloop: |
2822 READYUV422 | 2822 READYUV422 |
2823 YUVTORGB(ebp) | 2823 YUVTORGB(ebx) |
2824 STOREARGB | 2824 STOREARGB |
2825 | 2825 |
2826 sub ecx, 8 | 2826 sub ecx, 8 |
2827 jg convertloop | 2827 jg convertloop |
2828 | 2828 |
2829 pop ebp | 2829 pop ebx |
2830 pop edi | 2830 pop edi |
2831 pop esi | 2831 pop esi |
2832 ret | 2832 ret |
2833 } | 2833 } |
2834 } | 2834 } |
2835 | 2835 |
2836 // 8 pixels. | 2836 // 8 pixels. |
2837 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). | 2837 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
2838 // Similar to I420 but duplicate UV once more. | 2838 // Similar to I420 but duplicate UV once more. |
2839 __declspec(naked) | 2839 __declspec(naked) |
2840 void I411ToARGBRow_SSSE3(const uint8* y_buf, | 2840 void I411ToARGBRow_SSSE3(const uint8* y_buf, |
2841 const uint8* u_buf, | 2841 const uint8* u_buf, |
2842 const uint8* v_buf, | 2842 const uint8* v_buf, |
2843 uint8* dst_argb, | 2843 uint8* dst_argb, |
2844 struct YuvConstants* yuvconstants, | 2844 struct YuvConstants* yuvconstants, |
2845 int width) { | 2845 int width) { |
2846 __asm { | 2846 __asm { |
2847 push esi | 2847 push esi |
2848 push edi | 2848 push edi |
2849 push ebp | 2849 push ebx |
2850 mov eax, [esp + 12 + 4] // Y | 2850 mov eax, [esp + 12 + 4] // Y |
2851 mov esi, [esp + 12 + 8] // U | 2851 mov esi, [esp + 12 + 8] // U |
2852 mov edi, [esp + 12 + 12] // V | 2852 mov edi, [esp + 12 + 12] // V |
2853 mov edx, [esp + 12 + 16] // abgr | 2853 mov edx, [esp + 12 + 16] // abgr |
2854 mov ebp, [esp + 12 + 20] // yuvconstants | 2854 mov ebx, [esp + 12 + 20] // yuvconstants |
2855 mov ecx, [esp + 12 + 24] // width | 2855 mov ecx, [esp + 12 + 24] // width |
2856 sub edi, esi | 2856 sub edi, esi |
2857 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2857 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
2858 | 2858 |
2859 convertloop: | 2859 convertloop: |
2860 READYUV411 | 2860 READYUV411 |
2861 YUVTORGB(ebp) | 2861 YUVTORGB(ebx) |
2862 STOREARGB | 2862 STOREARGB |
2863 | 2863 |
2864 sub ecx, 8 | 2864 sub ecx, 8 |
2865 jg convertloop | 2865 jg convertloop |
2866 | 2866 |
2867 pop ebp | 2867 pop ebx |
2868 pop edi | 2868 pop edi |
2869 pop esi | 2869 pop esi |
2870 ret | 2870 ret |
2871 } | 2871 } |
2872 } | 2872 } |
2873 | 2873 |
2874 // 8 pixels. | 2874 // 8 pixels. |
2875 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). | 2875 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
2876 __declspec(naked) | 2876 __declspec(naked) |
2877 void NV12ToARGBRow_SSSE3(const uint8* y_buf, | 2877 void NV12ToARGBRow_SSSE3(const uint8* y_buf, |
2878 const uint8* uv_buf, | 2878 const uint8* uv_buf, |
2879 uint8* dst_argb, | 2879 uint8* dst_argb, |
2880 struct YuvConstants* yuvconstants, | 2880 struct YuvConstants* yuvconstants, |
2881 int width) { | 2881 int width) { |
2882 __asm { | 2882 __asm { |
2883 push esi | 2883 push esi |
2884 push ebp | 2884 push ebx |
2885 mov eax, [esp + 8 + 4] // Y | 2885 mov eax, [esp + 8 + 4] // Y |
2886 mov esi, [esp + 8 + 8] // UV | 2886 mov esi, [esp + 8 + 8] // UV |
2887 mov edx, [esp + 8 + 12] // argb | 2887 mov edx, [esp + 8 + 12] // argb |
2888 mov ebp, [esp + 8 + 16] // yuvconstants | 2888 mov ebx, [esp + 8 + 16] // yuvconstants |
2889 mov ecx, [esp + 8 + 20] // width | 2889 mov ecx, [esp + 8 + 20] // width |
2890 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2890 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
2891 | 2891 |
2892 convertloop: | 2892 convertloop: |
2893 READNV12 | 2893 READNV12 |
2894 YUVTORGB(ebp) | 2894 YUVTORGB(ebx) |
2895 STOREARGB | 2895 STOREARGB |
2896 | 2896 |
2897 sub ecx, 8 | 2897 sub ecx, 8 |
2898 jg convertloop | 2898 jg convertloop |
2899 | 2899 |
2900 pop ebp | 2900 pop ebx |
2901 pop esi | 2901 pop esi |
2902 ret | 2902 ret |
2903 } | 2903 } |
2904 } | 2904 } |
2905 | 2905 |
2906 // 8 pixels. | 2906 // 8 pixels. |
2907 // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes). | 2907 // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes). |
2908 __declspec(naked) | 2908 __declspec(naked) |
2909 void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, | 2909 void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, |
2910 uint8* dst_argb, | 2910 uint8* dst_argb, |
2911 struct YuvConstants* yuvconstants, | 2911 struct YuvConstants* yuvconstants, |
2912 int width) { | 2912 int width) { |
2913 __asm { | 2913 __asm { |
2914 push ebp | 2914 push ebx |
2915 mov eax, [esp + 4 + 4] // yuy2 | 2915 mov eax, [esp + 4 + 4] // yuy2 |
2916 mov edx, [esp + 4 + 8] // argb | 2916 mov edx, [esp + 4 + 8] // argb |
2917 mov ebp, [esp + 4 + 12] // yuvconstants | 2917 mov ebx, [esp + 4 + 12] // yuvconstants |
2918 mov ecx, [esp + 4 + 16] // width | 2918 mov ecx, [esp + 4 + 16] // width |
2919 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2919 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
2920 | 2920 |
2921 convertloop: | 2921 convertloop: |
2922 READYUY2 | 2922 READYUY2 |
2923 YUVTORGB(ebp) | 2923 YUVTORGB(ebx) |
2924 STOREARGB | 2924 STOREARGB |
2925 | 2925 |
2926 sub ecx, 8 | 2926 sub ecx, 8 |
2927 jg convertloop | 2927 jg convertloop |
2928 | 2928 |
2929 pop ebp | 2929 pop ebx |
2930 ret | 2930 ret |
2931 } | 2931 } |
2932 } | 2932 } |
2933 | 2933 |
2934 // 8 pixels. | 2934 // 8 pixels. |
2935 // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes). | 2935 // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes). |
2936 __declspec(naked) | 2936 __declspec(naked) |
2937 void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, | 2937 void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, |
2938 uint8* dst_argb, | 2938 uint8* dst_argb, |
2939 struct YuvConstants* yuvconstants, | 2939 struct YuvConstants* yuvconstants, |
2940 int width) { | 2940 int width) { |
2941 __asm { | 2941 __asm { |
2942 push ebp | 2942 push ebx |
2943 mov eax, [esp + 4 + 4] // uyvy | 2943 mov eax, [esp + 4 + 4] // uyvy |
2944 mov edx, [esp + 4 + 8] // argb | 2944 mov edx, [esp + 4 + 8] // argb |
2945 mov ebp, [esp + 4 + 12] // yuvconstants | 2945 mov ebx, [esp + 4 + 12] // yuvconstants |
2946 mov ecx, [esp + 4 + 16] // width | 2946 mov ecx, [esp + 4 + 16] // width |
2947 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2947 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
2948 | 2948 |
2949 convertloop: | 2949 convertloop: |
2950 READUYVY | 2950 READUYVY |
2951 YUVTORGB(ebp) | 2951 YUVTORGB(ebx) |
2952 STOREARGB | 2952 STOREARGB |
2953 | 2953 |
2954 sub ecx, 8 | 2954 sub ecx, 8 |
2955 jg convertloop | 2955 jg convertloop |
2956 | 2956 |
2957 pop ebp | 2957 pop ebx |
2958 ret | 2958 ret |
2959 } | 2959 } |
2960 } | 2960 } |
2961 | 2961 |
2962 __declspec(naked) | 2962 __declspec(naked) |
2963 void I422ToBGRARow_SSSE3(const uint8* y_buf, | 2963 void I422ToBGRARow_SSSE3(const uint8* y_buf, |
2964 const uint8* u_buf, | 2964 const uint8* u_buf, |
2965 const uint8* v_buf, | 2965 const uint8* v_buf, |
2966 uint8* dst_bgra, | 2966 uint8* dst_bgra, |
2967 struct YuvConstants* yuvconstants, | 2967 struct YuvConstants* yuvconstants, |
2968 int width) { | 2968 int width) { |
2969 __asm { | 2969 __asm { |
2970 push esi | 2970 push esi |
2971 push edi | 2971 push edi |
2972 push ebp | 2972 push ebx |
2973 mov eax, [esp + 12 + 4] // Y | 2973 mov eax, [esp + 12 + 4] // Y |
2974 mov esi, [esp + 12 + 8] // U | 2974 mov esi, [esp + 12 + 8] // U |
2975 mov edi, [esp + 12 + 12] // V | 2975 mov edi, [esp + 12 + 12] // V |
2976 mov edx, [esp + 12 + 16] // argb | 2976 mov edx, [esp + 12 + 16] // argb |
2977 mov ebp, [esp + 12 + 20] // yuvconstants | 2977 mov ebx, [esp + 12 + 20] // yuvconstants |
2978 mov ecx, [esp + 12 + 24] // width | 2978 mov ecx, [esp + 12 + 24] // width |
2979 sub edi, esi | 2979 sub edi, esi |
2980 | 2980 |
2981 convertloop: | 2981 convertloop: |
2982 READYUV422 | 2982 READYUV422 |
2983 YUVTORGB(ebp) | 2983 YUVTORGB(ebx) |
2984 STOREBGRA | 2984 STOREBGRA |
2985 | 2985 |
2986 sub ecx, 8 | 2986 sub ecx, 8 |
2987 jg convertloop | 2987 jg convertloop |
2988 | 2988 |
2989 pop ebp | 2989 pop ebx |
2990 pop edi | 2990 pop edi |
2991 pop esi | 2991 pop esi |
2992 ret | 2992 ret |
2993 } | 2993 } |
2994 } | 2994 } |
2995 | 2995 |
2996 __declspec(naked) | 2996 __declspec(naked) |
2997 void I422ToABGRRow_SSSE3(const uint8* y_buf, | 2997 void I422ToABGRRow_SSSE3(const uint8* y_buf, |
2998 const uint8* u_buf, | 2998 const uint8* u_buf, |
2999 const uint8* v_buf, | 2999 const uint8* v_buf, |
3000 uint8* dst_abgr, | 3000 uint8* dst_abgr, |
3001 struct YuvConstants* yuvconstants, | 3001 struct YuvConstants* yuvconstants, |
3002 int width) { | 3002 int width) { |
3003 __asm { | 3003 __asm { |
3004 push esi | 3004 push esi |
3005 push edi | 3005 push edi |
3006 push ebp | 3006 push ebx |
3007 mov eax, [esp + 12 + 4] // Y | 3007 mov eax, [esp + 12 + 4] // Y |
3008 mov esi, [esp + 12 + 8] // U | 3008 mov esi, [esp + 12 + 8] // U |
3009 mov edi, [esp + 12 + 12] // V | 3009 mov edi, [esp + 12 + 12] // V |
3010 mov edx, [esp + 12 + 16] // argb | 3010 mov edx, [esp + 12 + 16] // argb |
3011 mov ebp, [esp + 12 + 20] // yuvconstants | 3011 mov ebx, [esp + 12 + 20] // yuvconstants |
3012 mov ecx, [esp + 12 + 24] // width | 3012 mov ecx, [esp + 12 + 24] // width |
3013 sub edi, esi | 3013 sub edi, esi |
3014 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 3014 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
3015 | 3015 |
3016 convertloop: | 3016 convertloop: |
3017 READYUV422 | 3017 READYUV422 |
3018 YUVTORGB(ebp) | 3018 YUVTORGB(ebx) |
3019 STOREABGR | 3019 STOREABGR |
3020 | 3020 |
3021 sub ecx, 8 | 3021 sub ecx, 8 |
3022 jg convertloop | 3022 jg convertloop |
3023 | 3023 |
3024 pop ebp | 3024 pop ebx |
3025 pop edi | 3025 pop edi |
3026 pop esi | 3026 pop esi |
3027 ret | 3027 ret |
3028 } | 3028 } |
3029 } | 3029 } |
3030 | 3030 |
3031 __declspec(naked) | 3031 __declspec(naked) |
3032 void I422ToRGBARow_SSSE3(const uint8* y_buf, | 3032 void I422ToRGBARow_SSSE3(const uint8* y_buf, |
3033 const uint8* u_buf, | 3033 const uint8* u_buf, |
3034 const uint8* v_buf, | 3034 const uint8* v_buf, |
3035 uint8* dst_rgba, | 3035 uint8* dst_rgba, |
3036 struct YuvConstants* yuvconstants, | 3036 struct YuvConstants* yuvconstants, |
3037 int width) { | 3037 int width) { |
3038 __asm { | 3038 __asm { |
3039 push esi | 3039 push esi |
3040 push edi | 3040 push edi |
3041 push ebp | 3041 push ebx |
3042 mov eax, [esp + 12 + 4] // Y | 3042 mov eax, [esp + 12 + 4] // Y |
3043 mov esi, [esp + 12 + 8] // U | 3043 mov esi, [esp + 12 + 8] // U |
3044 mov edi, [esp + 12 + 12] // V | 3044 mov edi, [esp + 12 + 12] // V |
3045 mov edx, [esp + 12 + 16] // argb | 3045 mov edx, [esp + 12 + 16] // argb |
3046 mov ebp, [esp + 12 + 20] // yuvconstants | 3046 mov ebx, [esp + 12 + 20] // yuvconstants |
3047 mov ecx, [esp + 12 + 24] // width | 3047 mov ecx, [esp + 12 + 24] // width |
3048 sub edi, esi | 3048 sub edi, esi |
3049 | 3049 |
3050 convertloop: | 3050 convertloop: |
3051 READYUV422 | 3051 READYUV422 |
3052 YUVTORGB(ebp) | 3052 YUVTORGB(ebx) |
3053 STORERGBA | 3053 STORERGBA |
3054 | 3054 |
3055 sub ecx, 8 | 3055 sub ecx, 8 |
3056 jg convertloop | 3056 jg convertloop |
3057 | 3057 |
3058 pop ebp | 3058 pop ebx |
3059 pop edi | 3059 pop edi |
3060 pop esi | 3060 pop esi |
3061 ret | 3061 ret |
3062 } | 3062 } |
3063 } | 3063 } |
3064 #endif // HAS_I422TOARGBROW_SSSE3 | 3064 #endif // HAS_I422TOARGBROW_SSSE3 |
3065 | 3065 |
3066 #ifdef HAS_I400TOARGBROW_SSE2 | 3066 #ifdef HAS_I400TOARGBROW_SSE2 |
3067 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). | 3067 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). |
3068 __declspec(naked) | 3068 __declspec(naked) |
(...skipping 3386 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6455 } | 6455 } |
6456 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6456 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
6457 | 6457 |
6458 #endif // defined(_M_X64) | 6458 #endif // defined(_M_X64) |
6459 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) | 6459 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
6460 | 6460 |
6461 #ifdef __cplusplus | 6461 #ifdef __cplusplus |
6462 } // extern "C" | 6462 } // extern "C" |
6463 } // namespace libyuv | 6463 } // namespace libyuv |
6464 #endif | 6464 #endif |
OLD | NEW |