OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 225 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
236 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 | 236 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 |
237 static const uvec8 kShuffleMaskARGBToRGB24_0 = { | 237 static const uvec8 kShuffleMaskARGBToRGB24_0 = { |
238 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u | 238 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u |
239 }; | 239 }; |
240 | 240 |
241 // Shuffle table for converting ARGB to RAW. | 241 // Shuffle table for converting ARGB to RAW. |
242 static const uvec8 kShuffleMaskARGBToRAW_0 = { | 242 static const uvec8 kShuffleMaskARGBToRAW_0 = { |
243 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u | 243 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u |
244 }; | 244 }; |
245 | 245 |
| 246 // YUY2 shuf 16 Y to 32 Y. |
| 247 static const lvec8 kShuffleYUY2Y = { |
| 248 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, |
| 249 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 |
| 250 }; |
| 251 |
| 252 // YUY2 shuf 8 UV to 16 UV. |
| 253 static const lvec8 kShuffleYUY2UV = { |
| 254 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15, |
| 255 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15 |
| 256 }; |
| 257 |
| 258 // UYVY shuf 16 Y to 32 Y. |
| 259 static const lvec8 kShuffleUYVYY = { |
| 260 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, |
| 261 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 |
| 262 }; |
| 263 |
| 264 // UYVY shuf 8 UV to 16 UV. |
| 265 static const lvec8 kShuffleUYVYUV = { |
| 266 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14, |
| 267 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 |
| 268 }; |
| 269 |
246 // Duplicates gray value 3 times and fills in alpha opaque. | 270 // Duplicates gray value 3 times and fills in alpha opaque. |
247 __declspec(naked) | 271 __declspec(naked) |
248 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { | 272 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { |
249 __asm { | 273 __asm { |
250 mov eax, [esp + 4] // src_y | 274 mov eax, [esp + 4] // src_y |
251 mov edx, [esp + 8] // dst_argb | 275 mov edx, [esp + 8] // dst_argb |
252 mov ecx, [esp + 12] // pix | 276 mov ecx, [esp + 12] // pix |
253 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 | 277 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
254 pslld xmm5, 24 | 278 pslld xmm5, 24 |
255 | 279 |
(...skipping 1636 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1892 __asm vmovdqu xmm0, [esi] /* UV */ \ | 1916 __asm vmovdqu xmm0, [esi] /* UV */ \ |
1893 __asm lea esi, [esi + 16] \ | 1917 __asm lea esi, [esi + 16] \ |
1894 __asm vpermq ymm0, ymm0, 0xd8 \ | 1918 __asm vpermq ymm0, ymm0, 0xd8 \ |
1895 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ | 1919 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
1896 __asm vmovdqu xmm4, [eax] /* Y */ \ | 1920 __asm vmovdqu xmm4, [eax] /* Y */ \ |
1897 __asm vpermq ymm4, ymm4, 0xd8 \ | 1921 __asm vpermq ymm4, ymm4, 0xd8 \ |
1898 __asm vpunpcklbw ymm4, ymm4, ymm4 \ | 1922 __asm vpunpcklbw ymm4, ymm4, ymm4 \ |
1899 __asm lea eax, [eax + 16] \ | 1923 __asm lea eax, [eax + 16] \ |
1900 } | 1924 } |
1901 | 1925 |
| 1926 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. |
| 1927 #define READYUY2_AVX2 __asm { \ |
| 1928 __asm vmovdqu ymm4, [eax] /* YUY2 */ \ |
| 1929 __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \ |
| 1930 __asm vmovdqu ymm0, [eax] /* UV */ \ |
| 1931 __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \ |
| 1932 __asm lea eax, [eax + 32] \ |
| 1933 } |
| 1934 |
| 1935 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. |
| 1936 #define READUYVY_AVX2 __asm { \ |
| 1937 __asm vmovdqu ymm4, [eax] /* UYVY */ \ |
| 1938 __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \ |
| 1939 __asm vmovdqu ymm0, [eax] /* UV */ \ |
| 1940 __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \ |
| 1941 __asm lea eax, [eax + 32] \ |
| 1942 } |
| 1943 |
1902 // Convert 16 pixels: 16 UV and 16 Y. | 1944 // Convert 16 pixels: 16 UV and 16 Y. |
1903 #define YUVTORGB_AVX2(YuvConstants) __asm { \ | 1945 #define YUVTORGB_AVX2(YuvConstants) __asm { \ |
1904 __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\ | 1946 __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\ |
1905 __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\ | 1947 __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\ |
1906 __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\ | 1948 __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\ |
1907 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \ | 1949 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \ |
1908 __asm vpsubw ymm2, ymm3, ymm2 \ | 1950 __asm vpsubw ymm2, ymm3, ymm2 \ |
1909 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \ | 1951 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \ |
1910 __asm vpsubw ymm1, ymm3, ymm1 \ | 1952 __asm vpsubw ymm1, ymm3, ymm1 \ |
1911 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \ | 1953 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \ |
(...skipping 249 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2161 jg convertloop | 2203 jg convertloop |
2162 | 2204 |
2163 pop ebp | 2205 pop ebp |
2164 pop esi | 2206 pop esi |
2165 vzeroupper | 2207 vzeroupper |
2166 ret | 2208 ret |
2167 } | 2209 } |
2168 } | 2210 } |
2169 #endif // HAS_NV12TOARGBROW_AVX2 | 2211 #endif // HAS_NV12TOARGBROW_AVX2 |
2170 | 2212 |
| 2213 // 16 pixels. |
| 2214 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
| 2215 __declspec(naked) |
| 2216 void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, |
| 2217 uint8* dst_argb, |
| 2218 struct YuvConstants* yuvconstants, |
| 2219 int width) { |
| 2220 __asm { |
| 2221 push ebp |
| 2222 mov eax, [esp + 4 + 4] // yuy2 |
| 2223 mov edx, [esp + 4 + 8] // argb |
| 2224 mov ebp, [esp + 4 + 12] // yuvconstants |
| 2225 mov ecx, [esp + 4 + 16] // width |
| 2226 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2227 |
| 2228 convertloop: |
| 2229 READYUY2_AVX2 |
| 2230 YUVTORGB_AVX2(ebp) |
| 2231 STOREARGB_AVX2 |
| 2232 |
| 2233 sub ecx, 16 |
| 2234 jg convertloop |
| 2235 |
| 2236 pop ebp |
| 2237 vzeroupper |
| 2238 ret |
| 2239 } |
| 2240 } |
| 2241 |
| 2242 // 16 pixels. |
| 2243 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
| 2244 __declspec(naked) |
| 2245 void UYVYToARGBRow_AVX2(const uint8* src_uyvy, |
| 2246 uint8* dst_argb, |
| 2247 struct YuvConstants* yuvconstants, |
| 2248 int width) { |
| 2249 __asm { |
| 2250 push ebp |
| 2251 mov eax, [esp + 4 + 4] // uyvy |
| 2252 mov edx, [esp + 4 + 8] // argb |
| 2253 mov ebp, [esp + 4 + 12] // yuvconstants |
| 2254 mov ecx, [esp + 4 + 16] // width |
| 2255 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2256 |
| 2257 convertloop: |
| 2258 READUYVY_AVX2 |
| 2259 YUVTORGB_AVX2(ebp) |
| 2260 STOREARGB_AVX2 |
| 2261 |
| 2262 sub ecx, 16 |
| 2263 jg convertloop |
| 2264 |
| 2265 pop ebp |
| 2266 vzeroupper |
| 2267 ret |
| 2268 } |
| 2269 } |
| 2270 |
| 2271 |
2171 #ifdef HAS_I422TOBGRAROW_AVX2 | 2272 #ifdef HAS_I422TOBGRAROW_AVX2 |
2172 // 16 pixels | 2273 // 16 pixels |
2173 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). | 2274 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). |
2174 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. | 2275 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. |
2175 __declspec(naked) | 2276 __declspec(naked) |
2176 void I422ToBGRARow_AVX2(const uint8* y_buf, | 2277 void I422ToBGRARow_AVX2(const uint8* y_buf, |
2177 const uint8* u_buf, | 2278 const uint8* u_buf, |
2178 const uint8* v_buf, | 2279 const uint8* v_buf, |
2179 uint8* dst_argb, | 2280 uint8* dst_argb, |
2180 struct YuvConstants* yuvconstants, | 2281 struct YuvConstants* yuvconstants, |
(...skipping 150 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2331 // Read 4 UV from NV12, upsample to 8 UV. | 2432 // Read 4 UV from NV12, upsample to 8 UV. |
2332 #define READNV12 __asm { \ | 2433 #define READNV12 __asm { \ |
2333 __asm movq xmm0, qword ptr [esi] /* UV */ \ | 2434 __asm movq xmm0, qword ptr [esi] /* UV */ \ |
2334 __asm lea esi, [esi + 8] \ | 2435 __asm lea esi, [esi + 8] \ |
2335 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ | 2436 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
2336 __asm movq xmm4, qword ptr [eax] \ | 2437 __asm movq xmm4, qword ptr [eax] \ |
2337 __asm punpcklbw xmm4, xmm4 \ | 2438 __asm punpcklbw xmm4, xmm4 \ |
2338 __asm lea eax, [eax + 8] \ | 2439 __asm lea eax, [eax + 8] \ |
2339 } | 2440 } |
2340 | 2441 |
2341 // YUY2 shuf 8 Y to 16 Y. | 2442 // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV. |
2342 static const vec8 kShuffleYUY2Y = { | |
2343 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 | |
2344 }; | |
2345 | |
2346 // YUY2 shuf 4 UV to 8 UV. | |
2347 static const vec8 kShuffleYUY2UV = { | |
2348 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15 | |
2349 }; | |
2350 | |
2351 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. | |
2352 #define READYUY2 __asm { \ | 2443 #define READYUY2 __asm { \ |
2353 __asm movdqu xmm4, [eax] /* YUY2 */ \ | 2444 __asm movdqu xmm4, [eax] /* YUY2 */ \ |
2354 __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \ | 2445 __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \ |
2355 __asm movdqu xmm0, [eax] /* UV */ \ | 2446 __asm movdqu xmm0, [eax] /* UV */ \ |
2356 __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \ | 2447 __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \ |
2357 __asm lea eax, [eax + 16] \ | 2448 __asm lea eax, [eax + 16] \ |
2358 } | 2449 } |
2359 | 2450 |
2360 // UYVY shuf 8 Y to 16 Y. | 2451 // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV. |
2361 static const vec8 kShuffleUYVYY = { | |
2362 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 | |
2363 }; | |
2364 | |
2365 // UYVY shuf 4 UV to 8 UV. | |
2366 static const vec8 kShuffleUYVYUV = { | |
2367 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 | |
2368 }; | |
2369 | |
2370 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. | |
2371 #define READUYVY __asm { \ | 2452 #define READUYVY __asm { \ |
2372 __asm movdqu xmm4, [eax] /* UYVY */ \ | 2453 __asm movdqu xmm4, [eax] /* UYVY */ \ |
2373 __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \ | 2454 __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \ |
2374 __asm movdqu xmm0, [eax] /* UV */ \ | 2455 __asm movdqu xmm0, [eax] /* UV */ \ |
2375 __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \ | 2456 __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \ |
2376 __asm lea eax, [eax + 16] \ | 2457 __asm lea eax, [eax + 16] \ |
2377 __asm lea eax, [eax + 8] \ | |
2378 } | 2458 } |
2379 | 2459 |
2380 // Convert 8 pixels: 8 UV and 8 Y. | 2460 // Convert 8 pixels: 8 UV and 8 Y. |
2381 #define YUVTORGB(YuvConstants) __asm { \ | 2461 #define YUVTORGB(YuvConstants) __asm { \ |
2382 __asm movdqa xmm1, xmm0 \ | 2462 __asm movdqa xmm1, xmm0 \ |
2383 __asm movdqa xmm2, xmm0 \ | 2463 __asm movdqa xmm2, xmm0 \ |
2384 __asm movdqa xmm3, xmm0 \ | 2464 __asm movdqa xmm3, xmm0 \ |
2385 __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \ | 2465 __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \ |
2386 __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \ | 2466 __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \ |
2387 __asm psubw xmm0, xmm1 \ | 2467 __asm psubw xmm0, xmm1 \ |
(...skipping 3987 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6375 } | 6455 } |
6376 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6456 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
6377 | 6457 |
6378 #endif // defined(_M_X64) | 6458 #endif // defined(_M_X64) |
6379 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) | 6459 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
6380 | 6460 |
6381 #ifdef __cplusplus | 6461 #ifdef __cplusplus |
6382 } // extern "C" | 6462 } // extern "C" |
6383 } // namespace libyuv | 6463 } // namespace libyuv |
6384 #endif | 6464 #endif |
OLD | NEW |