Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(80)

Side by Side Diff: source/row_win.cc

Issue 1364813002: yuy2 avx2 initial change (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: avx2 yuy2/uyvy to argb Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/row_gcc.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 225 matching lines...) Expand 10 before | Expand all | Expand 10 after
236 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 236 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
237 static const uvec8 kShuffleMaskARGBToRGB24_0 = { 237 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
238 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u 238 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
239 }; 239 };
240 240
241 // Shuffle table for converting ARGB to RAW. 241 // Shuffle table for converting ARGB to RAW.
242 static const uvec8 kShuffleMaskARGBToRAW_0 = { 242 static const uvec8 kShuffleMaskARGBToRAW_0 = {
243 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u 243 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
244 }; 244 };
245 245
246 // YUY2 shuf 16 Y to 32 Y.
247 static const lvec8 kShuffleYUY2Y = {
248 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
249 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
250 };
251
252 // YUY2 shuf 8 UV to 16 UV.
253 static const lvec8 kShuffleYUY2UV = {
254 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
255 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
256 };
257
258 // UYVY shuf 16 Y to 32 Y.
259 static const lvec8 kShuffleUYVYY = {
260 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
261 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
262 };
263
264 // UYVY shuf 8 UV to 16 UV.
265 static const lvec8 kShuffleUYVYUV = {
266 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
267 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
268 };
269
246 // Duplicates gray value 3 times and fills in alpha opaque. 270 // Duplicates gray value 3 times and fills in alpha opaque.
247 __declspec(naked) 271 __declspec(naked)
248 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { 272 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
249 __asm { 273 __asm {
250 mov eax, [esp + 4] // src_y 274 mov eax, [esp + 4] // src_y
251 mov edx, [esp + 8] // dst_argb 275 mov edx, [esp + 8] // dst_argb
252 mov ecx, [esp + 12] // pix 276 mov ecx, [esp + 12] // pix
253 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 277 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
254 pslld xmm5, 24 278 pslld xmm5, 24
255 279
(...skipping 1636 matching lines...) Expand 10 before | Expand all | Expand 10 after
1892 __asm vmovdqu xmm0, [esi] /* UV */ \ 1916 __asm vmovdqu xmm0, [esi] /* UV */ \
1893 __asm lea esi, [esi + 16] \ 1917 __asm lea esi, [esi + 16] \
1894 __asm vpermq ymm0, ymm0, 0xd8 \ 1918 __asm vpermq ymm0, ymm0, 0xd8 \
1895 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ 1919 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1896 __asm vmovdqu xmm4, [eax] /* Y */ \ 1920 __asm vmovdqu xmm4, [eax] /* Y */ \
1897 __asm vpermq ymm4, ymm4, 0xd8 \ 1921 __asm vpermq ymm4, ymm4, 0xd8 \
1898 __asm vpunpcklbw ymm4, ymm4, ymm4 \ 1922 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1899 __asm lea eax, [eax + 16] \ 1923 __asm lea eax, [eax + 16] \
1900 } 1924 }
1901 1925
1926 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
1927 #define READYUY2_AVX2 __asm { \
1928 __asm vmovdqu ymm4, [eax] /* YUY2 */ \
1929 __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
1930 __asm vmovdqu ymm0, [eax] /* UV */ \
1931 __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \
1932 __asm lea eax, [eax + 32] \
1933 }
1934
1935 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
1936 #define READUYVY_AVX2 __asm { \
1937 __asm vmovdqu ymm4, [eax] /* UYVY */ \
1938 __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
1939 __asm vmovdqu ymm0, [eax] /* UV */ \
1940 __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \
1941 __asm lea eax, [eax + 32] \
1942 }
1943
1902 // Convert 16 pixels: 16 UV and 16 Y. 1944 // Convert 16 pixels: 16 UV and 16 Y.
1903 #define YUVTORGB_AVX2(YuvConstants) __asm { \ 1945 #define YUVTORGB_AVX2(YuvConstants) __asm { \
1904 __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\ 1946 __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
1905 __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\ 1947 __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
1906 __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\ 1948 __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
1907 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \ 1949 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \
1908 __asm vpsubw ymm2, ymm3, ymm2 \ 1950 __asm vpsubw ymm2, ymm3, ymm2 \
1909 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \ 1951 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \
1910 __asm vpsubw ymm1, ymm3, ymm1 \ 1952 __asm vpsubw ymm1, ymm3, ymm1 \
1911 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \ 1953 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
(...skipping 249 matching lines...) Expand 10 before | Expand all | Expand 10 after
2161 jg convertloop 2203 jg convertloop
2162 2204
2163 pop ebp 2205 pop ebp
2164 pop esi 2206 pop esi
2165 vzeroupper 2207 vzeroupper
2166 ret 2208 ret
2167 } 2209 }
2168 } 2210 }
2169 #endif // HAS_NV12TOARGBROW_AVX2 2211 #endif // HAS_NV12TOARGBROW_AVX2
2170 2212
2213 // 16 pixels.
2214 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2215 __declspec(naked)
2216 void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
2217 uint8* dst_argb,
2218 struct YuvConstants* yuvconstants,
2219 int width) {
2220 __asm {
2221 push ebp
2222 mov eax, [esp + 4 + 4] // yuy2
2223 mov edx, [esp + 4 + 8] // argb
2224 mov ebp, [esp + 4 + 12] // yuvconstants
2225 mov ecx, [esp + 4 + 16] // width
2226 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2227
2228 convertloop:
2229 READYUY2_AVX2
2230 YUVTORGB_AVX2(ebp)
2231 STOREARGB_AVX2
2232
2233 sub ecx, 16
2234 jg convertloop
2235
2236 pop ebp
2237 vzeroupper
2238 ret
2239 }
2240 }
2241
2242 // 16 pixels.
2243 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2244 __declspec(naked)
2245 void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
2246 uint8* dst_argb,
2247 struct YuvConstants* yuvconstants,
2248 int width) {
2249 __asm {
2250 push ebp
2251 mov eax, [esp + 4 + 4] // uyvy
2252 mov edx, [esp + 4 + 8] // argb
2253 mov ebp, [esp + 4 + 12] // yuvconstants
2254 mov ecx, [esp + 4 + 16] // width
2255 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2256
2257 convertloop:
2258 READUYVY_AVX2
2259 YUVTORGB_AVX2(ebp)
2260 STOREARGB_AVX2
2261
2262 sub ecx, 16
2263 jg convertloop
2264
2265 pop ebp
2266 vzeroupper
2267 ret
2268 }
2269 }
2270
2271
2171 #ifdef HAS_I422TOBGRAROW_AVX2 2272 #ifdef HAS_I422TOBGRAROW_AVX2
2172 // 16 pixels 2273 // 16 pixels
2173 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). 2274 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
2174 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. 2275 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
2175 __declspec(naked) 2276 __declspec(naked)
2176 void I422ToBGRARow_AVX2(const uint8* y_buf, 2277 void I422ToBGRARow_AVX2(const uint8* y_buf,
2177 const uint8* u_buf, 2278 const uint8* u_buf,
2178 const uint8* v_buf, 2279 const uint8* v_buf,
2179 uint8* dst_argb, 2280 uint8* dst_argb,
2180 struct YuvConstants* yuvconstants, 2281 struct YuvConstants* yuvconstants,
(...skipping 150 matching lines...) Expand 10 before | Expand all | Expand 10 after
2331 // Read 4 UV from NV12, upsample to 8 UV. 2432 // Read 4 UV from NV12, upsample to 8 UV.
2332 #define READNV12 __asm { \ 2433 #define READNV12 __asm { \
2333 __asm movq xmm0, qword ptr [esi] /* UV */ \ 2434 __asm movq xmm0, qword ptr [esi] /* UV */ \
2334 __asm lea esi, [esi + 8] \ 2435 __asm lea esi, [esi + 8] \
2335 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 2436 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2336 __asm movq xmm4, qword ptr [eax] \ 2437 __asm movq xmm4, qword ptr [eax] \
2337 __asm punpcklbw xmm4, xmm4 \ 2438 __asm punpcklbw xmm4, xmm4 \
2338 __asm lea eax, [eax + 8] \ 2439 __asm lea eax, [eax + 8] \
2339 } 2440 }
2340 2441
2341 // YUY2 shuf 8 Y to 16 Y. 2442 // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
2342 static const vec8 kShuffleYUY2Y = {
2343 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
2344 };
2345
2346 // YUY2 shuf 4 UV to 8 UV.
2347 static const vec8 kShuffleYUY2UV = {
2348 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
2349 };
2350
2351 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
2352 #define READYUY2 __asm { \ 2443 #define READYUY2 __asm { \
2353 __asm movdqu xmm4, [eax] /* YUY2 */ \ 2444 __asm movdqu xmm4, [eax] /* YUY2 */ \
2354 __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \ 2445 __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
2355 __asm movdqu xmm0, [eax] /* UV */ \ 2446 __asm movdqu xmm0, [eax] /* UV */ \
2356 __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \ 2447 __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \
2357 __asm lea eax, [eax + 16] \ 2448 __asm lea eax, [eax + 16] \
2358 } 2449 }
2359 2450
2360 // UYVY shuf 8 Y to 16 Y. 2451 // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
2361 static const vec8 kShuffleUYVYY = {
2362 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
2363 };
2364
2365 // UYVY shuf 4 UV to 8 UV.
2366 static const vec8 kShuffleUYVYUV = {
2367 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
2368 };
2369
2370 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
2371 #define READUYVY __asm { \ 2452 #define READUYVY __asm { \
2372 __asm movdqu xmm4, [eax] /* UYVY */ \ 2453 __asm movdqu xmm4, [eax] /* UYVY */ \
2373 __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \ 2454 __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
2374 __asm movdqu xmm0, [eax] /* UV */ \ 2455 __asm movdqu xmm0, [eax] /* UV */ \
2375 __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \ 2456 __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
2376 __asm lea eax, [eax + 16] \ 2457 __asm lea eax, [eax + 16] \
2377 __asm lea eax, [eax + 8] \
2378 } 2458 }
2379 2459
2380 // Convert 8 pixels: 8 UV and 8 Y. 2460 // Convert 8 pixels: 8 UV and 8 Y.
2381 #define YUVTORGB(YuvConstants) __asm { \ 2461 #define YUVTORGB(YuvConstants) __asm { \
2382 __asm movdqa xmm1, xmm0 \ 2462 __asm movdqa xmm1, xmm0 \
2383 __asm movdqa xmm2, xmm0 \ 2463 __asm movdqa xmm2, xmm0 \
2384 __asm movdqa xmm3, xmm0 \ 2464 __asm movdqa xmm3, xmm0 \
2385 __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \ 2465 __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \
2386 __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \ 2466 __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \
2387 __asm psubw xmm0, xmm1 \ 2467 __asm psubw xmm0, xmm1 \
(...skipping 3987 matching lines...) Expand 10 before | Expand all | Expand 10 after
6375 } 6455 }
6376 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 6456 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6377 6457
6378 #endif // defined(_M_X64) 6458 #endif // defined(_M_X64)
6379 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) 6459 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6380 6460
6381 #ifdef __cplusplus 6461 #ifdef __cplusplus
6382 } // extern "C" 6462 } // extern "C"
6383 } // namespace libyuv 6463 } // namespace libyuv
6384 #endif 6464 #endif
OLDNEW
« no previous file with comments | « source/row_gcc.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698