Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(181)

Side by Side Diff: source/row_gcc.cc

Issue 1364813002: yuy2 avx2 initial change (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: avx2 yuy2/uyvy to argb Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/row_common.cc ('k') | source/row_win.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // VERSION 2 1 // VERSION 2
2 /* 2 /*
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
4 * 4 *
5 * Use of this source code is governed by a BSD-style license 5 * Use of this source code is governed by a BSD-style license
6 * that can be found in the LICENSE file in the root of the source 6 * that can be found in the LICENSE file in the root of the source
7 * tree. An additional intellectual property rights grant can be found 7 * tree. An additional intellectual property rights grant can be found
8 * in the file PATENTS. All contributing project authors may 8 * in the file PATENTS. All contributing project authors may
9 * be found in the AUTHORS file in the root of the source tree. 9 * be found in the AUTHORS file in the root of the source tree.
10 */ 10 */
(...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after
133 133
134 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 134 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
135 static uvec8 kShuffleMaskARGBToRGB24_0 = { 135 static uvec8 kShuffleMaskARGBToRGB24_0 = {
136 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u 136 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
137 }; 137 };
138 138
139 // Shuffle table for converting ARGB to RAW. 139 // Shuffle table for converting ARGB to RAW.
140 static uvec8 kShuffleMaskARGBToRAW_0 = { 140 static uvec8 kShuffleMaskARGBToRAW_0 = {
141 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u 141 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
142 }; 142 };
143
144 // YUY2 shuf 16 Y to 32 Y.
145 static const lvec8 kShuffleYUY2Y = {
146 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
147 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
148 };
149
150 // YUY2 shuf 8 UV to 16 UV.
151 static const lvec8 kShuffleYUY2UV = {
152 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
153 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
154 };
155
156 // UYVY shuf 16 Y to 32 Y.
157 static const lvec8 kShuffleUYVYY = {
158 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
159 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
160 };
161
162 // UYVY shuf 8 UV to 16 UV.
163 static const lvec8 kShuffleUYVYUV = {
164 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
165 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
166 };
143 #endif // HAS_RGB24TOARGBROW_SSSE3 167 #endif // HAS_RGB24TOARGBROW_SSSE3
144 168
145 #ifdef HAS_J400TOARGBROW_SSE2 169 #ifdef HAS_J400TOARGBROW_SSE2
146 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { 170 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
147 asm volatile ( 171 asm volatile (
148 "pcmpeqb %%xmm5,%%xmm5 \n" 172 "pcmpeqb %%xmm5,%%xmm5 \n"
149 "pslld $0x18,%%xmm5 \n" 173 "pslld $0x18,%%xmm5 \n"
150 LABELALIGN 174 LABELALIGN
151 "1: \n" 175 "1: \n"
152 "movq " MEMACCESS(0) ",%%xmm0 \n" 176 "movq " MEMACCESS(0) ",%%xmm0 \n"
(...skipping 1201 matching lines...) Expand 10 before | Expand all | Expand 10 after
1354 1378
1355 // Read 4 UV from NV12, upsample to 8 UV 1379 // Read 4 UV from NV12, upsample to 8 UV
1356 #define READNV12 \ 1380 #define READNV12 \
1357 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ 1381 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
1358 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ 1382 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
1359 "punpcklwd %%xmm0,%%xmm0 \n" \ 1383 "punpcklwd %%xmm0,%%xmm0 \n" \
1360 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1384 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1361 "punpcklbw %%xmm4,%%xmm4 \n" \ 1385 "punpcklbw %%xmm4,%%xmm4 \n" \
1362 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" 1386 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
1363 1387
1364 // YUY2 shuf 8 Y to 16 Y.
1365 static const vec8 kShuffleYUY2Y = {
1366 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
1367 };
1368
1369 // YUY2 shuf 4 UV to 8 UV.
1370 static const vec8 kShuffleYUY2UV = {
1371 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
1372 };
1373
1374 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. 1388 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
1375 #define READYUY2 \ 1389 #define READYUY2 \
1376 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \ 1390 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \
1377 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ 1391 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
1378 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \ 1392 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \
1379 "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ 1393 "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
1380 "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n" 1394 "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n"
1381 1395
1382 // UYVY shuf 8 Y to 16 Y.
1383 static const vec8 kShuffleUYVYY = {
1384 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
1385 };
1386
1387 // UYVY shuf 4 UV to 8 UV.
1388 static const vec8 kShuffleUYVYUV = {
1389 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
1390 };
1391
1392 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. 1396 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
1393 #define READUYVY \ 1397 #define READUYVY \
1394 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \ 1398 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \
1395 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ 1399 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
1396 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \ 1400 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \
1397 "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ 1401 "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
1398 "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n" 1402 "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n"
1399 1403
1400 // Convert 8 pixels: 8 UV and 8 Y 1404 // Convert 8 pixels: 8 UV and 8 Y
1401 #define YUVTORGB(yuvconstants) \ 1405 #define YUVTORGB(yuvconstants) \
(...skipping 13 matching lines...) Expand all
1415 "paddsw %%xmm4,%%xmm0 \n" \ 1419 "paddsw %%xmm4,%%xmm0 \n" \
1416 "paddsw %%xmm4,%%xmm1 \n" \ 1420 "paddsw %%xmm4,%%xmm1 \n" \
1417 "paddsw %%xmm4,%%xmm2 \n" \ 1421 "paddsw %%xmm4,%%xmm2 \n" \
1418 "psraw $0x6,%%xmm0 \n" \ 1422 "psraw $0x6,%%xmm0 \n" \
1419 "psraw $0x6,%%xmm1 \n" \ 1423 "psraw $0x6,%%xmm1 \n" \
1420 "psraw $0x6,%%xmm2 \n" \ 1424 "psraw $0x6,%%xmm2 \n" \
1421 "packuswb %%xmm0,%%xmm0 \n" \ 1425 "packuswb %%xmm0,%%xmm0 \n" \
1422 "packuswb %%xmm1,%%xmm1 \n" \ 1426 "packuswb %%xmm1,%%xmm1 \n" \
1423 "packuswb %%xmm2,%%xmm2 \n" 1427 "packuswb %%xmm2,%%xmm2 \n"
1424 1428
1425 // Store 8 ARGB values. Assumes XMM5 is zero. 1429 // Store 8 ARGB values. Assumes XMM5 is set.
1426 #define STOREARGB \ 1430 #define STOREARGB \
1427 "punpcklbw %%xmm1,%%xmm0 \n" \ 1431 "punpcklbw %%xmm1,%%xmm0 \n" \
1428 "punpcklbw %%xmm5,%%xmm2 \n" \ 1432 "punpcklbw %%xmm5,%%xmm2 \n" \
1429 "movdqa %%xmm0,%%xmm1 \n" \ 1433 "movdqa %%xmm0,%%xmm1 \n" \
1430 "punpcklwd %%xmm2,%%xmm0 \n" \ 1434 "punpcklwd %%xmm2,%%xmm0 \n" \
1431 "punpckhwd %%xmm2,%%xmm1 \n" \ 1435 "punpckhwd %%xmm2,%%xmm1 \n" \
1432 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ 1436 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \
1433 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ 1437 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \
1434 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" 1438 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n"
1435 1439
1436 // Store 8 BGRA values. Assumes XMM5 is zero. 1440 // Store 8 BGRA values.
1437 #define STOREBGRA \ 1441 #define STOREBGRA \
1438 "pcmpeqb %%xmm5,%%xmm5 \n" \ 1442 "pcmpeqb %%xmm5,%%xmm5 \n" \
1439 "punpcklbw %%xmm0,%%xmm1 \n" \ 1443 "punpcklbw %%xmm0,%%xmm1 \n" \
1440 "punpcklbw %%xmm2,%%xmm5 \n" \ 1444 "punpcklbw %%xmm2,%%xmm5 \n" \
1441 "movdqa %%xmm5,%%xmm0 \n" \ 1445 "movdqa %%xmm5,%%xmm0 \n" \
1442 "punpcklwd %%xmm1,%%xmm5 \n" \ 1446 "punpcklwd %%xmm1,%%xmm5 \n" \
1443 "punpckhwd %%xmm1,%%xmm0 \n" \ 1447 "punpckhwd %%xmm1,%%xmm0 \n" \
1444 "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \ 1448 "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \
1445 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \ 1449 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \
1446 "lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n" 1450 "lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n"
1447 1451
1448 // Store 8 ABGR values. Assumes XMM5 is zero. 1452 // Store 8 ABGR values. Assumes XMM5 is set.
1449 #define STOREABGR \ 1453 #define STOREABGR \
1450 "punpcklbw %%xmm1,%%xmm2 \n" \ 1454 "punpcklbw %%xmm1,%%xmm2 \n" \
1451 "punpcklbw %%xmm5,%%xmm0 \n" \ 1455 "punpcklbw %%xmm5,%%xmm0 \n" \
1452 "movdqa %%xmm2,%%xmm1 \n" \ 1456 "movdqa %%xmm2,%%xmm1 \n" \
1453 "punpcklwd %%xmm0,%%xmm2 \n" \ 1457 "punpcklwd %%xmm0,%%xmm2 \n" \
1454 "punpckhwd %%xmm0,%%xmm1 \n" \ 1458 "punpckhwd %%xmm0,%%xmm1 \n" \
1455 "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \ 1459 "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \
1456 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \ 1460 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \
1457 "lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n" 1461 "lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n"
1458 1462
1459 // Store 8 RGBA values. Assumes XMM5 is zero. 1463 // Store 8 RGBA values. Assumes XMM5 is set.
1460 #define STORERGBA \ 1464 #define STORERGBA \
1461 "pcmpeqb %%xmm5,%%xmm5 \n" \ 1465 "pcmpeqb %%xmm5,%%xmm5 \n" \
1462 "punpcklbw %%xmm2,%%xmm1 \n" \ 1466 "punpcklbw %%xmm2,%%xmm1 \n" \
1463 "punpcklbw %%xmm0,%%xmm5 \n" \ 1467 "punpcklbw %%xmm0,%%xmm5 \n" \
1464 "movdqa %%xmm5,%%xmm0 \n" \ 1468 "movdqa %%xmm5,%%xmm0 \n" \
1465 "punpcklwd %%xmm1,%%xmm5 \n" \ 1469 "punpcklwd %%xmm1,%%xmm5 \n" \
1466 "punpckhwd %%xmm1,%%xmm0 \n" \ 1470 "punpckhwd %%xmm1,%%xmm0 \n" \
1467 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ 1471 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \
1468 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ 1472 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \
1469 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" 1473 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n"
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
1515 [u_buf]"+r"(u_buf), // %[u_buf] 1519 [u_buf]"+r"(u_buf), // %[u_buf]
1516 [v_buf]"+r"(v_buf), // %[v_buf] 1520 [v_buf]"+r"(v_buf), // %[v_buf]
1517 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] 1521 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
1518 [width]"+rm"(width) // %[width] 1522 [width]"+rm"(width) // %[width]
1519 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1523 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1520 : "memory", "cc", NACL_R14 1524 : "memory", "cc", NACL_R14
1521 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1525 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1522 ); 1526 );
1523 } 1527 }
1524 1528
1525 // TODO(fbarchard): Consider putting masks into constants.
1526 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, 1529 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1527 const uint8* u_buf, 1530 const uint8* u_buf,
1528 const uint8* v_buf, 1531 const uint8* v_buf,
1529 uint8* dst_rgb24, 1532 uint8* dst_rgb24,
1530 struct YuvConstants* yuvconstants, 1533 struct YuvConstants* yuvconstants,
1531 int width) { 1534 int width) {
1532 asm volatile ( 1535 asm volatile (
1533 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" 1536 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1534 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" 1537 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
1535 "sub %[u_buf],%[v_buf] \n" 1538 "sub %[u_buf],%[v_buf] \n"
(...skipping 286 matching lines...) Expand 10 before | Expand all | Expand 10 after
1822 1825
1823 #endif // HAS_I422TOARGBROW_SSSE3 1826 #endif // HAS_I422TOARGBROW_SSSE3
1824 1827
1825 // Read 8 UV from 422, upsample to 16 UV. 1828 // Read 8 UV from 422, upsample to 16 UV.
1826 #define READYUV422_AVX2 \ 1829 #define READYUV422_AVX2 \
1827 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1830 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1828 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1831 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1829 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ 1832 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1830 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ 1833 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
1831 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 1834 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1832 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" 1835 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
1836 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1837 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
1838 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
1839 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
1840
1841 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
1842 #define READYUY2_AVX2 \
1843 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \
1844 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
1845 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \
1846 "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
1847 "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n"
1848
1849 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
1850 #define READUYVY_AVX2 \
1851 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \
1852 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
1853 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \
1854 "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
1855 "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n"
1833 1856
1834 // Convert 16 pixels: 16 UV and 16 Y. 1857 // Convert 16 pixels: 16 UV and 16 Y.
1835 #define YUVTORGB_AVX2(YuvConstants) \ 1858 #define YUVTORGB_AVX2(YuvConstants) \
1836 "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \ 1859 "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \
1837 "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \ 1860 "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \
1838 "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \ 1861 "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \
1839 "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \ 1862 "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \
1840 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ 1863 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
1841 "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm3 \n" \ 1864 "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm3 \n" \
1842 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ 1865 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
1843 "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm3 \n" \ 1866 "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm3 \n" \
1844 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ 1867 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
1845 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \ 1868 "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm4,%%ymm4 \n" \
1846 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ 1869 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
1847 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ 1870 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
1848 "vpunpcklbw %%ymm3,%%ymm3,%%ymm3 \n" \ 1871 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
1849 "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3 \n" \ 1872 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
1850 "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" \ 1873 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
1851 "vpaddsw %%ymm3,%%ymm1,%%ymm1 \n" \ 1874 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
1852 "vpaddsw %%ymm3,%%ymm2,%%ymm2 \n" \ 1875 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
1853 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ 1876 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
1854 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ 1877 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
1855 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ 1878
1856 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ 1879 // Store 16 ARGB values. Assumes XMM5 is set.
1857 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ 1880 #define STOREARGB_AVX2 \
1858 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" 1881 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
1882 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1883 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
1884 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
1885 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
1886 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
1887 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \
1888 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) " \n" \
1889 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
1859 1890
1860 #if defined(HAS_I422TOBGRAROW_AVX2) 1891 #if defined(HAS_I422TOBGRAROW_AVX2)
1861 // 16 pixels 1892 // 16 pixels
1862 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). 1893 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
1863 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, 1894 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,
1864 const uint8* u_buf, 1895 const uint8* u_buf,
1865 const uint8* v_buf, 1896 const uint8* v_buf,
1866 uint8* dst_bgra, 1897 uint8* dst_bgra,
1867 struct YuvConstants* yuvconstants, 1898 struct YuvConstants* yuvconstants,
1868 int width) { 1899 int width) {
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
1909 uint8* dst_argb, 1940 uint8* dst_argb,
1910 struct YuvConstants* yuvconstants, 1941 struct YuvConstants* yuvconstants,
1911 int width) { 1942 int width) {
1912 asm volatile ( 1943 asm volatile (
1913 "sub %[u_buf],%[v_buf] \n" 1944 "sub %[u_buf],%[v_buf] \n"
1914 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 1945 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
1915 LABELALIGN 1946 LABELALIGN
1916 "1: \n" 1947 "1: \n"
1917 READYUV422_AVX2 1948 READYUV422_AVX2
1918 YUVTORGB_AVX2(yuvconstants) 1949 YUVTORGB_AVX2(yuvconstants)
1919 1950 STOREARGB_AVX2
1920 // Step 3: Weave into ARGB
1921 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG
1922 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1923 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA
1924 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
1925 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels
1926 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels
1927
1928 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n"
1929 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
1930 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
1931 "sub $0x10,%[width] \n" 1951 "sub $0x10,%[width] \n"
1932 "jg 1b \n" 1952 "jg 1b \n"
1933 "vzeroupper \n" 1953 "vzeroupper \n"
1934 : [y_buf]"+r"(y_buf), // %[y_buf] 1954 : [y_buf]"+r"(y_buf), // %[y_buf]
1935 [u_buf]"+r"(u_buf), // %[u_buf] 1955 [u_buf]"+r"(u_buf), // %[u_buf]
1936 [v_buf]"+r"(v_buf), // %[v_buf] 1956 [v_buf]"+r"(v_buf), // %[v_buf]
1937 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1957 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1938 [width]"+rm"(width) // %[width] 1958 [width]"+rm"(width) // %[width]
1939 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1959 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1940 : "memory", "cc", NACL_R14 1960 : "memory", "cc", NACL_R14
(...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after
2020 [v_buf]"+r"(v_buf), // %[v_buf] 2040 [v_buf]"+r"(v_buf), // %[v_buf]
2021 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2041 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2022 [width]"+rm"(width) // %[width] 2042 [width]"+rm"(width) // %[width]
2023 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2043 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2024 : "memory", "cc", NACL_R14 2044 : "memory", "cc", NACL_R14
2025 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2045 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2026 ); 2046 );
2027 } 2047 }
2028 #endif // HAS_I422TORGBAROW_AVX2 2048 #endif // HAS_I422TORGBAROW_AVX2
2029 2049
2050 #if defined(HAS_YUY2TOARGBROW_AVX2)
2051 // 16 pixels.
2052 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2053 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
2054 uint8* dst_argb,
2055 struct YuvConstants* yuvconstants,
2056 int width) {
2057
2058 asm volatile (
2059 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2060 LABELALIGN
2061 "1: \n"
2062 READYUY2_AVX2
2063 YUVTORGB_AVX2(yuvconstants)
2064 STOREARGB_AVX2
2065 "sub $0x10,%[width] \n"
2066 "jg 1b \n"
2067 "vzeroupper \n"
2068 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
2069 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2070 [width]"+rm"(width) // %[width]
2071 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2072 [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
2073 [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
2074 // Does not use r14.
2075 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2076 );
2077 }
2078 #endif // HAS_YUY2TOARGBROW_AVX2
2079
2080 #if defined(HAS_UYVYTOARGBROW_AVX2)
2081 // 16 pixels.
2082 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2083 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
2084 uint8* dst_argb,
2085 struct YuvConstants* yuvconstants,
2086 int width) {
2087
2088 asm volatile (
2089 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2090 LABELALIGN
2091 "1: \n"
2092 READUYVY_AVX2
2093 YUVTORGB_AVX2(yuvconstants)
2094 STOREARGB_AVX2
2095 "sub $0x10,%[width] \n"
2096 "jg 1b \n"
2097 "vzeroupper \n"
2098 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
2099 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2100 [width]"+rm"(width) // %[width]
2101 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2102 [kShuffleUYVYY]"m"(kShuffleUYVYY),
2103 [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
2104 // Does not use r14.
2105 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2106 );
2107 }
2108 #endif // HAS_UYVYTOARGBROW_AVX2
2109
2030 #ifdef HAS_I400TOARGBROW_SSE2 2110 #ifdef HAS_I400TOARGBROW_SSE2
2031 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { 2111 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
2032 asm volatile ( 2112 asm volatile (
2033 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 2113 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
2034 "movd %%eax,%%xmm2 \n" 2114 "movd %%eax,%%xmm2 \n"
2035 "pshufd $0x0,%%xmm2,%%xmm2 \n" 2115 "pshufd $0x0,%%xmm2,%%xmm2 \n"
2036 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16 2116 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16
2037 "movd %%eax,%%xmm3 \n" 2117 "movd %%eax,%%xmm3 \n"
2038 "pshufd $0x0,%%xmm3,%%xmm3 \n" 2118 "pshufd $0x0,%%xmm3,%%xmm3 \n"
2039 "pcmpeqb %%xmm4,%%xmm4 \n" 2119 "pcmpeqb %%xmm4,%%xmm4 \n"
(...skipping 3254 matching lines...) Expand 10 before | Expand all | Expand 10 after
5294 ); 5374 );
5295 } 5375 }
5296 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 5376 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5297 5377
5298 #endif // defined(__x86_64__) || defined(__i386__) 5378 #endif // defined(__x86_64__) || defined(__i386__)
5299 5379
5300 #ifdef __cplusplus 5380 #ifdef __cplusplus
5301 } // extern "C" 5381 } // extern "C"
5302 } // namespace libyuv 5382 } // namespace libyuv
5303 #endif 5383 #endif
OLDNEW
« no previous file with comments | « source/row_common.cc ('k') | source/row_win.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698