OLD | NEW |
1 // VERSION 2 | 1 // VERSION 2 |
2 /* | 2 /* |
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
4 * | 4 * |
5 * Use of this source code is governed by a BSD-style license | 5 * Use of this source code is governed by a BSD-style license |
6 * that can be found in the LICENSE file in the root of the source | 6 * that can be found in the LICENSE file in the root of the source |
7 * tree. An additional intellectual property rights grant can be found | 7 * tree. An additional intellectual property rights grant can be found |
8 * in the file PATENTS. All contributing project authors may | 8 * in the file PATENTS. All contributing project authors may |
9 * be found in the AUTHORS file in the root of the source tree. | 9 * be found in the AUTHORS file in the root of the source tree. |
10 */ | 10 */ |
(...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
133 | 133 |
134 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 | 134 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 |
135 static uvec8 kShuffleMaskARGBToRGB24_0 = { | 135 static uvec8 kShuffleMaskARGBToRGB24_0 = { |
136 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u | 136 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u |
137 }; | 137 }; |
138 | 138 |
139 // Shuffle table for converting ARGB to RAW. | 139 // Shuffle table for converting ARGB to RAW. |
140 static uvec8 kShuffleMaskARGBToRAW_0 = { | 140 static uvec8 kShuffleMaskARGBToRAW_0 = { |
141 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u | 141 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u |
142 }; | 142 }; |
| 143 |
| 144 // YUY2 shuf 16 Y to 32 Y. |
| 145 static const lvec8 kShuffleYUY2Y = { |
| 146 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, |
| 147 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 |
| 148 }; |
| 149 |
| 150 // YUY2 shuf 8 UV to 16 UV. |
| 151 static const lvec8 kShuffleYUY2UV = { |
| 152 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15, |
| 153 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15 |
| 154 }; |
| 155 |
| 156 // UYVY shuf 16 Y to 32 Y. |
| 157 static const lvec8 kShuffleUYVYY = { |
| 158 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, |
| 159 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 |
| 160 }; |
| 161 |
| 162 // UYVY shuf 8 UV to 16 UV. |
| 163 static const lvec8 kShuffleUYVYUV = { |
| 164 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14, |
| 165 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 |
| 166 }; |
143 #endif // HAS_RGB24TOARGBROW_SSSE3 | 167 #endif // HAS_RGB24TOARGBROW_SSSE3 |
144 | 168 |
145 #ifdef HAS_J400TOARGBROW_SSE2 | 169 #ifdef HAS_J400TOARGBROW_SSE2 |
146 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { | 170 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { |
147 asm volatile ( | 171 asm volatile ( |
148 "pcmpeqb %%xmm5,%%xmm5 \n" | 172 "pcmpeqb %%xmm5,%%xmm5 \n" |
149 "pslld $0x18,%%xmm5 \n" | 173 "pslld $0x18,%%xmm5 \n" |
150 LABELALIGN | 174 LABELALIGN |
151 "1: \n" | 175 "1: \n" |
152 "movq " MEMACCESS(0) ",%%xmm0 \n" | 176 "movq " MEMACCESS(0) ",%%xmm0 \n" |
(...skipping 1201 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1354 | 1378 |
1355 // Read 4 UV from NV12, upsample to 8 UV | 1379 // Read 4 UV from NV12, upsample to 8 UV |
1356 #define READNV12 \ | 1380 #define READNV12 \ |
1357 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ | 1381 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ |
1358 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ | 1382 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ |
1359 "punpcklwd %%xmm0,%%xmm0 \n" \ | 1383 "punpcklwd %%xmm0,%%xmm0 \n" \ |
1360 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1384 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1361 "punpcklbw %%xmm4,%%xmm4 \n" \ | 1385 "punpcklbw %%xmm4,%%xmm4 \n" \ |
1362 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" | 1386 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" |
1363 | 1387 |
1364 // YUY2 shuf 8 Y to 16 Y. | |
1365 static const vec8 kShuffleYUY2Y = { | |
1366 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 | |
1367 }; | |
1368 | |
1369 // YUY2 shuf 4 UV to 8 UV. | |
1370 static const vec8 kShuffleYUY2UV = { | |
1371 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15 | |
1372 }; | |
1373 | |
1374 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. | 1388 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. |
1375 #define READYUY2 \ | 1389 #define READYUY2 \ |
1376 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \ | 1390 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \ |
1377 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ | 1391 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ |
1378 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \ | 1392 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \ |
1379 "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ | 1393 "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ |
1380 "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n" | 1394 "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n" |
1381 | 1395 |
1382 // UYVY shuf 8 Y to 16 Y. | |
1383 static const vec8 kShuffleUYVYY = { | |
1384 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 | |
1385 }; | |
1386 | |
1387 // UYVY shuf 4 UV to 8 UV. | |
1388 static const vec8 kShuffleUYVYUV = { | |
1389 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 | |
1390 }; | |
1391 | |
1392 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. | 1396 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. |
1393 #define READUYVY \ | 1397 #define READUYVY \ |
1394 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \ | 1398 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \ |
1395 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ | 1399 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ |
1396 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \ | 1400 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \ |
1397 "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ | 1401 "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ |
1398 "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n" | 1402 "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n" |
1399 | 1403 |
1400 // Convert 8 pixels: 8 UV and 8 Y | 1404 // Convert 8 pixels: 8 UV and 8 Y |
1401 #define YUVTORGB(yuvconstants) \ | 1405 #define YUVTORGB(yuvconstants) \ |
(...skipping 13 matching lines...) Expand all Loading... |
1415 "paddsw %%xmm4,%%xmm0 \n" \ | 1419 "paddsw %%xmm4,%%xmm0 \n" \ |
1416 "paddsw %%xmm4,%%xmm1 \n" \ | 1420 "paddsw %%xmm4,%%xmm1 \n" \ |
1417 "paddsw %%xmm4,%%xmm2 \n" \ | 1421 "paddsw %%xmm4,%%xmm2 \n" \ |
1418 "psraw $0x6,%%xmm0 \n" \ | 1422 "psraw $0x6,%%xmm0 \n" \ |
1419 "psraw $0x6,%%xmm1 \n" \ | 1423 "psraw $0x6,%%xmm1 \n" \ |
1420 "psraw $0x6,%%xmm2 \n" \ | 1424 "psraw $0x6,%%xmm2 \n" \ |
1421 "packuswb %%xmm0,%%xmm0 \n" \ | 1425 "packuswb %%xmm0,%%xmm0 \n" \ |
1422 "packuswb %%xmm1,%%xmm1 \n" \ | 1426 "packuswb %%xmm1,%%xmm1 \n" \ |
1423 "packuswb %%xmm2,%%xmm2 \n" | 1427 "packuswb %%xmm2,%%xmm2 \n" |
1424 | 1428 |
1425 // Store 8 ARGB values. Assumes XMM5 is zero. | 1429 // Store 8 ARGB values. Assumes XMM5 is set. |
1426 #define STOREARGB \ | 1430 #define STOREARGB \ |
1427 "punpcklbw %%xmm1,%%xmm0 \n" \ | 1431 "punpcklbw %%xmm1,%%xmm0 \n" \ |
1428 "punpcklbw %%xmm5,%%xmm2 \n" \ | 1432 "punpcklbw %%xmm5,%%xmm2 \n" \ |
1429 "movdqa %%xmm0,%%xmm1 \n" \ | 1433 "movdqa %%xmm0,%%xmm1 \n" \ |
1430 "punpcklwd %%xmm2,%%xmm0 \n" \ | 1434 "punpcklwd %%xmm2,%%xmm0 \n" \ |
1431 "punpckhwd %%xmm2,%%xmm1 \n" \ | 1435 "punpckhwd %%xmm2,%%xmm1 \n" \ |
1432 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ | 1436 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ |
1433 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ | 1437 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ |
1434 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" | 1438 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" |
1435 | 1439 |
1436 // Store 8 BGRA values. Assumes XMM5 is zero. | 1440 // Store 8 BGRA values. |
1437 #define STOREBGRA \ | 1441 #define STOREBGRA \ |
1438 "pcmpeqb %%xmm5,%%xmm5 \n" \ | 1442 "pcmpeqb %%xmm5,%%xmm5 \n" \ |
1439 "punpcklbw %%xmm0,%%xmm1 \n" \ | 1443 "punpcklbw %%xmm0,%%xmm1 \n" \ |
1440 "punpcklbw %%xmm2,%%xmm5 \n" \ | 1444 "punpcklbw %%xmm2,%%xmm5 \n" \ |
1441 "movdqa %%xmm5,%%xmm0 \n" \ | 1445 "movdqa %%xmm5,%%xmm0 \n" \ |
1442 "punpcklwd %%xmm1,%%xmm5 \n" \ | 1446 "punpcklwd %%xmm1,%%xmm5 \n" \ |
1443 "punpckhwd %%xmm1,%%xmm0 \n" \ | 1447 "punpckhwd %%xmm1,%%xmm0 \n" \ |
1444 "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \ | 1448 "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \ |
1445 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \ | 1449 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \ |
1446 "lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n" | 1450 "lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n" |
1447 | 1451 |
1448 // Store 8 ABGR values. Assumes XMM5 is zero. | 1452 // Store 8 ABGR values. Assumes XMM5 is set. |
1449 #define STOREABGR \ | 1453 #define STOREABGR \ |
1450 "punpcklbw %%xmm1,%%xmm2 \n" \ | 1454 "punpcklbw %%xmm1,%%xmm2 \n" \ |
1451 "punpcklbw %%xmm5,%%xmm0 \n" \ | 1455 "punpcklbw %%xmm5,%%xmm0 \n" \ |
1452 "movdqa %%xmm2,%%xmm1 \n" \ | 1456 "movdqa %%xmm2,%%xmm1 \n" \ |
1453 "punpcklwd %%xmm0,%%xmm2 \n" \ | 1457 "punpcklwd %%xmm0,%%xmm2 \n" \ |
1454 "punpckhwd %%xmm0,%%xmm1 \n" \ | 1458 "punpckhwd %%xmm0,%%xmm1 \n" \ |
1455 "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \ | 1459 "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \ |
1456 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \ | 1460 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \ |
1457 "lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n" | 1461 "lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n" |
1458 | 1462 |
1459 // Store 8 RGBA values. Assumes XMM5 is zero. | 1463 // Store 8 RGBA values. Assumes XMM5 is set. |
1460 #define STORERGBA \ | 1464 #define STORERGBA \ |
1461 "pcmpeqb %%xmm5,%%xmm5 \n" \ | 1465 "pcmpeqb %%xmm5,%%xmm5 \n" \ |
1462 "punpcklbw %%xmm2,%%xmm1 \n" \ | 1466 "punpcklbw %%xmm2,%%xmm1 \n" \ |
1463 "punpcklbw %%xmm0,%%xmm5 \n" \ | 1467 "punpcklbw %%xmm0,%%xmm5 \n" \ |
1464 "movdqa %%xmm5,%%xmm0 \n" \ | 1468 "movdqa %%xmm5,%%xmm0 \n" \ |
1465 "punpcklwd %%xmm1,%%xmm5 \n" \ | 1469 "punpcklwd %%xmm1,%%xmm5 \n" \ |
1466 "punpckhwd %%xmm1,%%xmm0 \n" \ | 1470 "punpckhwd %%xmm1,%%xmm0 \n" \ |
1467 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ | 1471 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ |
1468 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ | 1472 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ |
1469 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" | 1473 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" |
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1515 [u_buf]"+r"(u_buf), // %[u_buf] | 1519 [u_buf]"+r"(u_buf), // %[u_buf] |
1516 [v_buf]"+r"(v_buf), // %[v_buf] | 1520 [v_buf]"+r"(v_buf), // %[v_buf] |
1517 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] | 1521 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] |
1518 [width]"+rm"(width) // %[width] | 1522 [width]"+rm"(width) // %[width] |
1519 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1523 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1520 : "memory", "cc", NACL_R14 | 1524 : "memory", "cc", NACL_R14 |
1521 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1525 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1522 ); | 1526 ); |
1523 } | 1527 } |
1524 | 1528 |
1525 // TODO(fbarchard): Consider putting masks into constants. | |
1526 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, | 1529 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, |
1527 const uint8* u_buf, | 1530 const uint8* u_buf, |
1528 const uint8* v_buf, | 1531 const uint8* v_buf, |
1529 uint8* dst_rgb24, | 1532 uint8* dst_rgb24, |
1530 struct YuvConstants* yuvconstants, | 1533 struct YuvConstants* yuvconstants, |
1531 int width) { | 1534 int width) { |
1532 asm volatile ( | 1535 asm volatile ( |
1533 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" | 1536 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" |
1534 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" | 1537 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" |
1535 "sub %[u_buf],%[v_buf] \n" | 1538 "sub %[u_buf],%[v_buf] \n" |
(...skipping 286 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1822 | 1825 |
1823 #endif // HAS_I422TOARGBROW_SSSE3 | 1826 #endif // HAS_I422TOARGBROW_SSSE3 |
1824 | 1827 |
1825 // Read 8 UV from 422, upsample to 16 UV. | 1828 // Read 8 UV from 422, upsample to 16 UV. |
1826 #define READYUV422_AVX2 \ | 1829 #define READYUV422_AVX2 \ |
1827 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | 1830 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
1828 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ | 1831 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
1829 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ | 1832 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ |
1830 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ | 1833 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
1831 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1834 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
1832 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" | 1835 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ |
| 1836 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
| 1837 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
| 1838 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
| 1839 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" |
| 1840 |
| 1841 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. |
| 1842 #define READYUY2_AVX2 \ |
| 1843 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \ |
| 1844 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ |
| 1845 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \ |
| 1846 "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ |
| 1847 "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n" |
| 1848 |
| 1849 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. |
| 1850 #define READUYVY_AVX2 \ |
| 1851 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \ |
| 1852 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ |
| 1853 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \ |
| 1854 "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ |
| 1855 "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n" |
1833 | 1856 |
1834 // Convert 16 pixels: 16 UV and 16 Y. | 1857 // Convert 16 pixels: 16 UV and 16 Y. |
1835 #define YUVTORGB_AVX2(YuvConstants) \ | 1858 #define YUVTORGB_AVX2(YuvConstants) \ |
1836 "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \ | 1859 "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \ |
1837 "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \ | 1860 "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \ |
1838 "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \ | 1861 "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \ |
1839 "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \ | 1862 "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \ |
1840 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ | 1863 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ |
1841 "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm3 \n" \ | 1864 "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm3 \n" \ |
1842 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ | 1865 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ |
1843 "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm3 \n" \ | 1866 "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm3 \n" \ |
1844 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ | 1867 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ |
1845 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \ | 1868 "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm4,%%ymm4 \n" \ |
1846 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ | 1869 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ |
1847 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ | 1870 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ |
1848 "vpunpcklbw %%ymm3,%%ymm3,%%ymm3 \n" \ | 1871 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ |
1849 "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3 \n" \ | 1872 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ |
1850 "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" \ | 1873 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ |
1851 "vpaddsw %%ymm3,%%ymm1,%%ymm1 \n" \ | 1874 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ |
1852 "vpaddsw %%ymm3,%%ymm2,%%ymm2 \n" \ | 1875 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ |
1853 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ | 1876 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ |
1854 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ | 1877 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" |
1855 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ | 1878 |
1856 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ | 1879 // Store 16 ARGB values. Assumes XMM5 is set. |
1857 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ | 1880 #define STOREARGB_AVX2 \ |
1858 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" | 1881 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
| 1882 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
| 1883 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ |
| 1884 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ |
| 1885 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ |
| 1886 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ |
| 1887 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ |
| 1888 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) " \n" \ |
| 1889 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" |
1859 | 1890 |
1860 #if defined(HAS_I422TOBGRAROW_AVX2) | 1891 #if defined(HAS_I422TOBGRAROW_AVX2) |
1861 // 16 pixels | 1892 // 16 pixels |
1862 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). | 1893 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). |
1863 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, | 1894 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, |
1864 const uint8* u_buf, | 1895 const uint8* u_buf, |
1865 const uint8* v_buf, | 1896 const uint8* v_buf, |
1866 uint8* dst_bgra, | 1897 uint8* dst_bgra, |
1867 struct YuvConstants* yuvconstants, | 1898 struct YuvConstants* yuvconstants, |
1868 int width) { | 1899 int width) { |
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1909 uint8* dst_argb, | 1940 uint8* dst_argb, |
1910 struct YuvConstants* yuvconstants, | 1941 struct YuvConstants* yuvconstants, |
1911 int width) { | 1942 int width) { |
1912 asm volatile ( | 1943 asm volatile ( |
1913 "sub %[u_buf],%[v_buf] \n" | 1944 "sub %[u_buf],%[v_buf] \n" |
1914 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 1945 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
1915 LABELALIGN | 1946 LABELALIGN |
1916 "1: \n" | 1947 "1: \n" |
1917 READYUV422_AVX2 | 1948 READYUV422_AVX2 |
1918 YUVTORGB_AVX2(yuvconstants) | 1949 YUVTORGB_AVX2(yuvconstants) |
1919 | 1950 STOREARGB_AVX2 |
1920 // Step 3: Weave into ARGB | |
1921 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG | |
1922 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
1923 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA | |
1924 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | |
1925 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels | |
1926 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels | |
1927 | |
1928 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n" | |
1929 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n" | |
1930 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" | |
1931 "sub $0x10,%[width] \n" | 1951 "sub $0x10,%[width] \n" |
1932 "jg 1b \n" | 1952 "jg 1b \n" |
1933 "vzeroupper \n" | 1953 "vzeroupper \n" |
1934 : [y_buf]"+r"(y_buf), // %[y_buf] | 1954 : [y_buf]"+r"(y_buf), // %[y_buf] |
1935 [u_buf]"+r"(u_buf), // %[u_buf] | 1955 [u_buf]"+r"(u_buf), // %[u_buf] |
1936 [v_buf]"+r"(v_buf), // %[v_buf] | 1956 [v_buf]"+r"(v_buf), // %[v_buf] |
1937 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1957 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1938 [width]"+rm"(width) // %[width] | 1958 [width]"+rm"(width) // %[width] |
1939 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1959 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1940 : "memory", "cc", NACL_R14 | 1960 : "memory", "cc", NACL_R14 |
(...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2020 [v_buf]"+r"(v_buf), // %[v_buf] | 2040 [v_buf]"+r"(v_buf), // %[v_buf] |
2021 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2041 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
2022 [width]"+rm"(width) // %[width] | 2042 [width]"+rm"(width) // %[width] |
2023 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 2043 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
2024 : "memory", "cc", NACL_R14 | 2044 : "memory", "cc", NACL_R14 |
2025 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2045 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
2026 ); | 2046 ); |
2027 } | 2047 } |
2028 #endif // HAS_I422TORGBAROW_AVX2 | 2048 #endif // HAS_I422TORGBAROW_AVX2 |
2029 | 2049 |
| 2050 #if defined(HAS_YUY2TOARGBROW_AVX2) |
| 2051 // 16 pixels. |
| 2052 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
| 2053 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, |
| 2054 uint8* dst_argb, |
| 2055 struct YuvConstants* yuvconstants, |
| 2056 int width) { |
| 2057 |
| 2058 asm volatile ( |
| 2059 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 2060 LABELALIGN |
| 2061 "1: \n" |
| 2062 READYUY2_AVX2 |
| 2063 YUVTORGB_AVX2(yuvconstants) |
| 2064 STOREARGB_AVX2 |
| 2065 "sub $0x10,%[width] \n" |
| 2066 "jg 1b \n" |
| 2067 "vzeroupper \n" |
| 2068 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] |
| 2069 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2070 [width]"+rm"(width) // %[width] |
| 2071 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
| 2072 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), |
| 2073 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) |
| 2074 // Does not use r14. |
| 2075 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2076 ); |
| 2077 } |
| 2078 #endif // HAS_YUY2TOARGBROW_AVX2 |
| 2079 |
| 2080 #if defined(HAS_UYVYTOARGBROW_AVX2) |
| 2081 // 16 pixels. |
| 2082 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
| 2083 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, |
| 2084 uint8* dst_argb, |
| 2085 struct YuvConstants* yuvconstants, |
| 2086 int width) { |
| 2087 |
| 2088 asm volatile ( |
| 2089 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 2090 LABELALIGN |
| 2091 "1: \n" |
| 2092 READUYVY_AVX2 |
| 2093 YUVTORGB_AVX2(yuvconstants) |
| 2094 STOREARGB_AVX2 |
| 2095 "sub $0x10,%[width] \n" |
| 2096 "jg 1b \n" |
| 2097 "vzeroupper \n" |
| 2098 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] |
| 2099 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2100 [width]"+rm"(width) // %[width] |
| 2101 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
| 2102 [kShuffleUYVYY]"m"(kShuffleUYVYY), |
| 2103 [kShuffleUYVYUV]"m"(kShuffleUYVYUV) |
| 2104 // Does not use r14. |
| 2105 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2106 ); |
| 2107 } |
| 2108 #endif // HAS_UYVYTOARGBROW_AVX2 |
| 2109 |
2030 #ifdef HAS_I400TOARGBROW_SSE2 | 2110 #ifdef HAS_I400TOARGBROW_SSE2 |
2031 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { | 2111 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { |
2032 asm volatile ( | 2112 asm volatile ( |
2033 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 | 2113 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 |
2034 "movd %%eax,%%xmm2 \n" | 2114 "movd %%eax,%%xmm2 \n" |
2035 "pshufd $0x0,%%xmm2,%%xmm2 \n" | 2115 "pshufd $0x0,%%xmm2,%%xmm2 \n" |
2036 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16 | 2116 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16 |
2037 "movd %%eax,%%xmm3 \n" | 2117 "movd %%eax,%%xmm3 \n" |
2038 "pshufd $0x0,%%xmm3,%%xmm3 \n" | 2118 "pshufd $0x0,%%xmm3,%%xmm3 \n" |
2039 "pcmpeqb %%xmm4,%%xmm4 \n" | 2119 "pcmpeqb %%xmm4,%%xmm4 \n" |
(...skipping 3254 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5294 ); | 5374 ); |
5295 } | 5375 } |
5296 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5376 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
5297 | 5377 |
5298 #endif // defined(__x86_64__) || defined(__i386__) | 5378 #endif // defined(__x86_64__) || defined(__i386__) |
5299 | 5379 |
5300 #ifdef __cplusplus | 5380 #ifdef __cplusplus |
5301 } // extern "C" | 5381 } // extern "C" |
5302 } // namespace libyuv | 5382 } // namespace libyuv |
5303 #endif | 5383 #endif |
OLD | NEW |