| OLD | NEW |
| 1 // VERSION 2 | 1 // VERSION 2 |
| 2 /* | 2 /* |
| 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
| 4 * | 4 * |
| 5 * Use of this source code is governed by a BSD-style license | 5 * Use of this source code is governed by a BSD-style license |
| 6 * that can be found in the LICENSE file in the root of the source | 6 * that can be found in the LICENSE file in the root of the source |
| 7 * tree. An additional intellectual property rights grant can be found | 7 * tree. An additional intellectual property rights grant can be found |
| 8 * in the file PATENTS. All contributing project authors may | 8 * in the file PATENTS. All contributing project authors may |
| 9 * be found in the AUTHORS file in the root of the source tree. | 9 * be found in the AUTHORS file in the root of the source tree. |
| 10 */ | 10 */ |
| (...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 133 | 133 |
| 134 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 | 134 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 |
| 135 static uvec8 kShuffleMaskARGBToRGB24_0 = { | 135 static uvec8 kShuffleMaskARGBToRGB24_0 = { |
| 136 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u | 136 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u |
| 137 }; | 137 }; |
| 138 | 138 |
| 139 // Shuffle table for converting ARGB to RAW. | 139 // Shuffle table for converting ARGB to RAW. |
| 140 static uvec8 kShuffleMaskARGBToRAW_0 = { | 140 static uvec8 kShuffleMaskARGBToRAW_0 = { |
| 141 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u | 141 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u |
| 142 }; | 142 }; |
| 143 |
| 144 // YUY2 shuf 16 Y to 32 Y. |
| 145 static const lvec8 kShuffleYUY2Y = { |
| 146 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, |
| 147 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 |
| 148 }; |
| 149 |
| 150 // YUY2 shuf 8 UV to 16 UV. |
| 151 static const lvec8 kShuffleYUY2UV = { |
| 152 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15, |
| 153 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15 |
| 154 }; |
| 155 |
| 156 // UYVY shuf 16 Y to 32 Y. |
| 157 static const lvec8 kShuffleUYVYY = { |
| 158 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, |
| 159 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 |
| 160 }; |
| 161 |
| 162 // UYVY shuf 8 UV to 16 UV. |
| 163 static const lvec8 kShuffleUYVYUV = { |
| 164 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14, |
| 165 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 |
| 166 }; |
| 143 #endif // HAS_RGB24TOARGBROW_SSSE3 | 167 #endif // HAS_RGB24TOARGBROW_SSSE3 |
| 144 | 168 |
| 145 #ifdef HAS_J400TOARGBROW_SSE2 | 169 #ifdef HAS_J400TOARGBROW_SSE2 |
| 146 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { | 170 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { |
| 147 asm volatile ( | 171 asm volatile ( |
| 148 "pcmpeqb %%xmm5,%%xmm5 \n" | 172 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 149 "pslld $0x18,%%xmm5 \n" | 173 "pslld $0x18,%%xmm5 \n" |
| 150 LABELALIGN | 174 LABELALIGN |
| 151 "1: \n" | 175 "1: \n" |
| 152 "movq " MEMACCESS(0) ",%%xmm0 \n" | 176 "movq " MEMACCESS(0) ",%%xmm0 \n" |
| (...skipping 1201 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1354 | 1378 |
| 1355 // Read 4 UV from NV12, upsample to 8 UV | 1379 // Read 4 UV from NV12, upsample to 8 UV |
| 1356 #define READNV12 \ | 1380 #define READNV12 \ |
| 1357 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ | 1381 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ |
| 1358 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ | 1382 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ |
| 1359 "punpcklwd %%xmm0,%%xmm0 \n" \ | 1383 "punpcklwd %%xmm0,%%xmm0 \n" \ |
| 1360 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1384 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
| 1361 "punpcklbw %%xmm4,%%xmm4 \n" \ | 1385 "punpcklbw %%xmm4,%%xmm4 \n" \ |
| 1362 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" | 1386 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" |
| 1363 | 1387 |
| 1364 // YUY2 shuf 8 Y to 16 Y. | |
| 1365 static const vec8 kShuffleYUY2Y = { | |
| 1366 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 | |
| 1367 }; | |
| 1368 | |
| 1369 // YUY2 shuf 4 UV to 8 UV. | |
| 1370 static const vec8 kShuffleYUY2UV = { | |
| 1371 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15 | |
| 1372 }; | |
| 1373 | |
| 1374 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. | 1388 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. |
| 1375 #define READYUY2 \ | 1389 #define READYUY2 \ |
| 1376 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \ | 1390 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \ |
| 1377 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ | 1391 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ |
| 1378 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \ | 1392 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \ |
| 1379 "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ | 1393 "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ |
| 1380 "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n" | 1394 "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n" |
| 1381 | 1395 |
| 1382 // UYVY shuf 8 Y to 16 Y. | |
| 1383 static const vec8 kShuffleUYVYY = { | |
| 1384 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 | |
| 1385 }; | |
| 1386 | |
| 1387 // UYVY shuf 4 UV to 8 UV. | |
| 1388 static const vec8 kShuffleUYVYUV = { | |
| 1389 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 | |
| 1390 }; | |
| 1391 | |
| 1392 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. | 1396 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. |
| 1393 #define READUYVY \ | 1397 #define READUYVY \ |
| 1394 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \ | 1398 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \ |
| 1395 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ | 1399 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ |
| 1396 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \ | 1400 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \ |
| 1397 "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ | 1401 "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ |
| 1398 "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n" | 1402 "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n" |
| 1399 | 1403 |
| 1400 // Convert 8 pixels: 8 UV and 8 Y | 1404 // Convert 8 pixels: 8 UV and 8 Y |
| 1401 #define YUVTORGB(yuvconstants) \ | 1405 #define YUVTORGB(yuvconstants) \ |
| (...skipping 13 matching lines...) Expand all Loading... |
| 1415 "paddsw %%xmm4,%%xmm0 \n" \ | 1419 "paddsw %%xmm4,%%xmm0 \n" \ |
| 1416 "paddsw %%xmm4,%%xmm1 \n" \ | 1420 "paddsw %%xmm4,%%xmm1 \n" \ |
| 1417 "paddsw %%xmm4,%%xmm2 \n" \ | 1421 "paddsw %%xmm4,%%xmm2 \n" \ |
| 1418 "psraw $0x6,%%xmm0 \n" \ | 1422 "psraw $0x6,%%xmm0 \n" \ |
| 1419 "psraw $0x6,%%xmm1 \n" \ | 1423 "psraw $0x6,%%xmm1 \n" \ |
| 1420 "psraw $0x6,%%xmm2 \n" \ | 1424 "psraw $0x6,%%xmm2 \n" \ |
| 1421 "packuswb %%xmm0,%%xmm0 \n" \ | 1425 "packuswb %%xmm0,%%xmm0 \n" \ |
| 1422 "packuswb %%xmm1,%%xmm1 \n" \ | 1426 "packuswb %%xmm1,%%xmm1 \n" \ |
| 1423 "packuswb %%xmm2,%%xmm2 \n" | 1427 "packuswb %%xmm2,%%xmm2 \n" |
| 1424 | 1428 |
| 1425 // Store 8 ARGB values. Assumes XMM5 is zero. | 1429 // Store 8 ARGB values. Assumes XMM5 is set. |
| 1426 #define STOREARGB \ | 1430 #define STOREARGB \ |
| 1427 "punpcklbw %%xmm1,%%xmm0 \n" \ | 1431 "punpcklbw %%xmm1,%%xmm0 \n" \ |
| 1428 "punpcklbw %%xmm5,%%xmm2 \n" \ | 1432 "punpcklbw %%xmm5,%%xmm2 \n" \ |
| 1429 "movdqa %%xmm0,%%xmm1 \n" \ | 1433 "movdqa %%xmm0,%%xmm1 \n" \ |
| 1430 "punpcklwd %%xmm2,%%xmm0 \n" \ | 1434 "punpcklwd %%xmm2,%%xmm0 \n" \ |
| 1431 "punpckhwd %%xmm2,%%xmm1 \n" \ | 1435 "punpckhwd %%xmm2,%%xmm1 \n" \ |
| 1432 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ | 1436 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ |
| 1433 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ | 1437 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ |
| 1434 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" | 1438 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" |
| 1435 | 1439 |
| 1436 // Store 8 BGRA values. Assumes XMM5 is zero. | 1440 // Store 8 BGRA values. |
| 1437 #define STOREBGRA \ | 1441 #define STOREBGRA \ |
| 1438 "pcmpeqb %%xmm5,%%xmm5 \n" \ | 1442 "pcmpeqb %%xmm5,%%xmm5 \n" \ |
| 1439 "punpcklbw %%xmm0,%%xmm1 \n" \ | 1443 "punpcklbw %%xmm0,%%xmm1 \n" \ |
| 1440 "punpcklbw %%xmm2,%%xmm5 \n" \ | 1444 "punpcklbw %%xmm2,%%xmm5 \n" \ |
| 1441 "movdqa %%xmm5,%%xmm0 \n" \ | 1445 "movdqa %%xmm5,%%xmm0 \n" \ |
| 1442 "punpcklwd %%xmm1,%%xmm5 \n" \ | 1446 "punpcklwd %%xmm1,%%xmm5 \n" \ |
| 1443 "punpckhwd %%xmm1,%%xmm0 \n" \ | 1447 "punpckhwd %%xmm1,%%xmm0 \n" \ |
| 1444 "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \ | 1448 "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \ |
| 1445 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \ | 1449 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \ |
| 1446 "lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n" | 1450 "lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n" |
| 1447 | 1451 |
| 1448 // Store 8 ABGR values. Assumes XMM5 is zero. | 1452 // Store 8 ABGR values. Assumes XMM5 is set. |
| 1449 #define STOREABGR \ | 1453 #define STOREABGR \ |
| 1450 "punpcklbw %%xmm1,%%xmm2 \n" \ | 1454 "punpcklbw %%xmm1,%%xmm2 \n" \ |
| 1451 "punpcklbw %%xmm5,%%xmm0 \n" \ | 1455 "punpcklbw %%xmm5,%%xmm0 \n" \ |
| 1452 "movdqa %%xmm2,%%xmm1 \n" \ | 1456 "movdqa %%xmm2,%%xmm1 \n" \ |
| 1453 "punpcklwd %%xmm0,%%xmm2 \n" \ | 1457 "punpcklwd %%xmm0,%%xmm2 \n" \ |
| 1454 "punpckhwd %%xmm0,%%xmm1 \n" \ | 1458 "punpckhwd %%xmm0,%%xmm1 \n" \ |
| 1455 "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \ | 1459 "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \ |
| 1456 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \ | 1460 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \ |
| 1457 "lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n" | 1461 "lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n" |
| 1458 | 1462 |
| 1459 // Store 8 RGBA values. Assumes XMM5 is zero. | 1463 // Store 8 RGBA values. Assumes XMM5 is set. |
| 1460 #define STORERGBA \ | 1464 #define STORERGBA \ |
| 1461 "pcmpeqb %%xmm5,%%xmm5 \n" \ | 1465 "pcmpeqb %%xmm5,%%xmm5 \n" \ |
| 1462 "punpcklbw %%xmm2,%%xmm1 \n" \ | 1466 "punpcklbw %%xmm2,%%xmm1 \n" \ |
| 1463 "punpcklbw %%xmm0,%%xmm5 \n" \ | 1467 "punpcklbw %%xmm0,%%xmm5 \n" \ |
| 1464 "movdqa %%xmm5,%%xmm0 \n" \ | 1468 "movdqa %%xmm5,%%xmm0 \n" \ |
| 1465 "punpcklwd %%xmm1,%%xmm5 \n" \ | 1469 "punpcklwd %%xmm1,%%xmm5 \n" \ |
| 1466 "punpckhwd %%xmm1,%%xmm0 \n" \ | 1470 "punpckhwd %%xmm1,%%xmm0 \n" \ |
| 1467 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ | 1471 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ |
| 1468 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ | 1472 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ |
| 1469 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" | 1473 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" |
| (...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1515 [u_buf]"+r"(u_buf), // %[u_buf] | 1519 [u_buf]"+r"(u_buf), // %[u_buf] |
| 1516 [v_buf]"+r"(v_buf), // %[v_buf] | 1520 [v_buf]"+r"(v_buf), // %[v_buf] |
| 1517 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] | 1521 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] |
| 1518 [width]"+rm"(width) // %[width] | 1522 [width]"+rm"(width) // %[width] |
| 1519 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1523 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 1520 : "memory", "cc", NACL_R14 | 1524 : "memory", "cc", NACL_R14 |
| 1521 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1525 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 1522 ); | 1526 ); |
| 1523 } | 1527 } |
| 1524 | 1528 |
| 1525 // TODO(fbarchard): Consider putting masks into constants. | |
| 1526 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, | 1529 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, |
| 1527 const uint8* u_buf, | 1530 const uint8* u_buf, |
| 1528 const uint8* v_buf, | 1531 const uint8* v_buf, |
| 1529 uint8* dst_rgb24, | 1532 uint8* dst_rgb24, |
| 1530 struct YuvConstants* yuvconstants, | 1533 struct YuvConstants* yuvconstants, |
| 1531 int width) { | 1534 int width) { |
| 1532 asm volatile ( | 1535 asm volatile ( |
| 1533 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" | 1536 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" |
| 1534 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" | 1537 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" |
| 1535 "sub %[u_buf],%[v_buf] \n" | 1538 "sub %[u_buf],%[v_buf] \n" |
| (...skipping 286 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1822 | 1825 |
| 1823 #endif // HAS_I422TOARGBROW_SSSE3 | 1826 #endif // HAS_I422TOARGBROW_SSSE3 |
| 1824 | 1827 |
| 1825 // Read 8 UV from 422, upsample to 16 UV. | 1828 // Read 8 UV from 422, upsample to 16 UV. |
| 1826 #define READYUV422_AVX2 \ | 1829 #define READYUV422_AVX2 \ |
| 1827 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | 1830 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
| 1828 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ | 1831 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
| 1829 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ | 1832 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ |
| 1830 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ | 1833 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
| 1831 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1834 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
| 1832 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" | 1835 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ |
| 1836 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
| 1837 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
| 1838 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
| 1839 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" |
| 1840 |
| 1841 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. |
| 1842 #define READYUY2_AVX2 \ |
| 1843 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \ |
| 1844 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ |
| 1845 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \ |
| 1846 "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ |
| 1847 "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n" |
| 1848 |
| 1849 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. |
| 1850 #define READUYVY_AVX2 \ |
| 1851 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \ |
| 1852 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ |
| 1853 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \ |
| 1854 "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ |
| 1855 "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n" |
| 1833 | 1856 |
| 1834 // Convert 16 pixels: 16 UV and 16 Y. | 1857 // Convert 16 pixels: 16 UV and 16 Y. |
| 1835 #define YUVTORGB_AVX2(YuvConstants) \ | 1858 #define YUVTORGB_AVX2(YuvConstants) \ |
| 1836 "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \ | 1859 "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \ |
| 1837 "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \ | 1860 "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \ |
| 1838 "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \ | 1861 "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \ |
| 1839 "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \ | 1862 "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \ |
| 1840 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ | 1863 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ |
| 1841 "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm3 \n" \ | 1864 "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm3 \n" \ |
| 1842 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ | 1865 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ |
| 1843 "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm3 \n" \ | 1866 "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm3 \n" \ |
| 1844 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ | 1867 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ |
| 1845 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \ | 1868 "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm4,%%ymm4 \n" \ |
| 1846 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ | 1869 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ |
| 1847 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ | 1870 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ |
| 1848 "vpunpcklbw %%ymm3,%%ymm3,%%ymm3 \n" \ | 1871 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ |
| 1849 "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3 \n" \ | 1872 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ |
| 1850 "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" \ | 1873 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ |
| 1851 "vpaddsw %%ymm3,%%ymm1,%%ymm1 \n" \ | 1874 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ |
| 1852 "vpaddsw %%ymm3,%%ymm2,%%ymm2 \n" \ | 1875 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ |
| 1853 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ | 1876 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ |
| 1854 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ | 1877 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" |
| 1855 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ | 1878 |
| 1856 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ | 1879 // Store 16 ARGB values. Assumes XMM5 is set. |
| 1857 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ | 1880 #define STOREARGB_AVX2 \ |
| 1858 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" | 1881 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
| 1882 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
| 1883 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ |
| 1884 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ |
| 1885 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ |
| 1886 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ |
| 1887 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ |
| 1888 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) " \n" \ |
| 1889 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" |
| 1859 | 1890 |
| 1860 #if defined(HAS_I422TOBGRAROW_AVX2) | 1891 #if defined(HAS_I422TOBGRAROW_AVX2) |
| 1861 // 16 pixels | 1892 // 16 pixels |
| 1862 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). | 1893 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). |
| 1863 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, | 1894 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, |
| 1864 const uint8* u_buf, | 1895 const uint8* u_buf, |
| 1865 const uint8* v_buf, | 1896 const uint8* v_buf, |
| 1866 uint8* dst_bgra, | 1897 uint8* dst_bgra, |
| 1867 struct YuvConstants* yuvconstants, | 1898 struct YuvConstants* yuvconstants, |
| 1868 int width) { | 1899 int width) { |
| (...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1909 uint8* dst_argb, | 1940 uint8* dst_argb, |
| 1910 struct YuvConstants* yuvconstants, | 1941 struct YuvConstants* yuvconstants, |
| 1911 int width) { | 1942 int width) { |
| 1912 asm volatile ( | 1943 asm volatile ( |
| 1913 "sub %[u_buf],%[v_buf] \n" | 1944 "sub %[u_buf],%[v_buf] \n" |
| 1914 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 1945 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 1915 LABELALIGN | 1946 LABELALIGN |
| 1916 "1: \n" | 1947 "1: \n" |
| 1917 READYUV422_AVX2 | 1948 READYUV422_AVX2 |
| 1918 YUVTORGB_AVX2(yuvconstants) | 1949 YUVTORGB_AVX2(yuvconstants) |
| 1919 | 1950 STOREARGB_AVX2 |
| 1920 // Step 3: Weave into ARGB | |
| 1921 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG | |
| 1922 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
| 1923 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA | |
| 1924 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | |
| 1925 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels | |
| 1926 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels | |
| 1927 | |
| 1928 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n" | |
| 1929 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n" | |
| 1930 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" | |
| 1931 "sub $0x10,%[width] \n" | 1951 "sub $0x10,%[width] \n" |
| 1932 "jg 1b \n" | 1952 "jg 1b \n" |
| 1933 "vzeroupper \n" | 1953 "vzeroupper \n" |
| 1934 : [y_buf]"+r"(y_buf), // %[y_buf] | 1954 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1935 [u_buf]"+r"(u_buf), // %[u_buf] | 1955 [u_buf]"+r"(u_buf), // %[u_buf] |
| 1936 [v_buf]"+r"(v_buf), // %[v_buf] | 1956 [v_buf]"+r"(v_buf), // %[v_buf] |
| 1937 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1957 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 1938 [width]"+rm"(width) // %[width] | 1958 [width]"+rm"(width) // %[width] |
| 1939 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1959 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 1940 : "memory", "cc", NACL_R14 | 1960 : "memory", "cc", NACL_R14 |
| (...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2020 [v_buf]"+r"(v_buf), // %[v_buf] | 2040 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2021 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2041 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2022 [width]"+rm"(width) // %[width] | 2042 [width]"+rm"(width) // %[width] |
| 2023 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 2043 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 2024 : "memory", "cc", NACL_R14 | 2044 : "memory", "cc", NACL_R14 |
| 2025 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2045 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2026 ); | 2046 ); |
| 2027 } | 2047 } |
| 2028 #endif // HAS_I422TORGBAROW_AVX2 | 2048 #endif // HAS_I422TORGBAROW_AVX2 |
| 2029 | 2049 |
| 2050 #if defined(HAS_YUY2TOARGBROW_AVX2) |
| 2051 // 16 pixels. |
| 2052 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
| 2053 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, |
| 2054 uint8* dst_argb, |
| 2055 struct YuvConstants* yuvconstants, |
| 2056 int width) { |
| 2057 |
| 2058 asm volatile ( |
| 2059 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 2060 LABELALIGN |
| 2061 "1: \n" |
| 2062 READYUY2_AVX2 |
| 2063 YUVTORGB_AVX2(yuvconstants) |
| 2064 STOREARGB_AVX2 |
| 2065 "sub $0x10,%[width] \n" |
| 2066 "jg 1b \n" |
| 2067 "vzeroupper \n" |
| 2068 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] |
| 2069 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2070 [width]"+rm"(width) // %[width] |
| 2071 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
| 2072 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), |
| 2073 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) |
| 2074 // Does not use r14. |
| 2075 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2076 ); |
| 2077 } |
| 2078 #endif // HAS_YUY2TOARGBROW_AVX2 |
| 2079 |
| 2080 #if defined(HAS_UYVYTOARGBROW_AVX2) |
| 2081 // 16 pixels. |
| 2082 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
| 2083 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, |
| 2084 uint8* dst_argb, |
| 2085 struct YuvConstants* yuvconstants, |
| 2086 int width) { |
| 2087 |
| 2088 asm volatile ( |
| 2089 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 2090 LABELALIGN |
| 2091 "1: \n" |
| 2092 READUYVY_AVX2 |
| 2093 YUVTORGB_AVX2(yuvconstants) |
| 2094 STOREARGB_AVX2 |
| 2095 "sub $0x10,%[width] \n" |
| 2096 "jg 1b \n" |
| 2097 "vzeroupper \n" |
| 2098 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] |
| 2099 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2100 [width]"+rm"(width) // %[width] |
| 2101 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
| 2102 [kShuffleUYVYY]"m"(kShuffleUYVYY), |
| 2103 [kShuffleUYVYUV]"m"(kShuffleUYVYUV) |
| 2104 // Does not use r14. |
| 2105 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2106 ); |
| 2107 } |
| 2108 #endif // HAS_UYVYTOARGBROW_AVX2 |
| 2109 |
| 2030 #ifdef HAS_I400TOARGBROW_SSE2 | 2110 #ifdef HAS_I400TOARGBROW_SSE2 |
| 2031 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { | 2111 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { |
| 2032 asm volatile ( | 2112 asm volatile ( |
| 2033 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 | 2113 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 |
| 2034 "movd %%eax,%%xmm2 \n" | 2114 "movd %%eax,%%xmm2 \n" |
| 2035 "pshufd $0x0,%%xmm2,%%xmm2 \n" | 2115 "pshufd $0x0,%%xmm2,%%xmm2 \n" |
| 2036 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16 | 2116 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16 |
| 2037 "movd %%eax,%%xmm3 \n" | 2117 "movd %%eax,%%xmm3 \n" |
| 2038 "pshufd $0x0,%%xmm3,%%xmm3 \n" | 2118 "pshufd $0x0,%%xmm3,%%xmm3 \n" |
| 2039 "pcmpeqb %%xmm4,%%xmm4 \n" | 2119 "pcmpeqb %%xmm4,%%xmm4 \n" |
| (...skipping 3254 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5294 ); | 5374 ); |
| 5295 } | 5375 } |
| 5296 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5376 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
| 5297 | 5377 |
| 5298 #endif // defined(__x86_64__) || defined(__i386__) | 5378 #endif // defined(__x86_64__) || defined(__i386__) |
| 5299 | 5379 |
| 5300 #ifdef __cplusplus | 5380 #ifdef __cplusplus |
| 5301 } // extern "C" | 5381 } // extern "C" |
| 5302 } // namespace libyuv | 5382 } // namespace libyuv |
| 5303 #endif | 5383 #endif |
| OLD | NEW |