OLD | NEW |
1 // VERSION 2 | 1 // VERSION 2 |
2 /* | 2 /* |
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
4 * | 4 * |
5 * Use of this source code is governed by a BSD-style license | 5 * Use of this source code is governed by a BSD-style license |
6 * that can be found in the LICENSE file in the root of the source | 6 * that can be found in the LICENSE file in the root of the source |
7 * tree. An additional intellectual property rights grant can be found | 7 * tree. An additional intellectual property rights grant can be found |
8 * in the file PATENTS. All contributing project authors may | 8 * in the file PATENTS. All contributing project authors may |
9 * be found in the AUTHORS file in the root of the source tree. | 9 * be found in the AUTHORS file in the root of the source tree. |
10 */ | 10 */ |
(...skipping 1332 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1343 "punpcklwd %%xmm0,%%xmm0 \n" \ | 1343 "punpcklwd %%xmm0,%%xmm0 \n" \ |
1344 "punpckldq %%xmm0,%%xmm0 \n" | 1344 "punpckldq %%xmm0,%%xmm0 \n" |
1345 | 1345 |
1346 // Read 4 UV from NV12, upsample to 8 UV | 1346 // Read 4 UV from NV12, upsample to 8 UV |
1347 #define READNV12 \ | 1347 #define READNV12 \ |
1348 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ | 1348 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ |
1349 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ | 1349 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ |
1350 "punpcklwd %%xmm0,%%xmm0 \n" | 1350 "punpcklwd %%xmm0,%%xmm0 \n" |
1351 | 1351 |
1352 // Convert 8 pixels: 8 UV and 8 Y | 1352 // Convert 8 pixels: 8 UV and 8 Y |
1353 #define YUVTORGB(YuvConstants) \ | 1353 #define YUVTORGB(yuvconstants) \ |
1354 "movdqa %%xmm0,%%xmm1 \n" \ | 1354 "movdqa %%xmm0,%%xmm1 \n" \ |
1355 "movdqa %%xmm0,%%xmm2 \n" \ | 1355 "movdqa %%xmm0,%%xmm2 \n" \ |
1356 "movdqa %%xmm0,%%xmm3 \n" \ | 1356 "movdqa %%xmm0,%%xmm3 \n" \ |
1357 "movdqa " MEMACCESS2(96, [YuvConstants]) ",%%xmm0 \n" \ | 1357 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \ |
1358 "pmaddubsw " MEMACCESS([YuvConstants]) ",%%xmm1 \n" \ | 1358 "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \ |
1359 "psubw %%xmm1,%%xmm0 \n" \ | 1359 "psubw %%xmm1,%%xmm0 \n" \ |
1360 "movdqa " MEMACCESS2(128, [YuvConstants]) ",%%xmm1 \n" \ | 1360 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \ |
1361 "pmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%xmm2 \n" \ | 1361 "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \ |
1362 "psubw %%xmm2,%%xmm1 \n" \ | 1362 "psubw %%xmm2,%%xmm1 \n" \ |
1363 "movdqa " MEMACCESS2(160, [YuvConstants]) ",%%xmm2 \n" \ | 1363 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \ |
1364 "pmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%xmm3 \n" \ | 1364 "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \ |
1365 "psubw %%xmm3,%%xmm2 \n" \ | 1365 "psubw %%xmm3,%%xmm2 \n" \ |
1366 "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ | 1366 "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ |
1367 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ | 1367 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ |
1368 "punpcklbw %%xmm3,%%xmm3 \n" \ | 1368 "punpcklbw %%xmm3,%%xmm3 \n" \ |
1369 "pmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%xmm3 \n" \ | 1369 "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm3 \n" \ |
1370 "paddsw %%xmm3,%%xmm0 \n" \ | 1370 "paddsw %%xmm3,%%xmm0 \n" \ |
1371 "paddsw %%xmm3,%%xmm1 \n" \ | 1371 "paddsw %%xmm3,%%xmm1 \n" \ |
1372 "paddsw %%xmm3,%%xmm2 \n" \ | 1372 "paddsw %%xmm3,%%xmm2 \n" \ |
1373 "psraw $0x6,%%xmm0 \n" \ | 1373 "psraw $0x6,%%xmm0 \n" \ |
1374 "psraw $0x6,%%xmm1 \n" \ | 1374 "psraw $0x6,%%xmm1 \n" \ |
1375 "psraw $0x6,%%xmm2 \n" \ | 1375 "psraw $0x6,%%xmm2 \n" \ |
1376 "packuswb %%xmm0,%%xmm0 \n" \ | 1376 "packuswb %%xmm0,%%xmm0 \n" \ |
1377 "packuswb %%xmm1,%%xmm1 \n" \ | 1377 "packuswb %%xmm1,%%xmm1 \n" \ |
1378 "packuswb %%xmm2,%%xmm2 \n" | 1378 "packuswb %%xmm2,%%xmm2 \n" |
1379 | 1379 |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1416 "pcmpeqb %%xmm5,%%xmm5 \n" \ | 1416 "pcmpeqb %%xmm5,%%xmm5 \n" \ |
1417 "punpcklbw %%xmm2,%%xmm1 \n" \ | 1417 "punpcklbw %%xmm2,%%xmm1 \n" \ |
1418 "punpcklbw %%xmm0,%%xmm5 \n" \ | 1418 "punpcklbw %%xmm0,%%xmm5 \n" \ |
1419 "movdqa %%xmm5,%%xmm0 \n" \ | 1419 "movdqa %%xmm5,%%xmm0 \n" \ |
1420 "punpcklwd %%xmm1,%%xmm5 \n" \ | 1420 "punpcklwd %%xmm1,%%xmm5 \n" \ |
1421 "punpckhwd %%xmm1,%%xmm0 \n" \ | 1421 "punpckhwd %%xmm1,%%xmm0 \n" \ |
1422 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ | 1422 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ |
1423 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ | 1423 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ |
1424 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" | 1424 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" |
1425 | 1425 |
1426 void OMITFP I444ToARGBMatrixRow_SSSE3(const uint8* y_buf, | 1426 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, |
1427 const uint8* u_buf, | 1427 const uint8* u_buf, |
1428 const uint8* v_buf, | 1428 const uint8* v_buf, |
1429 uint8* dst_argb, | 1429 uint8* dst_argb, |
1430 struct YuvConstants* YuvConstants, | 1430 struct YuvConstants* yuvconstants, |
1431 int width) { | 1431 int width) { |
1432 asm volatile ( | 1432 asm volatile ( |
1433 "sub %[u_buf],%[v_buf] \n" | 1433 "sub %[u_buf],%[v_buf] \n" |
1434 "pcmpeqb %%xmm5,%%xmm5 \n" | 1434 "pcmpeqb %%xmm5,%%xmm5 \n" |
1435 LABELALIGN | 1435 LABELALIGN |
1436 "1: \n" | 1436 "1: \n" |
1437 READYUV444 | 1437 READYUV444 |
1438 YUVTORGB(YuvConstants) | 1438 YUVTORGB(yuvconstants) |
1439 STOREARGB | 1439 STOREARGB |
1440 "sub $0x8,%[width] \n" | 1440 "sub $0x8,%[width] \n" |
1441 "jg 1b \n" | 1441 "jg 1b \n" |
1442 : [y_buf]"+r"(y_buf), // %[y_buf] | 1442 : [y_buf]"+r"(y_buf), // %[y_buf] |
1443 [u_buf]"+r"(u_buf), // %[u_buf] | 1443 [u_buf]"+r"(u_buf), // %[u_buf] |
1444 [v_buf]"+r"(v_buf), // %[v_buf] | 1444 [v_buf]"+r"(v_buf), // %[v_buf] |
1445 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1445 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1446 [width]"+rm"(width) // %[width] | 1446 [width]"+rm"(width) // %[width] |
1447 : [YuvConstants]"r"(YuvConstants) // %[kYuvConstants] | 1447 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1448 : "memory", "cc", NACL_R14 | 1448 : "memory", "cc", NACL_R14 |
1449 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1449 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
1450 ); | 1450 ); |
1451 } | 1451 } |
1452 | 1452 |
1453 void OMITFP I444ToABGRMatrixRow_SSSE3(const uint8* y_buf, | 1453 void OMITFP I444ToABGRRow_SSSE3(const uint8* y_buf, |
1454 const uint8* u_buf, | 1454 const uint8* u_buf, |
1455 const uint8* v_buf, | 1455 const uint8* v_buf, |
1456 uint8* dst_abgr, | 1456 uint8* dst_abgr, |
1457 struct YuvConstants* YuvConstants, | 1457 struct YuvConstants* yuvconstants, |
1458 int width) { | 1458 int width) { |
1459 asm volatile ( | 1459 asm volatile ( |
1460 "sub %[u_buf],%[v_buf] \n" | 1460 "sub %[u_buf],%[v_buf] \n" |
1461 "pcmpeqb %%xmm5,%%xmm5 \n" | 1461 "pcmpeqb %%xmm5,%%xmm5 \n" |
1462 LABELALIGN | 1462 LABELALIGN |
1463 "1: \n" | 1463 "1: \n" |
1464 READYUV444 | 1464 READYUV444 |
1465 YUVTORGB(YuvConstants) | 1465 YUVTORGB(yuvconstants) |
1466 STOREABGR | 1466 STOREABGR |
1467 "sub $0x8,%[width] \n" | 1467 "sub $0x8,%[width] \n" |
1468 "jg 1b \n" | 1468 "jg 1b \n" |
1469 : [y_buf]"+r"(y_buf), // %[y_buf] | 1469 : [y_buf]"+r"(y_buf), // %[y_buf] |
1470 [u_buf]"+r"(u_buf), // %[u_buf] | 1470 [u_buf]"+r"(u_buf), // %[u_buf] |
1471 [v_buf]"+r"(v_buf), // %[v_buf] | 1471 [v_buf]"+r"(v_buf), // %[v_buf] |
1472 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] | 1472 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] |
1473 [width]"+rm"(width) // %[width] | 1473 [width]"+rm"(width) // %[width] |
1474 : [YuvConstants]"r"(YuvConstants) // %[kYuvConstants] | 1474 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1475 : "memory", "cc", NACL_R14 | 1475 : "memory", "cc", NACL_R14 |
1476 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1476 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
1477 ); | 1477 ); |
1478 } | 1478 } |
1479 | 1479 |
1480 // TODO(fbarchard): Consider putting masks into constants. | 1480 // TODO(fbarchard): Consider putting masks into constants. |
1481 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, | 1481 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, |
1482 const uint8* u_buf, | 1482 const uint8* u_buf, |
1483 const uint8* v_buf, | 1483 const uint8* v_buf, |
1484 uint8* dst_rgb24, | 1484 uint8* dst_rgb24, |
| 1485 struct YuvConstants* yuvconstants, |
1485 int width) { | 1486 int width) { |
1486 asm volatile ( | 1487 asm volatile ( |
1487 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" | 1488 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" |
1488 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" | 1489 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" |
1489 "sub %[u_buf],%[v_buf] \n" | 1490 "sub %[u_buf],%[v_buf] \n" |
1490 LABELALIGN | 1491 LABELALIGN |
1491 "1: \n" | 1492 "1: \n" |
1492 READYUV422 | 1493 READYUV422 |
1493 YUVTORGB(kYuvConstants) | 1494 YUVTORGB(yuvconstants) |
1494 "punpcklbw %%xmm1,%%xmm0 \n" | 1495 "punpcklbw %%xmm1,%%xmm0 \n" |
1495 "punpcklbw %%xmm2,%%xmm2 \n" | 1496 "punpcklbw %%xmm2,%%xmm2 \n" |
1496 "movdqa %%xmm0,%%xmm1 \n" | 1497 "movdqa %%xmm0,%%xmm1 \n" |
1497 "punpcklwd %%xmm2,%%xmm0 \n" | 1498 "punpcklwd %%xmm2,%%xmm0 \n" |
1498 "punpckhwd %%xmm2,%%xmm1 \n" | 1499 "punpckhwd %%xmm2,%%xmm1 \n" |
1499 "pshufb %%xmm5,%%xmm0 \n" | 1500 "pshufb %%xmm5,%%xmm0 \n" |
1500 "pshufb %%xmm6,%%xmm1 \n" | 1501 "pshufb %%xmm6,%%xmm1 \n" |
1501 "palignr $0xc,%%xmm0,%%xmm1 \n" | 1502 "palignr $0xc,%%xmm0,%%xmm1 \n" |
1502 "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n" | 1503 "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n" |
1503 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n" | 1504 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n" |
1504 "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n" | 1505 "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n" |
1505 "subl $0x8,%[width] \n" | 1506 "subl $0x8,%[width] \n" |
1506 "jg 1b \n" | 1507 "jg 1b \n" |
1507 : [y_buf]"+r"(y_buf), // %[y_buf] | 1508 : [y_buf]"+r"(y_buf), // %[y_buf] |
1508 [u_buf]"+r"(u_buf), // %[u_buf] | 1509 [u_buf]"+r"(u_buf), // %[u_buf] |
1509 [v_buf]"+r"(v_buf), // %[v_buf] | 1510 [v_buf]"+r"(v_buf), // %[v_buf] |
1510 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] | 1511 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] |
1511 // TODO(fbarchard): Make width a register for 32 bit. | 1512 // TODO(fbarchard): Make width a register for 32 bit. |
1512 #if defined(__i386__) && defined(__pic__) | 1513 #if defined(__i386__) && defined(__pic__) |
1513 [width]"+m"(width) // %[width] | 1514 [width]"+m"(width) // %[width] |
1514 #else | 1515 #else |
1515 [width]"+rm"(width) // %[width] | 1516 [width]"+rm"(width) // %[width] |
1516 #endif | 1517 #endif |
1517 : [kYuvConstants]"r"(&kYuvConstants.kUVToB), | 1518 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
1518 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), | 1519 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), |
1519 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) | 1520 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) |
1520 : "memory", "cc", NACL_R14 | 1521 : "memory", "cc", NACL_R14 |
1521 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6" | 1522 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6" |
1522 ); | 1523 ); |
1523 } | 1524 } |
1524 | 1525 |
1525 void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, | 1526 void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, |
1526 const uint8* u_buf, | 1527 const uint8* u_buf, |
1527 const uint8* v_buf, | 1528 const uint8* v_buf, |
1528 uint8* dst_raw, | 1529 uint8* dst_raw, |
| 1530 struct YuvConstants* yuvconstants, |
1529 int width) { | 1531 int width) { |
1530 asm volatile ( | 1532 asm volatile ( |
1531 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n" | 1533 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n" |
1532 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n" | 1534 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n" |
1533 "sub %[u_buf],%[v_buf] \n" | 1535 "sub %[u_buf],%[v_buf] \n" |
1534 LABELALIGN | 1536 LABELALIGN |
1535 "1: \n" | 1537 "1: \n" |
1536 READYUV422 | 1538 READYUV422 |
1537 YUVTORGB(kYuvConstants) | 1539 YUVTORGB(yuvconstants) |
1538 "punpcklbw %%xmm1,%%xmm0 \n" | 1540 "punpcklbw %%xmm1,%%xmm0 \n" |
1539 "punpcklbw %%xmm2,%%xmm2 \n" | 1541 "punpcklbw %%xmm2,%%xmm2 \n" |
1540 "movdqa %%xmm0,%%xmm1 \n" | 1542 "movdqa %%xmm0,%%xmm1 \n" |
1541 "punpcklwd %%xmm2,%%xmm0 \n" | 1543 "punpcklwd %%xmm2,%%xmm0 \n" |
1542 "punpckhwd %%xmm2,%%xmm1 \n" | 1544 "punpckhwd %%xmm2,%%xmm1 \n" |
1543 "pshufb %%xmm5,%%xmm0 \n" | 1545 "pshufb %%xmm5,%%xmm0 \n" |
1544 "pshufb %%xmm6,%%xmm1 \n" | 1546 "pshufb %%xmm6,%%xmm1 \n" |
1545 "palignr $0xc,%%xmm0,%%xmm1 \n" | 1547 "palignr $0xc,%%xmm0,%%xmm1 \n" |
1546 "movq %%xmm0," MEMACCESS([dst_raw]) " \n" | 1548 "movq %%xmm0," MEMACCESS([dst_raw]) " \n" |
1547 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n" | 1549 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n" |
1548 "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n" | 1550 "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n" |
1549 "subl $0x8,%[width] \n" | 1551 "subl $0x8,%[width] \n" |
1550 "jg 1b \n" | 1552 "jg 1b \n" |
1551 : [y_buf]"+r"(y_buf), // %[y_buf] | 1553 : [y_buf]"+r"(y_buf), // %[y_buf] |
1552 [u_buf]"+r"(u_buf), // %[u_buf] | 1554 [u_buf]"+r"(u_buf), // %[u_buf] |
1553 [v_buf]"+r"(v_buf), // %[v_buf] | 1555 [v_buf]"+r"(v_buf), // %[v_buf] |
1554 [dst_raw]"+r"(dst_raw), // %[dst_raw] | 1556 [dst_raw]"+r"(dst_raw), // %[dst_raw] |
1555 // TODO(fbarchard): Make width a register for 32 bit. | 1557 // TODO(fbarchard): Make width a register for 32 bit. |
1556 #if defined(__i386__) && defined(__pic__) | 1558 #if defined(__i386__) && defined(__pic__) |
1557 [width]"+m"(width) // %[width] | 1559 [width]"+m"(width) // %[width] |
1558 #else | 1560 #else |
1559 [width]"+rm"(width) // %[width] | 1561 [width]"+rm"(width) // %[width] |
1560 #endif | 1562 #endif |
1561 : [kYuvConstants]"r"(&kYuvConstants.kUVToB), | 1563 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
1562 [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), | 1564 [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), |
1563 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW) | 1565 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW) |
1564 : "memory", "cc", NACL_R14 | 1566 : "memory", "cc", NACL_R14 |
1565 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6" | 1567 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6" |
1566 ); | 1568 ); |
1567 } | 1569 } |
1568 | 1570 |
1569 void OMITFP I422ToARGBMatrixRow_SSSE3(const uint8* y_buf, | 1571 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, |
1570 const uint8* u_buf, | 1572 const uint8* u_buf, |
1571 const uint8* v_buf, | 1573 const uint8* v_buf, |
1572 uint8* dst_argb, | 1574 uint8* dst_argb, |
1573 struct YuvConstants* YuvConstants, | 1575 struct YuvConstants* yuvconstants, |
1574 int width) { | 1576 int width) { |
1575 asm volatile ( | 1577 asm volatile ( |
1576 "sub %[u_buf],%[v_buf] \n" | 1578 "sub %[u_buf],%[v_buf] \n" |
1577 "pcmpeqb %%xmm5,%%xmm5 \n" | 1579 "pcmpeqb %%xmm5,%%xmm5 \n" |
1578 LABELALIGN | 1580 LABELALIGN |
1579 "1: \n" | 1581 "1: \n" |
1580 READYUV422 | 1582 READYUV422 |
1581 YUVTORGB(YuvConstants) | 1583 YUVTORGB(yuvconstants) |
1582 STOREARGB | 1584 STOREARGB |
1583 "sub $0x8,%[width] \n" | 1585 "sub $0x8,%[width] \n" |
1584 "jg 1b \n" | 1586 "jg 1b \n" |
1585 : [y_buf]"+r"(y_buf), // %[y_buf] | 1587 : [y_buf]"+r"(y_buf), // %[y_buf] |
1586 [u_buf]"+r"(u_buf), // %[u_buf] | 1588 [u_buf]"+r"(u_buf), // %[u_buf] |
1587 [v_buf]"+r"(v_buf), // %[v_buf] | 1589 [v_buf]"+r"(v_buf), // %[v_buf] |
1588 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1590 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1589 [width]"+rm"(width) // %[width] | 1591 [width]"+rm"(width) // %[width] |
1590 : [YuvConstants]"r"(YuvConstants) // %[kYuvConstants] | 1592 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1591 : "memory", "cc", NACL_R14 | 1593 : "memory", "cc", NACL_R14 |
1592 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1594 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
1593 ); | 1595 ); |
1594 } | 1596 } |
1595 | 1597 |
1596 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, | 1598 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, |
1597 const uint8* u_buf, | 1599 const uint8* u_buf, |
1598 const uint8* v_buf, | 1600 const uint8* v_buf, |
1599 uint8* dst_argb, | 1601 uint8* dst_argb, |
| 1602 struct YuvConstants* yuvconstants, |
1600 int width) { | 1603 int width) { |
1601 asm volatile ( | 1604 asm volatile ( |
1602 "sub %[u_buf],%[v_buf] \n" | 1605 "sub %[u_buf],%[v_buf] \n" |
1603 "pcmpeqb %%xmm5,%%xmm5 \n" | 1606 "pcmpeqb %%xmm5,%%xmm5 \n" |
1604 LABELALIGN | 1607 LABELALIGN |
1605 "1: \n" | 1608 "1: \n" |
1606 READYUV411 | 1609 READYUV411 |
1607 YUVTORGB(kYuvConstants) | 1610 YUVTORGB(yuvconstants) |
1608 STOREARGB | 1611 STOREARGB |
1609 "sub $0x8,%[width] \n" | 1612 "sub $0x8,%[width] \n" |
1610 "jg 1b \n" | 1613 "jg 1b \n" |
1611 : [y_buf]"+r"(y_buf), // %[y_buf] | 1614 : [y_buf]"+r"(y_buf), // %[y_buf] |
1612 [u_buf]"+r"(u_buf), // %[u_buf] | 1615 [u_buf]"+r"(u_buf), // %[u_buf] |
1613 [v_buf]"+r"(v_buf), // %[v_buf] | 1616 [v_buf]"+r"(v_buf), // %[v_buf] |
1614 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1617 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1615 [width]"+rm"(width) // %[width] | 1618 [width]"+rm"(width) // %[width] |
1616 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | 1619 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1617 : "memory", "cc", NACL_R14 | 1620 : "memory", "cc", NACL_R14 |
1618 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1621 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
1619 ); | 1622 ); |
1620 } | 1623 } |
1621 | 1624 |
1622 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, | 1625 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, |
1623 const uint8* uv_buf, | 1626 const uint8* uv_buf, |
1624 uint8* dst_argb, | 1627 uint8* dst_argb, |
| 1628 struct YuvConstants* yuvconstants, |
1625 int width) { | 1629 int width) { |
1626 asm volatile ( | 1630 asm volatile ( |
1627 "pcmpeqb %%xmm5,%%xmm5 \n" | 1631 "pcmpeqb %%xmm5,%%xmm5 \n" |
1628 LABELALIGN | 1632 LABELALIGN |
1629 "1: \n" | 1633 "1: \n" |
1630 READNV12 | 1634 READNV12 |
1631 YUVTORGB(kYuvConstants) | 1635 YUVTORGB(yuvconstants) |
1632 STOREARGB | 1636 STOREARGB |
1633 "sub $0x8,%[width] \n" | 1637 "sub $0x8,%[width] \n" |
1634 "jg 1b \n" | 1638 "jg 1b \n" |
1635 : [y_buf]"+r"(y_buf), // %[y_buf] | |
1636 [uv_buf]"+r"(uv_buf), // %[uv_buf] | |
1637 [dst_argb]"+r"(dst_argb), // %[dst_argb] | |
1638 [width]"+rm"(width) // %[width] | |
1639 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
1640 // Does not use r14. | |
1641 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
1642 ); | |
1643 } | |
1644 | |
1645 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, | |
1646 const uint8* uv_buf, | |
1647 uint8* dst_argb, | |
1648 int width) { | |
1649 asm volatile ( | |
1650 "pcmpeqb %%xmm5,%%xmm5 \n" | |
1651 LABELALIGN | |
1652 "1: \n" | |
1653 READNV12 | |
1654 YUVTORGB(kYuvConstants) | |
1655 STOREARGB | |
1656 "sub $0x8,%[width] \n" | |
1657 "jg 1b \n" | |
1658 : [y_buf]"+r"(y_buf), // %[y_buf] | 1639 : [y_buf]"+r"(y_buf), // %[y_buf] |
1659 [uv_buf]"+r"(uv_buf), // %[uv_buf] | 1640 [uv_buf]"+r"(uv_buf), // %[uv_buf] |
1660 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1641 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1661 [width]"+rm"(width) // %[width] | 1642 [width]"+rm"(width) // %[width] |
1662 : [kYuvConstants]"r"(&kYvuConstants.kUVToB) // %[kYuvConstants] | 1643 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1663 // Does not use r14. | 1644 // Does not use r14. |
1664 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1645 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
1665 ); | 1646 ); |
1666 } | 1647 } |
1667 | 1648 |
1668 void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, | 1649 void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, |
1669 const uint8* u_buf, | 1650 const uint8* u_buf, |
1670 const uint8* v_buf, | 1651 const uint8* v_buf, |
1671 uint8* dst_bgra, | 1652 uint8* dst_bgra, |
| 1653 struct YuvConstants* yuvconstants, |
1672 int width) { | 1654 int width) { |
1673 asm volatile ( | 1655 asm volatile ( |
1674 "sub %[u_buf],%[v_buf] \n" | 1656 "sub %[u_buf],%[v_buf] \n" |
1675 "pcmpeqb %%xmm5,%%xmm5 \n" | 1657 "pcmpeqb %%xmm5,%%xmm5 \n" |
1676 LABELALIGN | 1658 LABELALIGN |
1677 "1: \n" | 1659 "1: \n" |
1678 READYUV422 | 1660 READYUV422 |
1679 YUVTORGB(kYuvConstants) | 1661 YUVTORGB(yuvconstants) |
1680 STOREBGRA | 1662 STOREBGRA |
1681 "sub $0x8,%[width] \n" | 1663 "sub $0x8,%[width] \n" |
1682 "jg 1b \n" | 1664 "jg 1b \n" |
1683 : [y_buf]"+r"(y_buf), // %[y_buf] | 1665 : [y_buf]"+r"(y_buf), // %[y_buf] |
1684 [u_buf]"+r"(u_buf), // %[u_buf] | 1666 [u_buf]"+r"(u_buf), // %[u_buf] |
1685 [v_buf]"+r"(v_buf), // %[v_buf] | 1667 [v_buf]"+r"(v_buf), // %[v_buf] |
1686 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] | 1668 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] |
1687 [width]"+rm"(width) // %[width] | 1669 [width]"+rm"(width) // %[width] |
1688 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | 1670 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1689 : "memory", "cc", NACL_R14 | 1671 : "memory", "cc", NACL_R14 |
1690 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1672 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
1691 ); | 1673 ); |
1692 } | 1674 } |
1693 | 1675 |
1694 void OMITFP I422ToABGRMatrixRow_SSSE3(const uint8* y_buf, | 1676 void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, |
1695 const uint8* u_buf, | 1677 const uint8* u_buf, |
1696 const uint8* v_buf, | 1678 const uint8* v_buf, |
1697 uint8* dst_abgr, | 1679 uint8* dst_abgr, |
1698 struct YuvConstants* YuvConstants, | 1680 struct YuvConstants* yuvconstants, |
1699 int width) { | 1681 int width) { |
1700 asm volatile ( | 1682 asm volatile ( |
1701 "sub %[u_buf],%[v_buf] \n" | 1683 "sub %[u_buf],%[v_buf] \n" |
1702 "pcmpeqb %%xmm5,%%xmm5 \n" | 1684 "pcmpeqb %%xmm5,%%xmm5 \n" |
1703 LABELALIGN | 1685 LABELALIGN |
1704 "1: \n" | 1686 "1: \n" |
1705 READYUV422 | 1687 READYUV422 |
1706 YUVTORGB(kYuvConstants) | 1688 YUVTORGB(yuvconstants) |
1707 STOREABGR | 1689 STOREABGR |
1708 "sub $0x8,%[width] \n" | 1690 "sub $0x8,%[width] \n" |
1709 "jg 1b \n" | 1691 "jg 1b \n" |
1710 : [y_buf]"+r"(y_buf), // %[y_buf] | 1692 : [y_buf]"+r"(y_buf), // %[y_buf] |
1711 [u_buf]"+r"(u_buf), // %[u_buf] | 1693 [u_buf]"+r"(u_buf), // %[u_buf] |
1712 [v_buf]"+r"(v_buf), // %[v_buf] | 1694 [v_buf]"+r"(v_buf), // %[v_buf] |
1713 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] | 1695 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] |
1714 [width]"+rm"(width) // %[width] | 1696 [width]"+rm"(width) // %[width] |
1715 : [kYuvConstants]"r"(YuvConstants) // %[kYuvConstants] | 1697 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1716 : "memory", "cc", NACL_R14 | 1698 : "memory", "cc", NACL_R14 |
1717 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1699 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
1718 ); | 1700 ); |
1719 } | 1701 } |
1720 | 1702 |
1721 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, | 1703 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, |
1722 const uint8* u_buf, | 1704 const uint8* u_buf, |
1723 const uint8* v_buf, | 1705 const uint8* v_buf, |
1724 uint8* dst_rgba, | 1706 uint8* dst_rgba, |
| 1707 struct YuvConstants* yuvconstants, |
1725 int width) { | 1708 int width) { |
1726 asm volatile ( | 1709 asm volatile ( |
1727 "sub %[u_buf],%[v_buf] \n" | 1710 "sub %[u_buf],%[v_buf] \n" |
1728 "pcmpeqb %%xmm5,%%xmm5 \n" | 1711 "pcmpeqb %%xmm5,%%xmm5 \n" |
1729 LABELALIGN | 1712 LABELALIGN |
1730 "1: \n" | 1713 "1: \n" |
1731 READYUV422 | 1714 READYUV422 |
1732 YUVTORGB(kYuvConstants) | 1715 YUVTORGB(yuvconstants) |
1733 STORERGBA | 1716 STORERGBA |
1734 "sub $0x8,%[width] \n" | 1717 "sub $0x8,%[width] \n" |
1735 "jg 1b \n" | 1718 "jg 1b \n" |
1736 : [y_buf]"+r"(y_buf), // %[y_buf] | 1719 : [y_buf]"+r"(y_buf), // %[y_buf] |
1737 [u_buf]"+r"(u_buf), // %[u_buf] | 1720 [u_buf]"+r"(u_buf), // %[u_buf] |
1738 [v_buf]"+r"(v_buf), // %[v_buf] | 1721 [v_buf]"+r"(v_buf), // %[v_buf] |
1739 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] | 1722 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] |
1740 [width]"+rm"(width) // %[width] | 1723 [width]"+rm"(width) // %[width] |
1741 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | 1724 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1742 : "memory", "cc", NACL_R14 | 1725 : "memory", "cc", NACL_R14 |
1743 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1726 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
1744 ); | 1727 ); |
1745 } | 1728 } |
1746 | 1729 |
1747 #endif // HAS_I422TOARGBROW_SSSE3 | 1730 #endif // HAS_I422TOARGBROW_SSSE3 |
1748 | 1731 |
1749 // Read 8 UV from 422, upsample to 16 UV. | 1732 // Read 8 UV from 422, upsample to 16 UV. |
1750 #define READYUV422_AVX2 \ | 1733 #define READYUV422_AVX2 \ |
1751 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | 1734 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
(...skipping 29 matching lines...) Expand all Loading... |
1781 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ | 1764 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ |
1782 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" | 1765 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" |
1783 | 1766 |
1784 #if defined(HAS_I422TOBGRAROW_AVX2) | 1767 #if defined(HAS_I422TOBGRAROW_AVX2) |
1785 // 16 pixels | 1768 // 16 pixels |
1786 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). | 1769 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). |
1787 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, | 1770 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, |
1788 const uint8* u_buf, | 1771 const uint8* u_buf, |
1789 const uint8* v_buf, | 1772 const uint8* v_buf, |
1790 uint8* dst_bgra, | 1773 uint8* dst_bgra, |
| 1774 struct YuvConstants* yuvconstants, |
1791 int width) { | 1775 int width) { |
1792 asm volatile ( | 1776 asm volatile ( |
1793 "sub %[u_buf],%[v_buf] \n" | 1777 "sub %[u_buf],%[v_buf] \n" |
1794 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 1778 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
1795 LABELALIGN | 1779 LABELALIGN |
1796 "1: \n" | 1780 "1: \n" |
1797 READYUV422_AVX2 | 1781 READYUV422_AVX2 |
1798 YUVTORGB_AVX2(kYuvConstants) | 1782 YUVTORGB_AVX2(yuvconstants) |
1799 | 1783 |
1800 // Step 3: Weave into BGRA | 1784 // Step 3: Weave into BGRA |
1801 "vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB | 1785 "vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB |
1802 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | 1786 "vpermq $0xd8,%%ymm1,%%ymm1 \n" |
1803 "vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR | 1787 "vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR |
1804 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | 1788 "vpermq $0xd8,%%ymm2,%%ymm2 \n" |
1805 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels | 1789 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels |
1806 "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels | 1790 "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels |
1807 | 1791 |
1808 "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n" | 1792 "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n" |
1809 "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n" | 1793 "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n" |
1810 "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n" | 1794 "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n" |
1811 "sub $0x10,%[width] \n" | 1795 "sub $0x10,%[width] \n" |
1812 "jg 1b \n" | 1796 "jg 1b \n" |
1813 "vzeroupper \n" | 1797 "vzeroupper \n" |
1814 : [y_buf]"+r"(y_buf), // %[y_buf] | 1798 : [y_buf]"+r"(y_buf), // %[y_buf] |
1815 [u_buf]"+r"(u_buf), // %[u_buf] | 1799 [u_buf]"+r"(u_buf), // %[u_buf] |
1816 [v_buf]"+r"(v_buf), // %[v_buf] | 1800 [v_buf]"+r"(v_buf), // %[v_buf] |
1817 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] | 1801 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] |
1818 [width]"+rm"(width) // %[width] | 1802 [width]"+rm"(width) // %[width] |
1819 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | 1803 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1820 : "memory", "cc", NACL_R14 | 1804 : "memory", "cc", NACL_R14 |
1821 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1805 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
1822 ); | 1806 ); |
1823 } | 1807 } |
1824 #endif // HAS_I422TOBGRAROW_AVX2 | 1808 #endif // HAS_I422TOBGRAROW_AVX2 |
1825 | 1809 |
1826 #if defined(HAS_I422TOARGBMATRIXROW_AVX2) | 1810 #if defined(HAS_I422TOARGBROW_AVX2) |
1827 // 16 pixels | 1811 // 16 pixels |
1828 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 1812 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
1829 void OMITFP I422ToARGBMatrixRow_AVX2(const uint8* y_buf, | 1813 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, |
1830 const uint8* u_buf, | 1814 const uint8* u_buf, |
1831 const uint8* v_buf, | 1815 const uint8* v_buf, |
1832 uint8* dst_argb, | 1816 uint8* dst_argb, |
1833 struct YuvConstants* YuvConstants, | 1817 struct YuvConstants* yuvconstants, |
1834 int width) { | 1818 int width) { |
1835 asm volatile ( | 1819 asm volatile ( |
1836 "sub %[u_buf],%[v_buf] \n" | 1820 "sub %[u_buf],%[v_buf] \n" |
1837 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 1821 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
1838 LABELALIGN | 1822 LABELALIGN |
1839 "1: \n" | 1823 "1: \n" |
1840 READYUV422_AVX2 | 1824 READYUV422_AVX2 |
1841 YUVTORGB_AVX2(kYuvConstants) | 1825 YUVTORGB_AVX2(yuvconstants) |
1842 | 1826 |
1843 // Step 3: Weave into ARGB | 1827 // Step 3: Weave into ARGB |
1844 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG | 1828 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG |
1845 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | 1829 "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
1846 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA | 1830 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA |
1847 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | 1831 "vpermq $0xd8,%%ymm2,%%ymm2 \n" |
1848 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels | 1832 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels |
1849 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels | 1833 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels |
1850 | 1834 |
1851 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n" | 1835 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n" |
1852 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n" | 1836 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n" |
1853 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" | 1837 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" |
1854 "sub $0x10,%[width] \n" | 1838 "sub $0x10,%[width] \n" |
1855 "jg 1b \n" | 1839 "jg 1b \n" |
1856 "vzeroupper \n" | 1840 "vzeroupper \n" |
1857 : [y_buf]"+r"(y_buf), // %[y_buf] | 1841 : [y_buf]"+r"(y_buf), // %[y_buf] |
1858 [u_buf]"+r"(u_buf), // %[u_buf] | 1842 [u_buf]"+r"(u_buf), // %[u_buf] |
1859 [v_buf]"+r"(v_buf), // %[v_buf] | 1843 [v_buf]"+r"(v_buf), // %[v_buf] |
1860 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1844 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1861 [width]"+rm"(width) // %[width] | 1845 [width]"+rm"(width) // %[width] |
1862 : [kYuvConstants]"r"(YuvConstants) // %[kYuvConstants] | 1846 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1863 : "memory", "cc", NACL_R14 | 1847 : "memory", "cc", NACL_R14 |
1864 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1848 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
1865 ); | 1849 ); |
1866 } | 1850 } |
1867 #endif // HAS_I422TOARGBMATRIXROW_AVX2 | 1851 #endif // HAS_I422TOARGBROW_AVX2 |
1868 | 1852 |
1869 #if defined(HAS_I422TOABGRROW_AVX2) | 1853 #if defined(HAS_I422TOABGRROW_AVX2) |
1870 // 16 pixels | 1854 // 16 pixels |
1871 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). | 1855 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). |
1872 void OMITFP I422ToABGRMatrixRow_AVX2(const uint8* y_buf, | 1856 void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, |
1873 const uint8* u_buf, | 1857 const uint8* u_buf, |
1874 const uint8* v_buf, | 1858 const uint8* v_buf, |
1875 uint8* dst_argb, | 1859 uint8* dst_argb, |
1876 struct YuvConstants* YuvConstants, | 1860 struct YuvConstants* yuvconstants, |
1877 int width) { | 1861 int width) { |
1878 asm volatile ( | 1862 asm volatile ( |
1879 "sub %[u_buf],%[v_buf] \n" | 1863 "sub %[u_buf],%[v_buf] \n" |
1880 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 1864 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
1881 LABELALIGN | 1865 LABELALIGN |
1882 "1: \n" | 1866 "1: \n" |
1883 READYUV422_AVX2 | 1867 READYUV422_AVX2 |
1884 YUVTORGB_AVX2(kYuvConstants) | 1868 YUVTORGB_AVX2(yuvconstants) |
1885 | 1869 |
1886 // Step 3: Weave into ABGR | 1870 // Step 3: Weave into ABGR |
1887 "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" // RG | 1871 "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" // RG |
1888 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | 1872 "vpermq $0xd8,%%ymm1,%%ymm1 \n" |
1889 "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" // BA | 1873 "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" // BA |
1890 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | 1874 "vpermq $0xd8,%%ymm2,%%ymm2 \n" |
1891 "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" // RGBA first 8 pixels | 1875 "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" // RGBA first 8 pixels |
1892 "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" // RGBA next 8 pixels | 1876 "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" // RGBA next 8 pixels |
1893 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" | 1877 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" |
1894 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" | 1878 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" |
1895 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" | 1879 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" |
1896 "sub $0x10,%[width] \n" | 1880 "sub $0x10,%[width] \n" |
1897 "jg 1b \n" | 1881 "jg 1b \n" |
1898 "vzeroupper \n" | 1882 "vzeroupper \n" |
1899 : [y_buf]"+r"(y_buf), // %[y_buf] | 1883 : [y_buf]"+r"(y_buf), // %[y_buf] |
1900 [u_buf]"+r"(u_buf), // %[u_buf] | 1884 [u_buf]"+r"(u_buf), // %[u_buf] |
1901 [v_buf]"+r"(v_buf), // %[v_buf] | 1885 [v_buf]"+r"(v_buf), // %[v_buf] |
1902 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1886 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1903 [width]"+rm"(width) // %[width] | 1887 [width]"+rm"(width) // %[width] |
1904 : [kYuvConstants]"r"(YuvConstants) // %[kYuvConstants] | 1888 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1905 : "memory", "cc", NACL_R14 | 1889 : "memory", "cc", NACL_R14 |
1906 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1890 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
1907 ); | 1891 ); |
1908 } | 1892 } |
1909 #endif // HAS_I422TOABGRROW_AVX2 | 1893 #endif // HAS_I422TOABGRROW_AVX2 |
1910 | 1894 |
1911 #if defined(HAS_I422TORGBAROW_AVX2) | 1895 #if defined(HAS_I422TORGBAROW_AVX2) |
1912 // 16 pixels | 1896 // 16 pixels |
1913 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). | 1897 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). |
1914 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, | 1898 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, |
1915 const uint8* u_buf, | 1899 const uint8* u_buf, |
1916 const uint8* v_buf, | 1900 const uint8* v_buf, |
1917 uint8* dst_argb, | 1901 uint8* dst_argb, |
| 1902 struct YuvConstants* yuvconstants, |
1918 int width) { | 1903 int width) { |
1919 asm volatile ( | 1904 asm volatile ( |
1920 "sub %[u_buf],%[v_buf] \n" | 1905 "sub %[u_buf],%[v_buf] \n" |
1921 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 1906 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
1922 LABELALIGN | 1907 LABELALIGN |
1923 "1: \n" | 1908 "1: \n" |
1924 READYUV422_AVX2 | 1909 READYUV422_AVX2 |
1925 YUVTORGB_AVX2(kYuvConstants) | 1910 YUVTORGB_AVX2(yuvconstants) |
1926 | 1911 |
1927 // Step 3: Weave into RGBA | 1912 // Step 3: Weave into RGBA |
1928 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" | 1913 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" |
1929 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | 1914 "vpermq $0xd8,%%ymm1,%%ymm1 \n" |
1930 "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n" | 1915 "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n" |
1931 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | 1916 "vpermq $0xd8,%%ymm2,%%ymm2 \n" |
1932 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" | 1917 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" |
1933 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" | 1918 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" |
1934 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" | 1919 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" |
1935 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" | 1920 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" |
1936 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" | 1921 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" |
1937 "sub $0x10,%[width] \n" | 1922 "sub $0x10,%[width] \n" |
1938 "jg 1b \n" | 1923 "jg 1b \n" |
1939 "vzeroupper \n" | 1924 "vzeroupper \n" |
1940 : [y_buf]"+r"(y_buf), // %[y_buf] | 1925 : [y_buf]"+r"(y_buf), // %[y_buf] |
1941 [u_buf]"+r"(u_buf), // %[u_buf] | 1926 [u_buf]"+r"(u_buf), // %[u_buf] |
1942 [v_buf]"+r"(v_buf), // %[v_buf] | 1927 [v_buf]"+r"(v_buf), // %[v_buf] |
1943 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1928 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1944 [width]"+rm"(width) // %[width] | 1929 [width]"+rm"(width) // %[width] |
1945 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | 1930 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1946 : "memory", "cc", NACL_R14 | 1931 : "memory", "cc", NACL_R14 |
1947 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1932 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
1948 ); | 1933 ); |
1949 } | 1934 } |
1950 #endif // HAS_I422TORGBAROW_AVX2 | 1935 #endif // HAS_I422TORGBAROW_AVX2 |
1951 | 1936 |
1952 #ifdef HAS_I400TOARGBROW_SSE2 | 1937 #ifdef HAS_I400TOARGBROW_SSE2 |
1953 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { | 1938 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { |
1954 asm volatile ( | 1939 asm volatile ( |
1955 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 | 1940 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 |
(...skipping 3260 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5216 ); | 5201 ); |
5217 } | 5202 } |
5218 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5203 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
5219 | 5204 |
5220 #endif // defined(__x86_64__) || defined(__i386__) | 5205 #endif // defined(__x86_64__) || defined(__i386__) |
5221 | 5206 |
5222 #ifdef __cplusplus | 5207 #ifdef __cplusplus |
5223 } // extern "C" | 5208 } // extern "C" |
5224 } // namespace libyuv | 5209 } // namespace libyuv |
5225 #endif | 5210 #endif |
OLD | NEW |