| OLD | NEW |
| 1 // VERSION 2 | 1 // VERSION 2 |
| 2 /* | 2 /* |
| 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
| 4 * | 4 * |
| 5 * Use of this source code is governed by a BSD-style license | 5 * Use of this source code is governed by a BSD-style license |
| 6 * that can be found in the LICENSE file in the root of the source | 6 * that can be found in the LICENSE file in the root of the source |
| 7 * tree. An additional intellectual property rights grant can be found | 7 * tree. An additional intellectual property rights grant can be found |
| 8 * in the file PATENTS. All contributing project authors may | 8 * in the file PATENTS. All contributing project authors may |
| 9 * be found in the AUTHORS file in the root of the source tree. | 9 * be found in the AUTHORS file in the root of the source tree. |
| 10 */ | 10 */ |
| (...skipping 1332 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1343 "punpcklwd %%xmm0,%%xmm0 \n" \ | 1343 "punpcklwd %%xmm0,%%xmm0 \n" \ |
| 1344 "punpckldq %%xmm0,%%xmm0 \n" | 1344 "punpckldq %%xmm0,%%xmm0 \n" |
| 1345 | 1345 |
| 1346 // Read 4 UV from NV12, upsample to 8 UV | 1346 // Read 4 UV from NV12, upsample to 8 UV |
| 1347 #define READNV12 \ | 1347 #define READNV12 \ |
| 1348 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ | 1348 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ |
| 1349 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ | 1349 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ |
| 1350 "punpcklwd %%xmm0,%%xmm0 \n" | 1350 "punpcklwd %%xmm0,%%xmm0 \n" |
| 1351 | 1351 |
| 1352 // Convert 8 pixels: 8 UV and 8 Y | 1352 // Convert 8 pixels: 8 UV and 8 Y |
| 1353 #define YUVTORGB(YuvConstants) \ | 1353 #define YUVTORGB(yuvconstants) \ |
| 1354 "movdqa %%xmm0,%%xmm1 \n" \ | 1354 "movdqa %%xmm0,%%xmm1 \n" \ |
| 1355 "movdqa %%xmm0,%%xmm2 \n" \ | 1355 "movdqa %%xmm0,%%xmm2 \n" \ |
| 1356 "movdqa %%xmm0,%%xmm3 \n" \ | 1356 "movdqa %%xmm0,%%xmm3 \n" \ |
| 1357 "movdqa " MEMACCESS2(96, [YuvConstants]) ",%%xmm0 \n" \ | 1357 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \ |
| 1358 "pmaddubsw " MEMACCESS([YuvConstants]) ",%%xmm1 \n" \ | 1358 "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \ |
| 1359 "psubw %%xmm1,%%xmm0 \n" \ | 1359 "psubw %%xmm1,%%xmm0 \n" \ |
| 1360 "movdqa " MEMACCESS2(128, [YuvConstants]) ",%%xmm1 \n" \ | 1360 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \ |
| 1361 "pmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%xmm2 \n" \ | 1361 "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \ |
| 1362 "psubw %%xmm2,%%xmm1 \n" \ | 1362 "psubw %%xmm2,%%xmm1 \n" \ |
| 1363 "movdqa " MEMACCESS2(160, [YuvConstants]) ",%%xmm2 \n" \ | 1363 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \ |
| 1364 "pmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%xmm3 \n" \ | 1364 "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \ |
| 1365 "psubw %%xmm3,%%xmm2 \n" \ | 1365 "psubw %%xmm3,%%xmm2 \n" \ |
| 1366 "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ | 1366 "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ |
| 1367 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ | 1367 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ |
| 1368 "punpcklbw %%xmm3,%%xmm3 \n" \ | 1368 "punpcklbw %%xmm3,%%xmm3 \n" \ |
| 1369 "pmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%xmm3 \n" \ | 1369 "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm3 \n" \ |
| 1370 "paddsw %%xmm3,%%xmm0 \n" \ | 1370 "paddsw %%xmm3,%%xmm0 \n" \ |
| 1371 "paddsw %%xmm3,%%xmm1 \n" \ | 1371 "paddsw %%xmm3,%%xmm1 \n" \ |
| 1372 "paddsw %%xmm3,%%xmm2 \n" \ | 1372 "paddsw %%xmm3,%%xmm2 \n" \ |
| 1373 "psraw $0x6,%%xmm0 \n" \ | 1373 "psraw $0x6,%%xmm0 \n" \ |
| 1374 "psraw $0x6,%%xmm1 \n" \ | 1374 "psraw $0x6,%%xmm1 \n" \ |
| 1375 "psraw $0x6,%%xmm2 \n" \ | 1375 "psraw $0x6,%%xmm2 \n" \ |
| 1376 "packuswb %%xmm0,%%xmm0 \n" \ | 1376 "packuswb %%xmm0,%%xmm0 \n" \ |
| 1377 "packuswb %%xmm1,%%xmm1 \n" \ | 1377 "packuswb %%xmm1,%%xmm1 \n" \ |
| 1378 "packuswb %%xmm2,%%xmm2 \n" | 1378 "packuswb %%xmm2,%%xmm2 \n" |
| 1379 | 1379 |
| (...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1416 "pcmpeqb %%xmm5,%%xmm5 \n" \ | 1416 "pcmpeqb %%xmm5,%%xmm5 \n" \ |
| 1417 "punpcklbw %%xmm2,%%xmm1 \n" \ | 1417 "punpcklbw %%xmm2,%%xmm1 \n" \ |
| 1418 "punpcklbw %%xmm0,%%xmm5 \n" \ | 1418 "punpcklbw %%xmm0,%%xmm5 \n" \ |
| 1419 "movdqa %%xmm5,%%xmm0 \n" \ | 1419 "movdqa %%xmm5,%%xmm0 \n" \ |
| 1420 "punpcklwd %%xmm1,%%xmm5 \n" \ | 1420 "punpcklwd %%xmm1,%%xmm5 \n" \ |
| 1421 "punpckhwd %%xmm1,%%xmm0 \n" \ | 1421 "punpckhwd %%xmm1,%%xmm0 \n" \ |
| 1422 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ | 1422 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ |
| 1423 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ | 1423 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ |
| 1424 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" | 1424 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" |
| 1425 | 1425 |
| 1426 void OMITFP I444ToARGBMatrixRow_SSSE3(const uint8* y_buf, | 1426 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, |
| 1427 const uint8* u_buf, | 1427 const uint8* u_buf, |
| 1428 const uint8* v_buf, | 1428 const uint8* v_buf, |
| 1429 uint8* dst_argb, | 1429 uint8* dst_argb, |
| 1430 struct YuvConstants* YuvConstants, | 1430 struct YuvConstants* yuvconstants, |
| 1431 int width) { | 1431 int width) { |
| 1432 asm volatile ( | 1432 asm volatile ( |
| 1433 "sub %[u_buf],%[v_buf] \n" | 1433 "sub %[u_buf],%[v_buf] \n" |
| 1434 "pcmpeqb %%xmm5,%%xmm5 \n" | 1434 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 1435 LABELALIGN | 1435 LABELALIGN |
| 1436 "1: \n" | 1436 "1: \n" |
| 1437 READYUV444 | 1437 READYUV444 |
| 1438 YUVTORGB(YuvConstants) | 1438 YUVTORGB(yuvconstants) |
| 1439 STOREARGB | 1439 STOREARGB |
| 1440 "sub $0x8,%[width] \n" | 1440 "sub $0x8,%[width] \n" |
| 1441 "jg 1b \n" | 1441 "jg 1b \n" |
| 1442 : [y_buf]"+r"(y_buf), // %[y_buf] | 1442 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1443 [u_buf]"+r"(u_buf), // %[u_buf] | 1443 [u_buf]"+r"(u_buf), // %[u_buf] |
| 1444 [v_buf]"+r"(v_buf), // %[v_buf] | 1444 [v_buf]"+r"(v_buf), // %[v_buf] |
| 1445 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1445 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 1446 [width]"+rm"(width) // %[width] | 1446 [width]"+rm"(width) // %[width] |
| 1447 : [YuvConstants]"r"(YuvConstants) // %[kYuvConstants] | 1447 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 1448 : "memory", "cc", NACL_R14 | 1448 : "memory", "cc", NACL_R14 |
| 1449 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1449 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 1450 ); | 1450 ); |
| 1451 } | 1451 } |
| 1452 | 1452 |
| 1453 void OMITFP I444ToABGRMatrixRow_SSSE3(const uint8* y_buf, | 1453 void OMITFP I444ToABGRRow_SSSE3(const uint8* y_buf, |
| 1454 const uint8* u_buf, | 1454 const uint8* u_buf, |
| 1455 const uint8* v_buf, | 1455 const uint8* v_buf, |
| 1456 uint8* dst_abgr, | 1456 uint8* dst_abgr, |
| 1457 struct YuvConstants* YuvConstants, | 1457 struct YuvConstants* yuvconstants, |
| 1458 int width) { | 1458 int width) { |
| 1459 asm volatile ( | 1459 asm volatile ( |
| 1460 "sub %[u_buf],%[v_buf] \n" | 1460 "sub %[u_buf],%[v_buf] \n" |
| 1461 "pcmpeqb %%xmm5,%%xmm5 \n" | 1461 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 1462 LABELALIGN | 1462 LABELALIGN |
| 1463 "1: \n" | 1463 "1: \n" |
| 1464 READYUV444 | 1464 READYUV444 |
| 1465 YUVTORGB(YuvConstants) | 1465 YUVTORGB(yuvconstants) |
| 1466 STOREABGR | 1466 STOREABGR |
| 1467 "sub $0x8,%[width] \n" | 1467 "sub $0x8,%[width] \n" |
| 1468 "jg 1b \n" | 1468 "jg 1b \n" |
| 1469 : [y_buf]"+r"(y_buf), // %[y_buf] | 1469 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1470 [u_buf]"+r"(u_buf), // %[u_buf] | 1470 [u_buf]"+r"(u_buf), // %[u_buf] |
| 1471 [v_buf]"+r"(v_buf), // %[v_buf] | 1471 [v_buf]"+r"(v_buf), // %[v_buf] |
| 1472 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] | 1472 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] |
| 1473 [width]"+rm"(width) // %[width] | 1473 [width]"+rm"(width) // %[width] |
| 1474 : [YuvConstants]"r"(YuvConstants) // %[kYuvConstants] | 1474 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 1475 : "memory", "cc", NACL_R14 | 1475 : "memory", "cc", NACL_R14 |
| 1476 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1476 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 1477 ); | 1477 ); |
| 1478 } | 1478 } |
| 1479 | 1479 |
| 1480 // TODO(fbarchard): Consider putting masks into constants. | 1480 // TODO(fbarchard): Consider putting masks into constants. |
| 1481 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, | 1481 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, |
| 1482 const uint8* u_buf, | 1482 const uint8* u_buf, |
| 1483 const uint8* v_buf, | 1483 const uint8* v_buf, |
| 1484 uint8* dst_rgb24, | 1484 uint8* dst_rgb24, |
| 1485 struct YuvConstants* yuvconstants, |
| 1485 int width) { | 1486 int width) { |
| 1486 asm volatile ( | 1487 asm volatile ( |
| 1487 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" | 1488 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" |
| 1488 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" | 1489 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" |
| 1489 "sub %[u_buf],%[v_buf] \n" | 1490 "sub %[u_buf],%[v_buf] \n" |
| 1490 LABELALIGN | 1491 LABELALIGN |
| 1491 "1: \n" | 1492 "1: \n" |
| 1492 READYUV422 | 1493 READYUV422 |
| 1493 YUVTORGB(kYuvConstants) | 1494 YUVTORGB(yuvconstants) |
| 1494 "punpcklbw %%xmm1,%%xmm0 \n" | 1495 "punpcklbw %%xmm1,%%xmm0 \n" |
| 1495 "punpcklbw %%xmm2,%%xmm2 \n" | 1496 "punpcklbw %%xmm2,%%xmm2 \n" |
| 1496 "movdqa %%xmm0,%%xmm1 \n" | 1497 "movdqa %%xmm0,%%xmm1 \n" |
| 1497 "punpcklwd %%xmm2,%%xmm0 \n" | 1498 "punpcklwd %%xmm2,%%xmm0 \n" |
| 1498 "punpckhwd %%xmm2,%%xmm1 \n" | 1499 "punpckhwd %%xmm2,%%xmm1 \n" |
| 1499 "pshufb %%xmm5,%%xmm0 \n" | 1500 "pshufb %%xmm5,%%xmm0 \n" |
| 1500 "pshufb %%xmm6,%%xmm1 \n" | 1501 "pshufb %%xmm6,%%xmm1 \n" |
| 1501 "palignr $0xc,%%xmm0,%%xmm1 \n" | 1502 "palignr $0xc,%%xmm0,%%xmm1 \n" |
| 1502 "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n" | 1503 "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n" |
| 1503 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n" | 1504 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n" |
| 1504 "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n" | 1505 "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n" |
| 1505 "subl $0x8,%[width] \n" | 1506 "subl $0x8,%[width] \n" |
| 1506 "jg 1b \n" | 1507 "jg 1b \n" |
| 1507 : [y_buf]"+r"(y_buf), // %[y_buf] | 1508 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1508 [u_buf]"+r"(u_buf), // %[u_buf] | 1509 [u_buf]"+r"(u_buf), // %[u_buf] |
| 1509 [v_buf]"+r"(v_buf), // %[v_buf] | 1510 [v_buf]"+r"(v_buf), // %[v_buf] |
| 1510 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] | 1511 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] |
| 1511 // TODO(fbarchard): Make width a register for 32 bit. | 1512 // TODO(fbarchard): Make width a register for 32 bit. |
| 1512 #if defined(__i386__) && defined(__pic__) | 1513 #if defined(__i386__) && defined(__pic__) |
| 1513 [width]"+m"(width) // %[width] | 1514 [width]"+m"(width) // %[width] |
| 1514 #else | 1515 #else |
| 1515 [width]"+rm"(width) // %[width] | 1516 [width]"+rm"(width) // %[width] |
| 1516 #endif | 1517 #endif |
| 1517 : [kYuvConstants]"r"(&kYuvConstants.kUVToB), | 1518 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
| 1518 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), | 1519 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), |
| 1519 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) | 1520 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) |
| 1520 : "memory", "cc", NACL_R14 | 1521 : "memory", "cc", NACL_R14 |
| 1521 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6" | 1522 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6" |
| 1522 ); | 1523 ); |
| 1523 } | 1524 } |
| 1524 | 1525 |
| 1525 void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, | 1526 void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, |
| 1526 const uint8* u_buf, | 1527 const uint8* u_buf, |
| 1527 const uint8* v_buf, | 1528 const uint8* v_buf, |
| 1528 uint8* dst_raw, | 1529 uint8* dst_raw, |
| 1530 struct YuvConstants* yuvconstants, |
| 1529 int width) { | 1531 int width) { |
| 1530 asm volatile ( | 1532 asm volatile ( |
| 1531 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n" | 1533 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n" |
| 1532 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n" | 1534 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n" |
| 1533 "sub %[u_buf],%[v_buf] \n" | 1535 "sub %[u_buf],%[v_buf] \n" |
| 1534 LABELALIGN | 1536 LABELALIGN |
| 1535 "1: \n" | 1537 "1: \n" |
| 1536 READYUV422 | 1538 READYUV422 |
| 1537 YUVTORGB(kYuvConstants) | 1539 YUVTORGB(yuvconstants) |
| 1538 "punpcklbw %%xmm1,%%xmm0 \n" | 1540 "punpcklbw %%xmm1,%%xmm0 \n" |
| 1539 "punpcklbw %%xmm2,%%xmm2 \n" | 1541 "punpcklbw %%xmm2,%%xmm2 \n" |
| 1540 "movdqa %%xmm0,%%xmm1 \n" | 1542 "movdqa %%xmm0,%%xmm1 \n" |
| 1541 "punpcklwd %%xmm2,%%xmm0 \n" | 1543 "punpcklwd %%xmm2,%%xmm0 \n" |
| 1542 "punpckhwd %%xmm2,%%xmm1 \n" | 1544 "punpckhwd %%xmm2,%%xmm1 \n" |
| 1543 "pshufb %%xmm5,%%xmm0 \n" | 1545 "pshufb %%xmm5,%%xmm0 \n" |
| 1544 "pshufb %%xmm6,%%xmm1 \n" | 1546 "pshufb %%xmm6,%%xmm1 \n" |
| 1545 "palignr $0xc,%%xmm0,%%xmm1 \n" | 1547 "palignr $0xc,%%xmm0,%%xmm1 \n" |
| 1546 "movq %%xmm0," MEMACCESS([dst_raw]) " \n" | 1548 "movq %%xmm0," MEMACCESS([dst_raw]) " \n" |
| 1547 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n" | 1549 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n" |
| 1548 "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n" | 1550 "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n" |
| 1549 "subl $0x8,%[width] \n" | 1551 "subl $0x8,%[width] \n" |
| 1550 "jg 1b \n" | 1552 "jg 1b \n" |
| 1551 : [y_buf]"+r"(y_buf), // %[y_buf] | 1553 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1552 [u_buf]"+r"(u_buf), // %[u_buf] | 1554 [u_buf]"+r"(u_buf), // %[u_buf] |
| 1553 [v_buf]"+r"(v_buf), // %[v_buf] | 1555 [v_buf]"+r"(v_buf), // %[v_buf] |
| 1554 [dst_raw]"+r"(dst_raw), // %[dst_raw] | 1556 [dst_raw]"+r"(dst_raw), // %[dst_raw] |
| 1555 // TODO(fbarchard): Make width a register for 32 bit. | 1557 // TODO(fbarchard): Make width a register for 32 bit. |
| 1556 #if defined(__i386__) && defined(__pic__) | 1558 #if defined(__i386__) && defined(__pic__) |
| 1557 [width]"+m"(width) // %[width] | 1559 [width]"+m"(width) // %[width] |
| 1558 #else | 1560 #else |
| 1559 [width]"+rm"(width) // %[width] | 1561 [width]"+rm"(width) // %[width] |
| 1560 #endif | 1562 #endif |
| 1561 : [kYuvConstants]"r"(&kYuvConstants.kUVToB), | 1563 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
| 1562 [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), | 1564 [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), |
| 1563 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW) | 1565 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW) |
| 1564 : "memory", "cc", NACL_R14 | 1566 : "memory", "cc", NACL_R14 |
| 1565 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6" | 1567 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6" |
| 1566 ); | 1568 ); |
| 1567 } | 1569 } |
| 1568 | 1570 |
| 1569 void OMITFP I422ToARGBMatrixRow_SSSE3(const uint8* y_buf, | 1571 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, |
| 1570 const uint8* u_buf, | 1572 const uint8* u_buf, |
| 1571 const uint8* v_buf, | 1573 const uint8* v_buf, |
| 1572 uint8* dst_argb, | 1574 uint8* dst_argb, |
| 1573 struct YuvConstants* YuvConstants, | 1575 struct YuvConstants* yuvconstants, |
| 1574 int width) { | 1576 int width) { |
| 1575 asm volatile ( | 1577 asm volatile ( |
| 1576 "sub %[u_buf],%[v_buf] \n" | 1578 "sub %[u_buf],%[v_buf] \n" |
| 1577 "pcmpeqb %%xmm5,%%xmm5 \n" | 1579 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 1578 LABELALIGN | 1580 LABELALIGN |
| 1579 "1: \n" | 1581 "1: \n" |
| 1580 READYUV422 | 1582 READYUV422 |
| 1581 YUVTORGB(YuvConstants) | 1583 YUVTORGB(yuvconstants) |
| 1582 STOREARGB | 1584 STOREARGB |
| 1583 "sub $0x8,%[width] \n" | 1585 "sub $0x8,%[width] \n" |
| 1584 "jg 1b \n" | 1586 "jg 1b \n" |
| 1585 : [y_buf]"+r"(y_buf), // %[y_buf] | 1587 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1586 [u_buf]"+r"(u_buf), // %[u_buf] | 1588 [u_buf]"+r"(u_buf), // %[u_buf] |
| 1587 [v_buf]"+r"(v_buf), // %[v_buf] | 1589 [v_buf]"+r"(v_buf), // %[v_buf] |
| 1588 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1590 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 1589 [width]"+rm"(width) // %[width] | 1591 [width]"+rm"(width) // %[width] |
| 1590 : [YuvConstants]"r"(YuvConstants) // %[kYuvConstants] | 1592 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 1591 : "memory", "cc", NACL_R14 | 1593 : "memory", "cc", NACL_R14 |
| 1592 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1594 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 1593 ); | 1595 ); |
| 1594 } | 1596 } |
| 1595 | 1597 |
| 1596 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, | 1598 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, |
| 1597 const uint8* u_buf, | 1599 const uint8* u_buf, |
| 1598 const uint8* v_buf, | 1600 const uint8* v_buf, |
| 1599 uint8* dst_argb, | 1601 uint8* dst_argb, |
| 1602 struct YuvConstants* yuvconstants, |
| 1600 int width) { | 1603 int width) { |
| 1601 asm volatile ( | 1604 asm volatile ( |
| 1602 "sub %[u_buf],%[v_buf] \n" | 1605 "sub %[u_buf],%[v_buf] \n" |
| 1603 "pcmpeqb %%xmm5,%%xmm5 \n" | 1606 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 1604 LABELALIGN | 1607 LABELALIGN |
| 1605 "1: \n" | 1608 "1: \n" |
| 1606 READYUV411 | 1609 READYUV411 |
| 1607 YUVTORGB(kYuvConstants) | 1610 YUVTORGB(yuvconstants) |
| 1608 STOREARGB | 1611 STOREARGB |
| 1609 "sub $0x8,%[width] \n" | 1612 "sub $0x8,%[width] \n" |
| 1610 "jg 1b \n" | 1613 "jg 1b \n" |
| 1611 : [y_buf]"+r"(y_buf), // %[y_buf] | 1614 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1612 [u_buf]"+r"(u_buf), // %[u_buf] | 1615 [u_buf]"+r"(u_buf), // %[u_buf] |
| 1613 [v_buf]"+r"(v_buf), // %[v_buf] | 1616 [v_buf]"+r"(v_buf), // %[v_buf] |
| 1614 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1617 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 1615 [width]"+rm"(width) // %[width] | 1618 [width]"+rm"(width) // %[width] |
| 1616 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | 1619 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 1617 : "memory", "cc", NACL_R14 | 1620 : "memory", "cc", NACL_R14 |
| 1618 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1621 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 1619 ); | 1622 ); |
| 1620 } | 1623 } |
| 1621 | 1624 |
| 1622 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, | 1625 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, |
| 1623 const uint8* uv_buf, | 1626 const uint8* uv_buf, |
| 1624 uint8* dst_argb, | 1627 uint8* dst_argb, |
| 1628 struct YuvConstants* yuvconstants, |
| 1625 int width) { | 1629 int width) { |
| 1626 asm volatile ( | 1630 asm volatile ( |
| 1627 "pcmpeqb %%xmm5,%%xmm5 \n" | 1631 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 1628 LABELALIGN | 1632 LABELALIGN |
| 1629 "1: \n" | 1633 "1: \n" |
| 1630 READNV12 | 1634 READNV12 |
| 1631 YUVTORGB(kYuvConstants) | 1635 YUVTORGB(yuvconstants) |
| 1632 STOREARGB | 1636 STOREARGB |
| 1633 "sub $0x8,%[width] \n" | 1637 "sub $0x8,%[width] \n" |
| 1634 "jg 1b \n" | 1638 "jg 1b \n" |
| 1635 : [y_buf]"+r"(y_buf), // %[y_buf] | |
| 1636 [uv_buf]"+r"(uv_buf), // %[uv_buf] | |
| 1637 [dst_argb]"+r"(dst_argb), // %[dst_argb] | |
| 1638 [width]"+rm"(width) // %[width] | |
| 1639 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
| 1640 // Does not use r14. | |
| 1641 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
| 1642 ); | |
| 1643 } | |
| 1644 | |
| 1645 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, | |
| 1646 const uint8* uv_buf, | |
| 1647 uint8* dst_argb, | |
| 1648 int width) { | |
| 1649 asm volatile ( | |
| 1650 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 1651 LABELALIGN | |
| 1652 "1: \n" | |
| 1653 READNV12 | |
| 1654 YUVTORGB(kYuvConstants) | |
| 1655 STOREARGB | |
| 1656 "sub $0x8,%[width] \n" | |
| 1657 "jg 1b \n" | |
| 1658 : [y_buf]"+r"(y_buf), // %[y_buf] | 1639 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1659 [uv_buf]"+r"(uv_buf), // %[uv_buf] | 1640 [uv_buf]"+r"(uv_buf), // %[uv_buf] |
| 1660 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1641 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 1661 [width]"+rm"(width) // %[width] | 1642 [width]"+rm"(width) // %[width] |
| 1662 : [kYuvConstants]"r"(&kYvuConstants.kUVToB) // %[kYuvConstants] | 1643 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 1663 // Does not use r14. | 1644 // Does not use r14. |
| 1664 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1645 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 1665 ); | 1646 ); |
| 1666 } | 1647 } |
| 1667 | 1648 |
| 1668 void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, | 1649 void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, |
| 1669 const uint8* u_buf, | 1650 const uint8* u_buf, |
| 1670 const uint8* v_buf, | 1651 const uint8* v_buf, |
| 1671 uint8* dst_bgra, | 1652 uint8* dst_bgra, |
| 1653 struct YuvConstants* yuvconstants, |
| 1672 int width) { | 1654 int width) { |
| 1673 asm volatile ( | 1655 asm volatile ( |
| 1674 "sub %[u_buf],%[v_buf] \n" | 1656 "sub %[u_buf],%[v_buf] \n" |
| 1675 "pcmpeqb %%xmm5,%%xmm5 \n" | 1657 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 1676 LABELALIGN | 1658 LABELALIGN |
| 1677 "1: \n" | 1659 "1: \n" |
| 1678 READYUV422 | 1660 READYUV422 |
| 1679 YUVTORGB(kYuvConstants) | 1661 YUVTORGB(yuvconstants) |
| 1680 STOREBGRA | 1662 STOREBGRA |
| 1681 "sub $0x8,%[width] \n" | 1663 "sub $0x8,%[width] \n" |
| 1682 "jg 1b \n" | 1664 "jg 1b \n" |
| 1683 : [y_buf]"+r"(y_buf), // %[y_buf] | 1665 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1684 [u_buf]"+r"(u_buf), // %[u_buf] | 1666 [u_buf]"+r"(u_buf), // %[u_buf] |
| 1685 [v_buf]"+r"(v_buf), // %[v_buf] | 1667 [v_buf]"+r"(v_buf), // %[v_buf] |
| 1686 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] | 1668 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] |
| 1687 [width]"+rm"(width) // %[width] | 1669 [width]"+rm"(width) // %[width] |
| 1688 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | 1670 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 1689 : "memory", "cc", NACL_R14 | 1671 : "memory", "cc", NACL_R14 |
| 1690 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1672 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 1691 ); | 1673 ); |
| 1692 } | 1674 } |
| 1693 | 1675 |
| 1694 void OMITFP I422ToABGRMatrixRow_SSSE3(const uint8* y_buf, | 1676 void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, |
| 1695 const uint8* u_buf, | 1677 const uint8* u_buf, |
| 1696 const uint8* v_buf, | 1678 const uint8* v_buf, |
| 1697 uint8* dst_abgr, | 1679 uint8* dst_abgr, |
| 1698 struct YuvConstants* YuvConstants, | 1680 struct YuvConstants* yuvconstants, |
| 1699 int width) { | 1681 int width) { |
| 1700 asm volatile ( | 1682 asm volatile ( |
| 1701 "sub %[u_buf],%[v_buf] \n" | 1683 "sub %[u_buf],%[v_buf] \n" |
| 1702 "pcmpeqb %%xmm5,%%xmm5 \n" | 1684 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 1703 LABELALIGN | 1685 LABELALIGN |
| 1704 "1: \n" | 1686 "1: \n" |
| 1705 READYUV422 | 1687 READYUV422 |
| 1706 YUVTORGB(kYuvConstants) | 1688 YUVTORGB(yuvconstants) |
| 1707 STOREABGR | 1689 STOREABGR |
| 1708 "sub $0x8,%[width] \n" | 1690 "sub $0x8,%[width] \n" |
| 1709 "jg 1b \n" | 1691 "jg 1b \n" |
| 1710 : [y_buf]"+r"(y_buf), // %[y_buf] | 1692 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1711 [u_buf]"+r"(u_buf), // %[u_buf] | 1693 [u_buf]"+r"(u_buf), // %[u_buf] |
| 1712 [v_buf]"+r"(v_buf), // %[v_buf] | 1694 [v_buf]"+r"(v_buf), // %[v_buf] |
| 1713 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] | 1695 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] |
| 1714 [width]"+rm"(width) // %[width] | 1696 [width]"+rm"(width) // %[width] |
| 1715 : [kYuvConstants]"r"(YuvConstants) // %[kYuvConstants] | 1697 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 1716 : "memory", "cc", NACL_R14 | 1698 : "memory", "cc", NACL_R14 |
| 1717 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1699 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 1718 ); | 1700 ); |
| 1719 } | 1701 } |
| 1720 | 1702 |
| 1721 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, | 1703 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, |
| 1722 const uint8* u_buf, | 1704 const uint8* u_buf, |
| 1723 const uint8* v_buf, | 1705 const uint8* v_buf, |
| 1724 uint8* dst_rgba, | 1706 uint8* dst_rgba, |
| 1707 struct YuvConstants* yuvconstants, |
| 1725 int width) { | 1708 int width) { |
| 1726 asm volatile ( | 1709 asm volatile ( |
| 1727 "sub %[u_buf],%[v_buf] \n" | 1710 "sub %[u_buf],%[v_buf] \n" |
| 1728 "pcmpeqb %%xmm5,%%xmm5 \n" | 1711 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 1729 LABELALIGN | 1712 LABELALIGN |
| 1730 "1: \n" | 1713 "1: \n" |
| 1731 READYUV422 | 1714 READYUV422 |
| 1732 YUVTORGB(kYuvConstants) | 1715 YUVTORGB(yuvconstants) |
| 1733 STORERGBA | 1716 STORERGBA |
| 1734 "sub $0x8,%[width] \n" | 1717 "sub $0x8,%[width] \n" |
| 1735 "jg 1b \n" | 1718 "jg 1b \n" |
| 1736 : [y_buf]"+r"(y_buf), // %[y_buf] | 1719 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1737 [u_buf]"+r"(u_buf), // %[u_buf] | 1720 [u_buf]"+r"(u_buf), // %[u_buf] |
| 1738 [v_buf]"+r"(v_buf), // %[v_buf] | 1721 [v_buf]"+r"(v_buf), // %[v_buf] |
| 1739 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] | 1722 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] |
| 1740 [width]"+rm"(width) // %[width] | 1723 [width]"+rm"(width) // %[width] |
| 1741 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | 1724 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 1742 : "memory", "cc", NACL_R14 | 1725 : "memory", "cc", NACL_R14 |
| 1743 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1726 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 1744 ); | 1727 ); |
| 1745 } | 1728 } |
| 1746 | 1729 |
| 1747 #endif // HAS_I422TOARGBROW_SSSE3 | 1730 #endif // HAS_I422TOARGBROW_SSSE3 |
| 1748 | 1731 |
| 1749 // Read 8 UV from 422, upsample to 16 UV. | 1732 // Read 8 UV from 422, upsample to 16 UV. |
| 1750 #define READYUV422_AVX2 \ | 1733 #define READYUV422_AVX2 \ |
| 1751 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | 1734 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
| (...skipping 29 matching lines...) Expand all Loading... |
| 1781 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ | 1764 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ |
| 1782 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" | 1765 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" |
| 1783 | 1766 |
| 1784 #if defined(HAS_I422TOBGRAROW_AVX2) | 1767 #if defined(HAS_I422TOBGRAROW_AVX2) |
| 1785 // 16 pixels | 1768 // 16 pixels |
| 1786 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). | 1769 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). |
| 1787 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, | 1770 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, |
| 1788 const uint8* u_buf, | 1771 const uint8* u_buf, |
| 1789 const uint8* v_buf, | 1772 const uint8* v_buf, |
| 1790 uint8* dst_bgra, | 1773 uint8* dst_bgra, |
| 1774 struct YuvConstants* yuvconstants, |
| 1791 int width) { | 1775 int width) { |
| 1792 asm volatile ( | 1776 asm volatile ( |
| 1793 "sub %[u_buf],%[v_buf] \n" | 1777 "sub %[u_buf],%[v_buf] \n" |
| 1794 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 1778 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 1795 LABELALIGN | 1779 LABELALIGN |
| 1796 "1: \n" | 1780 "1: \n" |
| 1797 READYUV422_AVX2 | 1781 READYUV422_AVX2 |
| 1798 YUVTORGB_AVX2(kYuvConstants) | 1782 YUVTORGB_AVX2(yuvconstants) |
| 1799 | 1783 |
| 1800 // Step 3: Weave into BGRA | 1784 // Step 3: Weave into BGRA |
| 1801 "vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB | 1785 "vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB |
| 1802 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | 1786 "vpermq $0xd8,%%ymm1,%%ymm1 \n" |
| 1803 "vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR | 1787 "vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR |
| 1804 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | 1788 "vpermq $0xd8,%%ymm2,%%ymm2 \n" |
| 1805 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels | 1789 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels |
| 1806 "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels | 1790 "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels |
| 1807 | 1791 |
| 1808 "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n" | 1792 "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n" |
| 1809 "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n" | 1793 "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n" |
| 1810 "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n" | 1794 "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n" |
| 1811 "sub $0x10,%[width] \n" | 1795 "sub $0x10,%[width] \n" |
| 1812 "jg 1b \n" | 1796 "jg 1b \n" |
| 1813 "vzeroupper \n" | 1797 "vzeroupper \n" |
| 1814 : [y_buf]"+r"(y_buf), // %[y_buf] | 1798 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1815 [u_buf]"+r"(u_buf), // %[u_buf] | 1799 [u_buf]"+r"(u_buf), // %[u_buf] |
| 1816 [v_buf]"+r"(v_buf), // %[v_buf] | 1800 [v_buf]"+r"(v_buf), // %[v_buf] |
| 1817 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] | 1801 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] |
| 1818 [width]"+rm"(width) // %[width] | 1802 [width]"+rm"(width) // %[width] |
| 1819 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | 1803 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 1820 : "memory", "cc", NACL_R14 | 1804 : "memory", "cc", NACL_R14 |
| 1821 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1805 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 1822 ); | 1806 ); |
| 1823 } | 1807 } |
| 1824 #endif // HAS_I422TOBGRAROW_AVX2 | 1808 #endif // HAS_I422TOBGRAROW_AVX2 |
| 1825 | 1809 |
| 1826 #if defined(HAS_I422TOARGBMATRIXROW_AVX2) | 1810 #if defined(HAS_I422TOARGBROW_AVX2) |
| 1827 // 16 pixels | 1811 // 16 pixels |
| 1828 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 1812 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 1829 void OMITFP I422ToARGBMatrixRow_AVX2(const uint8* y_buf, | 1813 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, |
| 1830 const uint8* u_buf, | 1814 const uint8* u_buf, |
| 1831 const uint8* v_buf, | 1815 const uint8* v_buf, |
| 1832 uint8* dst_argb, | 1816 uint8* dst_argb, |
| 1833 struct YuvConstants* YuvConstants, | 1817 struct YuvConstants* yuvconstants, |
| 1834 int width) { | 1818 int width) { |
| 1835 asm volatile ( | 1819 asm volatile ( |
| 1836 "sub %[u_buf],%[v_buf] \n" | 1820 "sub %[u_buf],%[v_buf] \n" |
| 1837 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 1821 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 1838 LABELALIGN | 1822 LABELALIGN |
| 1839 "1: \n" | 1823 "1: \n" |
| 1840 READYUV422_AVX2 | 1824 READYUV422_AVX2 |
| 1841 YUVTORGB_AVX2(kYuvConstants) | 1825 YUVTORGB_AVX2(yuvconstants) |
| 1842 | 1826 |
| 1843 // Step 3: Weave into ARGB | 1827 // Step 3: Weave into ARGB |
| 1844 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG | 1828 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG |
| 1845 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | 1829 "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
| 1846 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA | 1830 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA |
| 1847 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | 1831 "vpermq $0xd8,%%ymm2,%%ymm2 \n" |
| 1848 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels | 1832 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels |
| 1849 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels | 1833 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels |
| 1850 | 1834 |
| 1851 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n" | 1835 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n" |
| 1852 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n" | 1836 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n" |
| 1853 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" | 1837 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" |
| 1854 "sub $0x10,%[width] \n" | 1838 "sub $0x10,%[width] \n" |
| 1855 "jg 1b \n" | 1839 "jg 1b \n" |
| 1856 "vzeroupper \n" | 1840 "vzeroupper \n" |
| 1857 : [y_buf]"+r"(y_buf), // %[y_buf] | 1841 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1858 [u_buf]"+r"(u_buf), // %[u_buf] | 1842 [u_buf]"+r"(u_buf), // %[u_buf] |
| 1859 [v_buf]"+r"(v_buf), // %[v_buf] | 1843 [v_buf]"+r"(v_buf), // %[v_buf] |
| 1860 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1844 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 1861 [width]"+rm"(width) // %[width] | 1845 [width]"+rm"(width) // %[width] |
| 1862 : [kYuvConstants]"r"(YuvConstants) // %[kYuvConstants] | 1846 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 1863 : "memory", "cc", NACL_R14 | 1847 : "memory", "cc", NACL_R14 |
| 1864 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1848 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 1865 ); | 1849 ); |
| 1866 } | 1850 } |
| 1867 #endif // HAS_I422TOARGBMATRIXROW_AVX2 | 1851 #endif // HAS_I422TOARGBROW_AVX2 |
| 1868 | 1852 |
| 1869 #if defined(HAS_I422TOABGRROW_AVX2) | 1853 #if defined(HAS_I422TOABGRROW_AVX2) |
| 1870 // 16 pixels | 1854 // 16 pixels |
| 1871 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). | 1855 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). |
| 1872 void OMITFP I422ToABGRMatrixRow_AVX2(const uint8* y_buf, | 1856 void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, |
| 1873 const uint8* u_buf, | 1857 const uint8* u_buf, |
| 1874 const uint8* v_buf, | 1858 const uint8* v_buf, |
| 1875 uint8* dst_argb, | 1859 uint8* dst_argb, |
| 1876 struct YuvConstants* YuvConstants, | 1860 struct YuvConstants* yuvconstants, |
| 1877 int width) { | 1861 int width) { |
| 1878 asm volatile ( | 1862 asm volatile ( |
| 1879 "sub %[u_buf],%[v_buf] \n" | 1863 "sub %[u_buf],%[v_buf] \n" |
| 1880 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 1864 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 1881 LABELALIGN | 1865 LABELALIGN |
| 1882 "1: \n" | 1866 "1: \n" |
| 1883 READYUV422_AVX2 | 1867 READYUV422_AVX2 |
| 1884 YUVTORGB_AVX2(kYuvConstants) | 1868 YUVTORGB_AVX2(yuvconstants) |
| 1885 | 1869 |
| 1886 // Step 3: Weave into ABGR | 1870 // Step 3: Weave into ABGR |
| 1887 "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" // RG | 1871 "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" // RG |
| 1888 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | 1872 "vpermq $0xd8,%%ymm1,%%ymm1 \n" |
| 1889 "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" // BA | 1873 "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" // BA |
| 1890 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | 1874 "vpermq $0xd8,%%ymm2,%%ymm2 \n" |
| 1891 "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" // RGBA first 8 pixels | 1875 "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" // RGBA first 8 pixels |
| 1892 "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" // RGBA next 8 pixels | 1876 "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" // RGBA next 8 pixels |
| 1893 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" | 1877 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" |
| 1894 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" | 1878 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" |
| 1895 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" | 1879 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" |
| 1896 "sub $0x10,%[width] \n" | 1880 "sub $0x10,%[width] \n" |
| 1897 "jg 1b \n" | 1881 "jg 1b \n" |
| 1898 "vzeroupper \n" | 1882 "vzeroupper \n" |
| 1899 : [y_buf]"+r"(y_buf), // %[y_buf] | 1883 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1900 [u_buf]"+r"(u_buf), // %[u_buf] | 1884 [u_buf]"+r"(u_buf), // %[u_buf] |
| 1901 [v_buf]"+r"(v_buf), // %[v_buf] | 1885 [v_buf]"+r"(v_buf), // %[v_buf] |
| 1902 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1886 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 1903 [width]"+rm"(width) // %[width] | 1887 [width]"+rm"(width) // %[width] |
| 1904 : [kYuvConstants]"r"(YuvConstants) // %[kYuvConstants] | 1888 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 1905 : "memory", "cc", NACL_R14 | 1889 : "memory", "cc", NACL_R14 |
| 1906 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1890 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 1907 ); | 1891 ); |
| 1908 } | 1892 } |
| 1909 #endif // HAS_I422TOABGRROW_AVX2 | 1893 #endif // HAS_I422TOABGRROW_AVX2 |
| 1910 | 1894 |
| 1911 #if defined(HAS_I422TORGBAROW_AVX2) | 1895 #if defined(HAS_I422TORGBAROW_AVX2) |
| 1912 // 16 pixels | 1896 // 16 pixels |
| 1913 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). | 1897 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). |
| 1914 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, | 1898 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, |
| 1915 const uint8* u_buf, | 1899 const uint8* u_buf, |
| 1916 const uint8* v_buf, | 1900 const uint8* v_buf, |
| 1917 uint8* dst_argb, | 1901 uint8* dst_argb, |
| 1902 struct YuvConstants* yuvconstants, |
| 1918 int width) { | 1903 int width) { |
| 1919 asm volatile ( | 1904 asm volatile ( |
| 1920 "sub %[u_buf],%[v_buf] \n" | 1905 "sub %[u_buf],%[v_buf] \n" |
| 1921 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 1906 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 1922 LABELALIGN | 1907 LABELALIGN |
| 1923 "1: \n" | 1908 "1: \n" |
| 1924 READYUV422_AVX2 | 1909 READYUV422_AVX2 |
| 1925 YUVTORGB_AVX2(kYuvConstants) | 1910 YUVTORGB_AVX2(yuvconstants) |
| 1926 | 1911 |
| 1927 // Step 3: Weave into RGBA | 1912 // Step 3: Weave into RGBA |
| 1928 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" | 1913 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" |
| 1929 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | 1914 "vpermq $0xd8,%%ymm1,%%ymm1 \n" |
| 1930 "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n" | 1915 "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n" |
| 1931 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | 1916 "vpermq $0xd8,%%ymm2,%%ymm2 \n" |
| 1932 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" | 1917 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" |
| 1933 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" | 1918 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" |
| 1934 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" | 1919 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" |
| 1935 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" | 1920 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" |
| 1936 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" | 1921 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" |
| 1937 "sub $0x10,%[width] \n" | 1922 "sub $0x10,%[width] \n" |
| 1938 "jg 1b \n" | 1923 "jg 1b \n" |
| 1939 "vzeroupper \n" | 1924 "vzeroupper \n" |
| 1940 : [y_buf]"+r"(y_buf), // %[y_buf] | 1925 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1941 [u_buf]"+r"(u_buf), // %[u_buf] | 1926 [u_buf]"+r"(u_buf), // %[u_buf] |
| 1942 [v_buf]"+r"(v_buf), // %[v_buf] | 1927 [v_buf]"+r"(v_buf), // %[v_buf] |
| 1943 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1928 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 1944 [width]"+rm"(width) // %[width] | 1929 [width]"+rm"(width) // %[width] |
| 1945 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | 1930 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 1946 : "memory", "cc", NACL_R14 | 1931 : "memory", "cc", NACL_R14 |
| 1947 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1932 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 1948 ); | 1933 ); |
| 1949 } | 1934 } |
| 1950 #endif // HAS_I422TORGBAROW_AVX2 | 1935 #endif // HAS_I422TORGBAROW_AVX2 |
| 1951 | 1936 |
| 1952 #ifdef HAS_I400TOARGBROW_SSE2 | 1937 #ifdef HAS_I400TOARGBROW_SSE2 |
| 1953 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { | 1938 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { |
| 1954 asm volatile ( | 1939 asm volatile ( |
| 1955 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 | 1940 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 |
| (...skipping 3260 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5216 ); | 5201 ); |
| 5217 } | 5202 } |
| 5218 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5203 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
| 5219 | 5204 |
| 5220 #endif // defined(__x86_64__) || defined(__i386__) | 5205 #endif // defined(__x86_64__) || defined(__i386__) |
| 5221 | 5206 |
| 5222 #ifdef __cplusplus | 5207 #ifdef __cplusplus |
| 5223 } // extern "C" | 5208 } // extern "C" |
| 5224 } // namespace libyuv | 5209 } // namespace libyuv |
| 5225 #endif | 5210 #endif |
| OLD | NEW |