OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBlitRow_opts_arm_neon.h" | 8 #include "SkBlitRow_opts_arm_neon.h" |
9 | 9 |
10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
(...skipping 617 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
628 | 628 |
629 do { | 629 do { |
630 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; | 630 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; |
631 uint16x8_t vres_a, vres_r, vres_g, vres_b; | 631 uint16x8_t vres_a, vres_r, vres_g, vres_b; |
632 uint8x8x4_t vsrc; | 632 uint8x8x4_t vsrc; |
633 | 633 |
634 // load pixels | 634 // load pixels |
635 vdst = vld1q_u16(dst); | 635 vdst = vld1q_u16(dst); |
636 #ifdef SK_CPU_ARM64 | 636 #ifdef SK_CPU_ARM64 |
637 vsrc = sk_vld4_u8_arm64_4(src); | 637 vsrc = sk_vld4_u8_arm64_4(src); |
638 #else | 638 #elif (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
639 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | |
640 asm ( | 639 asm ( |
641 "vld4.u8 %h[vsrc], [%[src]]!" | 640 "vld4.u8 %h[vsrc], [%[src]]!" |
642 : [vsrc] "=w" (vsrc), [src] "+&r" (src) | 641 : [vsrc] "=w" (vsrc), [src] "+&r" (src) |
643 : : | 642 : : |
644 ); | 643 ); |
645 #else | 644 #else |
646 register uint8x8_t d0 asm("d0"); | 645 register uint8x8_t d0 asm("d0"); |
647 register uint8x8_t d1 asm("d1"); | 646 register uint8x8_t d1 asm("d1"); |
648 register uint8x8_t d2 asm("d2"); | 647 register uint8x8_t d2 asm("d2"); |
649 register uint8x8_t d3 asm("d3"); | 648 register uint8x8_t d3 asm("d3"); |
650 | 649 |
651 asm volatile ( | 650 asm volatile ( |
652 "vld4.u8 {d0-d3},[%[src]]!;" | 651 "vld4.u8 {d0-d3},[%[src]]!;" |
653 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), | 652 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), |
654 [src] "+&r" (src) | 653 [src] "+&r" (src) |
655 : : | 654 : : |
656 ); | 655 ); |
657 vsrc.val[0] = d0; | 656 vsrc.val[0] = d0; |
658 vsrc.val[1] = d1; | 657 vsrc.val[1] = d1; |
659 vsrc.val[2] = d2; | 658 vsrc.val[2] = d2; |
660 vsrc.val[3] = d3; | 659 vsrc.val[3] = d3; |
661 #endif | 660 #endif |
662 #endif // #ifdef SK_CPU_ARM64 | |
663 | 661 |
664 | 662 |
665 // deinterleave dst | 663 // deinterleave dst |
666 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to
p of lanes | 664 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to
p of lanes |
667 vdst_b = vdst & vmask_blue; // extract blue | 665 vdst_b = vdst & vmask_blue; // extract blue |
668 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red | 666 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red |
669 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract
green | 667 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract
green |
670 | 668 |
671 // shift src to 565 | 669 // shift src to 565 |
672 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); | 670 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); |
(...skipping 631 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1304 | 1302 |
1305 src += 2; | 1303 src += 2; |
1306 dst += 2; | 1304 dst += 2; |
1307 count -= 2; | 1305 count -= 2; |
1308 } while(count); | 1306 } while(count); |
1309 } | 1307 } |
1310 } | 1308 } |
1311 | 1309 |
1312 /////////////////////////////////////////////////////////////////////////////// | 1310 /////////////////////////////////////////////////////////////////////////////// |
1313 | 1311 |
1314 #undef DEBUG_OPAQUE_DITHER | |
1315 | |
1316 #if defined(DEBUG_OPAQUE_DITHER) | |
1317 static void showme8(char *str, void *p, int len) | |
1318 { | |
1319 static char buf[256]; | |
1320 char tbuf[32]; | |
1321 int i; | |
1322 char *pc = (char*) p; | |
1323 sprintf(buf,"%8s:", str); | |
1324 for(i=0;i<len;i++) { | |
1325 sprintf(tbuf, " %02x", pc[i]); | |
1326 strcat(buf, tbuf); | |
1327 } | |
1328 SkDebugf("%s\n", buf); | |
1329 } | |
1330 static void showme16(char *str, void *p, int len) | |
1331 { | |
1332 static char buf[256]; | |
1333 char tbuf[32]; | |
1334 int i; | |
1335 uint16_t *pc = (uint16_t*) p; | |
1336 sprintf(buf,"%8s:", str); | |
1337 len = (len / sizeof(uint16_t)); /* passed as bytes */ | |
1338 for(i=0;i<len;i++) { | |
1339 sprintf(tbuf, " %04x", pc[i]); | |
1340 strcat(buf, tbuf); | |
1341 } | |
1342 SkDebugf("%s\n", buf); | |
1343 } | |
1344 #endif | |
1345 #endif // #ifdef SK_CPU_ARM32 | 1312 #endif // #ifdef SK_CPU_ARM32 |
1346 | 1313 |
1347 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, | 1314 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, |
1348 const SkPMColor* SK_RESTRICT src, | 1315 const SkPMColor* SK_RESTRICT src, |
1349 int count, U8CPU alpha, int x, int y) { | 1316 int count, U8CPU alpha, int x, int y) { |
1350 SkASSERT(255 == alpha); | 1317 SkASSERT(255 == alpha); |
1351 | 1318 |
1352 #define UNROLL 8 | 1319 #define UNROLL 8 |
1353 | 1320 |
1354 if (count >= UNROLL) { | 1321 if (count >= UNROLL) { |
1355 | 1322 |
1356 #if defined(DEBUG_OPAQUE_DITHER) | |
1357 uint16_t tmpbuf[UNROLL]; | |
1358 int td[UNROLL]; | |
1359 int tdv[UNROLL]; | |
1360 int ta[UNROLL]; | |
1361 int tap[UNROLL]; | |
1362 uint16_t in_dst[UNROLL]; | |
1363 int offset = 0; | |
1364 int noisy = 0; | |
1365 #endif | |
1366 | |
1367 uint8x8_t dbase; | 1323 uint8x8_t dbase; |
1368 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; | 1324 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; |
1369 dbase = vld1_u8(dstart); | 1325 dbase = vld1_u8(dstart); |
1370 | 1326 |
1371 do { | 1327 do { |
1372 uint8x8x4_t vsrc; | 1328 uint8x8x4_t vsrc; |
1373 uint8x8_t sr, sg, sb, sa, d; | 1329 uint8x8_t sr, sg, sb, sa, d; |
1374 uint16x8_t dst8, scale8, alpha8; | 1330 uint16x8_t dst8, scale8, alpha8; |
1375 uint16x8_t dst_r, dst_g, dst_b; | 1331 uint16x8_t dst_r, dst_g, dst_b; |
1376 | 1332 |
1377 #if defined(DEBUG_OPAQUE_DITHER) | |
1378 // calculate 8 elements worth into a temp buffer | |
1379 { | |
1380 int my_y = y; | |
1381 int my_x = x; | |
1382 SkPMColor* my_src = (SkPMColor*)src; | |
1383 uint16_t* my_dst = dst; | |
1384 int i; | |
1385 | |
1386 DITHER_565_SCAN(my_y); | |
1387 for(i = 0; i < UNROLL; i++) { | |
1388 SkPMColor c = *my_src++; | |
1389 SkPMColorAssert(c); | |
1390 if (c) { | |
1391 unsigned a = SkGetPackedA32(c); | |
1392 | |
1393 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); | |
1394 tdv[i] = DITHER_VALUE(my_x); | |
1395 ta[i] = a; | |
1396 tap[i] = SkAlpha255To256(a); | |
1397 td[i] = d; | |
1398 | |
1399 unsigned sr = SkGetPackedR32(c); | |
1400 unsigned sg = SkGetPackedG32(c); | |
1401 unsigned sb = SkGetPackedB32(c); | |
1402 sr = SkDITHER_R32_FOR_565(sr, d); | |
1403 sg = SkDITHER_G32_FOR_565(sg, d); | |
1404 sb = SkDITHER_B32_FOR_565(sb, d); | |
1405 | |
1406 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); | |
1407 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); | |
1408 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); | |
1409 // now src and dst expanded are in g:11 r:10 x:1 b:10 | |
1410 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5)
; | |
1411 td[i] = d; | |
1412 } else { | |
1413 tmpbuf[i] = *my_dst; | |
1414 ta[i] = tdv[i] = td[i] = 0xbeef; | |
1415 } | |
1416 in_dst[i] = *my_dst; | |
1417 my_dst += 1; | |
1418 DITHER_INC_X(my_x); | |
1419 } | |
1420 } | |
1421 #endif | |
1422 | |
1423 #ifdef SK_CPU_ARM64 | 1333 #ifdef SK_CPU_ARM64 |
1424 vsrc = sk_vld4_u8_arm64_4(src); | 1334 vsrc = sk_vld4_u8_arm64_4(src); |
1425 #else | 1335 #else |
1426 { | 1336 { |
1427 register uint8x8_t d0 asm("d0"); | 1337 register uint8x8_t d0 asm("d0"); |
1428 register uint8x8_t d1 asm("d1"); | 1338 register uint8x8_t d1 asm("d1"); |
1429 register uint8x8_t d2 asm("d2"); | 1339 register uint8x8_t d2 asm("d2"); |
1430 register uint8x8_t d3 asm("d3"); | 1340 register uint8x8_t d3 asm("d3"); |
1431 | 1341 |
1432 asm ("vld4.8 {d0-d3},[%[src]]! " | 1342 asm ("vld4.8 {d0-d3},[%[src]]! " |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1482 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); | 1392 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); |
1483 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); | 1393 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); |
1484 | 1394 |
1485 // repack to store | 1395 // repack to store |
1486 dst8 = vshrq_n_u16(dst_b, 5); | 1396 dst8 = vshrq_n_u16(dst_b, 5); |
1487 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); | 1397 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); |
1488 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); | 1398 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); |
1489 | 1399 |
1490 vst1q_u16(dst, dst8); | 1400 vst1q_u16(dst, dst8); |
1491 | 1401 |
1492 #if defined(DEBUG_OPAQUE_DITHER) | |
1493 // verify my 8 elements match the temp buffer | |
1494 { | |
1495 int i, bad=0; | |
1496 static int invocation; | |
1497 | |
1498 for (i = 0; i < UNROLL; i++) { | |
1499 if (tmpbuf[i] != dst[i]) { | |
1500 bad=1; | |
1501 } | |
1502 } | |
1503 if (bad) { | |
1504 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %
d\n", | |
1505 invocation, offset); | |
1506 SkDebugf(" alpha 0x%x\n", alpha); | |
1507 for (i = 0; i < UNROLL; i++) | |
1508 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %0
4x\n", | |
1509 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[
i], | |
1510 in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]); | |
1511 | |
1512 showme16("alpha8", &alpha8, sizeof(alpha8)); | |
1513 showme16("scale8", &scale8, sizeof(scale8)); | |
1514 showme8("d", &d, sizeof(d)); | |
1515 showme16("dst8", &dst8, sizeof(dst8)); | |
1516 showme16("dst_b", &dst_b, sizeof(dst_b)); | |
1517 showme16("dst_g", &dst_g, sizeof(dst_g)); | |
1518 showme16("dst_r", &dst_r, sizeof(dst_r)); | |
1519 showme8("sb", &sb, sizeof(sb)); | |
1520 showme8("sg", &sg, sizeof(sg)); | |
1521 showme8("sr", &sr, sizeof(sr)); | |
1522 | |
1523 return; | |
1524 } | |
1525 offset += UNROLL; | |
1526 invocation++; | |
1527 } | |
1528 #endif | |
1529 dst += UNROLL; | 1402 dst += UNROLL; |
1530 count -= UNROLL; | 1403 count -= UNROLL; |
1531 // skip x += UNROLL, since it's unchanged mod-4 | 1404 // skip x += UNROLL, since it's unchanged mod-4 |
1532 } while (count >= UNROLL); | 1405 } while (count >= UNROLL); |
1533 } | 1406 } |
1534 #undef UNROLL | 1407 #undef UNROLL |
1535 | 1408 |
1536 // residuals | 1409 // residuals |
1537 if (count > 0) { | 1410 if (count > 0) { |
1538 DITHER_565_SCAN(y); | 1411 DITHER_565_SCAN(y); |
(...skipping 23 matching lines...) Expand all Loading... |
1562 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); | 1435 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); |
1563 } | 1436 } |
1564 dst += 1; | 1437 dst += 1; |
1565 DITHER_INC_X(x); | 1438 DITHER_INC_X(x); |
1566 } while (--count != 0); | 1439 } while (--count != 0); |
1567 } | 1440 } |
1568 } | 1441 } |
1569 | 1442 |
1570 /////////////////////////////////////////////////////////////////////////////// | 1443 /////////////////////////////////////////////////////////////////////////////// |
1571 | 1444 |
1572 #undef DEBUG_S32_OPAQUE_DITHER | |
1573 | |
1574 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, | 1445 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, |
1575 const SkPMColor* SK_RESTRICT src, | 1446 const SkPMColor* SK_RESTRICT src, |
1576 int count, U8CPU alpha, int x, int y) { | 1447 int count, U8CPU alpha, int x, int y) { |
1577 SkASSERT(255 == alpha); | 1448 SkASSERT(255 == alpha); |
1578 | 1449 |
1579 #define UNROLL 8 | 1450 #define UNROLL 8 |
1580 if (count >= UNROLL) { | 1451 if (count >= UNROLL) { |
1581 uint8x8_t d; | 1452 uint8x8_t d; |
1582 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; | 1453 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; |
1583 d = vld1_u8(dstart); | 1454 d = vld1_u8(dstart); |
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1630 dg = vaddl_u8(sg, vshr_n_u8(d, 1)); | 1501 dg = vaddl_u8(sg, vshr_n_u8(d, 1)); |
1631 | 1502 |
1632 // pack high bits of each into 565 format (rgb, b is lsb) | 1503 // pack high bits of each into 565 format (rgb, b is lsb) |
1633 dst8 = vshrq_n_u16(db, 3); | 1504 dst8 = vshrq_n_u16(db, 3); |
1634 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); | 1505 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); |
1635 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11); | 1506 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11); |
1636 | 1507 |
1637 // store it | 1508 // store it |
1638 vst1q_u16(dst, dst8); | 1509 vst1q_u16(dst, dst8); |
1639 | 1510 |
1640 #if defined(DEBUG_S32_OPAQUE_DITHER) | |
1641 // always good to know if we generated good results | |
1642 { | |
1643 int i, myx = x, myy = y; | |
1644 DITHER_565_SCAN(myy); | |
1645 for (i=0;i<UNROLL;i++) { | |
1646 // the '!' in the asm block above post-incremented src by the 8 pixe
ls it reads. | |
1647 SkPMColor c = src[i-8]; | |
1648 unsigned dither = DITHER_VALUE(myx); | |
1649 uint16_t val = SkDitherRGB32To565(c, dither); | |
1650 if (val != dst[i]) { | |
1651 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x
\n", | |
1652 c, dither, val, dst[i], dstart[i]); | |
1653 } | |
1654 DITHER_INC_X(myx); | |
1655 } | |
1656 } | |
1657 #endif | |
1658 | |
1659 dst += UNROLL; | 1511 dst += UNROLL; |
1660 // we don't need to increment src as the asm above has already done it | 1512 // we don't need to increment src as the asm above has already done it |
1661 count -= UNROLL; | 1513 count -= UNROLL; |
1662 x += UNROLL; // probably superfluous | 1514 x += UNROLL; // probably superfluous |
1663 } | 1515 } |
1664 } | 1516 } |
1665 #undef UNROLL | 1517 #undef UNROLL |
1666 | 1518 |
1667 // residuals | 1519 // residuals |
1668 if (count > 0) { | 1520 if (count > 0) { |
(...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1740 invA += invA >> 7; | 1592 invA += invA >> 7; |
1741 SkASSERT(invA < 256); // Our caller has already handled the alpha == 0 case
. | 1593 SkASSERT(invA < 256); // Our caller has already handled the alpha == 0 case
. |
1742 | 1594 |
1743 Sk16h colorHighAndRound = Sk4px::DupPMColor(color).widenHi() + Sk16h(128); | 1595 Sk16h colorHighAndRound = Sk4px::DupPMColor(color).widenHi() + Sk16h(128); |
1744 Sk16b invA_16x(invA); | 1596 Sk16b invA_16x(invA); |
1745 | 1597 |
1746 Sk4px::MapSrc(count, dst, src, [&](const Sk4px& src4) -> Sk4px { | 1598 Sk4px::MapSrc(count, dst, src, [&](const Sk4px& src4) -> Sk4px { |
1747 return (src4 * invA_16x).addNarrowHi(colorHighAndRound); | 1599 return (src4 * invA_16x).addNarrowHi(colorHighAndRound); |
1748 }); | 1600 }); |
1749 } | 1601 } |
OLD | NEW |