| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include "SkBlitRow_opts_arm_neon.h" | 8 #include "SkBlitRow_opts_arm_neon.h" |
| 9 | 9 |
| 10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
| (...skipping 617 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 628 | 628 |
| 629 do { | 629 do { |
| 630 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; | 630 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; |
| 631 uint16x8_t vres_a, vres_r, vres_g, vres_b; | 631 uint16x8_t vres_a, vres_r, vres_g, vres_b; |
| 632 uint8x8x4_t vsrc; | 632 uint8x8x4_t vsrc; |
| 633 | 633 |
| 634 // load pixels | 634 // load pixels |
| 635 vdst = vld1q_u16(dst); | 635 vdst = vld1q_u16(dst); |
| 636 #ifdef SK_CPU_ARM64 | 636 #ifdef SK_CPU_ARM64 |
| 637 vsrc = sk_vld4_u8_arm64_4(src); | 637 vsrc = sk_vld4_u8_arm64_4(src); |
| 638 #else | 638 #elif (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
| 639 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | |
| 640 asm ( | 639 asm ( |
| 641 "vld4.u8 %h[vsrc], [%[src]]!" | 640 "vld4.u8 %h[vsrc], [%[src]]!" |
| 642 : [vsrc] "=w" (vsrc), [src] "+&r" (src) | 641 : [vsrc] "=w" (vsrc), [src] "+&r" (src) |
| 643 : : | 642 : : |
| 644 ); | 643 ); |
| 645 #else | 644 #else |
| 646 register uint8x8_t d0 asm("d0"); | 645 register uint8x8_t d0 asm("d0"); |
| 647 register uint8x8_t d1 asm("d1"); | 646 register uint8x8_t d1 asm("d1"); |
| 648 register uint8x8_t d2 asm("d2"); | 647 register uint8x8_t d2 asm("d2"); |
| 649 register uint8x8_t d3 asm("d3"); | 648 register uint8x8_t d3 asm("d3"); |
| 650 | 649 |
| 651 asm volatile ( | 650 asm volatile ( |
| 652 "vld4.u8 {d0-d3},[%[src]]!;" | 651 "vld4.u8 {d0-d3},[%[src]]!;" |
| 653 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), | 652 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), |
| 654 [src] "+&r" (src) | 653 [src] "+&r" (src) |
| 655 : : | 654 : : |
| 656 ); | 655 ); |
| 657 vsrc.val[0] = d0; | 656 vsrc.val[0] = d0; |
| 658 vsrc.val[1] = d1; | 657 vsrc.val[1] = d1; |
| 659 vsrc.val[2] = d2; | 658 vsrc.val[2] = d2; |
| 660 vsrc.val[3] = d3; | 659 vsrc.val[3] = d3; |
| 661 #endif | 660 #endif |
| 662 #endif // #ifdef SK_CPU_ARM64 | |
| 663 | 661 |
| 664 | 662 |
| 665 // deinterleave dst | 663 // deinterleave dst |
| 666 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to
p of lanes | 664 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to
p of lanes |
| 667 vdst_b = vdst & vmask_blue; // extract blue | 665 vdst_b = vdst & vmask_blue; // extract blue |
| 668 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red | 666 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red |
| 669 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract
green | 667 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract
green |
| 670 | 668 |
| 671 // shift src to 565 | 669 // shift src to 565 |
| 672 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); | 670 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); |
| (...skipping 631 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1304 | 1302 |
| 1305 src += 2; | 1303 src += 2; |
| 1306 dst += 2; | 1304 dst += 2; |
| 1307 count -= 2; | 1305 count -= 2; |
| 1308 } while(count); | 1306 } while(count); |
| 1309 } | 1307 } |
| 1310 } | 1308 } |
| 1311 | 1309 |
| 1312 /////////////////////////////////////////////////////////////////////////////// | 1310 /////////////////////////////////////////////////////////////////////////////// |
| 1313 | 1311 |
| 1314 #undef DEBUG_OPAQUE_DITHER | |
| 1315 | |
| 1316 #if defined(DEBUG_OPAQUE_DITHER) | |
| 1317 static void showme8(char *str, void *p, int len) | |
| 1318 { | |
| 1319 static char buf[256]; | |
| 1320 char tbuf[32]; | |
| 1321 int i; | |
| 1322 char *pc = (char*) p; | |
| 1323 sprintf(buf,"%8s:", str); | |
| 1324 for(i=0;i<len;i++) { | |
| 1325 sprintf(tbuf, " %02x", pc[i]); | |
| 1326 strcat(buf, tbuf); | |
| 1327 } | |
| 1328 SkDebugf("%s\n", buf); | |
| 1329 } | |
| 1330 static void showme16(char *str, void *p, int len) | |
| 1331 { | |
| 1332 static char buf[256]; | |
| 1333 char tbuf[32]; | |
| 1334 int i; | |
| 1335 uint16_t *pc = (uint16_t*) p; | |
| 1336 sprintf(buf,"%8s:", str); | |
| 1337 len = (len / sizeof(uint16_t)); /* passed as bytes */ | |
| 1338 for(i=0;i<len;i++) { | |
| 1339 sprintf(tbuf, " %04x", pc[i]); | |
| 1340 strcat(buf, tbuf); | |
| 1341 } | |
| 1342 SkDebugf("%s\n", buf); | |
| 1343 } | |
| 1344 #endif | |
| 1345 #endif // #ifdef SK_CPU_ARM32 | 1312 #endif // #ifdef SK_CPU_ARM32 |
| 1346 | 1313 |
| 1347 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, | 1314 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, |
| 1348 const SkPMColor* SK_RESTRICT src, | 1315 const SkPMColor* SK_RESTRICT src, |
| 1349 int count, U8CPU alpha, int x, int y) { | 1316 int count, U8CPU alpha, int x, int y) { |
| 1350 SkASSERT(255 == alpha); | 1317 SkASSERT(255 == alpha); |
| 1351 | 1318 |
| 1352 #define UNROLL 8 | 1319 #define UNROLL 8 |
| 1353 | 1320 |
| 1354 if (count >= UNROLL) { | 1321 if (count >= UNROLL) { |
| 1355 | 1322 |
| 1356 #if defined(DEBUG_OPAQUE_DITHER) | |
| 1357 uint16_t tmpbuf[UNROLL]; | |
| 1358 int td[UNROLL]; | |
| 1359 int tdv[UNROLL]; | |
| 1360 int ta[UNROLL]; | |
| 1361 int tap[UNROLL]; | |
| 1362 uint16_t in_dst[UNROLL]; | |
| 1363 int offset = 0; | |
| 1364 int noisy = 0; | |
| 1365 #endif | |
| 1366 | |
| 1367 uint8x8_t dbase; | 1323 uint8x8_t dbase; |
| 1368 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; | 1324 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; |
| 1369 dbase = vld1_u8(dstart); | 1325 dbase = vld1_u8(dstart); |
| 1370 | 1326 |
| 1371 do { | 1327 do { |
| 1372 uint8x8x4_t vsrc; | 1328 uint8x8x4_t vsrc; |
| 1373 uint8x8_t sr, sg, sb, sa, d; | 1329 uint8x8_t sr, sg, sb, sa, d; |
| 1374 uint16x8_t dst8, scale8, alpha8; | 1330 uint16x8_t dst8, scale8, alpha8; |
| 1375 uint16x8_t dst_r, dst_g, dst_b; | 1331 uint16x8_t dst_r, dst_g, dst_b; |
| 1376 | 1332 |
| 1377 #if defined(DEBUG_OPAQUE_DITHER) | |
| 1378 // calculate 8 elements worth into a temp buffer | |
| 1379 { | |
| 1380 int my_y = y; | |
| 1381 int my_x = x; | |
| 1382 SkPMColor* my_src = (SkPMColor*)src; | |
| 1383 uint16_t* my_dst = dst; | |
| 1384 int i; | |
| 1385 | |
| 1386 DITHER_565_SCAN(my_y); | |
| 1387 for(i = 0; i < UNROLL; i++) { | |
| 1388 SkPMColor c = *my_src++; | |
| 1389 SkPMColorAssert(c); | |
| 1390 if (c) { | |
| 1391 unsigned a = SkGetPackedA32(c); | |
| 1392 | |
| 1393 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); | |
| 1394 tdv[i] = DITHER_VALUE(my_x); | |
| 1395 ta[i] = a; | |
| 1396 tap[i] = SkAlpha255To256(a); | |
| 1397 td[i] = d; | |
| 1398 | |
| 1399 unsigned sr = SkGetPackedR32(c); | |
| 1400 unsigned sg = SkGetPackedG32(c); | |
| 1401 unsigned sb = SkGetPackedB32(c); | |
| 1402 sr = SkDITHER_R32_FOR_565(sr, d); | |
| 1403 sg = SkDITHER_G32_FOR_565(sg, d); | |
| 1404 sb = SkDITHER_B32_FOR_565(sb, d); | |
| 1405 | |
| 1406 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); | |
| 1407 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); | |
| 1408 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); | |
| 1409 // now src and dst expanded are in g:11 r:10 x:1 b:10 | |
| 1410 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5)
; | |
| 1411 td[i] = d; | |
| 1412 } else { | |
| 1413 tmpbuf[i] = *my_dst; | |
| 1414 ta[i] = tdv[i] = td[i] = 0xbeef; | |
| 1415 } | |
| 1416 in_dst[i] = *my_dst; | |
| 1417 my_dst += 1; | |
| 1418 DITHER_INC_X(my_x); | |
| 1419 } | |
| 1420 } | |
| 1421 #endif | |
| 1422 | |
| 1423 #ifdef SK_CPU_ARM64 | 1333 #ifdef SK_CPU_ARM64 |
| 1424 vsrc = sk_vld4_u8_arm64_4(src); | 1334 vsrc = sk_vld4_u8_arm64_4(src); |
| 1425 #else | 1335 #else |
| 1426 { | 1336 { |
| 1427 register uint8x8_t d0 asm("d0"); | 1337 register uint8x8_t d0 asm("d0"); |
| 1428 register uint8x8_t d1 asm("d1"); | 1338 register uint8x8_t d1 asm("d1"); |
| 1429 register uint8x8_t d2 asm("d2"); | 1339 register uint8x8_t d2 asm("d2"); |
| 1430 register uint8x8_t d3 asm("d3"); | 1340 register uint8x8_t d3 asm("d3"); |
| 1431 | 1341 |
| 1432 asm ("vld4.8 {d0-d3},[%[src]]! " | 1342 asm ("vld4.8 {d0-d3},[%[src]]! " |
| (...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1482 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); | 1392 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); |
| 1483 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); | 1393 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); |
| 1484 | 1394 |
| 1485 // repack to store | 1395 // repack to store |
| 1486 dst8 = vshrq_n_u16(dst_b, 5); | 1396 dst8 = vshrq_n_u16(dst_b, 5); |
| 1487 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); | 1397 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); |
| 1488 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); | 1398 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); |
| 1489 | 1399 |
| 1490 vst1q_u16(dst, dst8); | 1400 vst1q_u16(dst, dst8); |
| 1491 | 1401 |
| 1492 #if defined(DEBUG_OPAQUE_DITHER) | |
| 1493 // verify my 8 elements match the temp buffer | |
| 1494 { | |
| 1495 int i, bad=0; | |
| 1496 static int invocation; | |
| 1497 | |
| 1498 for (i = 0; i < UNROLL; i++) { | |
| 1499 if (tmpbuf[i] != dst[i]) { | |
| 1500 bad=1; | |
| 1501 } | |
| 1502 } | |
| 1503 if (bad) { | |
| 1504 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %
d\n", | |
| 1505 invocation, offset); | |
| 1506 SkDebugf(" alpha 0x%x\n", alpha); | |
| 1507 for (i = 0; i < UNROLL; i++) | |
| 1508 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %0
4x\n", | |
| 1509 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[
i], | |
| 1510 in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]); | |
| 1511 | |
| 1512 showme16("alpha8", &alpha8, sizeof(alpha8)); | |
| 1513 showme16("scale8", &scale8, sizeof(scale8)); | |
| 1514 showme8("d", &d, sizeof(d)); | |
| 1515 showme16("dst8", &dst8, sizeof(dst8)); | |
| 1516 showme16("dst_b", &dst_b, sizeof(dst_b)); | |
| 1517 showme16("dst_g", &dst_g, sizeof(dst_g)); | |
| 1518 showme16("dst_r", &dst_r, sizeof(dst_r)); | |
| 1519 showme8("sb", &sb, sizeof(sb)); | |
| 1520 showme8("sg", &sg, sizeof(sg)); | |
| 1521 showme8("sr", &sr, sizeof(sr)); | |
| 1522 | |
| 1523 return; | |
| 1524 } | |
| 1525 offset += UNROLL; | |
| 1526 invocation++; | |
| 1527 } | |
| 1528 #endif | |
| 1529 dst += UNROLL; | 1402 dst += UNROLL; |
| 1530 count -= UNROLL; | 1403 count -= UNROLL; |
| 1531 // skip x += UNROLL, since it's unchanged mod-4 | 1404 // skip x += UNROLL, since it's unchanged mod-4 |
| 1532 } while (count >= UNROLL); | 1405 } while (count >= UNROLL); |
| 1533 } | 1406 } |
| 1534 #undef UNROLL | 1407 #undef UNROLL |
| 1535 | 1408 |
| 1536 // residuals | 1409 // residuals |
| 1537 if (count > 0) { | 1410 if (count > 0) { |
| 1538 DITHER_565_SCAN(y); | 1411 DITHER_565_SCAN(y); |
| (...skipping 23 matching lines...) Expand all Loading... |
| 1562 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); | 1435 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); |
| 1563 } | 1436 } |
| 1564 dst += 1; | 1437 dst += 1; |
| 1565 DITHER_INC_X(x); | 1438 DITHER_INC_X(x); |
| 1566 } while (--count != 0); | 1439 } while (--count != 0); |
| 1567 } | 1440 } |
| 1568 } | 1441 } |
| 1569 | 1442 |
| 1570 /////////////////////////////////////////////////////////////////////////////// | 1443 /////////////////////////////////////////////////////////////////////////////// |
| 1571 | 1444 |
| 1572 #undef DEBUG_S32_OPAQUE_DITHER | |
| 1573 | |
| 1574 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, | 1445 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, |
| 1575 const SkPMColor* SK_RESTRICT src, | 1446 const SkPMColor* SK_RESTRICT src, |
| 1576 int count, U8CPU alpha, int x, int y) { | 1447 int count, U8CPU alpha, int x, int y) { |
| 1577 SkASSERT(255 == alpha); | 1448 SkASSERT(255 == alpha); |
| 1578 | 1449 |
| 1579 #define UNROLL 8 | 1450 #define UNROLL 8 |
| 1580 if (count >= UNROLL) { | 1451 if (count >= UNROLL) { |
| 1581 uint8x8_t d; | 1452 uint8x8_t d; |
| 1582 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; | 1453 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; |
| 1583 d = vld1_u8(dstart); | 1454 d = vld1_u8(dstart); |
| (...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1630 dg = vaddl_u8(sg, vshr_n_u8(d, 1)); | 1501 dg = vaddl_u8(sg, vshr_n_u8(d, 1)); |
| 1631 | 1502 |
| 1632 // pack high bits of each into 565 format (rgb, b is lsb) | 1503 // pack high bits of each into 565 format (rgb, b is lsb) |
| 1633 dst8 = vshrq_n_u16(db, 3); | 1504 dst8 = vshrq_n_u16(db, 3); |
| 1634 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); | 1505 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); |
| 1635 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11); | 1506 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11); |
| 1636 | 1507 |
| 1637 // store it | 1508 // store it |
| 1638 vst1q_u16(dst, dst8); | 1509 vst1q_u16(dst, dst8); |
| 1639 | 1510 |
| 1640 #if defined(DEBUG_S32_OPAQUE_DITHER) | |
| 1641 // always good to know if we generated good results | |
| 1642 { | |
| 1643 int i, myx = x, myy = y; | |
| 1644 DITHER_565_SCAN(myy); | |
| 1645 for (i=0;i<UNROLL;i++) { | |
| 1646 // the '!' in the asm block above post-incremented src by the 8 pixe
ls it reads. | |
| 1647 SkPMColor c = src[i-8]; | |
| 1648 unsigned dither = DITHER_VALUE(myx); | |
| 1649 uint16_t val = SkDitherRGB32To565(c, dither); | |
| 1650 if (val != dst[i]) { | |
| 1651 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x
\n", | |
| 1652 c, dither, val, dst[i], dstart[i]); | |
| 1653 } | |
| 1654 DITHER_INC_X(myx); | |
| 1655 } | |
| 1656 } | |
| 1657 #endif | |
| 1658 | |
| 1659 dst += UNROLL; | 1511 dst += UNROLL; |
| 1660 // we don't need to increment src as the asm above has already done it | 1512 // we don't need to increment src as the asm above has already done it |
| 1661 count -= UNROLL; | 1513 count -= UNROLL; |
| 1662 x += UNROLL; // probably superfluous | 1514 x += UNROLL; // probably superfluous |
| 1663 } | 1515 } |
| 1664 } | 1516 } |
| 1665 #undef UNROLL | 1517 #undef UNROLL |
| 1666 | 1518 |
| 1667 // residuals | 1519 // residuals |
| 1668 if (count > 0) { | 1520 if (count > 0) { |
| (...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1740 invA += invA >> 7; | 1592 invA += invA >> 7; |
| 1741 SkASSERT(invA < 256); // Our caller has already handled the alpha == 0 case
. | 1593 SkASSERT(invA < 256); // Our caller has already handled the alpha == 0 case
. |
| 1742 | 1594 |
| 1743 Sk16h colorHighAndRound = Sk4px::DupPMColor(color).widenHi() + Sk16h(128); | 1595 Sk16h colorHighAndRound = Sk4px::DupPMColor(color).widenHi() + Sk16h(128); |
| 1744 Sk16b invA_16x(invA); | 1596 Sk16b invA_16x(invA); |
| 1745 | 1597 |
| 1746 Sk4px::MapSrc(count, dst, src, [&](const Sk4px& src4) -> Sk4px { | 1598 Sk4px::MapSrc(count, dst, src, [&](const Sk4px& src4) -> Sk4px { |
| 1747 return (src4 * invA_16x).addNarrowHi(colorHighAndRound); | 1599 return (src4 * invA_16x).addNarrowHi(colorHighAndRound); |
| 1748 }); | 1600 }); |
| 1749 } | 1601 } |
| OLD | NEW |