src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 1277953002: Purge non-NEON ARM code.

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 1277953002: Purge non-NEON ARM code. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 5 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBlitRow_opts_arm_neon.h"	8 #include "SkBlitRow_opts_arm_neon.h"

9	9

10 #include "SkBlitMask.h"	10 #include "SkBlitMask.h"

(...skipping 617 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
628	628

629 do {	629 do {

630 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;	630 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;

631 uint16x8_t vres_a, vres_r, vres_g, vres_b;	631 uint16x8_t vres_a, vres_r, vres_g, vres_b;

632 uint8x8x4_t vsrc;	632 uint8x8x4_t vsrc;

633	633

634 // load pixels	634 // load pixels

635 vdst = vld1q_u16(dst);	635 vdst = vld1q_u16(dst);

636 #ifdef SK_CPU_ARM64	636 #ifdef SK_CPU_ARM64

637 vsrc = sk_vld4_u8_arm64_4(src);	637 vsrc = sk_vld4_u8_arm64_4(src);

638 #else	638 #elif (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))

639 #if (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))

640 asm (	639 asm (

641 "vld4.u8 %h[vsrc], [%[src]]!"	640 "vld4.u8 %h[vsrc], [%[src]]!"

642 : [vsrc] "=w" (vsrc), [src] "+&r" (src)	641 : [vsrc] "=w" (vsrc), [src] "+&r" (src)

643 : :	642 : :

644 );	643 );

645 #else	644 #else

646 register uint8x8_t d0 asm("d0");	645 register uint8x8_t d0 asm("d0");

647 register uint8x8_t d1 asm("d1");	646 register uint8x8_t d1 asm("d1");

648 register uint8x8_t d2 asm("d2");	647 register uint8x8_t d2 asm("d2");

649 register uint8x8_t d3 asm("d3");	648 register uint8x8_t d3 asm("d3");

650	649

651 asm volatile (	650 asm volatile (

652 "vld4.u8 {d0-d3},[%[src]]!;"	651 "vld4.u8 {d0-d3},[%[src]]!;"

653 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),	652 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),

654 [src] "+&r" (src)	653 [src] "+&r" (src)

655 : :	654 : :

656 );	655 );

657 vsrc.val[0] = d0;	656 vsrc.val[0] = d0;

658 vsrc.val[1] = d1;	657 vsrc.val[1] = d1;

659 vsrc.val[2] = d2;	658 vsrc.val[2] = d2;

660 vsrc.val[3] = d3;	659 vsrc.val[3] = d3;

661 #endif	660 #endif

662 #endif // #ifdef SK_CPU_ARM64

663	661

664	662

665 // deinterleave dst	663 // deinterleave dst

666 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to p of lanes	664 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to p of lanes

667 vdst_b = vdst & vmask_blue; // extract blue	665 vdst_b = vdst & vmask_blue; // extract blue

668 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red	666 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red

669 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green	667 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green

670	668

671 // shift src to 565	669 // shift src to 565

672 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);	670 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);

(...skipping 631 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1304	1302

1305 src += 2;	1303 src += 2;

1306 dst += 2;	1304 dst += 2;

1307 count -= 2;	1305 count -= 2;

1308 } while(count);	1306 } while(count);

1309 }	1307 }

1310 }	1308 }

1311	1309

1312 ///////////////////////////////////////////////////////////////////////////////	1310 ///////////////////////////////////////////////////////////////////////////////

1313	1311

1314 #undef DEBUG_OPAQUE_DITHER

1315

1316 #if defined(DEBUG_OPAQUE_DITHER)

1317 static void showme8(char str, void p, int len)

1318 {

1319 static char buf[256];

1320 char tbuf[32];

1321 int i;

1322 char pc = (char) p;

1323 sprintf(buf,"%8s:", str);

1324 for(i=0;i<len;i++) {

1325 sprintf(tbuf, " %02x", pc[i]);

1326 strcat(buf, tbuf);

1327 }

1328 SkDebugf("%s\n", buf);

1329 }

1330 static void showme16(char str, void p, int len)

1331 {

1332 static char buf[256];

1333 char tbuf[32];

1334 int i;

1335 uint16_t pc = (uint16_t) p;

1336 sprintf(buf,"%8s:", str);

1337 len = (len / sizeof(uint16_t)); /* passed as bytes */

1338 for(i=0;i<len;i++) {

1339 sprintf(tbuf, " %04x", pc[i]);

1340 strcat(buf, tbuf);

1341 }

1342 SkDebugf("%s\n", buf);

1343 }

1344 #endif

1345 #endif // #ifdef SK_CPU_ARM32	1312 #endif // #ifdef SK_CPU_ARM32

1346	1313

1347 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,	1314 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,

1348 const SkPMColor* SK_RESTRICT src,	1315 const SkPMColor* SK_RESTRICT src,

1349 int count, U8CPU alpha, int x, int y) {	1316 int count, U8CPU alpha, int x, int y) {

1350 SkASSERT(255 == alpha);	1317 SkASSERT(255 == alpha);

1351	1318

1352 #define UNROLL 8	1319 #define UNROLL 8

1353	1320

1354 if (count >= UNROLL) {	1321 if (count >= UNROLL) {

1355	1322

1356 #if defined(DEBUG_OPAQUE_DITHER)

1357 uint16_t tmpbuf[UNROLL];

1358 int td[UNROLL];

1359 int tdv[UNROLL];

1360 int ta[UNROLL];

1361 int tap[UNROLL];

1362 uint16_t in_dst[UNROLL];

1363 int offset = 0;

1364 int noisy = 0;

1365 #endif

1366

1367 uint8x8_t dbase;	1323 uint8x8_t dbase;

1368 const uint8_t dstart = &gDitherMatrix_Neon[(y&3)12 + (x&3)];	1324 const uint8_t dstart = &gDitherMatrix_Neon[(y&3)12 + (x&3)];

1369 dbase = vld1_u8(dstart);	1325 dbase = vld1_u8(dstart);

1370	1326

1371 do {	1327 do {

1372 uint8x8x4_t vsrc;	1328 uint8x8x4_t vsrc;

1373 uint8x8_t sr, sg, sb, sa, d;	1329 uint8x8_t sr, sg, sb, sa, d;

1374 uint16x8_t dst8, scale8, alpha8;	1330 uint16x8_t dst8, scale8, alpha8;

1375 uint16x8_t dst_r, dst_g, dst_b;	1331 uint16x8_t dst_r, dst_g, dst_b;

1376	1332

1377 #if defined(DEBUG_OPAQUE_DITHER)

1378 // calculate 8 elements worth into a temp buffer

1379 {

1380 int my_y = y;

1381 int my_x = x;

1382 SkPMColor* my_src = (SkPMColor*)src;

1383 uint16_t* my_dst = dst;

1384 int i;

1385

1386 DITHER_565_SCAN(my_y);

1387 for(i = 0; i < UNROLL; i++) {

1388 SkPMColor c = *my_src++;

1389 SkPMColorAssert(c);

1390 if (c) {

1391 unsigned a = SkGetPackedA32(c);

1392

1393 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));

1394 tdv[i] = DITHER_VALUE(my_x);

1395 ta[i] = a;

1396 tap[i] = SkAlpha255To256(a);

1397 td[i] = d;

1398

1399 unsigned sr = SkGetPackedR32(c);

1400 unsigned sg = SkGetPackedG32(c);

1401 unsigned sb = SkGetPackedB32(c);

1402 sr = SkDITHER_R32_FOR_565(sr, d);

1403 sg = SkDITHER_G32_FOR_565(sg, d);

1404 sb = SkDITHER_B32_FOR_565(sb, d);

1405

1406 uint32_t src_expanded = (sg << 24) \| (sr << 13) \| (sb << 2);

1407 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);

1408 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);

1409 // now src and dst expanded are in g:11 r:10 x:1 b:10

1410 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5) ;

1411 td[i] = d;

1412 } else {

1413 tmpbuf[i] = *my_dst;

1414 ta[i] = tdv[i] = td[i] = 0xbeef;

1415 }

1416 in_dst[i] = *my_dst;

1417 my_dst += 1;

1418 DITHER_INC_X(my_x);

1419 }

1420 }

1421 #endif

1422

1423 #ifdef SK_CPU_ARM64	1333 #ifdef SK_CPU_ARM64

1424 vsrc = sk_vld4_u8_arm64_4(src);	1334 vsrc = sk_vld4_u8_arm64_4(src);

1425 #else	1335 #else

1426 {	1336 {

1427 register uint8x8_t d0 asm("d0");	1337 register uint8x8_t d0 asm("d0");

1428 register uint8x8_t d1 asm("d1");	1338 register uint8x8_t d1 asm("d1");

1429 register uint8x8_t d2 asm("d2");	1339 register uint8x8_t d2 asm("d2");

1430 register uint8x8_t d3 asm("d3");	1340 register uint8x8_t d3 asm("d3");

1431	1341

1432 asm ("vld4.8 {d0-d3},[%[src]]! "	1342 asm ("vld4.8 {d0-d3},[%[src]]! "

(...skipping 49 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1482 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);	1392 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);

1483 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);	1393 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);

1484	1394

1485 // repack to store	1395 // repack to store

1486 dst8 = vshrq_n_u16(dst_b, 5);	1396 dst8 = vshrq_n_u16(dst_b, 5);

1487 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);	1397 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);

1488 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);	1398 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);

1489	1399

1490 vst1q_u16(dst, dst8);	1400 vst1q_u16(dst, dst8);

1491	1401

1492 #if defined(DEBUG_OPAQUE_DITHER)

1493 // verify my 8 elements match the temp buffer

1494 {

1495 int i, bad=0;

1496 static int invocation;

1497

1498 for (i = 0; i < UNROLL; i++) {

1499 if (tmpbuf[i] != dst[i]) {

1500 bad=1;

1501 }

1502 }

1503 if (bad) {

1504 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset % d\n",

1505 invocation, offset);

1506 SkDebugf(" alpha 0x%x\n", alpha);

1507 for (i = 0; i < UNROLL; i++)

1508 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %0 4x\n",

1509 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[ i],

1510 in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);

1511

1512 showme16("alpha8", &alpha8, sizeof(alpha8));

1513 showme16("scale8", &scale8, sizeof(scale8));

1514 showme8("d", &d, sizeof(d));

1515 showme16("dst8", &dst8, sizeof(dst8));

1516 showme16("dst_b", &dst_b, sizeof(dst_b));

1517 showme16("dst_g", &dst_g, sizeof(dst_g));

1518 showme16("dst_r", &dst_r, sizeof(dst_r));

1519 showme8("sb", &sb, sizeof(sb));

1520 showme8("sg", &sg, sizeof(sg));

1521 showme8("sr", &sr, sizeof(sr));

1522

1523 return;

1524 }

1525 offset += UNROLL;

1526 invocation++;

1527 }

1528 #endif

1529 dst += UNROLL;	1402 dst += UNROLL;

1530 count -= UNROLL;	1403 count -= UNROLL;

1531 // skip x += UNROLL, since it's unchanged mod-4	1404 // skip x += UNROLL, since it's unchanged mod-4

1532 } while (count >= UNROLL);	1405 } while (count >= UNROLL);

1533 }	1406 }

1534 #undef UNROLL	1407 #undef UNROLL

1535	1408

1536 // residuals	1409 // residuals

1537 if (count > 0) {	1410 if (count > 0) {

1538 DITHER_565_SCAN(y);	1411 DITHER_565_SCAN(y);

(...skipping 23 matching lines...) Expand all Loading...
1562 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);	1435 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);

1563 }	1436 }

1564 dst += 1;	1437 dst += 1;

1565 DITHER_INC_X(x);	1438 DITHER_INC_X(x);

1566 } while (--count != 0);	1439 } while (--count != 0);

1567 }	1440 }

1568 }	1441 }

1569	1442

1570 ///////////////////////////////////////////////////////////////////////////////	1443 ///////////////////////////////////////////////////////////////////////////////

1571	1444

1572 #undef DEBUG_S32_OPAQUE_DITHER

1573

1574 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,	1445 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,

1575 const SkPMColor* SK_RESTRICT src,	1446 const SkPMColor* SK_RESTRICT src,

1576 int count, U8CPU alpha, int x, int y) {	1447 int count, U8CPU alpha, int x, int y) {

1577 SkASSERT(255 == alpha);	1448 SkASSERT(255 == alpha);

1578	1449

1579 #define UNROLL 8	1450 #define UNROLL 8

1580 if (count >= UNROLL) {	1451 if (count >= UNROLL) {

1581 uint8x8_t d;	1452 uint8x8_t d;

1582 const uint8_t dstart = &gDitherMatrix_Neon[(y&3)12 + (x&3)];	1453 const uint8_t dstart = &gDitherMatrix_Neon[(y&3)12 + (x&3)];

1583 d = vld1_u8(dstart);	1454 d = vld1_u8(dstart);

(...skipping 46 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1630 dg = vaddl_u8(sg, vshr_n_u8(d, 1));	1501 dg = vaddl_u8(sg, vshr_n_u8(d, 1));

1631	1502

1632 // pack high bits of each into 565 format (rgb, b is lsb)	1503 // pack high bits of each into 565 format (rgb, b is lsb)

1633 dst8 = vshrq_n_u16(db, 3);	1504 dst8 = vshrq_n_u16(db, 3);

1634 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);	1505 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);

1635 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);	1506 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);

1636	1507

1637 // store it	1508 // store it

1638 vst1q_u16(dst, dst8);	1509 vst1q_u16(dst, dst8);

1639	1510

1640 #if defined(DEBUG_S32_OPAQUE_DITHER)

1641 // always good to know if we generated good results

1642 {

1643 int i, myx = x, myy = y;

1644 DITHER_565_SCAN(myy);

1645 for (i=0;i<UNROLL;i++) {

1646 // the '!' in the asm block above post-incremented src by the 8 pixe ls it reads.

1647 SkPMColor c = src[i-8];

1648 unsigned dither = DITHER_VALUE(myx);

1649 uint16_t val = SkDitherRGB32To565(c, dither);

1650 if (val != dst[i]) {

1651 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x \n",

1652 c, dither, val, dst[i], dstart[i]);

1653 }

1654 DITHER_INC_X(myx);

1655 }

1656 }

1657 #endif

1658

1659 dst += UNROLL;	1511 dst += UNROLL;

1660 // we don't need to increment src as the asm above has already done it	1512 // we don't need to increment src as the asm above has already done it

1661 count -= UNROLL;	1513 count -= UNROLL;

1662 x += UNROLL; // probably superfluous	1514 x += UNROLL; // probably superfluous

1663 }	1515 }

1664 }	1516 }

1665 #undef UNROLL	1517 #undef UNROLL

1666	1518

1667 // residuals	1519 // residuals

1668 if (count > 0) {	1520 if (count > 0) {

(...skipping 71 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1740 invA += invA >> 7;	1592 invA += invA >> 7;

1741 SkASSERT(invA < 256); // Our caller has already handled the alpha == 0 case .	1593 SkASSERT(invA < 256); // Our caller has already handled the alpha == 0 case .

1742	1594

1743 Sk16h colorHighAndRound = Sk4px::DupPMColor(color).widenHi() + Sk16h(128);	1595 Sk16h colorHighAndRound = Sk4px::DupPMColor(color).widenHi() + Sk16h(128);

1744 Sk16b invA_16x(invA);	1596 Sk16b invA_16x(invA);

1745	1597

1746 Sk4px::MapSrc(count, dst, src, [&](const Sk4px& src4) -> Sk4px {	1598 Sk4px::MapSrc(count, dst, src, [&](const Sk4px& src4) -> Sk4px {

1747 return (src4 * invA_16x).addNarrowHi(colorHighAndRound);	1599 return (src4 * invA_16x).addNarrowHi(colorHighAndRound);

1748 });	1600 });

1749 }	1601 }

OLD	NEW

« no previous file with comments | « src/opts/SkBlitRow_opts_arm.cpp ('k') | no next file » | no next file with comments »