src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 302283003: ARM Skia NEON patches - 38 - arm64 8888 blitters

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 302283003: ARM Skia NEON patches - 38 - arm64 8888 blitters (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Comments Created 6 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBlitRow_opts_arm_neon.h"	8 #include "SkBlitRow_opts_arm_neon.h"

9	9

10 #include "SkBlitMask.h"	10 #include "SkBlitMask.h"

11 #include "SkBlitRow.h"	11 #include "SkBlitRow.h"

12 #include "SkColorPriv.h"	12 #include "SkColorPriv.h"

13 #include "SkDither.h"	13 #include "SkDither.h"

14 #include "SkMathPriv.h"	14 #include "SkMathPriv.h"

15 #include "SkUtils.h"	15 #include "SkUtils.h"

16	16

17 #include "SkColor_opts_neon.h"	17 #include "SkColor_opts_neon.h"

18 #include <arm_neon.h>	18 #include <arm_neon.h>

19	19

	20 #ifdef SK_CPU_ARM

20 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,	21 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,

21 const SkPMColor* SK_RESTRICT src, int count,	22 const SkPMColor* SK_RESTRICT src, int count,

22 U8CPU alpha, int /x/, int /y/) {	23 U8CPU alpha, int /x/, int /y/) {

23 SkASSERT(255 == alpha);	24 SkASSERT(255 == alpha);

24	25

25 while (count >= 8) {	26 while (count >= 8) {

26 uint8x8x4_t vsrc;	27 uint8x8x4_t vsrc;

27 uint16x8_t vdst;	28 uint16x8_t vdst;

28	29

29 // Load	30 // Load

(...skipping 538 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
568 sb = SkDITHER_B32To565(sb, dither);	569 sb = SkDITHER_B32To565(sb, dither);

569	570

570 uint16_t d = *dst;	571 uint16_t d = *dst;

571 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),	572 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),

572 SkAlphaBlend(sg, SkGetPackedG16(d), scale),	573 SkAlphaBlend(sg, SkGetPackedG16(d), scale),

573 SkAlphaBlend(sb, SkGetPackedB16(d), scale));	574 SkAlphaBlend(sb, SkGetPackedB16(d), scale));

574 DITHER_INC_X(x);	575 DITHER_INC_X(x);

575 } while (--count != 0);	576 } while (--count != 0);

576 }	577 }

577 }	578 }

	579 #endif

578	580

579 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,	581 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,

580 const SkPMColor* SK_RESTRICT src,	582 const SkPMColor* SK_RESTRICT src,

581 int count, U8CPU alpha) {	583 int count, U8CPU alpha) {

582	584

583 SkASSERT(255 == alpha);	585 SkASSERT(255 == alpha);

584 if (count > 0) {	586 if (count > 0) {

585	587

586	588

587 uint8x8_t alpha_mask;	589 uint8x8_t alpha_mask;

(...skipping 324 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
912 vsrc_wide = vmovl_u8(vsrc);	914 vsrc_wide = vmovl_u8(vsrc);

913 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));	915 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));

914 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));	916 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));

915 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);	917 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);

916	918

917 // Store	919 // Store

918 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);	920 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);

919 }	921 }

920 }	922 }

921	923

	924 #ifdef SK_CPU_ARM

922 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,	925 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,

923 const SkPMColor* SK_RESTRICT src,	926 const SkPMColor* SK_RESTRICT src,

924 int count, U8CPU alpha) {	927 int count, U8CPU alpha) {

925	928

926 SkASSERT(255 >= alpha);	929 SkASSERT(255 >= alpha);

927	930

928 if (count <= 0) {	931 if (count <= 0) {

929 return;	932 return;

930 }	933 }

931	934

(...skipping 427 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1359 SkPMColor c = *src++;	1362 SkPMColor c = *src++;

1360 SkPMColorAssert(c);	1363 SkPMColorAssert(c);

1361 SkASSERT(SkGetPackedA32(c) == 255);	1364 SkASSERT(SkGetPackedA32(c) == 255);

1362	1365

1363 unsigned dither = DITHER_VALUE(x);	1366 unsigned dither = DITHER_VALUE(x);

1364 *dst++ = SkDitherRGB32To565(c, dither);	1367 *dst++ = SkDitherRGB32To565(c, dither);

1365 DITHER_INC_X(x);	1368 DITHER_INC_X(x);

1366 } while (--count != 0);	1369 } while (--count != 0);

1367 }	1370 }

1368 }	1371 }

	1372 #endif

1369	1373

1370 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,	1374 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,

1371 SkPMColor color) {	1375 SkPMColor color) {

1372 if (count <= 0) {	1376 if (count <= 0) {

1373 return;	1377 return;

1374 }	1378 }

1375	1379

1376 if (0 == color) {	1380 if (0 == color) {

1377 if (src != dst) {	1381 if (src != dst) {

1378 memcpy(dst, src, count * sizeof(SkPMColor));	1382 memcpy(dst, src, count * sizeof(SkPMColor));

(...skipping 15 matching lines...) Expand all Loading...
1394	1398

1395 vcolor = vdupq_n_u32(color);	1399 vcolor = vdupq_n_u32(color);

1396	1400

1397 // scale numerical interval [0-255], so load as 8 bits	1401 // scale numerical interval [0-255], so load as 8 bits

1398 vscale = vdup_n_u8(scale);	1402 vscale = vdup_n_u8(scale);

1399	1403

1400 do {	1404 do {

1401 // load src color, 8 pixels, 4 64 bit registers	1405 // load src color, 8 pixels, 4 64 bit registers

1402 // (and increment src).	1406 // (and increment src).

1403 uint32x2x4_t vsrc;	1407 uint32x2x4_t vsrc;

1404 #if (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))	1408 #if defined(SK_CPU_ARM) && ((__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR_ _ > 6)))

1405 asm (	1409 asm (

1406 "vld1.32 %h[vsrc], [%[src]]!"	1410 "vld1.32 %h[vsrc], [%[src]]!"

1407 : [vsrc] "=w" (vsrc), [src] "+r" (src)	1411 : [vsrc] "=w" (vsrc), [src] "+r" (src)

1408 : :	1412 : :

1409 );	1413 );

1410 #else // (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))	1414 #else // 64bit targets and Clang

1411 vsrc.val[0] = vld1_u32(src);	1415 vsrc.val[0] = vld1_u32(src);

1412 vsrc.val[1] = vld1_u32(src+2);	1416 vsrc.val[1] = vld1_u32(src+2);

1413 vsrc.val[2] = vld1_u32(src+4);	1417 vsrc.val[2] = vld1_u32(src+4);

1414 vsrc.val[3] = vld1_u32(src+6);	1418 vsrc.val[3] = vld1_u32(src+6);

1415 src += 8;	1419 src += 8;

1416 #endif	1420 #endif

1417	1421

1418 // multiply long by scale, 64 bits at a time,	1422 // multiply long by scale, 64 bits at a time,

1419 // destination into a 128 bit register.	1423 // destination into a 128 bit register.

1420 uint16x8x4_t vtmp;	1424 uint16x8x4_t vtmp;

(...skipping 15 matching lines...) Expand all Loading...
1436	1440

1437 // adding back the color, using 128 bit registers.	1441 // adding back the color, using 128 bit registers.

1438 uint32x4x2_t vdst;	1442 uint32x4x2_t vdst;

1439 vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] +	1443 vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] +

1440 vreinterpretq_u8_u32(vcolor));	1444 vreinterpretq_u8_u32(vcolor));

1441 vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] +	1445 vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] +

1442 vreinterpretq_u8_u32(vcolor));	1446 vreinterpretq_u8_u32(vcolor));

1443	1447

1444 // store back the 8 calculated pixels (2 128 bit	1448 // store back the 8 calculated pixels (2 128 bit

1445 // registers), and increment dst.	1449 // registers), and increment dst.

1446 #if (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))	1450 #if defined(SK_CPU_ARM) && ((__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR_ _ > 6)))

1447 asm (	1451 asm (

1448 "vst1.32 %h[vdst], [%[dst]]!"	1452 "vst1.32 %h[vdst], [%[dst]]!"

1449 : [dst] "+r" (dst)	1453 : [dst] "+r" (dst)

1450 : [vdst] "w" (vdst)	1454 : [vdst] "w" (vdst)

1451 : "memory"	1455 : "memory"

1452 );	1456 );

1453 #else // (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))	1457 #else // 64bit targets and Clang

1454 vst1q_u32(dst, vdst.val[0]);	1458 vst1q_u32(dst, vdst.val[0]);

1455 vst1q_u32(dst+4, vdst.val[1]);	1459 vst1q_u32(dst+4, vdst.val[1]);

1456 dst += 8;	1460 dst += 8;

1457 #endif	1461 #endif

1458 count -= 8;	1462 count -= 8;

1459	1463

1460 } while (count >= 8);	1464 } while (count >= 8);

1461 }	1465 }

1462	1466

1463 while (count > 0) {	1467 while (count > 0) {

1464 dst = color + SkAlphaMulQ(src, scale);	1468 dst = color + SkAlphaMulQ(src, scale);

1465 src += 1;	1469 src += 1;

1466 dst += 1;	1470 dst += 1;

1467 count--;	1471 count--;

1468 }	1472 }

1469 }	1473 }

1470	1474

1471 ///////////////////////////////////////////////////////////////////////////////	1475 ///////////////////////////////////////////////////////////////////////////////

1472	1476

1473 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {	1477 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {

	1478 #ifdef SK_CPU_ARM

1474 // no dither	1479 // no dither

1475 S32_D565_Opaque_neon,	1480 S32_D565_Opaque_neon,

1476 S32_D565_Blend_neon,	1481 S32_D565_Blend_neon,

1477 S32A_D565_Opaque_neon,	1482 S32A_D565_Opaque_neon,

1478 S32A_D565_Blend_neon,	1483 S32A_D565_Blend_neon,

1479	1484

1480 // dither	1485 // dither

1481 S32_D565_Opaque_Dither_neon,	1486 S32_D565_Opaque_Dither_neon,

1482 S32_D565_Blend_Dither_neon,	1487 S32_D565_Blend_Dither_neon,

1483 S32A_D565_Opaque_Dither_neon,	1488 S32A_D565_Opaque_Dither_neon,

1484 NULL, // S32A_D565_Blend_Dither	1489 NULL, // S32A_D565_Blend_Dither

	1490 #else

	1491 NULL, NULL, NULL, NULL,

	1492 NULL, NULL, NULL, NULL

	1493 #endif

1485 };	1494 };

1486	1495

1487 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {	1496 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {

1488 NULL, // S32_Opaque,	1497 NULL, // S32_Opaque,

1489 S32_Blend_BlitRow32_neon, // S32_Blend,	1498 S32_Blend_BlitRow32_neon, // S32_Blend,

1490 /*	1499 /*

1491 * We have two choices for S32A_Opaque procs. The one reads the src alpha	1500 * We have two choices for S32A_Opaque procs. The one reads the src alpha

1492 * value and attempts to optimize accordingly. The optimization is	1501 * value and attempts to optimize accordingly. The optimization is

1493 * sensitive to the source content and is not a win in all cases. For	1502 * sensitive to the source content and is not a win in all cases. For

1494 * example, if there are a lot of transitions between the alpha states,	1503 * example, if there are a lot of transitions between the alpha states,

1495 * the performance will almost certainly be worse. However, for many	1504 * the performance will almost certainly be worse. However, for many

1496 * common cases the performance is equivalent or better than the standard	1505 * common cases the performance is equivalent or better than the standard

1497 * case where we do not inspect the src alpha.	1506 * case where we do not inspect the src alpha.

1498 */	1507 */

1499 #if SK_A32_SHIFT == 24	1508 #if SK_A32_SHIFT == 24

1500 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor	1509 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor

1501 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,	1510 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,

1502 #else	1511 #else

1503 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,	1512 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,

1504 #endif	1513 #endif

	1514 #ifdef SK_CPU_ARM

1505 S32A_Blend_BlitRow32_neon // S32A_Blend	1515 S32A_Blend_BlitRow32_neon // S32A_Blend

	1516 #else

	1517 NULL

	1518 #endif

1506 };	1519 };

OLD	NEW

« no previous file with comments | « gyp/opts.gyp ('k') | no next file » | no next file with comments »