OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBlitRow_opts_arm_neon.h" | 8 #include "SkBlitRow_opts_arm_neon.h" |
9 | 9 |
10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
11 #include "SkBlitRow.h" | 11 #include "SkBlitRow.h" |
12 #include "SkColorPriv.h" | 12 #include "SkColorPriv.h" |
13 #include "SkDither.h" | 13 #include "SkDither.h" |
14 #include "SkMathPriv.h" | 14 #include "SkMathPriv.h" |
15 #include "SkUtils.h" | 15 #include "SkUtils.h" |
16 | 16 |
17 #include "SkColor_opts_neon.h" | 17 #include "SkColor_opts_neon.h" |
18 #include <arm_neon.h> | 18 #include <arm_neon.h> |
19 | 19 |
| 20 #ifdef SK_CPU_ARM |
20 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, | 21 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, |
21 const SkPMColor* SK_RESTRICT src, int count, | 22 const SkPMColor* SK_RESTRICT src, int count, |
22 U8CPU alpha, int /*x*/, int /*y*/) { | 23 U8CPU alpha, int /*x*/, int /*y*/) { |
23 SkASSERT(255 == alpha); | 24 SkASSERT(255 == alpha); |
24 | 25 |
25 while (count >= 8) { | 26 while (count >= 8) { |
26 uint8x8x4_t vsrc; | 27 uint8x8x4_t vsrc; |
27 uint16x8_t vdst; | 28 uint16x8_t vdst; |
28 | 29 |
29 // Load | 30 // Load |
(...skipping 538 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
568 sb = SkDITHER_B32To565(sb, dither); | 569 sb = SkDITHER_B32To565(sb, dither); |
569 | 570 |
570 uint16_t d = *dst; | 571 uint16_t d = *dst; |
571 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), | 572 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), |
572 SkAlphaBlend(sg, SkGetPackedG16(d), scale), | 573 SkAlphaBlend(sg, SkGetPackedG16(d), scale), |
573 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); | 574 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); |
574 DITHER_INC_X(x); | 575 DITHER_INC_X(x); |
575 } while (--count != 0); | 576 } while (--count != 0); |
576 } | 577 } |
577 } | 578 } |
| 579 #endif |
578 | 580 |
579 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, | 581 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
580 const SkPMColor* SK_RESTRICT src, | 582 const SkPMColor* SK_RESTRICT src, |
581 int count, U8CPU alpha) { | 583 int count, U8CPU alpha) { |
582 | 584 |
583 SkASSERT(255 == alpha); | 585 SkASSERT(255 == alpha); |
584 if (count > 0) { | 586 if (count > 0) { |
585 | 587 |
586 | 588 |
587 uint8x8_t alpha_mask; | 589 uint8x8_t alpha_mask; |
(...skipping 324 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
912 vsrc_wide = vmovl_u8(vsrc); | 914 vsrc_wide = vmovl_u8(vsrc); |
913 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); | 915 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); |
914 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); | 916 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); |
915 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); | 917 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); |
916 | 918 |
917 // Store | 919 // Store |
918 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); | 920 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); |
919 } | 921 } |
920 } | 922 } |
921 | 923 |
| 924 #ifdef SK_CPU_ARM |
922 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, | 925 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
923 const SkPMColor* SK_RESTRICT src, | 926 const SkPMColor* SK_RESTRICT src, |
924 int count, U8CPU alpha) { | 927 int count, U8CPU alpha) { |
925 | 928 |
926 SkASSERT(255 >= alpha); | 929 SkASSERT(255 >= alpha); |
927 | 930 |
928 if (count <= 0) { | 931 if (count <= 0) { |
929 return; | 932 return; |
930 } | 933 } |
931 | 934 |
(...skipping 427 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1359 SkPMColor c = *src++; | 1362 SkPMColor c = *src++; |
1360 SkPMColorAssert(c); | 1363 SkPMColorAssert(c); |
1361 SkASSERT(SkGetPackedA32(c) == 255); | 1364 SkASSERT(SkGetPackedA32(c) == 255); |
1362 | 1365 |
1363 unsigned dither = DITHER_VALUE(x); | 1366 unsigned dither = DITHER_VALUE(x); |
1364 *dst++ = SkDitherRGB32To565(c, dither); | 1367 *dst++ = SkDitherRGB32To565(c, dither); |
1365 DITHER_INC_X(x); | 1368 DITHER_INC_X(x); |
1366 } while (--count != 0); | 1369 } while (--count != 0); |
1367 } | 1370 } |
1368 } | 1371 } |
| 1372 #endif |
1369 | 1373 |
1370 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, | 1374 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, |
1371 SkPMColor color) { | 1375 SkPMColor color) { |
1372 if (count <= 0) { | 1376 if (count <= 0) { |
1373 return; | 1377 return; |
1374 } | 1378 } |
1375 | 1379 |
1376 if (0 == color) { | 1380 if (0 == color) { |
1377 if (src != dst) { | 1381 if (src != dst) { |
1378 memcpy(dst, src, count * sizeof(SkPMColor)); | 1382 memcpy(dst, src, count * sizeof(SkPMColor)); |
(...skipping 15 matching lines...) Expand all Loading... |
1394 | 1398 |
1395 vcolor = vdupq_n_u32(color); | 1399 vcolor = vdupq_n_u32(color); |
1396 | 1400 |
1397 // scale numerical interval [0-255], so load as 8 bits | 1401 // scale numerical interval [0-255], so load as 8 bits |
1398 vscale = vdup_n_u8(scale); | 1402 vscale = vdup_n_u8(scale); |
1399 | 1403 |
1400 do { | 1404 do { |
1401 // load src color, 8 pixels, 4 64 bit registers | 1405 // load src color, 8 pixels, 4 64 bit registers |
1402 // (and increment src). | 1406 // (and increment src). |
1403 uint32x2x4_t vsrc; | 1407 uint32x2x4_t vsrc; |
1404 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | 1408 #if defined(SK_CPU_ARM) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR_
_ > 6))) |
1405 asm ( | 1409 asm ( |
1406 "vld1.32 %h[vsrc], [%[src]]!" | 1410 "vld1.32 %h[vsrc], [%[src]]!" |
1407 : [vsrc] "=w" (vsrc), [src] "+r" (src) | 1411 : [vsrc] "=w" (vsrc), [src] "+r" (src) |
1408 : : | 1412 : : |
1409 ); | 1413 ); |
1410 #else // (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | 1414 #else // 64bit targets and Clang |
1411 vsrc.val[0] = vld1_u32(src); | 1415 vsrc.val[0] = vld1_u32(src); |
1412 vsrc.val[1] = vld1_u32(src+2); | 1416 vsrc.val[1] = vld1_u32(src+2); |
1413 vsrc.val[2] = vld1_u32(src+4); | 1417 vsrc.val[2] = vld1_u32(src+4); |
1414 vsrc.val[3] = vld1_u32(src+6); | 1418 vsrc.val[3] = vld1_u32(src+6); |
1415 src += 8; | 1419 src += 8; |
1416 #endif | 1420 #endif |
1417 | 1421 |
1418 // multiply long by scale, 64 bits at a time, | 1422 // multiply long by scale, 64 bits at a time, |
1419 // destination into a 128 bit register. | 1423 // destination into a 128 bit register. |
1420 uint16x8x4_t vtmp; | 1424 uint16x8x4_t vtmp; |
(...skipping 15 matching lines...) Expand all Loading... |
1436 | 1440 |
1437 // adding back the color, using 128 bit registers. | 1441 // adding back the color, using 128 bit registers. |
1438 uint32x4x2_t vdst; | 1442 uint32x4x2_t vdst; |
1439 vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] + | 1443 vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] + |
1440 vreinterpretq_u8_u32(vcolor)); | 1444 vreinterpretq_u8_u32(vcolor)); |
1441 vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] + | 1445 vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] + |
1442 vreinterpretq_u8_u32(vcolor)); | 1446 vreinterpretq_u8_u32(vcolor)); |
1443 | 1447 |
1444 // store back the 8 calculated pixels (2 128 bit | 1448 // store back the 8 calculated pixels (2 128 bit |
1445 // registers), and increment dst. | 1449 // registers), and increment dst. |
1446 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | 1450 #if defined(SK_CPU_ARM) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR_
_ > 6))) |
1447 asm ( | 1451 asm ( |
1448 "vst1.32 %h[vdst], [%[dst]]!" | 1452 "vst1.32 %h[vdst], [%[dst]]!" |
1449 : [dst] "+r" (dst) | 1453 : [dst] "+r" (dst) |
1450 : [vdst] "w" (vdst) | 1454 : [vdst] "w" (vdst) |
1451 : "memory" | 1455 : "memory" |
1452 ); | 1456 ); |
1453 #else // (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | 1457 #else // 64bit targets and Clang |
1454 vst1q_u32(dst, vdst.val[0]); | 1458 vst1q_u32(dst, vdst.val[0]); |
1455 vst1q_u32(dst+4, vdst.val[1]); | 1459 vst1q_u32(dst+4, vdst.val[1]); |
1456 dst += 8; | 1460 dst += 8; |
1457 #endif | 1461 #endif |
1458 count -= 8; | 1462 count -= 8; |
1459 | 1463 |
1460 } while (count >= 8); | 1464 } while (count >= 8); |
1461 } | 1465 } |
1462 | 1466 |
1463 while (count > 0) { | 1467 while (count > 0) { |
1464 *dst = color + SkAlphaMulQ(*src, scale); | 1468 *dst = color + SkAlphaMulQ(*src, scale); |
1465 src += 1; | 1469 src += 1; |
1466 dst += 1; | 1470 dst += 1; |
1467 count--; | 1471 count--; |
1468 } | 1472 } |
1469 } | 1473 } |
1470 | 1474 |
1471 /////////////////////////////////////////////////////////////////////////////// | 1475 /////////////////////////////////////////////////////////////////////////////// |
1472 | 1476 |
1473 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { | 1477 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { |
| 1478 #ifdef SK_CPU_ARM |
1474 // no dither | 1479 // no dither |
1475 S32_D565_Opaque_neon, | 1480 S32_D565_Opaque_neon, |
1476 S32_D565_Blend_neon, | 1481 S32_D565_Blend_neon, |
1477 S32A_D565_Opaque_neon, | 1482 S32A_D565_Opaque_neon, |
1478 S32A_D565_Blend_neon, | 1483 S32A_D565_Blend_neon, |
1479 | 1484 |
1480 // dither | 1485 // dither |
1481 S32_D565_Opaque_Dither_neon, | 1486 S32_D565_Opaque_Dither_neon, |
1482 S32_D565_Blend_Dither_neon, | 1487 S32_D565_Blend_Dither_neon, |
1483 S32A_D565_Opaque_Dither_neon, | 1488 S32A_D565_Opaque_Dither_neon, |
1484 NULL, // S32A_D565_Blend_Dither | 1489 NULL, // S32A_D565_Blend_Dither |
| 1490 #else |
| 1491 NULL, NULL, NULL, NULL, |
| 1492 NULL, NULL, NULL, NULL |
| 1493 #endif |
1485 }; | 1494 }; |
1486 | 1495 |
1487 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { | 1496 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { |
1488 NULL, // S32_Opaque, | 1497 NULL, // S32_Opaque, |
1489 S32_Blend_BlitRow32_neon, // S32_Blend, | 1498 S32_Blend_BlitRow32_neon, // S32_Blend, |
1490 /* | 1499 /* |
1491 * We have two choices for S32A_Opaque procs. The one reads the src alpha | 1500 * We have two choices for S32A_Opaque procs. The one reads the src alpha |
1492 * value and attempts to optimize accordingly. The optimization is | 1501 * value and attempts to optimize accordingly. The optimization is |
1493 * sensitive to the source content and is not a win in all cases. For | 1502 * sensitive to the source content and is not a win in all cases. For |
1494 * example, if there are a lot of transitions between the alpha states, | 1503 * example, if there are a lot of transitions between the alpha states, |
1495 * the performance will almost certainly be worse. However, for many | 1504 * the performance will almost certainly be worse. However, for many |
1496 * common cases the performance is equivalent or better than the standard | 1505 * common cases the performance is equivalent or better than the standard |
1497 * case where we do not inspect the src alpha. | 1506 * case where we do not inspect the src alpha. |
1498 */ | 1507 */ |
1499 #if SK_A32_SHIFT == 24 | 1508 #if SK_A32_SHIFT == 24 |
1500 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | 1509 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
1501 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | 1510 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
1502 #else | 1511 #else |
1503 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1512 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
1504 #endif | 1513 #endif |
| 1514 #ifdef SK_CPU_ARM |
1505 S32A_Blend_BlitRow32_neon // S32A_Blend | 1515 S32A_Blend_BlitRow32_neon // S32A_Blend |
| 1516 #else |
| 1517 NULL |
| 1518 #endif |
1506 }; | 1519 }; |
OLD | NEW |