| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include "SkBlitRow_opts_arm_neon.h" | 8 #include "SkBlitRow_opts_arm_neon.h" |
| 9 | 9 |
| 10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
| 11 #include "SkBlitRow.h" | 11 #include "SkBlitRow.h" |
| 12 #include "SkColorPriv.h" | 12 #include "SkColorPriv.h" |
| 13 #include "SkDither.h" | 13 #include "SkDither.h" |
| 14 #include "SkMathPriv.h" | 14 #include "SkMathPriv.h" |
| 15 #include "SkUtils.h" | 15 #include "SkUtils.h" |
| 16 | 16 |
| 17 #include "SkColor_opts_neon.h" | 17 #include "SkColor_opts_neon.h" |
| 18 #include <arm_neon.h> | 18 #include <arm_neon.h> |
| 19 | 19 |
| 20 #ifdef SK_CPU_ARM |
| 20 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, | 21 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, |
| 21 const SkPMColor* SK_RESTRICT src, int count, | 22 const SkPMColor* SK_RESTRICT src, int count, |
| 22 U8CPU alpha, int /*x*/, int /*y*/) { | 23 U8CPU alpha, int /*x*/, int /*y*/) { |
| 23 SkASSERT(255 == alpha); | 24 SkASSERT(255 == alpha); |
| 24 | 25 |
| 25 while (count >= 8) { | 26 while (count >= 8) { |
| 26 uint8x8x4_t vsrc; | 27 uint8x8x4_t vsrc; |
| 27 uint16x8_t vdst; | 28 uint16x8_t vdst; |
| 28 | 29 |
| 29 // Load | 30 // Load |
| (...skipping 538 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 568 sb = SkDITHER_B32To565(sb, dither); | 569 sb = SkDITHER_B32To565(sb, dither); |
| 569 | 570 |
| 570 uint16_t d = *dst; | 571 uint16_t d = *dst; |
| 571 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), | 572 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), |
| 572 SkAlphaBlend(sg, SkGetPackedG16(d), scale), | 573 SkAlphaBlend(sg, SkGetPackedG16(d), scale), |
| 573 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); | 574 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); |
| 574 DITHER_INC_X(x); | 575 DITHER_INC_X(x); |
| 575 } while (--count != 0); | 576 } while (--count != 0); |
| 576 } | 577 } |
| 577 } | 578 } |
| 579 #endif |
| 578 | 580 |
| 579 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, | 581 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
| 580 const SkPMColor* SK_RESTRICT src, | 582 const SkPMColor* SK_RESTRICT src, |
| 581 int count, U8CPU alpha) { | 583 int count, U8CPU alpha) { |
| 582 | 584 |
| 583 SkASSERT(255 == alpha); | 585 SkASSERT(255 == alpha); |
| 584 if (count > 0) { | 586 if (count > 0) { |
| 585 | 587 |
| 586 | 588 |
| 587 uint8x8_t alpha_mask; | 589 uint8x8_t alpha_mask; |
| (...skipping 324 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 912 vsrc_wide = vmovl_u8(vsrc); | 914 vsrc_wide = vmovl_u8(vsrc); |
| 913 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); | 915 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); |
| 914 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); | 916 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); |
| 915 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); | 917 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); |
| 916 | 918 |
| 917 // Store | 919 // Store |
| 918 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); | 920 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); |
| 919 } | 921 } |
| 920 } | 922 } |
| 921 | 923 |
| 924 #ifdef SK_CPU_ARM |
| 922 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, | 925 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
| 923 const SkPMColor* SK_RESTRICT src, | 926 const SkPMColor* SK_RESTRICT src, |
| 924 int count, U8CPU alpha) { | 927 int count, U8CPU alpha) { |
| 925 | 928 |
| 926 SkASSERT(255 >= alpha); | 929 SkASSERT(255 >= alpha); |
| 927 | 930 |
| 928 if (count <= 0) { | 931 if (count <= 0) { |
| 929 return; | 932 return; |
| 930 } | 933 } |
| 931 | 934 |
| (...skipping 427 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1359 SkPMColor c = *src++; | 1362 SkPMColor c = *src++; |
| 1360 SkPMColorAssert(c); | 1363 SkPMColorAssert(c); |
| 1361 SkASSERT(SkGetPackedA32(c) == 255); | 1364 SkASSERT(SkGetPackedA32(c) == 255); |
| 1362 | 1365 |
| 1363 unsigned dither = DITHER_VALUE(x); | 1366 unsigned dither = DITHER_VALUE(x); |
| 1364 *dst++ = SkDitherRGB32To565(c, dither); | 1367 *dst++ = SkDitherRGB32To565(c, dither); |
| 1365 DITHER_INC_X(x); | 1368 DITHER_INC_X(x); |
| 1366 } while (--count != 0); | 1369 } while (--count != 0); |
| 1367 } | 1370 } |
| 1368 } | 1371 } |
| 1372 #endif |
| 1369 | 1373 |
| 1370 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, | 1374 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, |
| 1371 SkPMColor color) { | 1375 SkPMColor color) { |
| 1372 if (count <= 0) { | 1376 if (count <= 0) { |
| 1373 return; | 1377 return; |
| 1374 } | 1378 } |
| 1375 | 1379 |
| 1376 if (0 == color) { | 1380 if (0 == color) { |
| 1377 if (src != dst) { | 1381 if (src != dst) { |
| 1378 memcpy(dst, src, count * sizeof(SkPMColor)); | 1382 memcpy(dst, src, count * sizeof(SkPMColor)); |
| (...skipping 15 matching lines...) Expand all Loading... |
| 1394 | 1398 |
| 1395 vcolor = vdupq_n_u32(color); | 1399 vcolor = vdupq_n_u32(color); |
| 1396 | 1400 |
| 1397 // scale numerical interval [0-255], so load as 8 bits | 1401 // scale numerical interval [0-255], so load as 8 bits |
| 1398 vscale = vdup_n_u8(scale); | 1402 vscale = vdup_n_u8(scale); |
| 1399 | 1403 |
| 1400 do { | 1404 do { |
| 1401 // load src color, 8 pixels, 4 64 bit registers | 1405 // load src color, 8 pixels, 4 64 bit registers |
| 1402 // (and increment src). | 1406 // (and increment src). |
| 1403 uint32x2x4_t vsrc; | 1407 uint32x2x4_t vsrc; |
| 1404 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | 1408 #if defined(SK_CPU_ARM) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR_
_ > 6))) |
| 1405 asm ( | 1409 asm ( |
| 1406 "vld1.32 %h[vsrc], [%[src]]!" | 1410 "vld1.32 %h[vsrc], [%[src]]!" |
| 1407 : [vsrc] "=w" (vsrc), [src] "+r" (src) | 1411 : [vsrc] "=w" (vsrc), [src] "+r" (src) |
| 1408 : : | 1412 : : |
| 1409 ); | 1413 ); |
| 1410 #else // (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | 1414 #else // 64bit targets and Clang |
| 1411 vsrc.val[0] = vld1_u32(src); | 1415 vsrc.val[0] = vld1_u32(src); |
| 1412 vsrc.val[1] = vld1_u32(src+2); | 1416 vsrc.val[1] = vld1_u32(src+2); |
| 1413 vsrc.val[2] = vld1_u32(src+4); | 1417 vsrc.val[2] = vld1_u32(src+4); |
| 1414 vsrc.val[3] = vld1_u32(src+6); | 1418 vsrc.val[3] = vld1_u32(src+6); |
| 1415 src += 8; | 1419 src += 8; |
| 1416 #endif | 1420 #endif |
| 1417 | 1421 |
| 1418 // multiply long by scale, 64 bits at a time, | 1422 // multiply long by scale, 64 bits at a time, |
| 1419 // destination into a 128 bit register. | 1423 // destination into a 128 bit register. |
| 1420 uint16x8x4_t vtmp; | 1424 uint16x8x4_t vtmp; |
| (...skipping 15 matching lines...) Expand all Loading... |
| 1436 | 1440 |
| 1437 // adding back the color, using 128 bit registers. | 1441 // adding back the color, using 128 bit registers. |
| 1438 uint32x4x2_t vdst; | 1442 uint32x4x2_t vdst; |
| 1439 vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] + | 1443 vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] + |
| 1440 vreinterpretq_u8_u32(vcolor)); | 1444 vreinterpretq_u8_u32(vcolor)); |
| 1441 vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] + | 1445 vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] + |
| 1442 vreinterpretq_u8_u32(vcolor)); | 1446 vreinterpretq_u8_u32(vcolor)); |
| 1443 | 1447 |
| 1444 // store back the 8 calculated pixels (2 128 bit | 1448 // store back the 8 calculated pixels (2 128 bit |
| 1445 // registers), and increment dst. | 1449 // registers), and increment dst. |
| 1446 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | 1450 #if defined(SK_CPU_ARM) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR_
_ > 6))) |
| 1447 asm ( | 1451 asm ( |
| 1448 "vst1.32 %h[vdst], [%[dst]]!" | 1452 "vst1.32 %h[vdst], [%[dst]]!" |
| 1449 : [dst] "+r" (dst) | 1453 : [dst] "+r" (dst) |
| 1450 : [vdst] "w" (vdst) | 1454 : [vdst] "w" (vdst) |
| 1451 : "memory" | 1455 : "memory" |
| 1452 ); | 1456 ); |
| 1453 #else // (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | 1457 #else // 64bit targets and Clang |
| 1454 vst1q_u32(dst, vdst.val[0]); | 1458 vst1q_u32(dst, vdst.val[0]); |
| 1455 vst1q_u32(dst+4, vdst.val[1]); | 1459 vst1q_u32(dst+4, vdst.val[1]); |
| 1456 dst += 8; | 1460 dst += 8; |
| 1457 #endif | 1461 #endif |
| 1458 count -= 8; | 1462 count -= 8; |
| 1459 | 1463 |
| 1460 } while (count >= 8); | 1464 } while (count >= 8); |
| 1461 } | 1465 } |
| 1462 | 1466 |
| 1463 while (count > 0) { | 1467 while (count > 0) { |
| 1464 *dst = color + SkAlphaMulQ(*src, scale); | 1468 *dst = color + SkAlphaMulQ(*src, scale); |
| 1465 src += 1; | 1469 src += 1; |
| 1466 dst += 1; | 1470 dst += 1; |
| 1467 count--; | 1471 count--; |
| 1468 } | 1472 } |
| 1469 } | 1473 } |
| 1470 | 1474 |
| 1471 /////////////////////////////////////////////////////////////////////////////// | 1475 /////////////////////////////////////////////////////////////////////////////// |
| 1472 | 1476 |
| 1473 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { | 1477 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { |
| 1478 #ifdef SK_CPU_ARM |
| 1474 // no dither | 1479 // no dither |
| 1475 S32_D565_Opaque_neon, | 1480 S32_D565_Opaque_neon, |
| 1476 S32_D565_Blend_neon, | 1481 S32_D565_Blend_neon, |
| 1477 S32A_D565_Opaque_neon, | 1482 S32A_D565_Opaque_neon, |
| 1478 S32A_D565_Blend_neon, | 1483 S32A_D565_Blend_neon, |
| 1479 | 1484 |
| 1480 // dither | 1485 // dither |
| 1481 S32_D565_Opaque_Dither_neon, | 1486 S32_D565_Opaque_Dither_neon, |
| 1482 S32_D565_Blend_Dither_neon, | 1487 S32_D565_Blend_Dither_neon, |
| 1483 S32A_D565_Opaque_Dither_neon, | 1488 S32A_D565_Opaque_Dither_neon, |
| 1484 NULL, // S32A_D565_Blend_Dither | 1489 NULL, // S32A_D565_Blend_Dither |
| 1490 #else |
| 1491 NULL, NULL, NULL, NULL, |
| 1492 NULL, NULL, NULL, NULL |
| 1493 #endif |
| 1485 }; | 1494 }; |
| 1486 | 1495 |
| 1487 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { | 1496 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { |
| 1488 NULL, // S32_Opaque, | 1497 NULL, // S32_Opaque, |
| 1489 S32_Blend_BlitRow32_neon, // S32_Blend, | 1498 S32_Blend_BlitRow32_neon, // S32_Blend, |
| 1490 /* | 1499 /* |
| 1491 * We have two choices for S32A_Opaque procs. The one reads the src alpha | 1500 * We have two choices for S32A_Opaque procs. The one reads the src alpha |
| 1492 * value and attempts to optimize accordingly. The optimization is | 1501 * value and attempts to optimize accordingly. The optimization is |
| 1493 * sensitive to the source content and is not a win in all cases. For | 1502 * sensitive to the source content and is not a win in all cases. For |
| 1494 * example, if there are a lot of transitions between the alpha states, | 1503 * example, if there are a lot of transitions between the alpha states, |
| 1495 * the performance will almost certainly be worse. However, for many | 1504 * the performance will almost certainly be worse. However, for many |
| 1496 * common cases the performance is equivalent or better than the standard | 1505 * common cases the performance is equivalent or better than the standard |
| 1497 * case where we do not inspect the src alpha. | 1506 * case where we do not inspect the src alpha. |
| 1498 */ | 1507 */ |
| 1499 #if SK_A32_SHIFT == 24 | 1508 #if SK_A32_SHIFT == 24 |
| 1500 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | 1509 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
| 1501 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | 1510 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
| 1502 #else | 1511 #else |
| 1503 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1512 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
| 1504 #endif | 1513 #endif |
| 1514 #ifdef SK_CPU_ARM |
| 1505 S32A_Blend_BlitRow32_neon // S32A_Blend | 1515 S32A_Blend_BlitRow32_neon // S32A_Blend |
| 1516 #else |
| 1517 NULL |
| 1518 #endif |
| 1506 }; | 1519 }; |
| OLD | NEW |