Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 /* | 1 /* |
| 2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include "SkBlitRow_opts_arm_neon.h" | 8 #include "SkBlitRow_opts_arm_neon.h" |
| 9 | 9 |
| 10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
| (...skipping 447 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 458 SkPMColorAssert(c); | 458 SkPMColorAssert(c); |
| 459 if (c) { | 459 if (c) { |
| 460 *dst = SkSrcOver32To16(c, *dst); | 460 *dst = SkSrcOver32To16(c, *dst); |
| 461 } | 461 } |
| 462 dst += 1; | 462 dst += 1; |
| 463 } while (--count != 0); | 463 } while (--count != 0); |
| 464 } | 464 } |
| 465 } | 465 } |
| 466 #endif // #ifdef SK_CPU_ARM32 | 466 #endif // #ifdef SK_CPU_ARM32 |
| 467 | 467 |
| 468 static uint32_t pmcolor_to_expand16(SkPMColor c) { | |
| 469 unsigned r = SkGetPackedR32(c); | |
| 470 unsigned g = SkGetPackedG32(c); | |
| 471 unsigned b = SkGetPackedB32(c); | |
| 472 return (g << 24) | (r << 13) | (b << 2); | |
| 473 } | |
| 474 | |
| 475 void Color32A_D565_neon(uint16_t dst[], SkPMColor src, int count, int x, int y) { | |
| 476 uint32_t src_expand; | |
| 477 unsigned scale; | |
| 478 uint16x8_t vmask_blue; | |
| 479 | |
| 480 if (count <= 0) return; | |
| 481 SkASSERT(((size_t)dst & 0x01) == 0); | |
| 482 | |
| 483 /* | |
| 484 * This preamble code is in order to make dst aligned to 8 bytes | |
| 485 * in the next mutiple bytes read & write access. | |
| 486 */ | |
| 487 src_expand = pmcolor_to_expand16(src); | |
| 488 scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3; | |
| 489 | |
| 490 #define DST_ALIGN 8 | |
| 491 | |
| 492 /* | |
| 493 * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 by tes at a time. | |
| 494 */ | |
| 495 int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1); | |
| 496 | |
| 497 for (int i = 0; i < preamble_size; i+=2, dst++) { | |
| 498 uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale; | |
| 499 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5); | |
| 500 if (--count == 0) | |
| 501 break; | |
| 502 } | |
| 503 | |
| 504 int count16 = 0; | |
| 505 count16 = count >> 4; | |
| 506 vmask_blue = vmovq_n_u16(SK_B16_MASK); | |
| 507 | |
| 508 if (count16) { | |
| 509 uint16x8_t wide_sr; | |
| 510 uint16x8_t wide_sg; | |
| 511 uint16x8_t wide_sb; | |
| 512 uint16x8_t wide_256_sa; | |
| 513 | |
| 514 unsigned sr = SkGetPackedR32(src); | |
| 515 unsigned sg = SkGetPackedG32(src); | |
| 516 unsigned sb = SkGetPackedB32(src); | |
| 517 unsigned sa = SkGetPackedA32(src); | |
| 518 | |
| 519 // Operation: dst_rgb = src_rgb + ((256 - src_a) >> 3) x dst_rgb | |
| 520 // sr: 8-bit based, dr: 5-bit based, with dr x ((256-sa)>>3), 5-bit left shifted, | |
| 521 //thus, for sr, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5) | |
| 522 wide_sr = vshlq_n_u16(vmovl_u8(vdup_n_u8(sr)), 2); // widen and src_red shift | |
| 523 | |
| 524 // sg: 8-bit based, dg: 6-bit based, with dg x ((256-sa)>>3), 5-bit left shifted, | |
| 525 //thus, for sg, do 3-bit left shift to match MSB : (8 + 3 = 6 + 5) | |
| 526 wide_sg = vshlq_n_u16(vmovl_u8(vdup_n_u8(sg)), 3); // widen and src_grn shift | |
| 527 | |
| 528 // sb: 8-bit based, db: 5-bit based, with db x ((256-sa)>>3), 5-bit left shifted, | |
| 529 //thus, for sb, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5) | |
| 530 wide_sb = vshlq_n_u16(vmovl_u8(vdup_n_u8(sb)), 2); // widen and src blu shift | |
| 531 | |
| 532 wide_256_sa = | |
| 533 vshrq_n_u16(vsubw_u8(vdupq_n_u16(256), vdup_n_u8(sa)), 3); // (256 - sa) >> 3 | |
| 534 | |
| 535 while (count16-- > 0) { | |
| 536 uint16x8_t vdst1, vdst1_r, vdst1_g, vdst1_b; | |
| 537 uint16x8_t vdst2, vdst2_r, vdst2_g, vdst2_b; | |
| 538 vdst1 = vld1q_u16(dst); | |
| 539 dst += 8; | |
| 540 vdst2 = vld1q_u16(dst); | |
| 541 dst -= 8; //to store dst again. | |
| 542 | |
| 543 vdst1_g = vshlq_n_u16(vdst1, SK_R16_BITS); // shift green to top of lanes | |
| 544 vdst1_b = vdst1 & vmask_blue; // extrac t blue | |
| 545 vdst1_r = vshrq_n_u16(vdst1, SK_R16_SHIFT); // extrac t red | |
| 546 vdst1_g = vshrq_n_u16(vdst1_g, SK_R16_BITS + SK_B16_BITS); // extrac t green | |
| 547 | |
| 548 vdst2_g = vshlq_n_u16(vdst2, SK_R16_BITS); // shift green to top of lanes | |
| 549 vdst2_b = vdst2 & vmask_blue; // extrac t blue | |
| 550 vdst2_r = vshrq_n_u16(vdst2, SK_R16_SHIFT); // extrac t red | |
| 551 vdst2_g = vshrq_n_u16(vdst2_g, SK_R16_BITS + SK_B16_BITS); // extrac t green | |
| 552 | |
| 553 vdst1_r = vmlaq_u16(wide_sr, wide_256_sa, vdst1_r); // sr + ( 256-sa) x dr1 | |
| 554 vdst1_g = vmlaq_u16(wide_sg, wide_256_sa, vdst1_g); // sg + ( 256-sa) x dg1 | |
| 555 vdst1_b = vmlaq_u16(wide_sb, wide_256_sa, vdst1_b); // sb + ( 256-sa) x db1 | |
| 556 | |
| 557 vdst2_r = vmlaq_u16(wide_sr, wide_256_sa, vdst2_r); // sr + ( 256-sa) x dr2 | |
| 558 vdst2_g = vmlaq_u16(wide_sg, wide_256_sa, vdst2_g); // sg + ( 256-sa) x dg2 | |
| 559 vdst2_b = vmlaq_u16(wide_sb, wide_256_sa, vdst2_b); // sb + ( 256-sa) x db2 | |
| 560 | |
| 561 vdst1_r = vshrq_n_u16(vdst1_r, 5); // 5-bit right shift for 5-bit red | |
| 562 vdst1_g = vshrq_n_u16(vdst1_g, 5); // 5-bit right shift for 6-bit green | |
| 563 vdst1_b = vshrq_n_u16(vdst1_b, 5); // 5-bit right shift for 5-bit blue | |
| 564 | |
| 565 vdst1 = vsliq_n_u16(vdst1_b, vdst1_g, SK_G16_SHIFT); // insert green into blue | |
| 566 vdst1 = vsliq_n_u16(vdst1, vdst1_r, SK_R16_SHIFT); // insert red into green/blue | |
| 567 | |
| 568 vdst2_r = vshrq_n_u16(vdst2_r, 5); // 5-bit right shift for 5-bit red | |
| 569 vdst2_g = vshrq_n_u16(vdst2_g, 5); // 5-bit right shift for 6-bit green | |
| 570 vdst2_b = vshrq_n_u16(vdst2_b, 5); // 5-bit right shift for 5-bit blue | |
| 571 | |
| 572 vdst2 = vsliq_n_u16(vdst2_b, vdst2_g, SK_G16_SHIFT); // insert green into blue | |
| 573 vdst2 = vsliq_n_u16(vdst2, vdst2_r, SK_R16_SHIFT); // insert red into green/blue | |
| 574 | |
| 575 vst1q_u16(dst, vdst1); | |
| 576 dst += 8; | |
| 577 vst1q_u16(dst, vdst2); | |
| 578 dst += 8; | |
| 579 } | |
| 580 } | |
| 581 | |
| 582 count &= 0xF; | |
| 583 if (count > 0) { | |
| 584 do { | |
| 585 uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale; | |
| 586 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5); | |
| 587 dst += 1; | |
| 588 } while (--count != 0); | |
| 589 } | |
| 590 } | |
| 591 | |
| 468 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { | 592 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { |
| 469 prod += vdupq_n_u16(128); | 593 prod += vdupq_n_u16(128); |
| 470 prod += vshrq_n_u16(prod, 8); | 594 prod += vshrq_n_u16(prod, 8); |
| 471 return vshrq_n_u16(prod, 8); | 595 return vshrq_n_u16(prod, 8); |
| 472 } | 596 } |
| 473 | 597 |
| 474 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, | 598 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, |
| 475 const SkPMColor* SK_RESTRICT src, int count, | 599 const SkPMColor* SK_RESTRICT src, int count, |
| 476 U8CPU alpha, int /*x*/, int /*y*/) { | 600 U8CPU alpha, int /*x*/, int /*y*/) { |
| 477 SkASSERT(255 > alpha); | 601 SkASSERT(255 > alpha); |
| (...skipping 1180 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1658 // https://code.google.com/p/skia/issues/detail?id=2797 | 1782 // https://code.google.com/p/skia/issues/detail?id=2797 |
| 1659 #endif | 1783 #endif |
| 1660 | 1784 |
| 1661 // dither | 1785 // dither |
| 1662 S32_D565_Opaque_Dither_neon, | 1786 S32_D565_Opaque_Dither_neon, |
| 1663 S32_D565_Blend_Dither_neon, | 1787 S32_D565_Blend_Dither_neon, |
| 1664 S32A_D565_Opaque_Dither_neon, | 1788 S32A_D565_Opaque_Dither_neon, |
| 1665 NULL, // S32A_D565_Blend_Dither | 1789 NULL, // S32A_D565_Blend_Dither |
| 1666 }; | 1790 }; |
| 1667 | 1791 |
| 1792 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = { | |
| 1793 #if 0 | |
|
djsollen
2015/01/21 14:36:53
remove the #if 0 block and replace with...
Color
mlee
2015/01/22 12:19:35
Done.
| |
| 1794 Color32_D565_neon, | |
| 1795 Color32A_D565_neon, | |
| 1796 Color32_D565_Dither_neon, | |
| 1797 Color32A_D565_Dither_neon | |
| 1798 #else | |
| 1799 // TODO: stop cheating and fill in the above specializations! | |
| 1800 Color32A_D565_neon, | |
| 1801 Color32A_D565_neon, | |
| 1802 Color32A_D565_neon, | |
| 1803 Color32A_D565_neon, | |
| 1804 #endif | |
| 1805 }; | |
| 1806 | |
| 1668 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { | 1807 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { |
| 1669 NULL, // S32_Opaque, | 1808 NULL, // S32_Opaque, |
| 1670 S32_Blend_BlitRow32_neon, // S32_Blend, | 1809 S32_Blend_BlitRow32_neon, // S32_Blend, |
| 1671 /* | 1810 /* |
| 1672 * We have two choices for S32A_Opaque procs. The one reads the src alpha | 1811 * We have two choices for S32A_Opaque procs. The one reads the src alpha |
| 1673 * value and attempts to optimize accordingly. The optimization is | 1812 * value and attempts to optimize accordingly. The optimization is |
| 1674 * sensitive to the source content and is not a win in all cases. For | 1813 * sensitive to the source content and is not a win in all cases. For |
| 1675 * example, if there are a lot of transitions between the alpha states, | 1814 * example, if there are a lot of transitions between the alpha states, |
| 1676 * the performance will almost certainly be worse. However, for many | 1815 * the performance will almost certainly be worse. However, for many |
| 1677 * common cases the performance is equivalent or better than the standard | 1816 * common cases the performance is equivalent or better than the standard |
| 1678 * case where we do not inspect the src alpha. | 1817 * case where we do not inspect the src alpha. |
| 1679 */ | 1818 */ |
| 1680 #if SK_A32_SHIFT == 24 | 1819 #if SK_A32_SHIFT == 24 |
| 1681 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | 1820 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
| 1682 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | 1821 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
| 1683 #else | 1822 #else |
| 1684 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1823 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
| 1685 #endif | 1824 #endif |
| 1686 #ifdef SK_CPU_ARM32 | 1825 #ifdef SK_CPU_ARM32 |
| 1687 S32A_Blend_BlitRow32_neon // S32A_Blend | 1826 S32A_Blend_BlitRow32_neon // S32A_Blend |
| 1688 #else | 1827 #else |
| 1689 NULL | 1828 NULL |
| 1690 #endif | 1829 #endif |
| 1691 }; | 1830 }; |
| OLD | NEW |