Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(12)

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 847363002: skia: blend32_16_row for neon version (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: skia: blend32_16_row for neon version Created 5 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/opts/SkBlitRow_opts_arm_neon.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2012 The Android Open Source Project 2 * Copyright 2012 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBlitRow_opts_arm_neon.h" 8 #include "SkBlitRow_opts_arm_neon.h"
9 9
10 #include "SkBlitMask.h" 10 #include "SkBlitMask.h"
(...skipping 447 matching lines...) Expand 10 before | Expand all | Expand 10 after
458 SkPMColorAssert(c); 458 SkPMColorAssert(c);
459 if (c) { 459 if (c) {
460 *dst = SkSrcOver32To16(c, *dst); 460 *dst = SkSrcOver32To16(c, *dst);
461 } 461 }
462 dst += 1; 462 dst += 1;
463 } while (--count != 0); 463 } while (--count != 0);
464 } 464 }
465 } 465 }
466 #endif // #ifdef SK_CPU_ARM32 466 #endif // #ifdef SK_CPU_ARM32
467 467
468 static uint32_t pmcolor_to_expand16(SkPMColor c) {
469 unsigned r = SkGetPackedR32(c);
470 unsigned g = SkGetPackedG32(c);
471 unsigned b = SkGetPackedB32(c);
472 return (g << 24) | (r << 13) | (b << 2);
473 }
474
475 void Color32A_D565_neon(uint16_t dst[], SkPMColor src, int count, int x, int y) {
476 uint32_t src_expand;
477 unsigned scale;
478 uint16x8_t vmask_blue;
479
480 if (count <= 0) return;
481 SkASSERT(((size_t)dst & 0x01) == 0);
482
483 /*
484 * This preamble code is in order to make dst aligned to 8 bytes
485 * in the next mutiple bytes read & write access.
486 */
487 src_expand = pmcolor_to_expand16(src);
488 scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
489
490 #define DST_ALIGN 8
491
492 /*
493 * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 by tes at a time.
494 */
495 int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1);
496
497 for (int i = 0; i < preamble_size; i+=2, dst++) {
498 uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
499 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
500 if (--count == 0)
501 break;
502 }
503
504 int count16 = 0;
505 count16 = count >> 4;
506 vmask_blue = vmovq_n_u16(SK_B16_MASK);
507
508 if (count16) {
509 uint16x8_t wide_sr;
510 uint16x8_t wide_sg;
511 uint16x8_t wide_sb;
512 uint16x8_t wide_256_sa;
513
514 unsigned sr = SkGetPackedR32(src);
515 unsigned sg = SkGetPackedG32(src);
516 unsigned sb = SkGetPackedB32(src);
517 unsigned sa = SkGetPackedA32(src);
518
519 // Operation: dst_rgb = src_rgb + ((256 - src_a) >> 3) x dst_rgb
520 // sr: 8-bit based, dr: 5-bit based, with dr x ((256-sa)>>3), 5-bit left shifted,
521 //thus, for sr, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
522 wide_sr = vshlq_n_u16(vmovl_u8(vdup_n_u8(sr)), 2); // widen and src_red shift
523
524 // sg: 8-bit based, dg: 6-bit based, with dg x ((256-sa)>>3), 5-bit left shifted,
525 //thus, for sg, do 3-bit left shift to match MSB : (8 + 3 = 6 + 5)
526 wide_sg = vshlq_n_u16(vmovl_u8(vdup_n_u8(sg)), 3); // widen and src_grn shift
527
528 // sb: 8-bit based, db: 5-bit based, with db x ((256-sa)>>3), 5-bit left shifted,
529 //thus, for sb, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
530 wide_sb = vshlq_n_u16(vmovl_u8(vdup_n_u8(sb)), 2); // widen and src blu shift
531
532 wide_256_sa =
533 vshrq_n_u16(vsubw_u8(vdupq_n_u16(256), vdup_n_u8(sa)), 3); // (256 - sa) >> 3
534
535 while (count16-- > 0) {
536 uint16x8_t vdst1, vdst1_r, vdst1_g, vdst1_b;
537 uint16x8_t vdst2, vdst2_r, vdst2_g, vdst2_b;
538 vdst1 = vld1q_u16(dst);
539 dst += 8;
540 vdst2 = vld1q_u16(dst);
541 dst -= 8; //to store dst again.
542
543 vdst1_g = vshlq_n_u16(vdst1, SK_R16_BITS); // shift green to top of lanes
544 vdst1_b = vdst1 & vmask_blue; // extrac t blue
545 vdst1_r = vshrq_n_u16(vdst1, SK_R16_SHIFT); // extrac t red
546 vdst1_g = vshrq_n_u16(vdst1_g, SK_R16_BITS + SK_B16_BITS); // extrac t green
547
548 vdst2_g = vshlq_n_u16(vdst2, SK_R16_BITS); // shift green to top of lanes
549 vdst2_b = vdst2 & vmask_blue; // extrac t blue
550 vdst2_r = vshrq_n_u16(vdst2, SK_R16_SHIFT); // extrac t red
551 vdst2_g = vshrq_n_u16(vdst2_g, SK_R16_BITS + SK_B16_BITS); // extrac t green
552
553 vdst1_r = vmlaq_u16(wide_sr, wide_256_sa, vdst1_r); // sr + ( 256-sa) x dr1
554 vdst1_g = vmlaq_u16(wide_sg, wide_256_sa, vdst1_g); // sg + ( 256-sa) x dg1
555 vdst1_b = vmlaq_u16(wide_sb, wide_256_sa, vdst1_b); // sb + ( 256-sa) x db1
556
557 vdst2_r = vmlaq_u16(wide_sr, wide_256_sa, vdst2_r); // sr + ( 256-sa) x dr2
558 vdst2_g = vmlaq_u16(wide_sg, wide_256_sa, vdst2_g); // sg + ( 256-sa) x dg2
559 vdst2_b = vmlaq_u16(wide_sb, wide_256_sa, vdst2_b); // sb + ( 256-sa) x db2
560
561 vdst1_r = vshrq_n_u16(vdst1_r, 5); // 5-bit right shift for 5-bit red
562 vdst1_g = vshrq_n_u16(vdst1_g, 5); // 5-bit right shift for 6-bit green
563 vdst1_b = vshrq_n_u16(vdst1_b, 5); // 5-bit right shift for 5-bit blue
564
565 vdst1 = vsliq_n_u16(vdst1_b, vdst1_g, SK_G16_SHIFT); // insert green into blue
566 vdst1 = vsliq_n_u16(vdst1, vdst1_r, SK_R16_SHIFT); // insert red into green/blue
567
568 vdst2_r = vshrq_n_u16(vdst2_r, 5); // 5-bit right shift for 5-bit red
569 vdst2_g = vshrq_n_u16(vdst2_g, 5); // 5-bit right shift for 6-bit green
570 vdst2_b = vshrq_n_u16(vdst2_b, 5); // 5-bit right shift for 5-bit blue
571
572 vdst2 = vsliq_n_u16(vdst2_b, vdst2_g, SK_G16_SHIFT); // insert green into blue
573 vdst2 = vsliq_n_u16(vdst2, vdst2_r, SK_R16_SHIFT); // insert red into green/blue
574
575 vst1q_u16(dst, vdst1);
576 dst += 8;
577 vst1q_u16(dst, vdst2);
578 dst += 8;
579 }
580 }
581
582 count &= 0xF;
583 if (count > 0) {
584 do {
585 uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
586 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
587 dst += 1;
588 } while (--count != 0);
589 }
590 }
591
468 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { 592 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
469 prod += vdupq_n_u16(128); 593 prod += vdupq_n_u16(128);
470 prod += vshrq_n_u16(prod, 8); 594 prod += vshrq_n_u16(prod, 8);
471 return vshrq_n_u16(prod, 8); 595 return vshrq_n_u16(prod, 8);
472 } 596 }
473 597
474 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 598 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
475 const SkPMColor* SK_RESTRICT src, int count, 599 const SkPMColor* SK_RESTRICT src, int count,
476 U8CPU alpha, int /*x*/, int /*y*/) { 600 U8CPU alpha, int /*x*/, int /*y*/) {
477 SkASSERT(255 > alpha); 601 SkASSERT(255 > alpha);
(...skipping 1180 matching lines...) Expand 10 before | Expand all | Expand 10 after
1658 // https://code.google.com/p/skia/issues/detail?id=2797 1782 // https://code.google.com/p/skia/issues/detail?id=2797
1659 #endif 1783 #endif
1660 1784
1661 // dither 1785 // dither
1662 S32_D565_Opaque_Dither_neon, 1786 S32_D565_Opaque_Dither_neon,
1663 S32_D565_Blend_Dither_neon, 1787 S32_D565_Blend_Dither_neon,
1664 S32A_D565_Opaque_Dither_neon, 1788 S32A_D565_Opaque_Dither_neon,
1665 NULL, // S32A_D565_Blend_Dither 1789 NULL, // S32A_D565_Blend_Dither
1666 }; 1790 };
1667 1791
1792 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {
1793 Color32A_D565_neon, // Color32_D565,
1794 Color32A_D565_neon, // Color32A_D565,
1795 Color32A_D565_neon, // Color32_D565_Dither,
1796 Color32A_D565_neon, // Color32A_D565_Dither
1797 };
1798
1668 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { 1799 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1669 NULL, // S32_Opaque, 1800 NULL, // S32_Opaque,
1670 S32_Blend_BlitRow32_neon, // S32_Blend, 1801 S32_Blend_BlitRow32_neon, // S32_Blend,
1671 /* 1802 /*
1672 * We have two choices for S32A_Opaque procs. The one reads the src alpha 1803 * We have two choices for S32A_Opaque procs. The one reads the src alpha
1673 * value and attempts to optimize accordingly. The optimization is 1804 * value and attempts to optimize accordingly. The optimization is
1674 * sensitive to the source content and is not a win in all cases. For 1805 * sensitive to the source content and is not a win in all cases. For
1675 * example, if there are a lot of transitions between the alpha states, 1806 * example, if there are a lot of transitions between the alpha states,
1676 * the performance will almost certainly be worse. However, for many 1807 * the performance will almost certainly be worse. However, for many
1677 * common cases the performance is equivalent or better than the standard 1808 * common cases the performance is equivalent or better than the standard
1678 * case where we do not inspect the src alpha. 1809 * case where we do not inspect the src alpha.
1679 */ 1810 */
1680 #if SK_A32_SHIFT == 24 1811 #if SK_A32_SHIFT == 24
1681 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1812 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1682 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1813 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
1683 #else 1814 #else
1684 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1815 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
1685 #endif 1816 #endif
1686 #ifdef SK_CPU_ARM32 1817 #ifdef SK_CPU_ARM32
1687 S32A_Blend_BlitRow32_neon // S32A_Blend 1818 S32A_Blend_BlitRow32_neon // S32A_Blend
1688 #else 1819 #else
1689 NULL 1820 NULL
1690 #endif 1821 #endif
1691 }; 1822 };
OLDNEW
« no previous file with comments | « src/opts/SkBlitRow_opts_arm_neon.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698