OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBlitRow_opts_arm.h" | 8 #include "SkBlitRow_opts_arm.h" |
9 | 9 |
10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
(...skipping 499 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
510 | 510 |
511 /* do any residual iterations */ | 511 /* do any residual iterations */ |
512 while (--count >= 0) { | 512 while (--count >= 0) { |
513 *dst = SkPMSrcOver(*src, *dst); | 513 *dst = SkPMSrcOver(*src, *dst); |
514 src += 1; | 514 src += 1; |
515 dst += 1; | 515 dst += 1; |
516 } | 516 } |
517 } | 517 } |
518 } | 518 } |
519 | 519 |
| 520 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst, |
| 521 const SkPMColor* SK_RESTRICT src, |
| 522 int count, U8CPU alpha) { |
| 523 SkASSERT(255 == alpha); |
| 524 |
| 525 if (count <= 0) |
| 526 return; |
| 527 |
| 528 /* Use these to check if src is transparent or opaque */ |
| 529 const unsigned int ALPHA_OPAQ = 0xFF000000; |
| 530 const unsigned int ALPHA_TRANS = 0x00FFFFFF; |
| 531 |
| 532 #define UNROLL 4 |
| 533 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1); |
| 534 const SkPMColor* SK_RESTRICT src_temp = src; |
| 535 |
| 536 /* set up the NEON variables */ |
| 537 uint8x8_t alpha_mask; |
| 538 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; |
| 539 alpha_mask = vld1_u8(alpha_mask_setup); |
| 540 |
| 541 uint8x8_t src_raw, dst_raw, dst_final; |
| 542 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; |
| 543 uint8x8_t dst_cooked; |
| 544 uint16x8_t dst_wide; |
| 545 uint8x8_t alpha_narrow; |
| 546 uint16x8_t alpha_wide; |
| 547 |
| 548 /* choose the first processing type */ |
| 549 if( src >= src_end) |
| 550 goto TAIL; |
| 551 if(*src <= ALPHA_TRANS) |
| 552 goto ALPHA_0; |
| 553 if(*src >= ALPHA_OPAQ) |
| 554 goto ALPHA_255; |
| 555 /* fall-thru */ |
| 556 |
| 557 ALPHA_1_TO_254: |
| 558 do { |
| 559 |
| 560 /* get the source */ |
| 561 src_raw = vreinterpret_u8_u32(vld1_u32(src)); |
| 562 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); |
| 563 |
| 564 /* get and hold the dst too */ |
| 565 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); |
| 566 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); |
| 567 |
| 568 |
| 569 /* get the alphas spread out properly */ |
| 570 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); |
| 571 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ |
| 572 /* we collapsed (255-a)+1 ... */ |
| 573 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); |
| 574 |
| 575 /* spread the dest */ |
| 576 dst_wide = vmovl_u8(dst_raw); |
| 577 |
| 578 /* alpha mul the dest */ |
| 579 dst_wide = vmulq_u16 (dst_wide, alpha_wide); |
| 580 dst_cooked = vshrn_n_u16(dst_wide, 8); |
| 581 |
| 582 /* sum -- ignoring any byte lane overflows */ |
| 583 dst_final = vadd_u8(src_raw, dst_cooked); |
| 584 |
| 585 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); |
| 586 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ |
| 587 /* we collapsed (255-a)+1 ... */ |
| 588 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); |
| 589 |
| 590 /* spread the dest */ |
| 591 dst_wide = vmovl_u8(dst_raw_2); |
| 592 |
| 593 /* alpha mul the dest */ |
| 594 dst_wide = vmulq_u16 (dst_wide, alpha_wide); |
| 595 dst_cooked = vshrn_n_u16(dst_wide, 8); |
| 596 |
| 597 /* sum -- ignoring any byte lane overflows */ |
| 598 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); |
| 599 |
| 600 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); |
| 601 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); |
| 602 |
| 603 src += UNROLL; |
| 604 dst += UNROLL; |
| 605 |
| 606 /* if 2 of the next pixels aren't between 1 and 254 |
| 607 it might make sense to go to the optimized loops */ |
| 608 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_
OPAQ && src[1] >= ALPHA_OPAQ)) |
| 609 break; |
| 610 |
| 611 } while(src < src_end); |
| 612 |
| 613 if (src >= src_end) |
| 614 goto TAIL; |
| 615 |
| 616 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ) |
| 617 goto ALPHA_255; |
| 618 |
| 619 /*fall-thru*/ |
| 620 |
| 621 ALPHA_0: |
| 622 |
| 623 /*In this state, we know the current alpha is 0 and |
| 624 we optimize for the next alpha also being zero. */ |
| 625 src_temp = src; //so we don't have to increment dst every time |
| 626 do { |
| 627 if(*(++src) > ALPHA_TRANS) |
| 628 break; |
| 629 if(*(++src) > ALPHA_TRANS) |
| 630 break; |
| 631 if(*(++src) > ALPHA_TRANS) |
| 632 break; |
| 633 if(*(++src) > ALPHA_TRANS) |
| 634 break; |
| 635 } while(src < src_end); |
| 636 |
| 637 dst += (src - src_temp); |
| 638 |
| 639 /* no longer alpha 0, so determine where to go next. */ |
| 640 if( src >= src_end) |
| 641 goto TAIL; |
| 642 if(*src >= ALPHA_OPAQ) |
| 643 goto ALPHA_255; |
| 644 else |
| 645 goto ALPHA_1_TO_254; |
| 646 |
| 647 ALPHA_255: |
| 648 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) { |
| 649 dst[0]=src[0]; |
| 650 dst[1]=src[1]; |
| 651 dst[2]=src[2]; |
| 652 dst[3]=src[3]; |
| 653 src+=UNROLL; |
| 654 dst+=UNROLL; |
| 655 if(src >= src_end) |
| 656 goto TAIL; |
| 657 } |
| 658 |
| 659 //Handle remainder. |
| 660 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; |
| 661 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; |
| 662 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; } |
| 663 } |
| 664 } |
| 665 |
| 666 if( src >= src_end) |
| 667 goto TAIL; |
| 668 if(*src <= ALPHA_TRANS) |
| 669 goto ALPHA_0; |
| 670 else |
| 671 goto ALPHA_1_TO_254; |
| 672 |
| 673 TAIL: |
| 674 /* do any residual iterations */ |
| 675 src_end += UNROLL + 1; //goto the real end |
| 676 while(src != src_end) { |
| 677 if( *src != 0 ) { |
| 678 if( *src >= ALPHA_OPAQ ) { |
| 679 *dst = *src; |
| 680 } |
| 681 else { |
| 682 *dst = SkPMSrcOver(*src, *dst); |
| 683 } |
| 684 } |
| 685 src++; |
| 686 dst++; |
| 687 } |
| 688 |
| 689 #undef UNROLL |
| 690 return; |
| 691 } |
520 | 692 |
521 /* Neon version of S32_Blend_BlitRow32() | 693 /* Neon version of S32_Blend_BlitRow32() |
522 * portable version is in src/core/SkBlitRow_D32.cpp | 694 * portable version is in src/core/SkBlitRow_D32.cpp |
523 */ | 695 */ |
524 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, | 696 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
525 const SkPMColor* SK_RESTRICT src, | 697 const SkPMColor* SK_RESTRICT src, |
526 int count, U8CPU alpha) { | 698 int count, U8CPU alpha) { |
527 SkASSERT(alpha <= 255); | 699 SkASSERT(alpha <= 255); |
528 if (count > 0) { | 700 if (count > 0) { |
529 uint16_t src_scale = SkAlpha255To256(alpha); | 701 uint16_t src_scale = SkAlpha255To256(alpha); |
(...skipping 570 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1100 // dither | 1272 // dither |
1101 NULL, // S32_D4444_Opaque_Dither, | 1273 NULL, // S32_D4444_Opaque_Dither, |
1102 NULL, // S32_D4444_Blend_Dither, | 1274 NULL, // S32_D4444_Blend_Dither, |
1103 NULL, // S32A_D4444_Opaque_Dither, | 1275 NULL, // S32A_D4444_Opaque_Dither, |
1104 NULL, // S32A_D4444_Blend_Dither | 1276 NULL, // S32A_D4444_Blend_Dither |
1105 }; | 1277 }; |
1106 | 1278 |
1107 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { | 1279 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { |
1108 NULL, // S32_Opaque, | 1280 NULL, // S32_Opaque, |
1109 S32_Blend_BlitRow32_neon, // S32_Blend, | 1281 S32_Blend_BlitRow32_neon, // S32_Blend, |
1110 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1282 /* |
| 1283 * We have two choices for S32A_Opaque procs. The one reads the src alpha |
| 1284 * value and attempts to optimize accordingly. The optimization is |
| 1285 * sensitive to the source content and is not a win in all cases. For |
| 1286 * example, if there are a lot of transitions between the alpha states, |
| 1287 * the performance will almost certainly be worse. However, for many |
| 1288 * common cases the performance is equivalent or better than the standard |
| 1289 * case where we do not inspect the src alpha. |
| 1290 */ |
| 1291 #if SK_A32_SHIFT == 24 |
| 1292 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
| 1293 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
| 1294 #else |
| 1295 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
| 1296 #endif |
1111 S32A_Blend_BlitRow32_arm // S32A_Blend | 1297 S32A_Blend_BlitRow32_arm // S32A_Blend |
1112 }; | 1298 }; |
OLD | NEW |