| OLD | NEW | 
|---|
| 1 /* | 1 /* | 
| 2  * Copyright 2012 The Android Open Source Project | 2  * Copyright 2012 The Android Open Source Project | 
| 3  * | 3  * | 
| 4  * Use of this source code is governed by a BSD-style license that can be | 4  * Use of this source code is governed by a BSD-style license that can be | 
| 5  * found in the LICENSE file. | 5  * found in the LICENSE file. | 
| 6  */ | 6  */ | 
| 7 | 7 | 
| 8 #include "SkBlitRow_opts_arm_neon.h" | 8 #include "SkBlitRow_opts_arm_neon.h" | 
| 9 | 9 | 
| 10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" | 
| (...skipping 853 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 864 | 864 | 
| 865             uint16_t d = *dst; | 865             uint16_t d = *dst; | 
| 866             *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), | 866             *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), | 
| 867                                  SkAlphaBlend(sg, SkGetPackedG16(d), scale), | 867                                  SkAlphaBlend(sg, SkGetPackedG16(d), scale), | 
| 868                                  SkAlphaBlend(sb, SkGetPackedB16(d), scale)); | 868                                  SkAlphaBlend(sb, SkGetPackedB16(d), scale)); | 
| 869             DITHER_INC_X(x); | 869             DITHER_INC_X(x); | 
| 870         } while (--count != 0); | 870         } while (--count != 0); | 
| 871     } | 871     } | 
| 872 } | 872 } | 
| 873 | 873 | 
| 874 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |  | 
| 875                                 const SkPMColor* SK_RESTRICT src, |  | 
| 876                                 int count, U8CPU alpha) { |  | 
| 877 |  | 
| 878     SkASSERT(255 == alpha); |  | 
| 879     if (count > 0) { |  | 
| 880 |  | 
| 881 |  | 
| 882     uint8x8_t alpha_mask; |  | 
| 883 |  | 
| 884     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; |  | 
| 885     alpha_mask = vld1_u8(alpha_mask_setup); |  | 
| 886 |  | 
| 887     /* do the NEON unrolled code */ |  | 
| 888 #define    UNROLL    4 |  | 
| 889     while (count >= UNROLL) { |  | 
| 890         uint8x8_t src_raw, dst_raw, dst_final; |  | 
| 891         uint8x8_t src_raw_2, dst_raw_2, dst_final_2; |  | 
| 892 |  | 
| 893         /* The two prefetches below may make the code slighlty |  | 
| 894          * slower for small values of count but are worth having |  | 
| 895          * in the general case. |  | 
| 896          */ |  | 
| 897         __builtin_prefetch(src+32); |  | 
| 898         __builtin_prefetch(dst+32); |  | 
| 899 |  | 
| 900         /* get the source */ |  | 
| 901         src_raw = vreinterpret_u8_u32(vld1_u32(src)); |  | 
| 902 #if    UNROLL > 2 |  | 
| 903         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); |  | 
| 904 #endif |  | 
| 905 |  | 
| 906         /* get and hold the dst too */ |  | 
| 907         dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); |  | 
| 908 #if    UNROLL > 2 |  | 
| 909         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); |  | 
| 910 #endif |  | 
| 911 |  | 
| 912     /* 1st and 2nd bits of the unrolling */ |  | 
| 913     { |  | 
| 914         uint8x8_t dst_cooked; |  | 
| 915         uint16x8_t dst_wide; |  | 
| 916         uint8x8_t alpha_narrow; |  | 
| 917         uint16x8_t alpha_wide; |  | 
| 918 |  | 
| 919         /* get the alphas spread out properly */ |  | 
| 920         alpha_narrow = vtbl1_u8(src_raw, alpha_mask); |  | 
| 921         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); |  | 
| 922 |  | 
| 923         /* spread the dest */ |  | 
| 924         dst_wide = vmovl_u8(dst_raw); |  | 
| 925 |  | 
| 926         /* alpha mul the dest */ |  | 
| 927         dst_wide = vmulq_u16 (dst_wide, alpha_wide); |  | 
| 928         dst_cooked = vshrn_n_u16(dst_wide, 8); |  | 
| 929 |  | 
| 930         /* sum -- ignoring any byte lane overflows */ |  | 
| 931         dst_final = vadd_u8(src_raw, dst_cooked); |  | 
| 932     } |  | 
| 933 |  | 
| 934 #if    UNROLL > 2 |  | 
| 935     /* the 3rd and 4th bits of our unrolling */ |  | 
| 936     { |  | 
| 937         uint8x8_t dst_cooked; |  | 
| 938         uint16x8_t dst_wide; |  | 
| 939         uint8x8_t alpha_narrow; |  | 
| 940         uint16x8_t alpha_wide; |  | 
| 941 |  | 
| 942         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); |  | 
| 943         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); |  | 
| 944 |  | 
| 945         /* spread the dest */ |  | 
| 946         dst_wide = vmovl_u8(dst_raw_2); |  | 
| 947 |  | 
| 948         /* alpha mul the dest */ |  | 
| 949         dst_wide = vmulq_u16 (dst_wide, alpha_wide); |  | 
| 950         dst_cooked = vshrn_n_u16(dst_wide, 8); |  | 
| 951 |  | 
| 952         /* sum -- ignoring any byte lane overflows */ |  | 
| 953         dst_final_2 = vadd_u8(src_raw_2, dst_cooked); |  | 
| 954     } |  | 
| 955 #endif |  | 
| 956 |  | 
| 957         vst1_u32(dst, vreinterpret_u32_u8(dst_final)); |  | 
| 958 #if    UNROLL > 2 |  | 
| 959         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); |  | 
| 960 #endif |  | 
| 961 |  | 
| 962         src += UNROLL; |  | 
| 963         dst += UNROLL; |  | 
| 964         count -= UNROLL; |  | 
| 965     } |  | 
| 966 #undef    UNROLL |  | 
| 967 |  | 
| 968     /* do any residual iterations */ |  | 
| 969         while (--count >= 0) { |  | 
| 970             *dst = SkPMSrcOver(*src, *dst); |  | 
| 971             src += 1; |  | 
| 972             dst += 1; |  | 
| 973         } |  | 
| 974     } |  | 
| 975 } |  | 
| 976 |  | 
| 977 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst, |  | 
| 978                                 const SkPMColor* SK_RESTRICT src, |  | 
| 979                                 int count, U8CPU alpha) { |  | 
| 980     SkASSERT(255 == alpha); |  | 
| 981 |  | 
| 982     if (count <= 0) |  | 
| 983     return; |  | 
| 984 |  | 
| 985     /* Use these to check if src is transparent or opaque */ |  | 
| 986     const unsigned int ALPHA_OPAQ  = 0xFF000000; |  | 
| 987     const unsigned int ALPHA_TRANS = 0x00FFFFFF; |  | 
| 988 |  | 
| 989 #define UNROLL  4 |  | 
| 990     const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1); |  | 
| 991     const SkPMColor* SK_RESTRICT src_temp = src; |  | 
| 992 |  | 
| 993     /* set up the NEON variables */ |  | 
| 994     uint8x8_t alpha_mask; |  | 
| 995     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; |  | 
| 996     alpha_mask = vld1_u8(alpha_mask_setup); |  | 
| 997 |  | 
| 998     uint8x8_t src_raw, dst_raw, dst_final; |  | 
| 999     uint8x8_t src_raw_2, dst_raw_2, dst_final_2; |  | 
| 1000     uint8x8_t dst_cooked; |  | 
| 1001     uint16x8_t dst_wide; |  | 
| 1002     uint8x8_t alpha_narrow; |  | 
| 1003     uint16x8_t alpha_wide; |  | 
| 1004 |  | 
| 1005     /* choose the first processing type */ |  | 
| 1006     if( src >= src_end) |  | 
| 1007         goto TAIL; |  | 
| 1008     if(*src <= ALPHA_TRANS) |  | 
| 1009         goto ALPHA_0; |  | 
| 1010     if(*src >= ALPHA_OPAQ) |  | 
| 1011         goto ALPHA_255; |  | 
| 1012     /* fall-thru */ |  | 
| 1013 |  | 
| 1014 ALPHA_1_TO_254: |  | 
| 1015     do { |  | 
| 1016 |  | 
| 1017         /* get the source */ |  | 
| 1018         src_raw = vreinterpret_u8_u32(vld1_u32(src)); |  | 
| 1019         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); |  | 
| 1020 |  | 
| 1021         /* get and hold the dst too */ |  | 
| 1022         dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); |  | 
| 1023         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); |  | 
| 1024 |  | 
| 1025 |  | 
| 1026         /* get the alphas spread out properly */ |  | 
| 1027         alpha_narrow = vtbl1_u8(src_raw, alpha_mask); |  | 
| 1028         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ |  | 
| 1029         /* we collapsed (255-a)+1 ... */ |  | 
| 1030         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); |  | 
| 1031 |  | 
| 1032         /* spread the dest */ |  | 
| 1033         dst_wide = vmovl_u8(dst_raw); |  | 
| 1034 |  | 
| 1035         /* alpha mul the dest */ |  | 
| 1036         dst_wide = vmulq_u16 (dst_wide, alpha_wide); |  | 
| 1037         dst_cooked = vshrn_n_u16(dst_wide, 8); |  | 
| 1038 |  | 
| 1039         /* sum -- ignoring any byte lane overflows */ |  | 
| 1040         dst_final = vadd_u8(src_raw, dst_cooked); |  | 
| 1041 |  | 
| 1042         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); |  | 
| 1043         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ |  | 
| 1044         /* we collapsed (255-a)+1 ... */ |  | 
| 1045         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); |  | 
| 1046 |  | 
| 1047         /* spread the dest */ |  | 
| 1048         dst_wide = vmovl_u8(dst_raw_2); |  | 
| 1049 |  | 
| 1050         /* alpha mul the dest */ |  | 
| 1051         dst_wide = vmulq_u16 (dst_wide, alpha_wide); |  | 
| 1052         dst_cooked = vshrn_n_u16(dst_wide, 8); |  | 
| 1053 |  | 
| 1054         /* sum -- ignoring any byte lane overflows */ |  | 
| 1055         dst_final_2 = vadd_u8(src_raw_2, dst_cooked); |  | 
| 1056 |  | 
| 1057         vst1_u32(dst, vreinterpret_u32_u8(dst_final)); |  | 
| 1058         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); |  | 
| 1059 |  | 
| 1060         src += UNROLL; |  | 
| 1061         dst += UNROLL; |  | 
| 1062 |  | 
| 1063         /* if 2 of the next pixels aren't between 1 and 254 |  | 
| 1064         it might make sense to go to the optimized loops */ |  | 
| 1065         if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_
      OPAQ && src[1] >= ALPHA_OPAQ)) |  | 
| 1066             break; |  | 
| 1067 |  | 
| 1068     } while(src < src_end); |  | 
| 1069 |  | 
| 1070     if (src >= src_end) |  | 
| 1071         goto TAIL; |  | 
| 1072 |  | 
| 1073     if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ) |  | 
| 1074         goto ALPHA_255; |  | 
| 1075 |  | 
| 1076     /*fall-thru*/ |  | 
| 1077 |  | 
| 1078 ALPHA_0: |  | 
| 1079 |  | 
| 1080     /*In this state, we know the current alpha is 0 and |  | 
| 1081      we optimize for the next alpha also being zero. */ |  | 
| 1082     src_temp = src;  //so we don't have to increment dst every time |  | 
| 1083     do { |  | 
| 1084         if(*(++src) > ALPHA_TRANS) |  | 
| 1085             break; |  | 
| 1086         if(*(++src) > ALPHA_TRANS) |  | 
| 1087             break; |  | 
| 1088         if(*(++src) > ALPHA_TRANS) |  | 
| 1089             break; |  | 
| 1090         if(*(++src) > ALPHA_TRANS) |  | 
| 1091             break; |  | 
| 1092     } while(src < src_end); |  | 
| 1093 |  | 
| 1094     dst += (src - src_temp); |  | 
| 1095 |  | 
| 1096     /* no longer alpha 0, so determine where to go next. */ |  | 
| 1097     if( src >= src_end) |  | 
| 1098         goto TAIL; |  | 
| 1099     if(*src >= ALPHA_OPAQ) |  | 
| 1100         goto ALPHA_255; |  | 
| 1101     else |  | 
| 1102         goto ALPHA_1_TO_254; |  | 
| 1103 |  | 
| 1104 ALPHA_255: |  | 
| 1105     while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) { |  | 
| 1106         dst[0]=src[0]; |  | 
| 1107         dst[1]=src[1]; |  | 
| 1108         dst[2]=src[2]; |  | 
| 1109         dst[3]=src[3]; |  | 
| 1110         src+=UNROLL; |  | 
| 1111         dst+=UNROLL; |  | 
| 1112         if(src >= src_end) |  | 
| 1113             goto TAIL; |  | 
| 1114     } |  | 
| 1115 |  | 
| 1116     //Handle remainder. |  | 
| 1117     if(*src >= ALPHA_OPAQ) { *dst++ = *src++; |  | 
| 1118         if(*src >= ALPHA_OPAQ) { *dst++ = *src++; |  | 
| 1119             if(*src >= ALPHA_OPAQ) { *dst++ = *src++; } |  | 
| 1120         } |  | 
| 1121     } |  | 
| 1122 |  | 
| 1123     if( src >= src_end) |  | 
| 1124         goto TAIL; |  | 
| 1125     if(*src <= ALPHA_TRANS) |  | 
| 1126         goto ALPHA_0; |  | 
| 1127     else |  | 
| 1128         goto ALPHA_1_TO_254; |  | 
| 1129 |  | 
| 1130 TAIL: |  | 
| 1131     /* do any residual iterations */ |  | 
| 1132     src_end += UNROLL + 1;  //goto the real end |  | 
| 1133     while(src != src_end) { |  | 
| 1134         if( *src != 0 ) { |  | 
| 1135             if( *src >= ALPHA_OPAQ ) { |  | 
| 1136                 *dst = *src; |  | 
| 1137             } |  | 
| 1138             else { |  | 
| 1139                 *dst = SkPMSrcOver(*src, *dst); |  | 
| 1140             } |  | 
| 1141         } |  | 
| 1142         src++; |  | 
| 1143         dst++; |  | 
| 1144     } |  | 
| 1145 |  | 
| 1146 #undef    UNROLL |  | 
| 1147     return; |  | 
| 1148 } |  | 
| 1149 |  | 
| 1150 /* Neon version of S32_Blend_BlitRow32() | 874 /* Neon version of S32_Blend_BlitRow32() | 
| 1151  * portable version is in src/core/SkBlitRow_D32.cpp | 875  * portable version is in src/core/SkBlitRow_D32.cpp | 
| 1152  */ | 876  */ | 
| 1153 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, | 877 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, | 
| 1154                               const SkPMColor* SK_RESTRICT src, | 878                               const SkPMColor* SK_RESTRICT src, | 
| 1155                               int count, U8CPU alpha) { | 879                               int count, U8CPU alpha) { | 
| 1156     SkASSERT(alpha <= 255); | 880     SkASSERT(alpha <= 255); | 
| 1157 | 881 | 
| 1158     if (count <= 0) { | 882     if (count <= 0) { | 
| 1159         return; | 883         return; | 
| (...skipping 394 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 1554 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = { | 1278 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = { | 
| 1555     Color32A_D565_neon,    // Color32_D565, | 1279     Color32A_D565_neon,    // Color32_D565, | 
| 1556     Color32A_D565_neon,    // Color32A_D565, | 1280     Color32A_D565_neon,    // Color32A_D565, | 
| 1557     Color32A_D565_neon,    // Color32_D565_Dither, | 1281     Color32A_D565_neon,    // Color32_D565_Dither, | 
| 1558     Color32A_D565_neon,    // Color32A_D565_Dither | 1282     Color32A_D565_neon,    // Color32A_D565_Dither | 
| 1559 }; | 1283 }; | 
| 1560 | 1284 | 
| 1561 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { | 1285 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { | 
| 1562     nullptr,   // S32_Opaque, | 1286     nullptr,   // S32_Opaque, | 
| 1563     S32_Blend_BlitRow32_neon,        // S32_Blend, | 1287     S32_Blend_BlitRow32_neon,        // S32_Blend, | 
| 1564     /* | 1288     nullptr,  // Ported to SkOpts | 
| 1565      * We have two choices for S32A_Opaque procs. The one reads the src alpha |  | 
| 1566      * value and attempts to optimize accordingly.  The optimization is |  | 
| 1567      * sensitive to the source content and is not a win in all cases. For |  | 
| 1568      * example, if there are a lot of transitions between the alpha states, |  | 
| 1569      * the performance will almost certainly be worse.  However, for many |  | 
| 1570      * common cases the performance is equivalent or better than the standard |  | 
| 1571      * case where we do not inspect the src alpha. |  | 
| 1572      */ |  | 
| 1573 #if SK_A32_SHIFT == 24 |  | 
| 1574     // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |  | 
| 1575     S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque, |  | 
| 1576 #else |  | 
| 1577     S32A_Opaque_BlitRow32_neon,     // S32A_Opaque, |  | 
| 1578 #endif |  | 
| 1579 #ifdef SK_CPU_ARM32 | 1289 #ifdef SK_CPU_ARM32 | 
| 1580     S32A_Blend_BlitRow32_neon        // S32A_Blend | 1290     S32A_Blend_BlitRow32_neon        // S32A_Blend | 
| 1581 #else | 1291 #else | 
| 1582     nullptr | 1292     nullptr | 
| 1583 #endif | 1293 #endif | 
| 1584 }; | 1294 }; | 
| OLD | NEW | 
|---|