OLD | NEW |
(Empty) | |
| 1 |
| 2 #include "SkBlitMask.h" |
| 3 #include "SkColor_opts_neon.h" |
| 4 |
| 5 static void D32_A8_Black_neon(void* SK_RESTRICT dst, size_t dstRB, |
| 6 const void* SK_RESTRICT maskPtr, size_t maskRB, |
| 7 SkColor, int width, int height) { |
| 8 SkPMColor* SK_RESTRICT device = (SkPMColor*)dst; |
| 9 const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr; |
| 10 |
| 11 maskRB -= width; |
| 12 dstRB -= (width << 2); |
| 13 do { |
| 14 int w = width; |
| 15 while (w >= 8) { |
| 16 uint8x8_t vmask = vld1_u8(mask); |
| 17 uint16x8_t vscale = vsubw_u8(vdupq_n_u16(256), vmask); |
| 18 uint8x8x4_t vdevice = vld4_u8((uint8_t*)device); |
| 19 |
| 20 vdevice = SkAlphaMulQ_neon8(vdevice, vscale); |
| 21 vdevice.val[NEON_A] += vmask; |
| 22 |
| 23 vst4_u8((uint8_t*)device, vdevice); |
| 24 |
| 25 mask += 8; |
| 26 device += 8; |
| 27 w -= 8; |
| 28 } |
| 29 while (w-- > 0) { |
| 30 unsigned aa = *mask++; |
| 31 *device = (aa << SK_A32_SHIFT) |
| 32 + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa)); |
| 33 device += 1; |
| 34 }; |
| 35 device = (uint32_t*)((char*)device + dstRB); |
| 36 mask += maskRB; |
| 37 } while (--height != 0); |
| 38 } |
| 39 |
| 40 static void D32_A8_Opaque_neon(void* SK_RESTRICT dst, size_t dstRB, |
| 41 const void* SK_RESTRICT maskPtr, size_t maskRB, |
| 42 SkColor color, int width, int height) { |
| 43 SkPMColor pmc = SkPreMultiplyColor(color); |
| 44 SkPMColor* SK_RESTRICT device = (SkPMColor*)dst; |
| 45 const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr; |
| 46 uint8x8x4_t vpmc; |
| 47 |
| 48 maskRB -= width; |
| 49 dstRB -= (width << 2); |
| 50 if (width >= 8) { |
| 51 vpmc.val[NEON_A] = vdup_n_u8(SkGetPackedA32(pmc)); |
| 52 vpmc.val[NEON_R] = vdup_n_u8(SkGetPackedR32(pmc)); |
| 53 vpmc.val[NEON_G] = vdup_n_u8(SkGetPackedG32(pmc)); |
| 54 vpmc.val[NEON_B] = vdup_n_u8(SkGetPackedB32(pmc)); |
| 55 } |
| 56 do { |
| 57 int w = width; |
| 58 while (w >= 8) { |
| 59 uint8x8_t vmask = vld1_u8(mask); |
| 60 uint16x8_t vmask256 = SkAlpha255To256_neon8(vmask); |
| 61 uint16x8_t vscale = vsubw_u8(vdupq_n_u16(256), vmask); |
| 62 uint8x8x4_t vdev = vld4_u8((uint8_t*)device); |
| 63 |
| 64 vdev.val[NEON_A] = SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256) |
| 65 + SkAlphaMul_neon8(vdev.val[NEON_A], vscale); |
| 66 vdev.val[NEON_R] = SkAlphaMul_neon8(vpmc.val[NEON_R], vmask256) |
| 67 + SkAlphaMul_neon8(vdev.val[NEON_R], vscale); |
| 68 vdev.val[NEON_G] = SkAlphaMul_neon8(vpmc.val[NEON_G], vmask256) |
| 69 + SkAlphaMul_neon8(vdev.val[NEON_G], vscale); |
| 70 vdev.val[NEON_B] = SkAlphaMul_neon8(vpmc.val[NEON_B], vmask256) |
| 71 + SkAlphaMul_neon8(vdev.val[NEON_B], vscale); |
| 72 |
| 73 vst4_u8((uint8_t*)device, vdev); |
| 74 |
| 75 mask += 8; |
| 76 device += 8; |
| 77 w -= 8; |
| 78 } |
| 79 while (w-- > 0) { |
| 80 unsigned aa = *mask++; |
| 81 *device = SkAlphaMulQ(pmc, SkAlpha255To256(aa)) |
| 82 + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa)); |
| 83 device += 1; |
| 84 }; |
| 85 device = (uint32_t*)((char*)device + dstRB); |
| 86 mask += maskRB; |
| 87 } while (--height != 0); |
| 88 } |
| 89 |
| 90 static void D32_A8_Color_neon(void* SK_RESTRICT dst, size_t dstRB, |
| 91 const void* SK_RESTRICT maskPtr, size_t maskRB, |
| 92 SkColor color, int width, int height) { |
| 93 SkPMColor pmc = SkPreMultiplyColor(color); |
| 94 size_t dstOffset = dstRB - (width << 2); |
| 95 size_t maskOffset = maskRB - width; |
| 96 SkPMColor* SK_RESTRICT device = (SkPMColor *)dst; |
| 97 const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr; |
| 98 uint8x8x4_t vpmc; |
| 99 |
| 100 if (width >= 8) { |
| 101 vpmc.val[NEON_A] = vdup_n_u8(SkGetPackedA32(pmc)); |
| 102 vpmc.val[NEON_R] = vdup_n_u8(SkGetPackedR32(pmc)); |
| 103 vpmc.val[NEON_G] = vdup_n_u8(SkGetPackedG32(pmc)); |
| 104 vpmc.val[NEON_B] = vdup_n_u8(SkGetPackedB32(pmc)); |
| 105 } |
| 106 do { |
| 107 int w = width; |
| 108 while (w >= 8) { |
| 109 uint8x8_t vmask = vld1_u8(mask); |
| 110 uint16x8_t vscale, vmask256 = SkAlpha255To256_neon8(vmask); |
| 111 uint8x8x4_t vdev = vld4_u8((uint8_t*)device); |
| 112 vscale = vsubw_u8(vdupq_n_u16(256), |
| 113 SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256)); |
| 114 |
| 115 vdev.val[NEON_A] = SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256) |
| 116 + SkAlphaMul_neon8(vdev.val[NEON_A], vscale); |
| 117 vdev.val[NEON_R] = SkAlphaMul_neon8(vpmc.val[NEON_R], vmask256) |
| 118 + SkAlphaMul_neon8(vdev.val[NEON_R], vscale); |
| 119 vdev.val[NEON_G] = SkAlphaMul_neon8(vpmc.val[NEON_G], vmask256) |
| 120 + SkAlphaMul_neon8(vdev.val[NEON_G], vscale); |
| 121 vdev.val[NEON_B] = SkAlphaMul_neon8(vpmc.val[NEON_B], vmask256) |
| 122 + SkAlphaMul_neon8(vdev.val[NEON_B], vscale); |
| 123 |
| 124 vst4_u8((uint8_t*)device, vdev); |
| 125 |
| 126 mask += 8; |
| 127 device += 8; |
| 128 w -= 8; |
| 129 } |
| 130 while (w--) { |
| 131 unsigned aa = *mask++; |
| 132 *device = SkBlendARGB32(pmc, *device, aa); |
| 133 device += 1; |
| 134 }; |
| 135 device = (uint32_t*)((char*)device + dstOffset); |
| 136 mask += maskOffset; |
| 137 } while (--height != 0); |
| 138 } |
| 139 |
| 140 SkBlitMask::ColorProc D32_A8_Factory_neon(SkColor color) { |
| 141 if (SK_ColorBLACK == color) { |
| 142 return D32_A8_Black_neon; |
| 143 } else if (0xFF == SkColorGetA(color)) { |
| 144 return D32_A8_Opaque_neon; |
| 145 } else { |
| 146 return D32_A8_Color_neon; |
| 147 } |
| 148 } |
| 149 |
| 150 //////////////////////////////////////////////////////////////////////////////// |
| 151 |
| 152 void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[], |
| 153 SkColor color, int width, |
| 154 SkPMColor opaqueDst) { |
| 155 int colR = SkColorGetR(color); |
| 156 int colG = SkColorGetG(color); |
| 157 int colB = SkColorGetB(color); |
| 158 |
| 159 uint8x8_t vcolR, vcolG, vcolB; |
| 160 uint8x8_t vopqDstA, vopqDstR, vopqDstG, vopqDstB; |
| 161 |
| 162 if (width >= 8) { |
| 163 vcolR = vdup_n_u8(colR); |
| 164 vcolG = vdup_n_u8(colG); |
| 165 vcolB = vdup_n_u8(colB); |
| 166 vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst)); |
| 167 vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst)); |
| 168 vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst)); |
| 169 vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst)); |
| 170 } |
| 171 |
| 172 while (width >= 8) { |
| 173 uint8x8x4_t vdst; |
| 174 uint16x8_t vmask; |
| 175 uint16x8_t vmaskR, vmaskG, vmaskB; |
| 176 uint8x8_t vsel_trans, vsel_opq; |
| 177 |
| 178 vdst = vld4_u8((uint8_t*)dst); |
| 179 vmask = vld1q_u16(src); |
| 180 |
| 181 // Prepare compare masks |
| 182 vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0))); |
| 183 vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF))); |
| 184 |
| 185 // Get all the color masks on 5 bits |
| 186 vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); |
| 187 vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), |
| 188 SK_B16_BITS + SK_R16_BITS + 1); |
| 189 vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); |
| 190 |
| 191 // Upscale to 0..32 |
| 192 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); |
| 193 vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); |
| 194 vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); |
| 195 |
| 196 vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF)
); |
| 197 vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]); |
| 198 |
| 199 vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR); |
| 200 vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG); |
| 201 vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB); |
| 202 |
| 203 vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]); |
| 204 vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]); |
| 205 vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]); |
| 206 |
| 207 vst4_u8((uint8_t*)dst, vdst); |
| 208 |
| 209 dst += 8; |
| 210 src += 8; |
| 211 width -= 8; |
| 212 } |
| 213 |
| 214 // Leftovers |
| 215 for (int i = 0; i < width; i++) { |
| 216 dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i], |
| 217 opaqueDst); |
| 218 } |
| 219 } |
| 220 |
| 221 void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[], |
| 222 SkColor color, int width, SkPMColor) { |
| 223 int colA = SkColorGetA(color); |
| 224 int colR = SkColorGetR(color); |
| 225 int colG = SkColorGetG(color); |
| 226 int colB = SkColorGetB(color); |
| 227 |
| 228 colA = SkAlpha255To256(colA); |
| 229 |
| 230 uint8x8_t vcolR, vcolG, vcolB; |
| 231 uint16x8_t vcolA; |
| 232 |
| 233 if (width >= 8) { |
| 234 vcolA = vdupq_n_u16(colA); |
| 235 vcolR = vdup_n_u8(colR); |
| 236 vcolG = vdup_n_u8(colG); |
| 237 vcolB = vdup_n_u8(colB); |
| 238 } |
| 239 |
| 240 while (width >= 8) { |
| 241 uint8x8x4_t vdst; |
| 242 uint16x8_t vmask; |
| 243 uint16x8_t vmaskR, vmaskG, vmaskB; |
| 244 |
| 245 vdst = vld4_u8((uint8_t*)dst); |
| 246 vmask = vld1q_u16(src); |
| 247 |
| 248 // Get all the color masks on 5 bits |
| 249 vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); |
| 250 vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), |
| 251 SK_B16_BITS + SK_R16_BITS + 1); |
| 252 vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); |
| 253 |
| 254 // Upscale to 0..32 |
| 255 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); |
| 256 vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); |
| 257 vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); |
| 258 |
| 259 vmaskR = vshrq_n_u16(vmaskR * vcolA, 8); |
| 260 vmaskG = vshrq_n_u16(vmaskG * vcolA, 8); |
| 261 vmaskB = vshrq_n_u16(vmaskB * vcolA, 8); |
| 262 |
| 263 vdst.val[NEON_A] = vdup_n_u8(0xFF); |
| 264 vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR); |
| 265 vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG); |
| 266 vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB); |
| 267 |
| 268 vst4_u8((uint8_t*)dst, vdst); |
| 269 |
| 270 dst += 8; |
| 271 src += 8; |
| 272 width -= 8; |
| 273 } |
| 274 |
| 275 for (int i = 0; i < width; i++) { |
| 276 dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]); |
| 277 } |
| 278 } |
| 279 |
OLD | NEW |