Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 /* | 1 /* |
| 2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include "SkBlitRow_opts_arm_neon.h" | 8 #include "SkBlitRow_opts_arm_neon.h" |
| 9 | 9 |
| 10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
| 11 #include "SkBlitRow.h" | 11 #include "SkBlitRow.h" |
| 12 #include "SkColorPriv.h" | 12 #include "SkColorPriv.h" |
| 13 #include "SkDither.h" | 13 #include "SkDither.h" |
| 14 #include "SkMathPriv.h" | 14 #include "SkMathPriv.h" |
| 15 #include "SkUtils.h" | 15 #include "SkUtils.h" |
| 16 | 16 |
| 17 #include "SkColor_opts_neon.h" | 17 #include "SkColor_opts_neon.h" |
| 18 #include <arm_neon.h> | 18 #include <arm_neon.h> |
| 19 | 19 |
| 20 #ifdef SK_CPU_ARM32 | 20 #ifdef SK_CPU_ARM64 |
| 21 static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) { | |
| 22 uint8x8x4_t vsrc; | |
| 23 uint8x8_t vsrc_0, vsrc_1, vsrc_2; | |
| 24 | |
| 25 asm ( | |
| 26 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" | |
| 27 "mov %[vsrc0].8b, v0.8b \t\n" | |
| 28 "mov %[vsrc1].8b, v1.8b \t\n" | |
| 29 "mov %[vsrc2].8b, v2.8b \t\n" | |
| 30 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), | |
| 31 [vsrc2] "=w" (vsrc_2), [src] "+&r" (src) | |
| 32 : : "v0", "v1", "v2", "v3" | |
| 33 ); | |
| 34 | |
| 35 vsrc.val[0] = vsrc_0; | |
| 36 vsrc.val[1] = vsrc_1; | |
| 37 vsrc.val[2] = vsrc_2; | |
| 38 | |
| 39 return vsrc; | |
| 40 } | |
| 41 | |
| 42 static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) { | |
| 43 uint8x8x4_t vsrc; | |
| 44 uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3; | |
| 45 | |
| 46 asm ( | |
| 47 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" | |
| 48 "mov %[vsrc0].8b, v0.8b \t\n" | |
| 49 "mov %[vsrc1].8b, v1.8b \t\n" | |
| 50 "mov %[vsrc2].8b, v2.8b \t\n" | |
| 51 "mov %[vsrc3].8b, v3.8b \t\n" | |
| 52 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), | |
| 53 [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3), | |
| 54 [src] "+&r" (src) | |
| 55 : : "v0", "v1", "v2", "v3" | |
| 56 ); | |
| 57 | |
| 58 vsrc.val[0] = vsrc_0; | |
| 59 vsrc.val[1] = vsrc_1; | |
| 60 vsrc.val[2] = vsrc_2; | |
| 61 vsrc.val[3] = vsrc_3; | |
| 62 | |
| 63 return vsrc; | |
| 64 } | |
| 65 #endif | |
| 66 | |
| 21 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, | 67 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, |
| 22 const SkPMColor* SK_RESTRICT src, int count, | 68 const SkPMColor* SK_RESTRICT src, int count, |
| 23 U8CPU alpha, int /*x*/, int /*y*/) { | 69 U8CPU alpha, int /*x*/, int /*y*/) { |
| 24 SkASSERT(255 == alpha); | 70 SkASSERT(255 == alpha); |
| 25 | 71 |
| 26 while (count >= 8) { | 72 while (count >= 8) { |
| 27 uint8x8x4_t vsrc; | 73 uint8x8x4_t vsrc; |
| 28 uint16x8_t vdst; | 74 uint16x8_t vdst; |
| 29 | 75 |
| 30 // Load | 76 // Load |
| 77 #ifdef SK_CPU_ARM64 | |
| 78 vsrc = sk_vld4_u8_arm64_3(src); | |
| 79 #else | |
| 31 vsrc = vld4_u8((uint8_t*)src); | 80 vsrc = vld4_u8((uint8_t*)src); |
| 81 src += 8; | |
| 82 #endif | |
| 32 | 83 |
| 33 // Convert src to 565 | 84 // Convert src to 565 |
| 34 vdst = SkPixel32ToPixel16_neon8(vsrc); | 85 vdst = SkPixel32ToPixel16_neon8(vsrc); |
| 35 | 86 |
| 36 // Store | 87 // Store |
| 37 vst1q_u16(dst, vdst); | 88 vst1q_u16(dst, vdst); |
| 38 | 89 |
| 39 // Prepare next iteration | 90 // Prepare next iteration |
| 40 dst += 8; | 91 dst += 8; |
| 41 src += 8; | |
| 42 count -= 8; | 92 count -= 8; |
| 43 }; | 93 }; |
| 44 | 94 |
| 45 // Leftovers | 95 // Leftovers |
| 46 while (count > 0) { | 96 while (count > 0) { |
| 47 SkPMColor c = *src++; | 97 SkPMColor c = *src++; |
| 48 SkPMColorAssert(c); | 98 SkPMColorAssert(c); |
| 49 *dst = SkPixel32ToPixel16_ToU16(c); | 99 *dst = SkPixel32ToPixel16_ToU16(c); |
| 50 dst++; | 100 dst++; |
| 51 count--; | 101 count--; |
| 52 }; | 102 }; |
| 53 } | 103 } |
| 54 | 104 |
| 55 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst, | 105 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst, |
| 56 const SkPMColor* SK_RESTRICT src, int count, | 106 const SkPMColor* SK_RESTRICT src, int count, |
| 57 U8CPU alpha, int /*x*/, int /*y*/) { | 107 U8CPU alpha, int /*x*/, int /*y*/) { |
| 58 SkASSERT(255 > alpha); | 108 SkASSERT(255 > alpha); |
| 59 | 109 |
| 60 uint16x8_t vmask_blue, vscale; | 110 uint16x8_t vmask_blue, vscale; |
| 61 | 111 |
| 62 // prepare constants | 112 // prepare constants |
| 63 vscale = vdupq_n_u16(SkAlpha255To256(alpha)); | 113 vscale = vdupq_n_u16(SkAlpha255To256(alpha)); |
| 64 vmask_blue = vmovq_n_u16(0x1F); | 114 vmask_blue = vmovq_n_u16(0x1F); |
| 65 | 115 |
| 66 while (count >= 8) { | 116 while (count >= 8) { |
| 117 uint8x8x4_t vsrc; | |
| 67 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; | 118 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; |
| 68 uint16x8_t vres_r, vres_g, vres_b; | 119 uint16x8_t vres_r, vres_g, vres_b; |
| 69 uint8x8_t vsrc_r, vsrc_g, vsrc_b; | |
| 70 | 120 |
| 71 // Load src | 121 // Load src |
| 122 #ifdef SK_CPU_ARM64 | |
| 123 vsrc = sk_vld4_u8_arm64_3(src); | |
| 124 #else | |
| 72 { | 125 { |
| 73 register uint8x8_t d0 asm("d0"); | 126 register uint8x8_t d0 asm("d0"); |
| 74 register uint8x8_t d1 asm("d1"); | 127 register uint8x8_t d1 asm("d1"); |
| 75 register uint8x8_t d2 asm("d2"); | 128 register uint8x8_t d2 asm("d2"); |
| 76 register uint8x8_t d3 asm("d3"); | 129 register uint8x8_t d3 asm("d3"); |
| 77 | 130 |
| 78 asm ( | 131 asm ( |
| 79 "vld4.8 {d0-d3},[%[src]]!" | 132 "vld4.8 {d0-d3},[%[src]]!" |
| 80 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) | 133 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) |
| 81 : | 134 : |
| 82 ); | 135 ); |
| 83 vsrc_g = d1; | 136 vsrc.val[0] = d0; |
| 84 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) | 137 vsrc.val[1] = d1; |
| 85 vsrc_r = d2; vsrc_b = d0; | 138 vsrc.val[2] = d2; |
| 86 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) | 139 } |
| 87 vsrc_r = d0; vsrc_b = d2; | |
| 88 #endif | 140 #endif |
| 89 } | |
| 90 | 141 |
| 91 // Load and unpack dst | 142 // Load and unpack dst |
| 92 vdst = vld1q_u16(dst); | 143 vdst = vld1q_u16(dst); |
| 93 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes | 144 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes |
| 94 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue | 145 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue |
| 95 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red | 146 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red |
| 96 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green | 147 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green |
| 97 | 148 |
| 98 // Shift src to 565 | 149 // Shift src to 565 range |
| 99 vsrc_r = vshr_n_u8(vsrc_r, 3); // shift red to 565 range | 150 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3); |
| 100 vsrc_g = vshr_n_u8(vsrc_g, 2); // shift green to 565 range | 151 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2); |
| 101 vsrc_b = vshr_n_u8(vsrc_b, 3); // shift blue to 565 range | 152 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3); |
| 102 | 153 |
| 103 // Scale src - dst | 154 // Scale src - dst |
| 104 vres_r = vmovl_u8(vsrc_r) - vdst_r; | 155 vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r; |
| 105 vres_g = vmovl_u8(vsrc_g) - vdst_g; | 156 vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g; |
| 106 vres_b = vmovl_u8(vsrc_b) - vdst_b; | 157 vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b; |
| 107 | 158 |
| 108 vres_r = vshrq_n_u16(vres_r * vscale, 8); | 159 vres_r = vshrq_n_u16(vres_r * vscale, 8); |
| 109 vres_g = vshrq_n_u16(vres_g * vscale, 8); | 160 vres_g = vshrq_n_u16(vres_g * vscale, 8); |
| 110 vres_b = vshrq_n_u16(vres_b * vscale, 8); | 161 vres_b = vshrq_n_u16(vres_b * vscale, 8); |
| 111 | 162 |
| 112 vres_r += vdst_r; | 163 vres_r += vdst_r; |
| 113 vres_g += vdst_g; | 164 vres_g += vdst_g; |
| 114 vres_b += vdst_b; | 165 vres_b += vdst_b; |
| 115 | 166 |
| 116 // Combine | 167 // Combine |
| (...skipping 12 matching lines...) Expand all Loading... | |
| 129 SkPMColorAssert(c); | 180 SkPMColorAssert(c); |
| 130 uint16_t d = *dst; | 181 uint16_t d = *dst; |
| 131 *dst++ = SkPackRGB16( | 182 *dst++ = SkPackRGB16( |
| 132 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale), | 183 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale), |
| 133 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale), | 184 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale), |
| 134 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale)); | 185 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale)); |
| 135 } while (--count != 0); | 186 } while (--count != 0); |
| 136 } | 187 } |
| 137 } | 188 } |
| 138 | 189 |
| 190 #ifdef SK_CPU_ARM32 | |
| 139 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, | 191 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, |
| 140 const SkPMColor* SK_RESTRICT src, int count, | 192 const SkPMColor* SK_RESTRICT src, int count, |
| 141 U8CPU alpha, int /*x*/, int /*y*/) { | 193 U8CPU alpha, int /*x*/, int /*y*/) { |
| 142 SkASSERT(255 == alpha); | 194 SkASSERT(255 == alpha); |
| 143 | 195 |
| 144 if (count >= 8) { | 196 if (count >= 8) { |
| 145 uint16_t* SK_RESTRICT keep_dst = 0; | 197 uint16_t* SK_RESTRICT keep_dst = 0; |
| 146 | 198 |
| 147 asm volatile ( | 199 asm volatile ( |
| 148 "ands ip, %[count], #7 \n\t" | 200 "ands ip, %[count], #7 \n\t" |
| (...skipping 157 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 306 | 358 |
| 307 "21: \n\t" | 359 "21: \n\t" |
| 308 : [count] "+r" (count) | 360 : [count] "+r" (count) |
| 309 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (s rc) | 361 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (s rc) |
| 310 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6" ,"d7", | 362 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6" ,"d7", |
| 311 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25 ","d26","d27","d28","d29", | 363 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25 ","d26","d27","d28","d29", |
| 312 "d30","d31" | 364 "d30","d31" |
| 313 ); | 365 ); |
| 314 } | 366 } |
| 315 } | 367 } |
| 368 #endif | |
| 316 | 369 |
| 317 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { | 370 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { |
| 318 prod += vdupq_n_u16(128); | 371 prod += vdupq_n_u16(128); |
| 319 prod += vshrq_n_u16(prod, 8); | 372 prod += vshrq_n_u16(prod, 8); |
| 320 return vshrq_n_u16(prod, 8); | 373 return vshrq_n_u16(prod, 8); |
| 321 } | 374 } |
| 322 | 375 |
| 323 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, | 376 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, |
| 324 const SkPMColor* SK_RESTRICT src, int count, | 377 const SkPMColor* SK_RESTRICT src, int count, |
| 325 U8CPU alpha, int /*x*/, int /*y*/) { | 378 U8CPU alpha, int /*x*/, int /*y*/) { |
| (...skipping 13 matching lines...) Expand all Loading... | |
| 339 valpha = vdup_n_u8(alpha); | 392 valpha = vdup_n_u8(alpha); |
| 340 vmask_blue = vmovq_n_u16(SK_B16_MASK); | 393 vmask_blue = vmovq_n_u16(SK_B16_MASK); |
| 341 | 394 |
| 342 do { | 395 do { |
| 343 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; | 396 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; |
| 344 uint16x8_t vres_a, vres_r, vres_g, vres_b; | 397 uint16x8_t vres_a, vres_r, vres_g, vres_b; |
| 345 uint8x8x4_t vsrc; | 398 uint8x8x4_t vsrc; |
| 346 | 399 |
| 347 // load pixels | 400 // load pixels |
| 348 vdst = vld1q_u16(dst); | 401 vdst = vld1q_u16(dst); |
| 402 #ifdef SK_CPU_ARM64 | |
| 403 vsrc = sk_vld4_u8_arm64_4(src); | |
| 404 #else | |
| 349 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | 405 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
|
mtklein
2014/06/06 14:41:17
Think it makes sense to follow up and do the same
| |
| 350 asm ( | 406 asm ( |
| 351 "vld4.u8 %h[vsrc], [%[src]]!" | 407 "vld4.u8 %h[vsrc], [%[src]]!" |
| 352 : [vsrc] "=w" (vsrc), [src] "+&r" (src) | 408 : [vsrc] "=w" (vsrc), [src] "+&r" (src) |
| 353 : : | 409 : : |
| 354 ); | 410 ); |
| 355 #else | 411 #else |
| 356 register uint8x8_t d0 asm("d0"); | 412 register uint8x8_t d0 asm("d0"); |
| 357 register uint8x8_t d1 asm("d1"); | 413 register uint8x8_t d1 asm("d1"); |
| 358 register uint8x8_t d2 asm("d2"); | 414 register uint8x8_t d2 asm("d2"); |
| 359 register uint8x8_t d3 asm("d3"); | 415 register uint8x8_t d3 asm("d3"); |
| 360 | 416 |
| 361 asm volatile ( | 417 asm volatile ( |
| 362 "vld4.u8 {d0-d3},[%[src]]!;" | 418 "vld4.u8 {d0-d3},[%[src]]!;" |
| 363 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), | 419 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), |
| 364 [src] "+&r" (src) | 420 [src] "+&r" (src) |
| 365 : : | 421 : : |
| 366 ); | 422 ); |
| 367 vsrc.val[0] = d0; | 423 vsrc.val[0] = d0; |
| 368 vsrc.val[1] = d1; | 424 vsrc.val[1] = d1; |
| 369 vsrc.val[2] = d2; | 425 vsrc.val[2] = d2; |
| 370 vsrc.val[3] = d3; | 426 vsrc.val[3] = d3; |
| 371 #endif | 427 #endif |
| 428 #endif // #ifdef SK_CPU_ARM64 | |
| 372 | 429 |
| 373 | 430 |
| 374 // deinterleave dst | 431 // deinterleave dst |
| 375 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to p of lanes | 432 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to p of lanes |
| 376 vdst_b = vdst & vmask_blue; // extract blue | 433 vdst_b = vdst & vmask_blue; // extract blue |
| 377 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red | 434 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red |
| 378 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green | 435 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green |
| 379 | 436 |
| 380 // shift src to 565 | 437 // shift src to 565 |
| 381 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); | 438 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); |
| (...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 461 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; | 518 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; |
| 462 | 519 |
| 463 uint8x8_t vdither = vld1_u8(dstart); // load dither values | 520 uint8x8_t vdither = vld1_u8(dstart); // load dither values |
| 464 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither value s | 521 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither value s |
| 465 | 522 |
| 466 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into ne on reg | 523 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into ne on reg |
| 467 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask | 524 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask |
| 468 | 525 |
| 469 do { | 526 do { |
| 470 | 527 |
| 528 uint8x8x4_t vsrc; | |
| 471 uint8x8_t vsrc_r, vsrc_g, vsrc_b; | 529 uint8x8_t vsrc_r, vsrc_g, vsrc_b; |
| 472 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b; | 530 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b; |
| 473 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b; | 531 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b; |
| 474 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b; | 532 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b; |
| 475 uint16x8_t vdst; | 533 uint16x8_t vdst; |
| 476 uint16x8_t vdst_r, vdst_g, vdst_b; | 534 uint16x8_t vdst_r, vdst_g, vdst_b; |
| 477 int16x8_t vres_r, vres_g, vres_b; | 535 int16x8_t vres_r, vres_g, vres_b; |
| 478 int8x8_t vres8_r, vres8_g, vres8_b; | 536 int8x8_t vres8_r, vres8_g, vres8_b; |
| 479 | 537 |
| 480 // Load source and add dither | 538 // Load source and add dither |
| 539 #ifdef SK_CPU_ARM64 | |
| 540 vsrc = sk_vld4_u8_arm64_3(src); | |
| 541 #else | |
| 481 { | 542 { |
| 482 register uint8x8_t d0 asm("d0"); | 543 register uint8x8_t d0 asm("d0"); |
| 483 register uint8x8_t d1 asm("d1"); | 544 register uint8x8_t d1 asm("d1"); |
| 484 register uint8x8_t d2 asm("d2"); | 545 register uint8x8_t d2 asm("d2"); |
| 485 register uint8x8_t d3 asm("d3"); | 546 register uint8x8_t d3 asm("d3"); |
| 486 | 547 |
| 487 asm ( | 548 asm ( |
| 488 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" | 549 "vld4.8 {d0-d3},[%[src]]! " |
| 489 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) | 550 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) |
| 490 : | 551 : |
| 491 ); | 552 ); |
| 492 vsrc_g = d1; | 553 vsrc.val[0] = d0; |
| 493 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) | 554 vsrc.val[1] = d1; |
| 494 vsrc_r = d2; vsrc_b = d0; | 555 vsrc.val[2] = d2; |
| 495 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) | 556 } |
| 496 vsrc_r = d0; vsrc_b = d2; | |
| 497 #endif | 557 #endif |
| 498 } | 558 vsrc_r = vsrc.val[NEON_R]; |
| 559 vsrc_g = vsrc.val[NEON_G]; | |
| 560 vsrc_b = vsrc.val[NEON_B]; | |
| 499 | 561 |
| 500 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6 | 562 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6 |
| 501 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5 | 563 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5 |
| 502 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5 | 564 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5 |
| 503 | 565 |
| 504 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen | 566 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen |
| 505 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red an d widen | 567 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red an d widen |
| 506 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue a nd widen | 568 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue a nd widen |
| 507 | 569 |
| 508 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red fr om result | 570 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red fr om result |
| (...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 569 sb = SkDITHER_B32To565(sb, dither); | 631 sb = SkDITHER_B32To565(sb, dither); |
| 570 | 632 |
| 571 uint16_t d = *dst; | 633 uint16_t d = *dst; |
| 572 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), | 634 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), |
| 573 SkAlphaBlend(sg, SkGetPackedG16(d), scale), | 635 SkAlphaBlend(sg, SkGetPackedG16(d), scale), |
| 574 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); | 636 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); |
| 575 DITHER_INC_X(x); | 637 DITHER_INC_X(x); |
| 576 } while (--count != 0); | 638 } while (--count != 0); |
| 577 } | 639 } |
| 578 } | 640 } |
| 579 #endif | |
| 580 | 641 |
| 581 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, | 642 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
| 582 const SkPMColor* SK_RESTRICT src, | 643 const SkPMColor* SK_RESTRICT src, |
| 583 int count, U8CPU alpha) { | 644 int count, U8CPU alpha) { |
| 584 | 645 |
| 585 SkASSERT(255 == alpha); | 646 SkASSERT(255 == alpha); |
| 586 if (count > 0) { | 647 if (count > 0) { |
| 587 | 648 |
| 588 | 649 |
| 589 uint8x8_t alpha_mask; | 650 uint8x8_t alpha_mask; |
| (...skipping 450 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1040 uint16_t *pc = (uint16_t*) p; | 1101 uint16_t *pc = (uint16_t*) p; |
| 1041 sprintf(buf,"%8s:", str); | 1102 sprintf(buf,"%8s:", str); |
| 1042 len = (len / sizeof(uint16_t)); /* passed as bytes */ | 1103 len = (len / sizeof(uint16_t)); /* passed as bytes */ |
| 1043 for(i=0;i<len;i++) { | 1104 for(i=0;i<len;i++) { |
| 1044 sprintf(tbuf, " %04x", pc[i]); | 1105 sprintf(tbuf, " %04x", pc[i]); |
| 1045 strcat(buf, tbuf); | 1106 strcat(buf, tbuf); |
| 1046 } | 1107 } |
| 1047 SkDebugf("%s\n", buf); | 1108 SkDebugf("%s\n", buf); |
| 1048 } | 1109 } |
| 1049 #endif | 1110 #endif |
| 1111 #endif // #ifdef SK_CPU_ARM32 | |
| 1050 | 1112 |
| 1051 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, | 1113 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, |
| 1052 const SkPMColor* SK_RESTRICT src, | 1114 const SkPMColor* SK_RESTRICT src, |
| 1053 int count, U8CPU alpha, int x, int y) { | 1115 int count, U8CPU alpha, int x, int y) { |
| 1054 SkASSERT(255 == alpha); | 1116 SkASSERT(255 == alpha); |
| 1055 | 1117 |
| 1056 #define UNROLL 8 | 1118 #define UNROLL 8 |
| 1057 | 1119 |
| 1058 if (count >= UNROLL) { | 1120 if (count >= UNROLL) { |
| 1059 | 1121 |
| 1060 #if defined(DEBUG_OPAQUE_DITHER) | 1122 #if defined(DEBUG_OPAQUE_DITHER) |
| 1061 uint16_t tmpbuf[UNROLL]; | 1123 uint16_t tmpbuf[UNROLL]; |
| 1062 int td[UNROLL]; | 1124 int td[UNROLL]; |
| 1063 int tdv[UNROLL]; | 1125 int tdv[UNROLL]; |
| 1064 int ta[UNROLL]; | 1126 int ta[UNROLL]; |
| 1065 int tap[UNROLL]; | 1127 int tap[UNROLL]; |
| 1066 uint16_t in_dst[UNROLL]; | 1128 uint16_t in_dst[UNROLL]; |
| 1067 int offset = 0; | 1129 int offset = 0; |
| 1068 int noisy = 0; | 1130 int noisy = 0; |
| 1069 #endif | 1131 #endif |
| 1070 | 1132 |
| 1071 uint8x8_t dbase; | 1133 uint8x8_t dbase; |
| 1072 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; | 1134 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; |
| 1073 dbase = vld1_u8(dstart); | 1135 dbase = vld1_u8(dstart); |
| 1074 | 1136 |
| 1075 do { | 1137 do { |
| 1138 uint8x8x4_t vsrc; | |
| 1076 uint8x8_t sr, sg, sb, sa, d; | 1139 uint8x8_t sr, sg, sb, sa, d; |
| 1077 uint16x8_t dst8, scale8, alpha8; | 1140 uint16x8_t dst8, scale8, alpha8; |
| 1078 uint16x8_t dst_r, dst_g, dst_b; | 1141 uint16x8_t dst_r, dst_g, dst_b; |
| 1079 | 1142 |
| 1080 #if defined(DEBUG_OPAQUE_DITHER) | 1143 #if defined(DEBUG_OPAQUE_DITHER) |
| 1081 // calculate 8 elements worth into a temp buffer | 1144 // calculate 8 elements worth into a temp buffer |
| 1082 { | 1145 { |
| 1083 int my_y = y; | 1146 int my_y = y; |
| 1084 int my_x = x; | 1147 int my_x = x; |
| 1085 SkPMColor* my_src = (SkPMColor*)src; | 1148 SkPMColor* my_src = (SkPMColor*)src; |
| (...skipping 30 matching lines...) Expand all Loading... | |
| 1116 tmpbuf[i] = *my_dst; | 1179 tmpbuf[i] = *my_dst; |
| 1117 ta[i] = tdv[i] = td[i] = 0xbeef; | 1180 ta[i] = tdv[i] = td[i] = 0xbeef; |
| 1118 } | 1181 } |
| 1119 in_dst[i] = *my_dst; | 1182 in_dst[i] = *my_dst; |
| 1120 my_dst += 1; | 1183 my_dst += 1; |
| 1121 DITHER_INC_X(my_x); | 1184 DITHER_INC_X(my_x); |
| 1122 } | 1185 } |
| 1123 } | 1186 } |
| 1124 #endif | 1187 #endif |
| 1125 | 1188 |
| 1126 | 1189 #ifdef SK_CPU_ARM64 |
| 1190 vsrc = sk_vld4_u8_arm64_4(src); | |
| 1191 #else | |
| 1127 { | 1192 { |
| 1128 register uint8x8_t d0 asm("d0"); | 1193 register uint8x8_t d0 asm("d0"); |
| 1129 register uint8x8_t d1 asm("d1"); | 1194 register uint8x8_t d1 asm("d1"); |
| 1130 register uint8x8_t d2 asm("d2"); | 1195 register uint8x8_t d2 asm("d2"); |
| 1131 register uint8x8_t d3 asm("d3"); | 1196 register uint8x8_t d3 asm("d3"); |
| 1132 | 1197 |
| 1133 asm ("vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" | 1198 asm ("vld4.8 {d0-d3},[%[src]]! " |
| 1134 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src) | 1199 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src) |
| 1135 : | 1200 : |
| 1136 ); | 1201 ); |
| 1137 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) | 1202 vsrc.val[0] = d0; |
| 1138 sr = d2; sg = d1; sb = d0; sa = d3; | 1203 vsrc.val[1] = d1; |
| 1139 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) | 1204 vsrc.val[2] = d2; |
| 1140 sr = d0; sg = d1; sb = d2; sa = d3; | 1205 vsrc.val[3] = d3; |
| 1206 } | |
| 1141 #endif | 1207 #endif |
| 1142 } | 1208 sa = vsrc.val[NEON_A]; |
| 1209 sr = vsrc.val[NEON_R]; | |
| 1210 sg = vsrc.val[NEON_G]; | |
| 1211 sb = vsrc.val[NEON_B]; | |
| 1143 | 1212 |
| 1144 /* calculate 'd', which will be 0..7 | 1213 /* calculate 'd', which will be 0..7 |
| 1145 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice | 1214 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice |
| 1146 */ | 1215 */ |
| 1147 alpha8 = vmovl_u8(dbase); | 1216 alpha8 = vmovl_u8(dbase); |
| 1148 alpha8 = vmlal_u8(alpha8, sa, dbase); | 1217 alpha8 = vmlal_u8(alpha8, sa, dbase); |
| 1149 d = vshrn_n_u16(alpha8, 8); // narrowing too | 1218 d = vshrn_n_u16(alpha8, 8); // narrowing too |
| 1150 | 1219 |
| 1151 // sr = sr - (sr>>5) + d | 1220 // sr = sr - (sr>>5) + d |
| 1152 /* watching for 8-bit overflow. d is 0..7; risky range of | 1221 /* watching for 8-bit overflow. d is 0..7; risky range of |
| (...skipping 123 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1276 #define UNROLL 8 | 1345 #define UNROLL 8 |
| 1277 if (count >= UNROLL) { | 1346 if (count >= UNROLL) { |
| 1278 uint8x8_t d; | 1347 uint8x8_t d; |
| 1279 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; | 1348 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; |
| 1280 d = vld1_u8(dstart); | 1349 d = vld1_u8(dstart); |
| 1281 | 1350 |
| 1282 while (count >= UNROLL) { | 1351 while (count >= UNROLL) { |
| 1283 uint8x8_t sr, sg, sb; | 1352 uint8x8_t sr, sg, sb; |
| 1284 uint16x8_t dr, dg, db; | 1353 uint16x8_t dr, dg, db; |
| 1285 uint16x8_t dst8; | 1354 uint16x8_t dst8; |
| 1355 uint8x8x4_t vsrc; | |
| 1286 | 1356 |
| 1357 #ifdef SK_CPU_ARM64 | |
| 1358 vsrc = sk_vld4_u8_arm64_3(src); | |
| 1359 #else | |
| 1287 { | 1360 { |
| 1288 register uint8x8_t d0 asm("d0"); | 1361 register uint8x8_t d0 asm("d0"); |
| 1289 register uint8x8_t d1 asm("d1"); | 1362 register uint8x8_t d1 asm("d1"); |
| 1290 register uint8x8_t d2 asm("d2"); | 1363 register uint8x8_t d2 asm("d2"); |
| 1291 register uint8x8_t d3 asm("d3"); | 1364 register uint8x8_t d3 asm("d3"); |
| 1292 | 1365 |
| 1293 asm ( | 1366 asm ( |
| 1294 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" | 1367 "vld4.8 {d0-d3},[%[src]]! " |
| 1295 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) | 1368 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) |
| 1296 : | 1369 : |
| 1297 ); | 1370 ); |
| 1298 sg = d1; | 1371 vsrc.val[0] = d0; |
| 1299 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) | 1372 vsrc.val[1] = d1; |
| 1300 sr = d2; sb = d0; | 1373 vsrc.val[2] = d2; |
| 1301 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) | 1374 } |
| 1302 sr = d0; sb = d2; | |
| 1303 #endif | 1375 #endif |
| 1304 } | 1376 sr = vsrc.val[NEON_R]; |
| 1377 sg = vsrc.val[NEON_G]; | |
| 1378 sb = vsrc.val[NEON_B]; | |
| 1379 | |
| 1305 /* XXX: if we want to prefetch, hide it in the above asm() | 1380 /* XXX: if we want to prefetch, hide it in the above asm() |
| 1306 * using the gcc __builtin_prefetch(), the prefetch will | 1381 * using the gcc __builtin_prefetch(), the prefetch will |
| 1307 * fall to the bottom of the loop -- it won't stick up | 1382 * fall to the bottom of the loop -- it won't stick up |
| 1308 * at the top of the loop, just after the vld4. | 1383 * at the top of the loop, just after the vld4. |
| 1309 */ | 1384 */ |
| 1310 | 1385 |
| 1311 // sr = sr - (sr>>5) + d | 1386 // sr = sr - (sr>>5) + d |
| 1312 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); | 1387 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); |
| 1313 dr = vaddl_u8(sr, d); | 1388 dr = vaddl_u8(sr, d); |
| 1314 | 1389 |
| (...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1362 SkPMColor c = *src++; | 1437 SkPMColor c = *src++; |
| 1363 SkPMColorAssert(c); | 1438 SkPMColorAssert(c); |
| 1364 SkASSERT(SkGetPackedA32(c) == 255); | 1439 SkASSERT(SkGetPackedA32(c) == 255); |
| 1365 | 1440 |
| 1366 unsigned dither = DITHER_VALUE(x); | 1441 unsigned dither = DITHER_VALUE(x); |
| 1367 *dst++ = SkDitherRGB32To565(c, dither); | 1442 *dst++ = SkDitherRGB32To565(c, dither); |
| 1368 DITHER_INC_X(x); | 1443 DITHER_INC_X(x); |
| 1369 } while (--count != 0); | 1444 } while (--count != 0); |
| 1370 } | 1445 } |
| 1371 } | 1446 } |
| 1372 #endif | |
| 1373 | 1447 |
| 1374 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, | 1448 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, |
| 1375 SkPMColor color) { | 1449 SkPMColor color) { |
| 1376 if (count <= 0) { | 1450 if (count <= 0) { |
| 1377 return; | 1451 return; |
| 1378 } | 1452 } |
| 1379 | 1453 |
| 1380 if (0 == color) { | 1454 if (0 == color) { |
| 1381 if (src != dst) { | 1455 if (src != dst) { |
| 1382 memcpy(dst, src, count * sizeof(SkPMColor)); | 1456 memcpy(dst, src, count * sizeof(SkPMColor)); |
| (...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1468 *dst = color + SkAlphaMulQ(*src, scale); | 1542 *dst = color + SkAlphaMulQ(*src, scale); |
| 1469 src += 1; | 1543 src += 1; |
| 1470 dst += 1; | 1544 dst += 1; |
| 1471 count--; | 1545 count--; |
| 1472 } | 1546 } |
| 1473 } | 1547 } |
| 1474 | 1548 |
| 1475 /////////////////////////////////////////////////////////////////////////////// | 1549 /////////////////////////////////////////////////////////////////////////////// |
| 1476 | 1550 |
| 1477 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { | 1551 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { |
| 1478 #ifdef SK_CPU_ARM32 | |
| 1479 // no dither | 1552 // no dither |
| 1480 S32_D565_Opaque_neon, | 1553 S32_D565_Opaque_neon, |
| 1481 S32_D565_Blend_neon, | 1554 S32_D565_Blend_neon, |
| 1555 #ifdef SK_CPU_ARM32 | |
| 1482 S32A_D565_Opaque_neon, | 1556 S32A_D565_Opaque_neon, |
| 1557 #else | |
| 1558 NULL, | |
| 1559 #endif | |
| 1483 S32A_D565_Blend_neon, | 1560 S32A_D565_Blend_neon, |
| 1484 | 1561 |
| 1485 // dither | 1562 // dither |
| 1486 S32_D565_Opaque_Dither_neon, | 1563 S32_D565_Opaque_Dither_neon, |
| 1487 S32_D565_Blend_Dither_neon, | 1564 S32_D565_Blend_Dither_neon, |
| 1488 S32A_D565_Opaque_Dither_neon, | 1565 S32A_D565_Opaque_Dither_neon, |
| 1489 NULL, // S32A_D565_Blend_Dither | 1566 NULL, // S32A_D565_Blend_Dither |
| 1490 #else | |
| 1491 NULL, NULL, NULL, NULL, | |
| 1492 NULL, NULL, NULL, NULL | |
| 1493 #endif | |
| 1494 }; | 1567 }; |
| 1495 | 1568 |
| 1496 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { | 1569 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { |
| 1497 NULL, // S32_Opaque, | 1570 NULL, // S32_Opaque, |
| 1498 S32_Blend_BlitRow32_neon, // S32_Blend, | 1571 S32_Blend_BlitRow32_neon, // S32_Blend, |
| 1499 /* | 1572 /* |
| 1500 * We have two choices for S32A_Opaque procs. The one reads the src alpha | 1573 * We have two choices for S32A_Opaque procs. The one reads the src alpha |
| 1501 * value and attempts to optimize accordingly. The optimization is | 1574 * value and attempts to optimize accordingly. The optimization is |
| 1502 * sensitive to the source content and is not a win in all cases. For | 1575 * sensitive to the source content and is not a win in all cases. For |
| 1503 * example, if there are a lot of transitions between the alpha states, | 1576 * example, if there are a lot of transitions between the alpha states, |
| 1504 * the performance will almost certainly be worse. However, for many | 1577 * the performance will almost certainly be worse. However, for many |
| 1505 * common cases the performance is equivalent or better than the standard | 1578 * common cases the performance is equivalent or better than the standard |
| 1506 * case where we do not inspect the src alpha. | 1579 * case where we do not inspect the src alpha. |
| 1507 */ | 1580 */ |
| 1508 #if SK_A32_SHIFT == 24 | 1581 #if SK_A32_SHIFT == 24 |
| 1509 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | 1582 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
| 1510 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | 1583 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
| 1511 #else | 1584 #else |
| 1512 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1585 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
| 1513 #endif | 1586 #endif |
| 1514 #ifdef SK_CPU_ARM32 | 1587 #ifdef SK_CPU_ARM32 |
| 1515 S32A_Blend_BlitRow32_neon // S32A_Blend | 1588 S32A_Blend_BlitRow32_neon // S32A_Blend |
| 1516 #else | 1589 #else |
| 1517 NULL | 1590 NULL |
| 1518 #endif | 1591 #endif |
| 1519 }; | 1592 }; |
| OLD | NEW |