OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBlitRow_opts_arm_neon.h" | 8 #include "SkBlitRow_opts_arm_neon.h" |
9 | 9 |
10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
45 // Leftovers | 45 // Leftovers |
46 while (count > 0) { | 46 while (count > 0) { |
47 SkPMColor c = *src++; | 47 SkPMColor c = *src++; |
48 SkPMColorAssert(c); | 48 SkPMColorAssert(c); |
49 *dst = SkPixel32ToPixel16_ToU16(c); | 49 *dst = SkPixel32ToPixel16_ToU16(c); |
50 dst++; | 50 dst++; |
51 count--; | 51 count--; |
52 }; | 52 }; |
53 } | 53 } |
54 | 54 |
| 55 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst, |
| 56 const SkPMColor* SK_RESTRICT src, int count, |
| 57 U8CPU alpha, int /*x*/, int /*y*/) { |
| 58 SkASSERT(255 > alpha); |
| 59 |
| 60 uint16x8_t vmask_blue, vscale; |
| 61 |
| 62 // prepare constants |
| 63 vscale = vdupq_n_u16(SkAlpha255To256(alpha)); |
| 64 vmask_blue = vmovq_n_u16(0x1F); |
| 65 |
| 66 while (count >= 8) { |
| 67 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; |
| 68 uint16x8_t vres_r, vres_g, vres_b; |
| 69 uint8x8_t vsrc_r, vsrc_g, vsrc_b; |
| 70 |
| 71 // Load src |
| 72 { |
| 73 register uint8x8_t d0 asm("d0"); |
| 74 register uint8x8_t d1 asm("d1"); |
| 75 register uint8x8_t d2 asm("d2"); |
| 76 register uint8x8_t d3 asm("d3"); |
| 77 |
| 78 asm ( |
| 79 "vld4.8 {d0-d3},[%[src]]!" |
| 80 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) |
| 81 : |
| 82 ); |
| 83 vsrc_g = d1; |
| 84 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) |
| 85 vsrc_r = d2; vsrc_b = d0; |
| 86 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) |
| 87 vsrc_r = d0; vsrc_b = d2; |
| 88 #endif |
| 89 } |
| 90 |
| 91 // Load and unpack dst |
| 92 vdst = vld1q_u16(dst); |
| 93 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes |
| 94 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue |
| 95 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red |
| 96 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green |
| 97 |
| 98 // Shift src to 565 |
| 99 vsrc_r = vshr_n_u8(vsrc_r, 3); // shift red to 565 range |
| 100 vsrc_g = vshr_n_u8(vsrc_g, 2); // shift green to 565 range |
| 101 vsrc_b = vshr_n_u8(vsrc_b, 3); // shift blue to 565 range |
| 102 |
| 103 // Scale src - dst |
| 104 vres_r = vmovl_u8(vsrc_r) - vdst_r; |
| 105 vres_g = vmovl_u8(vsrc_g) - vdst_g; |
| 106 vres_b = vmovl_u8(vsrc_b) - vdst_b; |
| 107 |
| 108 vres_r = vshrq_n_u16(vres_r * vscale, 8); |
| 109 vres_g = vshrq_n_u16(vres_g * vscale, 8); |
| 110 vres_b = vshrq_n_u16(vres_b * vscale, 8); |
| 111 |
| 112 vres_r += vdst_r; |
| 113 vres_g += vdst_g; |
| 114 vres_b += vdst_b; |
| 115 |
| 116 // Combine |
| 117 vres_b = vsliq_n_u16(vres_b, vres_g, 5); // insert green into blue |
| 118 vres_b = vsliq_n_u16(vres_b, vres_r, 6+5); // insert red into green/blu
e |
| 119 |
| 120 // Store |
| 121 vst1q_u16(dst, vres_b); |
| 122 dst += 8; |
| 123 count -= 8; |
| 124 } |
| 125 if (count > 0) { |
| 126 int scale = SkAlpha255To256(alpha); |
| 127 do { |
| 128 SkPMColor c = *src++; |
| 129 SkPMColorAssert(c); |
| 130 uint16_t d = *dst; |
| 131 *dst++ = SkPackRGB16( |
| 132 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale), |
| 133 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale), |
| 134 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale)); |
| 135 } while (--count != 0); |
| 136 } |
| 137 } |
| 138 |
55 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, | 139 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, |
56 const SkPMColor* SK_RESTRICT src, int count, | 140 const SkPMColor* SK_RESTRICT src, int count, |
57 U8CPU alpha, int /*x*/, int /*y*/) { | 141 U8CPU alpha, int /*x*/, int /*y*/) { |
58 SkASSERT(255 == alpha); | 142 SkASSERT(255 == alpha); |
59 | 143 |
60 if (count >= 8) { | 144 if (count >= 8) { |
61 uint16_t* SK_RESTRICT keep_dst = 0; | 145 uint16_t* SK_RESTRICT keep_dst = 0; |
62 | 146 |
63 asm volatile ( | 147 asm volatile ( |
64 "ands ip, %[count], #7 \n\t" | 148 "ands ip, %[count], #7 \n\t" |
(...skipping 1313 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1378 dst += 1; | 1462 dst += 1; |
1379 count--; | 1463 count--; |
1380 } | 1464 } |
1381 } | 1465 } |
1382 } | 1466 } |
1383 | 1467 |
1384 /////////////////////////////////////////////////////////////////////////////// | 1468 /////////////////////////////////////////////////////////////////////////////// |
1385 | 1469 |
1386 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { | 1470 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { |
1387 // no dither | 1471 // no dither |
1388 // NOTE: For the S32_D565_Blend function below, we don't have a special | |
1389 // version that assumes that each source pixel is opaque. But our | |
1390 // S32A is still faster than the default, so use it. | |
1391 S32_D565_Opaque_neon, | 1472 S32_D565_Opaque_neon, |
1392 S32A_D565_Blend_neon, // really S32_D565_Blend | 1473 S32_D565_Blend_neon, |
1393 S32A_D565_Opaque_neon, | 1474 S32A_D565_Opaque_neon, |
1394 S32A_D565_Blend_neon, | 1475 S32A_D565_Blend_neon, |
1395 | 1476 |
1396 // dither | 1477 // dither |
1397 S32_D565_Opaque_Dither_neon, | 1478 S32_D565_Opaque_Dither_neon, |
1398 S32_D565_Blend_Dither_neon, | 1479 S32_D565_Blend_Dither_neon, |
1399 S32A_D565_Opaque_Dither_neon, | 1480 S32A_D565_Opaque_Dither_neon, |
1400 NULL, // S32A_D565_Blend_Dither | 1481 NULL, // S32A_D565_Blend_Dither |
1401 }; | 1482 }; |
1402 | 1483 |
(...skipping 10 matching lines...) Expand all Loading... |
1413 * case where we do not inspect the src alpha. | 1494 * case where we do not inspect the src alpha. |
1414 */ | 1495 */ |
1415 #if SK_A32_SHIFT == 24 | 1496 #if SK_A32_SHIFT == 24 |
1416 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | 1497 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
1417 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | 1498 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
1418 #else | 1499 #else |
1419 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1500 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
1420 #endif | 1501 #endif |
1421 S32A_Blend_BlitRow32_neon // S32A_Blend | 1502 S32A_Blend_BlitRow32_neon // S32A_Blend |
1422 }; | 1503 }; |
OLD | NEW |