| OLD | NEW |
| 1 | 1 |
| 2 /* | 2 /* |
| 3 * Copyright 2012 The Android Open Source Project | 3 * Copyright 2012 The Android Open Source Project |
| 4 * | 4 * |
| 5 * Use of this source code is governed by a BSD-style license that can be | 5 * Use of this source code is governed by a BSD-style license that can be |
| 6 * found in the LICENSE file. | 6 * found in the LICENSE file. |
| 7 */ | 7 */ |
| 8 | 8 |
| 9 | 9 |
| 10 #include <arm_neon.h> | 10 #include <arm_neon.h> |
| 11 #include "SkColorPriv.h" | 11 #include "SkColorPriv.h" |
| 12 | 12 |
| 13 /* | 13 /* |
| 14 * Filter_32_opaque | 14 * Filter_32_opaque |
| 15 * | 15 * |
| 16 * There is no hard-n-fast rule that the filtering must produce | 16 * There is no hard-n-fast rule that the filtering must produce |
| 17 * exact results for the color components, but if the 4 incoming colors are | 17 * exact results for the color components, but if the 4 incoming colors are |
| 18 * all opaque, then the output color must also be opaque. Subsequent parts of | 18 * all opaque, then the output color must also be opaque. Subsequent parts of |
| 19 * the drawing pipeline may rely on this (e.g. which blitrow proc to use). | 19 * the drawing pipeline may rely on this (e.g. which blitrow proc to use). |
| 20 * |
| 20 */ | 21 */ |
| 21 | 22 // Chrome on Android uses -Os so we need to force these inline. Otherwise |
| 22 static inline void Filter_32_opaque_neon(unsigned x, unsigned y, | 23 // calling the function in the inner loops will cause significant overhead on |
| 23 SkPMColor a00, SkPMColor a01, | 24 // some platforms. |
| 24 SkPMColor a10, SkPMColor a11, | 25 static SK_ALWAYS_INLINE void Filter_32_opaque_neon(unsigned x, unsigned y, |
| 25 SkPMColor *dst) { | 26 SkPMColor a00, SkPMColor a01, |
| 27 SkPMColor a10, SkPMColor a11, |
| 28 SkPMColor *dst) { |
| 26 uint8x8_t vy, vconst16_8, v16_y, vres; | 29 uint8x8_t vy, vconst16_8, v16_y, vres; |
| 27 uint16x4_t vx, vconst16_16, v16_x, tmp; | 30 uint16x4_t vx, vconst16_16, v16_x, tmp; |
| 28 uint32x2_t va0, va1; | 31 uint32x2_t va0, va1; |
| 29 uint16x8_t tmp1, tmp2; | 32 uint16x8_t tmp1, tmp2; |
| 30 | 33 |
| 31 vy = vdup_n_u8(y); // duplicate y into vy | 34 vy = vdup_n_u8(y); // duplicate y into vy |
| 32 vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8 | 35 vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8 |
| 33 v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y | 36 v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y |
| 34 | 37 |
| 35 va0 = vdup_n_u32(a00); // duplicate a00 | 38 va0 = vdup_n_u32(a00); // duplicate a00 |
| (...skipping 10 matching lines...) Expand all Loading... |
| 46 | 49 |
| 47 tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x | 50 tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x |
| 48 tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x | 51 tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x |
| 49 tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x) | 52 tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x) |
| 50 tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x) | 53 tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x) |
| 51 | 54 |
| 52 vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down resu
lt by 8 | 55 vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down resu
lt by 8 |
| 53 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result | 56 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result |
| 54 } | 57 } |
| 55 | 58 |
| 56 static inline void Filter_32_alpha_neon(unsigned x, unsigned y, | 59 static SK_ALWAYS_INLINE void Filter_32_alpha_neon(unsigned x, unsigned y, |
| 57 SkPMColor a00, SkPMColor a01, | 60 SkPMColor a00, SkPMColor a01, |
| 58 SkPMColor a10, SkPMColor a11, | 61 SkPMColor a10, SkPMColor a11, |
| 59 SkPMColor *dst, uint16_t scale) { | 62 SkPMColor *dst, |
| 63 uint16_t scale) { |
| 60 uint8x8_t vy, vconst16_8, v16_y, vres; | 64 uint8x8_t vy, vconst16_8, v16_y, vres; |
| 61 uint16x4_t vx, vconst16_16, v16_x, tmp, vscale; | 65 uint16x4_t vx, vconst16_16, v16_x, tmp, vscale; |
| 62 uint32x2_t va0, va1; | 66 uint32x2_t va0, va1; |
| 63 uint16x8_t tmp1, tmp2; | 67 uint16x8_t tmp1, tmp2; |
| 64 | 68 |
| 65 vy = vdup_n_u8(y); // duplicate y into vy | 69 vy = vdup_n_u8(y); // duplicate y into vy |
| 66 vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8 | 70 vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8 |
| 67 v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y | 71 v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y |
| 68 | 72 |
| 69 va0 = vdup_n_u32(a00); // duplicate a00 | 73 va0 = vdup_n_u32(a00); // duplicate a00 |
| (...skipping 13 matching lines...) Expand all Loading... |
| 83 tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x) | 87 tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x) |
| 84 tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x) | 88 tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x) |
| 85 | 89 |
| 86 vscale = vdup_n_u16(scale); // duplicate scale | 90 vscale = vdup_n_u16(scale); // duplicate scale |
| 87 tmp = vshr_n_u16(tmp, 8); // shift down result by 8 | 91 tmp = vshr_n_u16(tmp, 8); // shift down result by 8 |
| 88 tmp = vmul_u16(tmp, vscale); // multiply result by scale | 92 tmp = vmul_u16(tmp, vscale); // multiply result by scale |
| 89 | 93 |
| 90 vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down resu
lt by 8 | 94 vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down resu
lt by 8 |
| 91 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result | 95 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result |
| 92 } | 96 } |
| OLD | NEW |