OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright 2013 The Android Open Source Project | |
3 * | |
4 * Use of this source code is governed by a BSD-style license that can be | |
5 * found in the LICENSE file. | |
6 */ | |
7 | |
8 | |
9 #include "SkBitmap.h" | |
10 #include "SkColorPriv.h" | |
11 #include "SkBlurImage_opts.h" | |
12 #include "SkRect.h" | |
13 | |
14 #include <arm_neon.h> | |
15 | |
16 namespace { | |
17 | |
18 enum BlurDirection { | |
19 kX, kY | |
20 }; | |
21 | |
22 /** | |
23 * Helper function to load 2 pixels from diffent rows to a 8x8 NEON register | |
24 * and also pre-load pixels for future read | |
25 */ | |
26 template<BlurDirection srcDirection> | |
27 inline uint8x8_t load_2_pixels(const SkPMColor* src, int srcStride) { | |
28 if (srcDirection == kX) { | |
29 uint32x2_t temp = vdup_n_u32(0); | |
30 // 10% faster by adding these 2 prefetches | |
31 SK_PREFETCH(src + 16); | |
32 SK_PREFETCH(src + srcStride + 16); | |
33 return vreinterpret_u8_u32(vld1_lane_u32(src + srcStride, vld1_lane_u32(
src, temp, 0), 1)); | |
34 } else { | |
35 return vld1_u8((uint8_t*)src); | |
36 } | |
37 } | |
38 | |
39 /** | |
40 * Helper function to store the low 8-bits from a 16x8 NEON register to 2 rows | |
41 */ | |
42 template<BlurDirection dstDirection> | |
43 inline void store_2_pixels(uint16x8_t result16x8, SkPMColor* dst, int dstStride)
{ | |
44 if (dstDirection == kX) { | |
45 uint32x2_t temp = vreinterpret_u32_u8(vmovn_u16(result16x8)); | |
46 vst1_lane_u32(dst, temp, 0); | |
47 vst1_lane_u32(dst + dstStride, temp, 1); | |
48 } else { | |
49 uint8x8_t temp = vmovn_u16(result16x8); | |
50 vst1_u8((uint8_t*)dst, temp); | |
51 } | |
52 } | |
53 | |
54 /** | |
55 * fast path for kernel size less than 128 | |
56 */ | |
57 template<BlurDirection srcDirection, BlurDirection dstDirection> | |
58 void SkDoubleRowBoxBlur_NEON(const SkPMColor** src, int srcStride, SkPMColor** d
st, int kernelSize, | |
59 int leftOffset, int rightOffset, int width, int* height) | |
60 { | |
61 const int rightBorder = SkMin32(rightOffset + 1, width); | |
62 const int srcStrideX = srcDirection == kX ? 1 : srcStride; | |
63 const int dstStrideX = dstDirection == kX ? 1 : *height; | |
64 const int srcStrideY = srcDirection == kX ? srcStride : 1; | |
65 const int dstStrideY = dstDirection == kX ? width : 1; | |
66 const uint16x8_t scale = vdupq_n_u16((1 << 15) / kernelSize); | |
67 | |
68 for (; *height >= 2; *height -= 2) { | |
69 uint16x8_t sum = vdupq_n_u16(0); | |
70 const SkPMColor* p = *src; | |
71 for (int i = 0; i < rightBorder; i++) { | |
72 sum = vaddw_u8(sum, | |
73 load_2_pixels<srcDirection>(p, srcStride)); | |
74 p += srcStrideX; | |
75 } | |
76 | |
77 const SkPMColor* sptr = *src; | |
78 SkPMColor* dptr = *dst; | |
79 for (int x = 0; x < width; x++) { | |
80 // val = (sum * scale * 2 + 0x8000) >> 16 | |
81 uint16x8_t resultPixels = vreinterpretq_u16_s16(vqrdmulhq_s16( | |
82 vreinterpretq_s16_u16(sum), vreinterpretq_s16_u16(scale))); | |
83 store_2_pixels<dstDirection>(resultPixels, dptr, width); | |
84 | |
85 if (x >= leftOffset) { | |
86 sum = vsubw_u8(sum, | |
87 load_2_pixels<srcDirection>(sptr - leftOffset * srcStrideX,
srcStride)); | |
88 } | |
89 if (x + rightOffset + 1 < width) { | |
90 sum = vaddw_u8(sum, | |
91 load_2_pixels<srcDirection>(sptr + (rightOffset + 1) * srcSt
rideX, srcStride)); | |
92 } | |
93 sptr += srcStrideX; | |
94 dptr += dstStrideX; | |
95 } | |
96 *src += srcStrideY * 2; | |
97 *dst += dstStrideY * 2; | |
98 } | |
99 } | |
100 | |
101 | |
102 /** | |
103 * Helper function to spread the components of a 32-bit integer into the | |
104 * lower 8 bits of each 16-bit element of a NEON register. | |
105 */ | |
106 | |
107 static inline uint16x4_t expand(uint32_t a) { | |
108 // ( ARGB ) -> ( ARGB ARGB ) -> ( A R G B A R G B ) | |
109 uint8x8_t v8 = vreinterpret_u8_u32(vdup_n_u32(a)); | |
110 // ( A R G B A R G B ) -> ( 0A 0R 0G 0B 0A 0R 0G 0B ) -> ( 0A 0R 0G 0B ) | |
111 return vget_low_u16(vmovl_u8(v8)); | |
112 } | |
113 | |
114 template<BlurDirection srcDirection, BlurDirection dstDirection> | |
115 void SkBoxBlur_NEON(const SkPMColor* src, int srcStride, SkPMColor* dst, int ker
nelSize, | |
116 int leftOffset, int rightOffset, int width, int height) | |
117 { | |
118 const int rightBorder = SkMin32(rightOffset + 1, width); | |
119 const int srcStrideX = srcDirection == kX ? 1 : srcStride; | |
120 const int dstStrideX = dstDirection == kX ? 1 : height; | |
121 const int srcStrideY = srcDirection == kX ? srcStride : 1; | |
122 const int dstStrideY = dstDirection == kX ? width : 1; | |
123 const uint32x4_t scale = vdupq_n_u32((1 << 24) / kernelSize); | |
124 const uint32x4_t half = vdupq_n_u32(1 << 23); | |
125 | |
126 if (1 < kernelSize && kernelSize < 128) | |
127 { | |
128 SkDoubleRowBoxBlur_NEON<srcDirection, dstDirection>(&src, srcStride, &ds
t, kernelSize, | |
129 leftOffset, rightOffset, width, &height); | |
130 } | |
131 | |
132 for (; height > 0; height--) { | |
133 uint32x4_t sum = vdupq_n_u32(0); | |
134 const SkPMColor* p = src; | |
135 for (int i = 0; i < rightBorder; ++i) { | |
136 sum = vaddw_u16(sum, expand(*p)); | |
137 p += srcStrideX; | |
138 } | |
139 | |
140 const SkPMColor* sptr = src; | |
141 SkPMColor* dptr = dst; | |
142 for (int x = 0; x < width; ++x) { | |
143 // ( half+sumA*scale half+sumR*scale half+sumG*scale half+sumB*scale
) | |
144 uint32x4_t result = vmlaq_u32(half, sum, scale); | |
145 | |
146 // Saturated conversion to 16-bit. | |
147 // ( AAAA RRRR GGGG BBBB ) -> ( 0A 0R 0G 0B ) | |
148 uint16x4_t result16 = vqshrn_n_u32(result, 16); | |
149 | |
150 // Saturated conversion to 8-bit. | |
151 // ( 0A 0R 0G 0B ) -> ( 0A 0R 0G 0B 0A 0R 0G 0B ) -> ( A R G B A R G
B ) | |
152 uint8x8_t result8 = vqshrn_n_u16(vcombine_u16(result16, result16), 8
); | |
153 | |
154 // ( A R G B A R G B ) -> ( ARGB ARGB ) -> ( ARGB ) | |
155 // Store low 32 bits to destination. | |
156 vst1_lane_u32(dptr, vreinterpret_u32_u8(result8), 0); | |
157 | |
158 if (x >= leftOffset) { | |
159 const SkPMColor* l = sptr - leftOffset * srcStrideX; | |
160 sum = vsubw_u16(sum, expand(*l)); | |
161 } | |
162 if (x + rightOffset + 1 < width) { | |
163 const SkPMColor* r = sptr + (rightOffset + 1) * srcStrideX; | |
164 sum = vaddw_u16(sum, expand(*r)); | |
165 } | |
166 sptr += srcStrideX; | |
167 if (srcDirection == kX) { | |
168 SK_PREFETCH(sptr + (rightOffset + 16) * srcStrideX); | |
169 } | |
170 dptr += dstStrideX; | |
171 } | |
172 src += srcStrideY; | |
173 dst += dstStrideY; | |
174 } | |
175 } | |
176 | |
177 } // namespace | |
178 | |
179 bool SkBoxBlurGetPlatformProcs_NEON(SkBoxBlurProc* boxBlurX, | |
180 SkBoxBlurProc* boxBlurXY, | |
181 SkBoxBlurProc* boxBlurYX) { | |
182 *boxBlurX = SkBoxBlur_NEON<kX, kX>; | |
183 *boxBlurXY = SkBoxBlur_NEON<kX, kY>; | |
184 *boxBlurYX = SkBoxBlur_NEON<kY, kX>; | |
185 return true; | |
186 } | |
OLD | NEW |