OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 Google Inc. | 2 * Copyright 2012 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBitmapProcState.h" | 8 #include "SkBitmapProcState.h" |
9 #include "SkBitmapProcState_filter.h" | 9 #include "SkBitmapProcState_filter.h" |
10 #include "SkColorPriv.h" | 10 #include "SkColorPriv.h" |
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
70 // todo: possibly specialize on opaqueness | 70 // todo: possibly specialize on opaqueness |
71 SG8_alpha_D32_nofilter_DXDY_neon, | 71 SG8_alpha_D32_nofilter_DXDY_neon, |
72 SG8_alpha_D32_nofilter_DXDY_neon, | 72 SG8_alpha_D32_nofilter_DXDY_neon, |
73 SG8_alpha_D32_nofilter_DX_neon, | 73 SG8_alpha_D32_nofilter_DX_neon, |
74 SG8_alpha_D32_nofilter_DX_neon, | 74 SG8_alpha_D32_nofilter_DX_neon, |
75 SG8_alpha_D32_filter_DXDY_neon, | 75 SG8_alpha_D32_filter_DXDY_neon, |
76 SG8_alpha_D32_filter_DXDY_neon, | 76 SG8_alpha_D32_filter_DXDY_neon, |
77 SG8_alpha_D32_filter_DX_neon, | 77 SG8_alpha_D32_filter_DX_neon, |
78 SG8_alpha_D32_filter_DX_neon, | 78 SG8_alpha_D32_filter_DX_neon, |
79 }; | 79 }; |
80 | |
81 /////////////////////////////////////////////////////////////////////////////// | |
82 | |
83 #include <arm_neon.h> | |
84 #include "SkConvolver.h" | |
85 | |
86 static SK_ALWAYS_INLINE void accum_remainder(const unsigned char* pixels_left, | |
87 const SkConvolutionFilter1D::ConvolutionFixed* filter_values, int32x4_t&
accum, int r) { | |
88 int remainder[4] = {0}; | |
89 for (int i = 0; i < r; i++) { | |
90 SkConvolutionFilter1D::ConvolutionFixed coeff = filter_values[i]; | |
91 remainder[0] += coeff * pixels_left[i * 4 + 0]; | |
92 remainder[1] += coeff * pixels_left[i * 4 + 1]; | |
93 remainder[2] += coeff * pixels_left[i * 4 + 2]; | |
94 remainder[3] += coeff * pixels_left[i * 4 + 3]; | |
95 } | |
96 int32x4_t t = {remainder[0], remainder[1], remainder[2], remainder[3]}; | |
97 accum += t; | |
98 } | |
99 | |
100 // Convolves horizontally along a single row. The row data is given in | |
101 // |srcData| and continues for the numValues() of the filter. | |
102 void convolveHorizontally_neon(const unsigned char* srcData, | |
103 const SkConvolutionFilter1D& filter, | |
104 unsigned char* outRow, | |
105 bool hasAlpha) { | |
106 // Loop over each pixel on this row in the output image. | |
107 int numValues = filter.numValues(); | |
108 for (int outX = 0; outX < numValues; outX++) { | |
109 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); | |
110 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); | |
111 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); | |
112 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); | |
113 // Get the filter that determines the current output pixel. | |
114 int filterOffset, filterLength; | |
115 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = | |
116 filter.FilterForValue(outX, &filterOffset, &filterLength); | |
117 | |
118 // Compute the first pixel in this row that the filter affects. It will | |
119 // touch |filterLength| pixels (4 bytes each) after this. | |
120 const unsigned char* rowToFilter = &srcData[filterOffset * 4]; | |
121 | |
122 // Apply the filter to the row to get the destination pixel in |accum|. | |
123 int32x4_t accum = vdupq_n_s32(0); | |
124 for (int filterX = 0; filterX < filterLength >> 2; filterX++) { | |
125 // Load 4 coefficients | |
126 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; | |
127 coeffs = vld1_s16(filterValues); | |
128 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c
oeff_mask0)); | |
129 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c
oeff_mask1)); | |
130 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c
oeff_mask2)); | |
131 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c
oeff_mask3)); | |
132 | |
133 // Load pixels and calc | |
134 uint8x16_t pixels = vld1q_u8(rowToFilter); | |
135 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels
))); | |
136 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixel
s))); | |
137 | |
138 int16x4_t p0_src = vget_low_s16(p01_16); | |
139 int16x4_t p1_src = vget_high_s16(p01_16); | |
140 int16x4_t p2_src = vget_low_s16(p23_16); | |
141 int16x4_t p3_src = vget_high_s16(p23_16); | |
142 | |
143 int32x4_t p0 = vmull_s16(p0_src, coeff0); | |
144 int32x4_t p1 = vmull_s16(p1_src, coeff1); | |
145 int32x4_t p2 = vmull_s16(p2_src, coeff2); | |
146 int32x4_t p3 = vmull_s16(p3_src, coeff3); | |
147 | |
148 accum += p0; | |
149 accum += p1; | |
150 accum += p2; | |
151 accum += p3; | |
152 | |
153 // Advance the pointers | |
154 rowToFilter += 16; | |
155 filterValues += 4; | |
156 } | |
157 | |
158 int r = filterLength & 3; | |
159 if (r) { | |
160 int remainder_offset = (filterOffset + filterLength - r) * 4; | |
161 accum_remainder(srcData + remainder_offset, filterValues, accum, r); | |
162 } | |
163 | |
164 // Bring this value back in range. All of the filter scaling factors | |
165 // are in fixed point with kShiftBits bits of fractional part. | |
166 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); | |
167 | |
168 // Pack and store the new pixel. | |
169 int16x4_t accum16 = vqmovn_s32(accum); | |
170 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16)); | |
171 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(a
ccum8), 0); | |
172 outRow += 4; | |
173 } | |
174 } | |
175 | |
176 // Does vertical convolution to produce one output row. The filter values and | |
177 // length are given in the first two parameters. These are applied to each | |
178 // of the rows pointed to in the |sourceDataRows| array, with each row | |
179 // being |pixelWidth| wide. | |
180 // | |
181 // The output must have room for |pixelWidth * 4| bytes. | |
182 template<bool hasAlpha> | |
183 void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filt
erValues, | |
184 int filterLength, | |
185 unsigned char* const* sourceDataRows, | |
186 int pixelWidth, | |
187 unsigned char* outRow) { | |
188 int width = pixelWidth & ~3; | |
189 | |
190 int32x4_t accum0, accum1, accum2, accum3; | |
191 int16x4_t coeff16; | |
192 | |
193 // Output four pixels per iteration (16 bytes). | |
194 for (int outX = 0; outX < width; outX += 4) { | |
195 | |
196 // Accumulated result for each pixel. 32 bits per RGBA channel. | |
197 accum0 = accum1 = accum2 = accum3 = vdupq_n_s32(0); | |
198 | |
199 // Convolve with one filter coefficient per iteration. | |
200 for (int filterY = 0; filterY < filterLength; filterY++) { | |
201 | |
202 // Duplicate the filter coefficient 4 times. | |
203 // [16] cj cj cj cj | |
204 coeff16 = vdup_n_s16(filterValues[filterY]); | |
205 | |
206 // Load four pixels (16 bytes) together. | |
207 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
208 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]); | |
209 | |
210 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8
))); | |
211 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src
8))); | |
212 int16x4_t src16_0 = vget_low_s16(src16_01); | |
213 int16x4_t src16_1 = vget_high_s16(src16_01); | |
214 int16x4_t src16_2 = vget_low_s16(src16_23); | |
215 int16x4_t src16_3 = vget_high_s16(src16_23); | |
216 | |
217 accum0 += vmull_s16(src16_0, coeff16); | |
218 accum1 += vmull_s16(src16_1, coeff16); | |
219 accum2 += vmull_s16(src16_2, coeff16); | |
220 accum3 += vmull_s16(src16_3, coeff16); | |
221 } | |
222 | |
223 // Shift right for fixed point implementation. | |
224 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits); | |
225 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits); | |
226 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits); | |
227 accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits); | |
228 | |
229 // Packing 32 bits |accum| to 16 bits per channel (signed saturation). | |
230 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | |
231 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1
)); | |
232 // [16] a3 b3 g3 r3 a2 b2 g2 r2 | |
233 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum3
)); | |
234 | |
235 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). | |
236 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
237 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accu
m16_1)); | |
238 | |
239 if (hasAlpha) { | |
240 // Compute the max(ri, gi, bi) for each pixel. | |
241 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 | |
242 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8
(accum8), 8)); | |
243 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
244 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g | |
245 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 | |
246 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 1
6)); | |
247 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
248 b = vmaxq_u8(a, b); // Max of r and g and b. | |
249 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 | |
250 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24)); | |
251 | |
252 // Make sure the value of alpha channel is always larger than maximu
m | |
253 // value of color channels. | |
254 accum8 = vmaxq_u8(b, accum8); | |
255 } else { | |
256 // Set value of alpha channels to 0xFF. | |
257 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n
_u32(0xFF000000)); | |
258 } | |
259 | |
260 // Store the convolution result (16 bytes) and advance the pixel pointer
s. | |
261 vst1q_u8(outRow, accum8); | |
262 outRow += 16; | |
263 } | |
264 | |
265 // Process the leftovers when the width of the output is not divisible | |
266 // by 4, that is at most 3 pixels. | |
267 int r = pixelWidth & 3; | |
268 if (r) { | |
269 | |
270 accum0 = accum1 = accum2 = vdupq_n_s32(0); | |
271 | |
272 for (int filterY = 0; filterY < filterLength; ++filterY) { | |
273 coeff16 = vdup_n_s16(filterValues[filterY]); | |
274 | |
275 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
276 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]); | |
277 | |
278 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8
))); | |
279 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src
8))); | |
280 int16x4_t src16_0 = vget_low_s16(src16_01); | |
281 int16x4_t src16_1 = vget_high_s16(src16_01); | |
282 int16x4_t src16_2 = vget_low_s16(src16_23); | |
283 | |
284 accum0 += vmull_s16(src16_0, coeff16); | |
285 accum1 += vmull_s16(src16_1, coeff16); | |
286 accum2 += vmull_s16(src16_2, coeff16); | |
287 } | |
288 | |
289 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits); | |
290 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits); | |
291 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits); | |
292 | |
293 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1
)); | |
294 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum2
)); | |
295 | |
296 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accu
m16_1)); | |
297 | |
298 if (hasAlpha) { | |
299 // Compute the max(ri, gi, bi) for each pixel. | |
300 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 | |
301 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8
(accum8), 8)); | |
302 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
303 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g | |
304 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 | |
305 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 1
6)); | |
306 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
307 b = vmaxq_u8(a, b); // Max of r and g and b. | |
308 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 | |
309 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24)); | |
310 | |
311 // Make sure the value of alpha channel is always larger than maximu
m | |
312 // value of color channels. | |
313 accum8 = vmaxq_u8(b, accum8); | |
314 } else { | |
315 // Set value of alpha channels to 0xFF. | |
316 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n
_u32(0xFF000000)); | |
317 } | |
318 | |
319 switch(r) { | |
320 case 1: | |
321 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpretq_u3
2_u8(accum8), 0); | |
322 break; | |
323 case 2: | |
324 vst1_u32(reinterpret_cast<uint32_t*>(outRow), | |
325 vreinterpret_u32_u8(vget_low_u8(accum8))); | |
326 break; | |
327 case 3: | |
328 vst1_u32(reinterpret_cast<uint32_t*>(outRow), | |
329 vreinterpret_u32_u8(vget_low_u8(accum8))); | |
330 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow+8), vreinterpretq_
u32_u8(accum8), 2); | |
331 break; | |
332 } | |
333 } | |
334 } | |
335 | |
336 void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filt
erValues, | |
337 int filterLength, | |
338 unsigned char* const* sourceDataRows, | |
339 int pixelWidth, | |
340 unsigned char* outRow, | |
341 bool sourceHasAlpha) { | |
342 if (sourceHasAlpha) { | |
343 convolveVertically_neon<true>(filterValues, filterLength, | |
344 sourceDataRows, pixelWidth, | |
345 outRow); | |
346 } else { | |
347 convolveVertically_neon<false>(filterValues, filterLength, | |
348 sourceDataRows, pixelWidth, | |
349 outRow); | |
350 } | |
351 } | |
352 | |
353 // Convolves horizontally along four rows. The row data is given in | |
354 // |src_data| and continues for the num_values() of the filter. | |
355 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please | |
356 // refer to that function for detailed comments. | |
357 void convolve4RowsHorizontally_neon(const unsigned char* srcData[4], | |
358 const SkConvolutionFilter1D& filter, | |
359 unsigned char* outRow[4], | |
360 size_t outRowBytes) { | |
361 | |
362 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); | |
363 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); | |
364 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); | |
365 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); | |
366 int num_values = filter.numValues(); | |
367 | |
368 int filterOffset, filterLength; | |
369 | |
370 // Output one pixel each iteration, calculating all channels (RGBA) together
. | |
371 for (int outX = 0; outX < num_values; outX++) { | |
372 | |
373 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = | |
374 filter.FilterForValue(outX, &filterOffset, &filterLength); | |
375 | |
376 // four pixels in a column per iteration. | |
377 int32x4_t accum0 = vdupq_n_s32(0); | |
378 int32x4_t accum1 = vdupq_n_s32(0); | |
379 int32x4_t accum2 = vdupq_n_s32(0); | |
380 int32x4_t accum3 = vdupq_n_s32(0); | |
381 | |
382 int start = (filterOffset<<2); | |
383 | |
384 // We will load and accumulate with four coefficients per iteration. | |
385 for (int filter_x = 0; filter_x < (filterLength >> 2); filter_x++) { | |
386 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; | |
387 | |
388 coeffs = vld1_s16(filterValues); | |
389 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c
oeff_mask0)); | |
390 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c
oeff_mask1)); | |
391 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c
oeff_mask2)); | |
392 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c
oeff_mask3)); | |
393 | |
394 uint8x16_t pixels; | |
395 int16x8_t p01_16, p23_16; | |
396 int32x4_t p0, p1, p2, p3; | |
397 | |
398 | |
399 #define ITERATION(src, accum) \ | |
400 pixels = vld1q_u8(src); \ | |
401 p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels))); \ | |
402 p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); \ | |
403 p0 = vmull_s16(vget_low_s16(p01_16), coeff0); \ | |
404 p1 = vmull_s16(vget_high_s16(p01_16), coeff1); \ | |
405 p2 = vmull_s16(vget_low_s16(p23_16), coeff2); \ | |
406 p3 = vmull_s16(vget_high_s16(p23_16), coeff3); \ | |
407 accum += p0; \ | |
408 accum += p1; \ | |
409 accum += p2; \ | |
410 accum += p3 | |
411 | |
412 ITERATION(srcData[0] + start, accum0); | |
413 ITERATION(srcData[1] + start, accum1); | |
414 ITERATION(srcData[2] + start, accum2); | |
415 ITERATION(srcData[3] + start, accum3); | |
416 | |
417 start += 16; | |
418 filterValues += 4; | |
419 } | |
420 | |
421 int r = filterLength & 3; | |
422 if (r) { | |
423 int remainder_offset = (filterOffset + filterLength - r) * 4; | |
424 accum_remainder(srcData[0] + remainder_offset, filterValues, accum0,
r); | |
425 accum_remainder(srcData[1] + remainder_offset, filterValues, accum1,
r); | |
426 accum_remainder(srcData[2] + remainder_offset, filterValues, accum2,
r); | |
427 accum_remainder(srcData[3] + remainder_offset, filterValues, accum3,
r); | |
428 } | |
429 | |
430 int16x4_t accum16; | |
431 uint8x8_t res0, res1, res2, res3; | |
432 | |
433 #define PACK_RESULT(accum, res) \ | |
434 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \ | |
435 accum16 = vqmovn_s32(accum); \ | |
436 res = vqmovun_s16(vcombine_s16(accum16, accum16)); | |
437 | |
438 PACK_RESULT(accum0, res0); | |
439 PACK_RESULT(accum1, res1); | |
440 PACK_RESULT(accum2, res2); | |
441 PACK_RESULT(accum3, res3); | |
442 | |
443 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u
8(res0), 0); | |
444 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u
8(res1), 0); | |
445 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u
8(res2), 0); | |
446 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u
8(res3), 0); | |
447 outRow[0] += 4; | |
448 outRow[1] += 4; | |
449 outRow[2] += 4; | |
450 outRow[3] += 4; | |
451 } | |
452 } | |
453 | |
454 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) { | |
455 procs->fConvolveVertically = &convolveVertically_neon; | |
456 procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon; | |
457 procs->fConvolveHorizontally = &convolveHorizontally_neon; | |
458 } | |
OLD | NEW |