Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(89)

Side by Side Diff: src/opts/SkBitmapProcState_arm_neon.cpp

Issue 2500113004: Port convolve functions to SkOpts (Closed)
Patch Set: Fix typo Created 4 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/opts/SkBitmapFilter_opts_SSE2.cpp ('k') | src/opts/SkBitmapProcState_opts_arm.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2012 Google Inc. 2 * Copyright 2012 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBitmapProcState.h" 8 #include "SkBitmapProcState.h"
9 #include "SkBitmapProcState_filter.h" 9 #include "SkBitmapProcState_filter.h"
10 #include "SkColorPriv.h" 10 #include "SkColorPriv.h"
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after
70 // todo: possibly specialize on opaqueness 70 // todo: possibly specialize on opaqueness
71 SG8_alpha_D32_nofilter_DXDY_neon, 71 SG8_alpha_D32_nofilter_DXDY_neon,
72 SG8_alpha_D32_nofilter_DXDY_neon, 72 SG8_alpha_D32_nofilter_DXDY_neon,
73 SG8_alpha_D32_nofilter_DX_neon, 73 SG8_alpha_D32_nofilter_DX_neon,
74 SG8_alpha_D32_nofilter_DX_neon, 74 SG8_alpha_D32_nofilter_DX_neon,
75 SG8_alpha_D32_filter_DXDY_neon, 75 SG8_alpha_D32_filter_DXDY_neon,
76 SG8_alpha_D32_filter_DXDY_neon, 76 SG8_alpha_D32_filter_DXDY_neon,
77 SG8_alpha_D32_filter_DX_neon, 77 SG8_alpha_D32_filter_DX_neon,
78 SG8_alpha_D32_filter_DX_neon, 78 SG8_alpha_D32_filter_DX_neon,
79 }; 79 };
80
81 ///////////////////////////////////////////////////////////////////////////////
82
83 #include <arm_neon.h>
84 #include "SkConvolver.h"
85
86 static SK_ALWAYS_INLINE void accum_remainder(const unsigned char* pixels_left,
87 const SkConvolutionFilter1D::ConvolutionFixed* filter_values, int32x4_t& accum, int r) {
88 int remainder[4] = {0};
89 for (int i = 0; i < r; i++) {
90 SkConvolutionFilter1D::ConvolutionFixed coeff = filter_values[i];
91 remainder[0] += coeff * pixels_left[i * 4 + 0];
92 remainder[1] += coeff * pixels_left[i * 4 + 1];
93 remainder[2] += coeff * pixels_left[i * 4 + 2];
94 remainder[3] += coeff * pixels_left[i * 4 + 3];
95 }
96 int32x4_t t = {remainder[0], remainder[1], remainder[2], remainder[3]};
97 accum += t;
98 }
99
100 // Convolves horizontally along a single row. The row data is given in
101 // |srcData| and continues for the numValues() of the filter.
102 void convolveHorizontally_neon(const unsigned char* srcData,
103 const SkConvolutionFilter1D& filter,
104 unsigned char* outRow,
105 bool hasAlpha) {
106 // Loop over each pixel on this row in the output image.
107 int numValues = filter.numValues();
108 for (int outX = 0; outX < numValues; outX++) {
109 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
110 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
111 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
112 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
113 // Get the filter that determines the current output pixel.
114 int filterOffset, filterLength;
115 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
116 filter.FilterForValue(outX, &filterOffset, &filterLength);
117
118 // Compute the first pixel in this row that the filter affects. It will
119 // touch |filterLength| pixels (4 bytes each) after this.
120 const unsigned char* rowToFilter = &srcData[filterOffset * 4];
121
122 // Apply the filter to the row to get the destination pixel in |accum|.
123 int32x4_t accum = vdupq_n_s32(0);
124 for (int filterX = 0; filterX < filterLength >> 2; filterX++) {
125 // Load 4 coefficients
126 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
127 coeffs = vld1_s16(filterValues);
128 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask0));
129 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask1));
130 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask2));
131 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask3));
132
133 // Load pixels and calc
134 uint8x16_t pixels = vld1q_u8(rowToFilter);
135 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels )));
136 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixel s)));
137
138 int16x4_t p0_src = vget_low_s16(p01_16);
139 int16x4_t p1_src = vget_high_s16(p01_16);
140 int16x4_t p2_src = vget_low_s16(p23_16);
141 int16x4_t p3_src = vget_high_s16(p23_16);
142
143 int32x4_t p0 = vmull_s16(p0_src, coeff0);
144 int32x4_t p1 = vmull_s16(p1_src, coeff1);
145 int32x4_t p2 = vmull_s16(p2_src, coeff2);
146 int32x4_t p3 = vmull_s16(p3_src, coeff3);
147
148 accum += p0;
149 accum += p1;
150 accum += p2;
151 accum += p3;
152
153 // Advance the pointers
154 rowToFilter += 16;
155 filterValues += 4;
156 }
157
158 int r = filterLength & 3;
159 if (r) {
160 int remainder_offset = (filterOffset + filterLength - r) * 4;
161 accum_remainder(srcData + remainder_offset, filterValues, accum, r);
162 }
163
164 // Bring this value back in range. All of the filter scaling factors
165 // are in fixed point with kShiftBits bits of fractional part.
166 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);
167
168 // Pack and store the new pixel.
169 int16x4_t accum16 = vqmovn_s32(accum);
170 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));
171 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(a ccum8), 0);
172 outRow += 4;
173 }
174 }
175
176 // Does vertical convolution to produce one output row. The filter values and
177 // length are given in the first two parameters. These are applied to each
178 // of the rows pointed to in the |sourceDataRows| array, with each row
179 // being |pixelWidth| wide.
180 //
181 // The output must have room for |pixelWidth * 4| bytes.
182 template<bool hasAlpha>
183 void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filt erValues,
184 int filterLength,
185 unsigned char* const* sourceDataRows,
186 int pixelWidth,
187 unsigned char* outRow) {
188 int width = pixelWidth & ~3;
189
190 int32x4_t accum0, accum1, accum2, accum3;
191 int16x4_t coeff16;
192
193 // Output four pixels per iteration (16 bytes).
194 for (int outX = 0; outX < width; outX += 4) {
195
196 // Accumulated result for each pixel. 32 bits per RGBA channel.
197 accum0 = accum1 = accum2 = accum3 = vdupq_n_s32(0);
198
199 // Convolve with one filter coefficient per iteration.
200 for (int filterY = 0; filterY < filterLength; filterY++) {
201
202 // Duplicate the filter coefficient 4 times.
203 // [16] cj cj cj cj
204 coeff16 = vdup_n_s16(filterValues[filterY]);
205
206 // Load four pixels (16 bytes) together.
207 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
208 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]);
209
210 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8 )));
211 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src 8)));
212 int16x4_t src16_0 = vget_low_s16(src16_01);
213 int16x4_t src16_1 = vget_high_s16(src16_01);
214 int16x4_t src16_2 = vget_low_s16(src16_23);
215 int16x4_t src16_3 = vget_high_s16(src16_23);
216
217 accum0 += vmull_s16(src16_0, coeff16);
218 accum1 += vmull_s16(src16_1, coeff16);
219 accum2 += vmull_s16(src16_2, coeff16);
220 accum3 += vmull_s16(src16_3, coeff16);
221 }
222
223 // Shift right for fixed point implementation.
224 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
225 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
226 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
227 accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits);
228
229 // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
230 // [16] a1 b1 g1 r1 a0 b0 g0 r0
231 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1 ));
232 // [16] a3 b3 g3 r3 a2 b2 g2 r2
233 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum3 ));
234
235 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
236 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
237 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accu m16_1));
238
239 if (hasAlpha) {
240 // Compute the max(ri, gi, bi) for each pixel.
241 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
242 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8 (accum8), 8));
243 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
244 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
245 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
246 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 1 6));
247 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
248 b = vmaxq_u8(a, b); // Max of r and g and b.
249 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
250 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));
251
252 // Make sure the value of alpha channel is always larger than maximu m
253 // value of color channels.
254 accum8 = vmaxq_u8(b, accum8);
255 } else {
256 // Set value of alpha channels to 0xFF.
257 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n _u32(0xFF000000));
258 }
259
260 // Store the convolution result (16 bytes) and advance the pixel pointer s.
261 vst1q_u8(outRow, accum8);
262 outRow += 16;
263 }
264
265 // Process the leftovers when the width of the output is not divisible
266 // by 4, that is at most 3 pixels.
267 int r = pixelWidth & 3;
268 if (r) {
269
270 accum0 = accum1 = accum2 = vdupq_n_s32(0);
271
272 for (int filterY = 0; filterY < filterLength; ++filterY) {
273 coeff16 = vdup_n_s16(filterValues[filterY]);
274
275 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
276 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]);
277
278 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8 )));
279 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src 8)));
280 int16x4_t src16_0 = vget_low_s16(src16_01);
281 int16x4_t src16_1 = vget_high_s16(src16_01);
282 int16x4_t src16_2 = vget_low_s16(src16_23);
283
284 accum0 += vmull_s16(src16_0, coeff16);
285 accum1 += vmull_s16(src16_1, coeff16);
286 accum2 += vmull_s16(src16_2, coeff16);
287 }
288
289 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
290 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
291 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
292
293 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1 ));
294 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum2 ));
295
296 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accu m16_1));
297
298 if (hasAlpha) {
299 // Compute the max(ri, gi, bi) for each pixel.
300 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
301 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8 (accum8), 8));
302 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
303 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
304 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
305 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 1 6));
306 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
307 b = vmaxq_u8(a, b); // Max of r and g and b.
308 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
309 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));
310
311 // Make sure the value of alpha channel is always larger than maximu m
312 // value of color channels.
313 accum8 = vmaxq_u8(b, accum8);
314 } else {
315 // Set value of alpha channels to 0xFF.
316 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n _u32(0xFF000000));
317 }
318
319 switch(r) {
320 case 1:
321 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpretq_u3 2_u8(accum8), 0);
322 break;
323 case 2:
324 vst1_u32(reinterpret_cast<uint32_t*>(outRow),
325 vreinterpret_u32_u8(vget_low_u8(accum8)));
326 break;
327 case 3:
328 vst1_u32(reinterpret_cast<uint32_t*>(outRow),
329 vreinterpret_u32_u8(vget_low_u8(accum8)));
330 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow+8), vreinterpretq_ u32_u8(accum8), 2);
331 break;
332 }
333 }
334 }
335
336 void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filt erValues,
337 int filterLength,
338 unsigned char* const* sourceDataRows,
339 int pixelWidth,
340 unsigned char* outRow,
341 bool sourceHasAlpha) {
342 if (sourceHasAlpha) {
343 convolveVertically_neon<true>(filterValues, filterLength,
344 sourceDataRows, pixelWidth,
345 outRow);
346 } else {
347 convolveVertically_neon<false>(filterValues, filterLength,
348 sourceDataRows, pixelWidth,
349 outRow);
350 }
351 }
352
353 // Convolves horizontally along four rows. The row data is given in
354 // |src_data| and continues for the num_values() of the filter.
355 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
356 // refer to that function for detailed comments.
357 void convolve4RowsHorizontally_neon(const unsigned char* srcData[4],
358 const SkConvolutionFilter1D& filter,
359 unsigned char* outRow[4],
360 size_t outRowBytes) {
361
362 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
363 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
364 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
365 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
366 int num_values = filter.numValues();
367
368 int filterOffset, filterLength;
369
370 // Output one pixel each iteration, calculating all channels (RGBA) together .
371 for (int outX = 0; outX < num_values; outX++) {
372
373 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
374 filter.FilterForValue(outX, &filterOffset, &filterLength);
375
376 // four pixels in a column per iteration.
377 int32x4_t accum0 = vdupq_n_s32(0);
378 int32x4_t accum1 = vdupq_n_s32(0);
379 int32x4_t accum2 = vdupq_n_s32(0);
380 int32x4_t accum3 = vdupq_n_s32(0);
381
382 int start = (filterOffset<<2);
383
384 // We will load and accumulate with four coefficients per iteration.
385 for (int filter_x = 0; filter_x < (filterLength >> 2); filter_x++) {
386 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
387
388 coeffs = vld1_s16(filterValues);
389 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask0));
390 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask1));
391 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask2));
392 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask3));
393
394 uint8x16_t pixels;
395 int16x8_t p01_16, p23_16;
396 int32x4_t p0, p1, p2, p3;
397
398
399 #define ITERATION(src, accum) \
400 pixels = vld1q_u8(src); \
401 p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels))); \
402 p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); \
403 p0 = vmull_s16(vget_low_s16(p01_16), coeff0); \
404 p1 = vmull_s16(vget_high_s16(p01_16), coeff1); \
405 p2 = vmull_s16(vget_low_s16(p23_16), coeff2); \
406 p3 = vmull_s16(vget_high_s16(p23_16), coeff3); \
407 accum += p0; \
408 accum += p1; \
409 accum += p2; \
410 accum += p3
411
412 ITERATION(srcData[0] + start, accum0);
413 ITERATION(srcData[1] + start, accum1);
414 ITERATION(srcData[2] + start, accum2);
415 ITERATION(srcData[3] + start, accum3);
416
417 start += 16;
418 filterValues += 4;
419 }
420
421 int r = filterLength & 3;
422 if (r) {
423 int remainder_offset = (filterOffset + filterLength - r) * 4;
424 accum_remainder(srcData[0] + remainder_offset, filterValues, accum0, r);
425 accum_remainder(srcData[1] + remainder_offset, filterValues, accum1, r);
426 accum_remainder(srcData[2] + remainder_offset, filterValues, accum2, r);
427 accum_remainder(srcData[3] + remainder_offset, filterValues, accum3, r);
428 }
429
430 int16x4_t accum16;
431 uint8x8_t res0, res1, res2, res3;
432
433 #define PACK_RESULT(accum, res) \
434 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \
435 accum16 = vqmovn_s32(accum); \
436 res = vqmovun_s16(vcombine_s16(accum16, accum16));
437
438 PACK_RESULT(accum0, res0);
439 PACK_RESULT(accum1, res1);
440 PACK_RESULT(accum2, res2);
441 PACK_RESULT(accum3, res3);
442
443 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u 8(res0), 0);
444 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u 8(res1), 0);
445 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u 8(res2), 0);
446 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u 8(res3), 0);
447 outRow[0] += 4;
448 outRow[1] += 4;
449 outRow[2] += 4;
450 outRow[3] += 4;
451 }
452 }
453
454 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) {
455 procs->fConvolveVertically = &convolveVertically_neon;
456 procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon;
457 procs->fConvolveHorizontally = &convolveHorizontally_neon;
458 }
OLDNEW
« no previous file with comments | « src/opts/SkBitmapFilter_opts_SSE2.cpp ('k') | src/opts/SkBitmapProcState_opts_arm.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698