OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 Google Inc. | 2 * Copyright 2012 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBitmapProcState.h" | 8 #include "SkBitmapProcState.h" |
9 #include "SkBitmapProcState_filter.h" | 9 #include "SkBitmapProcState_filter.h" |
10 #include "SkColorPriv.h" | 10 #include "SkColorPriv.h" |
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
76 SG8_alpha_D32_filter_DXDY_neon, | 76 SG8_alpha_D32_filter_DXDY_neon, |
77 SG8_alpha_D32_filter_DX_neon, | 77 SG8_alpha_D32_filter_DX_neon, |
78 SG8_alpha_D32_filter_DX_neon, | 78 SG8_alpha_D32_filter_DX_neon, |
79 }; | 79 }; |
80 | 80 |
81 /////////////////////////////////////////////////////////////////////////////// | 81 /////////////////////////////////////////////////////////////////////////////// |
82 | 82 |
83 #include <arm_neon.h> | 83 #include <arm_neon.h> |
84 #include "SkConvolver.h" | 84 #include "SkConvolver.h" |
85 | 85 |
| 86 static SK_ALWAYS_INLINE void accum_remainder(const unsigned char* pixels_left, |
| 87 const SkConvolutionFilter1D::ConvolutionFixed* filter_values, int32x4_t&
accum, int r) { |
| 88 int remainder[4] = {0}; |
| 89 for (int i = 0; i < r; i++) { |
| 90 SkConvolutionFilter1D::ConvolutionFixed coeff = filter_values[i]; |
| 91 remainder[0] += coeff * pixels_left[i * 4 + 0]; |
| 92 remainder[1] += coeff * pixels_left[i * 4 + 1]; |
| 93 remainder[2] += coeff * pixels_left[i * 4 + 2]; |
| 94 remainder[3] += coeff * pixels_left[i * 4 + 3]; |
| 95 } |
| 96 int32x4_t t = {remainder[0], remainder[1], remainder[2], remainder[3]}; |
| 97 accum += t; |
| 98 } |
| 99 |
86 // Convolves horizontally along a single row. The row data is given in | 100 // Convolves horizontally along a single row. The row data is given in |
87 // |srcData| and continues for the numValues() of the filter. | 101 // |srcData| and continues for the numValues() of the filter. |
88 void convolveHorizontally_neon(const unsigned char* srcData, | 102 void convolveHorizontally_neon(const unsigned char* srcData, |
89 const SkConvolutionFilter1D& filter, | 103 const SkConvolutionFilter1D& filter, |
90 unsigned char* outRow, | 104 unsigned char* outRow, |
91 bool hasAlpha) { | 105 bool hasAlpha) { |
92 // Loop over each pixel on this row in the output image. | 106 // Loop over each pixel on this row in the output image. |
93 int numValues = filter.numValues(); | 107 int numValues = filter.numValues(); |
94 for (int outX = 0; outX < numValues; outX++) { | 108 for (int outX = 0; outX < numValues; outX++) { |
95 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); | 109 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); |
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
133 | 147 |
134 accum += p0; | 148 accum += p0; |
135 accum += p1; | 149 accum += p1; |
136 accum += p2; | 150 accum += p2; |
137 accum += p3; | 151 accum += p3; |
138 | 152 |
139 // Advance the pointers | 153 // Advance the pointers |
140 rowToFilter += 16; | 154 rowToFilter += 16; |
141 filterValues += 4; | 155 filterValues += 4; |
142 } | 156 } |
| 157 |
143 int r = filterLength & 3; | 158 int r = filterLength & 3; |
144 if (r) { | 159 if (r) { |
145 const uint16_t mask[4][4] = { | 160 int remainder_offset = (filterOffset + filterLength - r) * 4; |
146 {0, 0, 0, 0}, | 161 accum_remainder(srcData + remainder_offset, filterValues, accum, r); |
147 {0xFFFF, 0, 0, 0}, | |
148 {0xFFFF, 0xFFFF, 0, 0}, | |
149 {0xFFFF, 0xFFFF, 0xFFFF, 0} | |
150 }; | |
151 uint16x4_t coeffs; | |
152 int16x4_t coeff0, coeff1, coeff2; | |
153 coeffs = vld1_u16(reinterpret_cast<const uint16_t*>(filterValues)); | |
154 coeffs &= vld1_u16(&mask[r][0]); | |
155 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c
oeff_mask0)); | |
156 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c
oeff_mask1)); | |
157 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c
oeff_mask2)); | |
158 | |
159 // Load pixels and calc | |
160 uint8x16_t pixels = vld1q_u8(rowToFilter); | |
161 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels
))); | |
162 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixel
s))); | |
163 int32x4_t p0 = vmull_s16(vget_low_s16(p01_16), coeff0); | |
164 int32x4_t p1 = vmull_s16(vget_high_s16(p01_16), coeff1); | |
165 int32x4_t p2 = vmull_s16(vget_low_s16(p23_16), coeff2); | |
166 | |
167 accum += p0; | |
168 accum += p1; | |
169 accum += p2; | |
170 } | 162 } |
171 | 163 |
172 // Bring this value back in range. All of the filter scaling factors | 164 // Bring this value back in range. All of the filter scaling factors |
173 // are in fixed point with kShiftBits bits of fractional part. | 165 // are in fixed point with kShiftBits bits of fractional part. |
174 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); | 166 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); |
175 | 167 |
176 // Pack and store the new pixel. | 168 // Pack and store the new pixel. |
177 int16x4_t accum16 = vqmovn_s32(accum); | 169 int16x4_t accum16 = vqmovn_s32(accum); |
178 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16)); | 170 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16)); |
179 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(a
ccum8), 0); | 171 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(a
ccum8), 0); |
(...skipping 187 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
367 unsigned char* outRow[4], | 359 unsigned char* outRow[4], |
368 size_t outRowBytes) { | 360 size_t outRowBytes) { |
369 | 361 |
370 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); | 362 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); |
371 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); | 363 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); |
372 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); | 364 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); |
373 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); | 365 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); |
374 int num_values = filter.numValues(); | 366 int num_values = filter.numValues(); |
375 | 367 |
376 int filterOffset, filterLength; | 368 int filterOffset, filterLength; |
377 // |mask| will be used to decimate all extra filter coefficients that are | |
378 // loaded by SIMD when |filter_length| is not divisible by 4. | |
379 // mask[0] is not used in following algorithm. | |
380 const uint16_t mask[4][4] = { | |
381 {0, 0, 0, 0}, | |
382 {0xFFFF, 0, 0, 0}, | |
383 {0xFFFF, 0xFFFF, 0, 0}, | |
384 {0xFFFF, 0xFFFF, 0xFFFF, 0} | |
385 }; | |
386 | 369 |
387 // Output one pixel each iteration, calculating all channels (RGBA) together
. | 370 // Output one pixel each iteration, calculating all channels (RGBA) together
. |
388 for (int outX = 0; outX < num_values; outX++) { | 371 for (int outX = 0; outX < num_values; outX++) { |
389 | 372 |
390 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = | 373 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = |
391 filter.FilterForValue(outX, &filterOffset, &filterLength); | 374 filter.FilterForValue(outX, &filterOffset, &filterLength); |
392 | 375 |
393 // four pixels in a column per iteration. | 376 // four pixels in a column per iteration. |
394 int32x4_t accum0 = vdupq_n_s32(0); | 377 int32x4_t accum0 = vdupq_n_s32(0); |
395 int32x4_t accum1 = vdupq_n_s32(0); | 378 int32x4_t accum1 = vdupq_n_s32(0); |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
430 ITERATION(srcData[1] + start, accum1); | 413 ITERATION(srcData[1] + start, accum1); |
431 ITERATION(srcData[2] + start, accum2); | 414 ITERATION(srcData[2] + start, accum2); |
432 ITERATION(srcData[3] + start, accum3); | 415 ITERATION(srcData[3] + start, accum3); |
433 | 416 |
434 start += 16; | 417 start += 16; |
435 filterValues += 4; | 418 filterValues += 4; |
436 } | 419 } |
437 | 420 |
438 int r = filterLength & 3; | 421 int r = filterLength & 3; |
439 if (r) { | 422 if (r) { |
440 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; | 423 int remainder_offset = (filterOffset + filterLength - r) * 4; |
441 coeffs = vld1_s16(filterValues); | 424 accum_remainder(srcData[0] + remainder_offset, filterValues, accum0,
r); |
442 coeffs &= vreinterpret_s16_u16(vld1_u16(&mask[r][0])); | 425 accum_remainder(srcData[1] + remainder_offset, filterValues, accum1,
r); |
443 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c
oeff_mask0)); | 426 accum_remainder(srcData[2] + remainder_offset, filterValues, accum2,
r); |
444 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c
oeff_mask1)); | 427 accum_remainder(srcData[3] + remainder_offset, filterValues, accum3,
r); |
445 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c
oeff_mask2)); | |
446 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c
oeff_mask3)); | |
447 | |
448 uint8x16_t pixels; | |
449 int16x8_t p01_16, p23_16; | |
450 int32x4_t p0, p1, p2, p3; | |
451 | |
452 ITERATION(srcData[0] + start, accum0); | |
453 ITERATION(srcData[1] + start, accum1); | |
454 ITERATION(srcData[2] + start, accum2); | |
455 ITERATION(srcData[3] + start, accum3); | |
456 } | 428 } |
457 | 429 |
458 int16x4_t accum16; | 430 int16x4_t accum16; |
459 uint8x8_t res0, res1, res2, res3; | 431 uint8x8_t res0, res1, res2, res3; |
460 | 432 |
461 #define PACK_RESULT(accum, res) \ | 433 #define PACK_RESULT(accum, res) \ |
462 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \ | 434 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \ |
463 accum16 = vqmovn_s32(accum); \ | 435 accum16 = vqmovn_s32(accum); \ |
464 res = vqmovun_s16(vcombine_s16(accum16, accum16)); | 436 res = vqmovun_s16(vcombine_s16(accum16, accum16)); |
465 | 437 |
466 PACK_RESULT(accum0, res0); | 438 PACK_RESULT(accum0, res0); |
467 PACK_RESULT(accum1, res1); | 439 PACK_RESULT(accum1, res1); |
468 PACK_RESULT(accum2, res2); | 440 PACK_RESULT(accum2, res2); |
469 PACK_RESULT(accum3, res3); | 441 PACK_RESULT(accum3, res3); |
470 | 442 |
471 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u
8(res0), 0); | 443 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u
8(res0), 0); |
472 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u
8(res1), 0); | 444 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u
8(res1), 0); |
473 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u
8(res2), 0); | 445 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u
8(res2), 0); |
474 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u
8(res3), 0); | 446 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u
8(res3), 0); |
475 outRow[0] += 4; | 447 outRow[0] += 4; |
476 outRow[1] += 4; | 448 outRow[1] += 4; |
477 outRow[2] += 4; | 449 outRow[2] += 4; |
478 outRow[3] += 4; | 450 outRow[3] += 4; |
479 } | 451 } |
480 } | 452 } |
481 | 453 |
482 void applySIMDPadding_neon(SkConvolutionFilter1D *filter) { | |
483 // Padding |paddingCount| of more dummy coefficients after the coefficients | |
484 // of last filter to prevent SIMD instructions which load 8 or 16 bytes | |
485 // together to access invalid memory areas. We are not trying to align the | |
486 // coefficients right now due to the opaqueness of <vector> implementation. | |
487 // This has to be done after all |AddFilter| calls. | |
488 for (int i = 0; i < 8; ++i) { | |
489 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix
ed>(0)); | |
490 } | |
491 } | |
492 | |
493 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) { | 454 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) { |
494 procs->fExtraHorizontalReads = 3; | |
495 procs->fConvolveVertically = &convolveVertically_neon; | 455 procs->fConvolveVertically = &convolveVertically_neon; |
496 procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon; | 456 procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon; |
497 procs->fConvolveHorizontally = &convolveHorizontally_neon; | 457 procs->fConvolveHorizontally = &convolveHorizontally_neon; |
498 procs->fApplySIMDPadding = &applySIMDPadding_neon; | |
499 } | 458 } |
OLD | NEW |