OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 Google Inc. | 2 * Copyright 2012 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBitmapProcState.h" | 8 #include "SkBitmapProcState.h" |
9 #include "SkBitmapProcState_filter.h" | 9 #include "SkBitmapProcState_filter.h" |
10 #include "SkColorPriv.h" | 10 #include "SkColorPriv.h" |
(...skipping 124 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
135 accum += p1; | 135 accum += p1; |
136 accum += p2; | 136 accum += p2; |
137 accum += p3; | 137 accum += p3; |
138 | 138 |
139 // Advance the pointers | 139 // Advance the pointers |
140 rowToFilter += 16; | 140 rowToFilter += 16; |
141 filterValues += 4; | 141 filterValues += 4; |
142 } | 142 } |
143 int r = filterLength & 3; | 143 int r = filterLength & 3; |
144 if (r) { | 144 if (r) { |
145 const uint16_t mask[4][4] = { | 145 #define ACCUM_REMAINDER(src, accum) {
\ |
146 {0, 0, 0, 0}, | 146 int remainder[4] = {0};
\ |
147 {0xFFFF, 0, 0, 0}, | 147 const unsigned char* pixels_left = src + (filterOffset + filterLengt
h - r) * 4;\ |
148 {0xFFFF, 0xFFFF, 0, 0}, | 148 for (int i = 0; i < r; i++) {
\ |
149 {0xFFFF, 0xFFFF, 0xFFFF, 0} | 149 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i];
\ |
150 }; | 150 remainder[0] += coeff * pixels_left[i * 4 + 0];
\ |
151 uint16x4_t coeffs; | 151 remainder[1] += coeff * pixels_left[i * 4 + 1];
\ |
152 int16x4_t coeff0, coeff1, coeff2; | 152 remainder[2] += coeff * pixels_left[i * 4 + 2];
\ |
153 coeffs = vld1_u16(reinterpret_cast<const uint16_t*>(filterValues)); | 153 remainder[3] += coeff * pixels_left[i * 4 + 3];
\ |
154 coeffs &= vld1_u16(&mask[r][0]); | 154 }
\ |
155 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c
oeff_mask0)); | 155 int32x4_t t{remainder[0], remainder[1], remainder[2], remainder[3]};
\ |
156 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c
oeff_mask1)); | 156 accum += t; } |
157 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c
oeff_mask2)); | 157 ACCUM_REMAINDER(srcData, accum); |
158 | |
159 // Load pixels and calc | |
160 uint8x16_t pixels = vld1q_u8(rowToFilter); | |
161 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels
))); | |
162 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixel
s))); | |
163 int32x4_t p0 = vmull_s16(vget_low_s16(p01_16), coeff0); | |
164 int32x4_t p1 = vmull_s16(vget_high_s16(p01_16), coeff1); | |
165 int32x4_t p2 = vmull_s16(vget_low_s16(p23_16), coeff2); | |
166 | |
167 accum += p0; | |
168 accum += p1; | |
169 accum += p2; | |
170 } | 158 } |
171 | 159 |
172 // Bring this value back in range. All of the filter scaling factors | 160 // Bring this value back in range. All of the filter scaling factors |
173 // are in fixed point with kShiftBits bits of fractional part. | 161 // are in fixed point with kShiftBits bits of fractional part. |
174 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); | 162 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); |
175 | 163 |
176 // Pack and store the new pixel. | 164 // Pack and store the new pixel. |
177 int16x4_t accum16 = vqmovn_s32(accum); | 165 int16x4_t accum16 = vqmovn_s32(accum); |
178 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16)); | 166 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16)); |
179 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(a
ccum8), 0); | 167 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(a
ccum8), 0); |
(...skipping 187 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
367 unsigned char* outRow[4], | 355 unsigned char* outRow[4], |
368 size_t outRowBytes) { | 356 size_t outRowBytes) { |
369 | 357 |
370 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); | 358 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); |
371 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); | 359 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); |
372 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); | 360 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); |
373 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); | 361 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); |
374 int num_values = filter.numValues(); | 362 int num_values = filter.numValues(); |
375 | 363 |
376 int filterOffset, filterLength; | 364 int filterOffset, filterLength; |
377 // |mask| will be used to decimate all extra filter coefficients that are | |
378 // loaded by SIMD when |filter_length| is not divisible by 4. | |
379 // mask[0] is not used in following algorithm. | |
380 const uint16_t mask[4][4] = { | |
381 {0, 0, 0, 0}, | |
382 {0xFFFF, 0, 0, 0}, | |
383 {0xFFFF, 0xFFFF, 0, 0}, | |
384 {0xFFFF, 0xFFFF, 0xFFFF, 0} | |
385 }; | |
386 | 365 |
387 // Output one pixel each iteration, calculating all channels (RGBA) together
. | 366 // Output one pixel each iteration, calculating all channels (RGBA) together
. |
388 for (int outX = 0; outX < num_values; outX++) { | 367 for (int outX = 0; outX < num_values; outX++) { |
389 | 368 |
390 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = | 369 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = |
391 filter.FilterForValue(outX, &filterOffset, &filterLength); | 370 filter.FilterForValue(outX, &filterOffset, &filterLength); |
392 | 371 |
393 // four pixels in a column per iteration. | 372 // four pixels in a column per iteration. |
394 int32x4_t accum0 = vdupq_n_s32(0); | 373 int32x4_t accum0 = vdupq_n_s32(0); |
395 int32x4_t accum1 = vdupq_n_s32(0); | 374 int32x4_t accum1 = vdupq_n_s32(0); |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
430 ITERATION(srcData[1] + start, accum1); | 409 ITERATION(srcData[1] + start, accum1); |
431 ITERATION(srcData[2] + start, accum2); | 410 ITERATION(srcData[2] + start, accum2); |
432 ITERATION(srcData[3] + start, accum3); | 411 ITERATION(srcData[3] + start, accum3); |
433 | 412 |
434 start += 16; | 413 start += 16; |
435 filterValues += 4; | 414 filterValues += 4; |
436 } | 415 } |
437 | 416 |
438 int r = filterLength & 3; | 417 int r = filterLength & 3; |
439 if (r) { | 418 if (r) { |
440 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; | 419 ACCUM_REMAINDER(srcData[0], accum0); |
441 coeffs = vld1_s16(filterValues); | 420 ACCUM_REMAINDER(srcData[1], accum1); |
442 coeffs &= vreinterpret_s16_u16(vld1_u16(&mask[r][0])); | 421 ACCUM_REMAINDER(srcData[2], accum2); |
443 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c
oeff_mask0)); | 422 ACCUM_REMAINDER(srcData[3], accum3); |
444 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c
oeff_mask1)); | |
445 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c
oeff_mask2)); | |
446 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c
oeff_mask3)); | |
447 | |
448 uint8x16_t pixels; | |
449 int16x8_t p01_16, p23_16; | |
450 int32x4_t p0, p1, p2, p3; | |
451 | |
452 ITERATION(srcData[0] + start, accum0); | |
453 ITERATION(srcData[1] + start, accum1); | |
454 ITERATION(srcData[2] + start, accum2); | |
455 ITERATION(srcData[3] + start, accum3); | |
456 } | 423 } |
457 | 424 |
458 int16x4_t accum16; | 425 int16x4_t accum16; |
459 uint8x8_t res0, res1, res2, res3; | 426 uint8x8_t res0, res1, res2, res3; |
460 | 427 |
461 #define PACK_RESULT(accum, res) \ | 428 #define PACK_RESULT(accum, res) \ |
462 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \ | 429 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \ |
463 accum16 = vqmovn_s32(accum); \ | 430 accum16 = vqmovn_s32(accum); \ |
464 res = vqmovun_s16(vcombine_s16(accum16, accum16)); | 431 res = vqmovun_s16(vcombine_s16(accum16, accum16)); |
465 | 432 |
466 PACK_RESULT(accum0, res0); | 433 PACK_RESULT(accum0, res0); |
467 PACK_RESULT(accum1, res1); | 434 PACK_RESULT(accum1, res1); |
468 PACK_RESULT(accum2, res2); | 435 PACK_RESULT(accum2, res2); |
469 PACK_RESULT(accum3, res3); | 436 PACK_RESULT(accum3, res3); |
470 | 437 |
471 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u
8(res0), 0); | 438 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u
8(res0), 0); |
472 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u
8(res1), 0); | 439 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u
8(res1), 0); |
473 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u
8(res2), 0); | 440 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u
8(res2), 0); |
474 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u
8(res3), 0); | 441 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u
8(res3), 0); |
475 outRow[0] += 4; | 442 outRow[0] += 4; |
476 outRow[1] += 4; | 443 outRow[1] += 4; |
477 outRow[2] += 4; | 444 outRow[2] += 4; |
478 outRow[3] += 4; | 445 outRow[3] += 4; |
479 } | 446 } |
480 } | 447 } |
481 | 448 |
482 void applySIMDPadding_neon(SkConvolutionFilter1D *filter) { | |
483 // Padding |paddingCount| of more dummy coefficients after the coefficients | |
484 // of last filter to prevent SIMD instructions which load 8 or 16 bytes | |
485 // together to access invalid memory areas. We are not trying to align the | |
486 // coefficients right now due to the opaqueness of <vector> implementation. | |
487 // This has to be done after all |AddFilter| calls. | |
488 for (int i = 0; i < 8; ++i) { | |
489 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix
ed>(0)); | |
490 } | |
491 } | |
492 | |
493 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) { | 449 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) { |
494 procs->fExtraHorizontalReads = 3; | |
495 procs->fConvolveVertically = &convolveVertically_neon; | 450 procs->fConvolveVertically = &convolveVertically_neon; |
496 procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon; | 451 procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon; |
497 procs->fConvolveHorizontally = &convolveHorizontally_neon; | 452 procs->fConvolveHorizontally = &convolveHorizontally_neon; |
498 procs->fApplySIMDPadding = &applySIMDPadding_neon; | |
499 } | 453 } |
OLD | NEW |