| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2012 Google Inc. | 2 * Copyright 2012 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include "SkBitmapProcState.h" | 8 #include "SkBitmapProcState.h" |
| 9 #include "SkBitmapProcState_filter.h" | 9 #include "SkBitmapProcState_filter.h" |
| 10 #include "SkColorPriv.h" | 10 #include "SkColorPriv.h" |
| (...skipping 124 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 135 accum += p1; | 135 accum += p1; |
| 136 accum += p2; | 136 accum += p2; |
| 137 accum += p3; | 137 accum += p3; |
| 138 | 138 |
| 139 // Advance the pointers | 139 // Advance the pointers |
| 140 rowToFilter += 16; | 140 rowToFilter += 16; |
| 141 filterValues += 4; | 141 filterValues += 4; |
| 142 } | 142 } |
| 143 int r = filterLength & 3; | 143 int r = filterLength & 3; |
| 144 if (r) { | 144 if (r) { |
| 145 const uint16_t mask[4][4] = { | 145 #define ACCUM_REMAINDER(src, accum) {
\ |
| 146 {0, 0, 0, 0}, | 146 int remainder[4] = {0};
\ |
| 147 {0xFFFF, 0, 0, 0}, | 147 const unsigned char* pixels_left = src + (filterOffset + filterLengt
h - r) * 4;\ |
| 148 {0xFFFF, 0xFFFF, 0, 0}, | 148 for (int i = 0; i < r; i++) {
\ |
| 149 {0xFFFF, 0xFFFF, 0xFFFF, 0} | 149 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i];
\ |
| 150 }; | 150 remainder[0] += coeff * pixels_left[i * 4 + 0];
\ |
| 151 uint16x4_t coeffs; | 151 remainder[1] += coeff * pixels_left[i * 4 + 1];
\ |
| 152 int16x4_t coeff0, coeff1, coeff2; | 152 remainder[2] += coeff * pixels_left[i * 4 + 2];
\ |
| 153 coeffs = vld1_u16(reinterpret_cast<const uint16_t*>(filterValues)); | 153 remainder[3] += coeff * pixels_left[i * 4 + 3];
\ |
| 154 coeffs &= vld1_u16(&mask[r][0]); | 154 }
\ |
| 155 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c
oeff_mask0)); | 155 int32x4_t t{remainder[0], remainder[1], remainder[2], remainder[3]};
\ |
| 156 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c
oeff_mask1)); | 156 accum += t; } |
| 157 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c
oeff_mask2)); | 157 ACCUM_REMAINDER(srcData, accum); |
| 158 | |
| 159 // Load pixels and calc | |
| 160 uint8x16_t pixels = vld1q_u8(rowToFilter); | |
| 161 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels
))); | |
| 162 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixel
s))); | |
| 163 int32x4_t p0 = vmull_s16(vget_low_s16(p01_16), coeff0); | |
| 164 int32x4_t p1 = vmull_s16(vget_high_s16(p01_16), coeff1); | |
| 165 int32x4_t p2 = vmull_s16(vget_low_s16(p23_16), coeff2); | |
| 166 | |
| 167 accum += p0; | |
| 168 accum += p1; | |
| 169 accum += p2; | |
| 170 } | 158 } |
| 171 | 159 |
| 172 // Bring this value back in range. All of the filter scaling factors | 160 // Bring this value back in range. All of the filter scaling factors |
| 173 // are in fixed point with kShiftBits bits of fractional part. | 161 // are in fixed point with kShiftBits bits of fractional part. |
| 174 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); | 162 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); |
| 175 | 163 |
| 176 // Pack and store the new pixel. | 164 // Pack and store the new pixel. |
| 177 int16x4_t accum16 = vqmovn_s32(accum); | 165 int16x4_t accum16 = vqmovn_s32(accum); |
| 178 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16)); | 166 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16)); |
| 179 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(a
ccum8), 0); | 167 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(a
ccum8), 0); |
| (...skipping 187 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 367 unsigned char* outRow[4], | 355 unsigned char* outRow[4], |
| 368 size_t outRowBytes) { | 356 size_t outRowBytes) { |
| 369 | 357 |
| 370 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); | 358 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); |
| 371 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); | 359 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); |
| 372 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); | 360 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); |
| 373 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); | 361 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); |
| 374 int num_values = filter.numValues(); | 362 int num_values = filter.numValues(); |
| 375 | 363 |
| 376 int filterOffset, filterLength; | 364 int filterOffset, filterLength; |
| 377 // |mask| will be used to decimate all extra filter coefficients that are | |
| 378 // loaded by SIMD when |filter_length| is not divisible by 4. | |
| 379 // mask[0] is not used in following algorithm. | |
| 380 const uint16_t mask[4][4] = { | |
| 381 {0, 0, 0, 0}, | |
| 382 {0xFFFF, 0, 0, 0}, | |
| 383 {0xFFFF, 0xFFFF, 0, 0}, | |
| 384 {0xFFFF, 0xFFFF, 0xFFFF, 0} | |
| 385 }; | |
| 386 | 365 |
| 387 // Output one pixel each iteration, calculating all channels (RGBA) together
. | 366 // Output one pixel each iteration, calculating all channels (RGBA) together
. |
| 388 for (int outX = 0; outX < num_values; outX++) { | 367 for (int outX = 0; outX < num_values; outX++) { |
| 389 | 368 |
| 390 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = | 369 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = |
| 391 filter.FilterForValue(outX, &filterOffset, &filterLength); | 370 filter.FilterForValue(outX, &filterOffset, &filterLength); |
| 392 | 371 |
| 393 // four pixels in a column per iteration. | 372 // four pixels in a column per iteration. |
| 394 int32x4_t accum0 = vdupq_n_s32(0); | 373 int32x4_t accum0 = vdupq_n_s32(0); |
| 395 int32x4_t accum1 = vdupq_n_s32(0); | 374 int32x4_t accum1 = vdupq_n_s32(0); |
| (...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 430 ITERATION(srcData[1] + start, accum1); | 409 ITERATION(srcData[1] + start, accum1); |
| 431 ITERATION(srcData[2] + start, accum2); | 410 ITERATION(srcData[2] + start, accum2); |
| 432 ITERATION(srcData[3] + start, accum3); | 411 ITERATION(srcData[3] + start, accum3); |
| 433 | 412 |
| 434 start += 16; | 413 start += 16; |
| 435 filterValues += 4; | 414 filterValues += 4; |
| 436 } | 415 } |
| 437 | 416 |
| 438 int r = filterLength & 3; | 417 int r = filterLength & 3; |
| 439 if (r) { | 418 if (r) { |
| 440 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; | 419 ACCUM_REMAINDER(srcData[0], accum0); |
| 441 coeffs = vld1_s16(filterValues); | 420 ACCUM_REMAINDER(srcData[1], accum1); |
| 442 coeffs &= vreinterpret_s16_u16(vld1_u16(&mask[r][0])); | 421 ACCUM_REMAINDER(srcData[2], accum2); |
| 443 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c
oeff_mask0)); | 422 ACCUM_REMAINDER(srcData[3], accum3); |
| 444 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c
oeff_mask1)); | |
| 445 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c
oeff_mask2)); | |
| 446 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c
oeff_mask3)); | |
| 447 | |
| 448 uint8x16_t pixels; | |
| 449 int16x8_t p01_16, p23_16; | |
| 450 int32x4_t p0, p1, p2, p3; | |
| 451 | |
| 452 ITERATION(srcData[0] + start, accum0); | |
| 453 ITERATION(srcData[1] + start, accum1); | |
| 454 ITERATION(srcData[2] + start, accum2); | |
| 455 ITERATION(srcData[3] + start, accum3); | |
| 456 } | 423 } |
| 457 | 424 |
| 458 int16x4_t accum16; | 425 int16x4_t accum16; |
| 459 uint8x8_t res0, res1, res2, res3; | 426 uint8x8_t res0, res1, res2, res3; |
| 460 | 427 |
| 461 #define PACK_RESULT(accum, res) \ | 428 #define PACK_RESULT(accum, res) \ |
| 462 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \ | 429 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \ |
| 463 accum16 = vqmovn_s32(accum); \ | 430 accum16 = vqmovn_s32(accum); \ |
| 464 res = vqmovun_s16(vcombine_s16(accum16, accum16)); | 431 res = vqmovun_s16(vcombine_s16(accum16, accum16)); |
| 465 | 432 |
| 466 PACK_RESULT(accum0, res0); | 433 PACK_RESULT(accum0, res0); |
| 467 PACK_RESULT(accum1, res1); | 434 PACK_RESULT(accum1, res1); |
| 468 PACK_RESULT(accum2, res2); | 435 PACK_RESULT(accum2, res2); |
| 469 PACK_RESULT(accum3, res3); | 436 PACK_RESULT(accum3, res3); |
| 470 | 437 |
| 471 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u
8(res0), 0); | 438 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u
8(res0), 0); |
| 472 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u
8(res1), 0); | 439 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u
8(res1), 0); |
| 473 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u
8(res2), 0); | 440 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u
8(res2), 0); |
| 474 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u
8(res3), 0); | 441 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u
8(res3), 0); |
| 475 outRow[0] += 4; | 442 outRow[0] += 4; |
| 476 outRow[1] += 4; | 443 outRow[1] += 4; |
| 477 outRow[2] += 4; | 444 outRow[2] += 4; |
| 478 outRow[3] += 4; | 445 outRow[3] += 4; |
| 479 } | 446 } |
| 480 } | 447 } |
| 481 | 448 |
| 482 void applySIMDPadding_neon(SkConvolutionFilter1D *filter) { | |
| 483 // Padding |paddingCount| of more dummy coefficients after the coefficients | |
| 484 // of last filter to prevent SIMD instructions which load 8 or 16 bytes | |
| 485 // together to access invalid memory areas. We are not trying to align the | |
| 486 // coefficients right now due to the opaqueness of <vector> implementation. | |
| 487 // This has to be done after all |AddFilter| calls. | |
| 488 for (int i = 0; i < 8; ++i) { | |
| 489 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix
ed>(0)); | |
| 490 } | |
| 491 } | |
| 492 | |
| 493 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) { | 449 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) { |
| 494 procs->fExtraHorizontalReads = 3; | |
| 495 procs->fConvolveVertically = &convolveVertically_neon; | 450 procs->fConvolveVertically = &convolveVertically_neon; |
| 496 procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon; | 451 procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon; |
| 497 procs->fConvolveHorizontally = &convolveHorizontally_neon; | 452 procs->fConvolveHorizontally = &convolveHorizontally_neon; |
| 498 procs->fApplySIMDPadding = &applySIMDPadding_neon; | |
| 499 } | 453 } |
| OLD | NEW |