Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(454)

Side by Side Diff: src/opts/SkBitmapProcState_arm_neon.cpp

Issue 2481733003: Make SSE2/Neon convolution functions not to read extra bytes (Closed)
Patch Set: improve neon performance Created 4 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright 2012 Google Inc. 2 * Copyright 2012 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBitmapProcState.h" 8 #include "SkBitmapProcState.h"
9 #include "SkBitmapProcState_filter.h" 9 #include "SkBitmapProcState_filter.h"
10 #include "SkColorPriv.h" 10 #include "SkColorPriv.h"
(...skipping 124 matching lines...) Expand 10 before | Expand all | Expand 10 after
135 accum += p1; 135 accum += p1;
136 accum += p2; 136 accum += p2;
137 accum += p3; 137 accum += p3;
138 138
139 // Advance the pointers 139 // Advance the pointers
140 rowToFilter += 16; 140 rowToFilter += 16;
141 filterValues += 4; 141 filterValues += 4;
142 } 142 }
143 int r = filterLength & 3; 143 int r = filterLength & 3;
144 if (r) { 144 if (r) {
145 const uint16_t mask[4][4] = { 145 #define ACCUM_REMAINDER(src, accum) { \
146 {0, 0, 0, 0}, 146 int remainder[4] = {0}; \
147 {0xFFFF, 0, 0, 0}, 147 const unsigned char* pixels_left = src + (filterOffset + filterLengt h - r) * 4;\
148 {0xFFFF, 0xFFFF, 0, 0}, 148 for (int i = 0; i < r; i++) { \
149 {0xFFFF, 0xFFFF, 0xFFFF, 0} 149 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i]; \
150 }; 150 remainder[0] += coeff * pixels_left[i * 4 + 0]; \
151 uint16x4_t coeffs; 151 remainder[1] += coeff * pixels_left[i * 4 + 1]; \
152 int16x4_t coeff0, coeff1, coeff2; 152 remainder[2] += coeff * pixels_left[i * 4 + 2]; \
153 coeffs = vld1_u16(reinterpret_cast<const uint16_t*>(filterValues)); 153 remainder[3] += coeff * pixels_left[i * 4 + 3]; \
154 coeffs &= vld1_u16(&mask[r][0]); 154 } \
155 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c oeff_mask0)); 155 int32x4_t t{remainder[0], remainder[1], remainder[2], remainder[3]}; \
156 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c oeff_mask1)); 156 accum += t; }
157 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c oeff_mask2)); 157 ACCUM_REMAINDER(srcData, accum);
158
159 // Load pixels and calc
160 uint8x16_t pixels = vld1q_u8(rowToFilter);
161 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels )));
162 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixel s)));
163 int32x4_t p0 = vmull_s16(vget_low_s16(p01_16), coeff0);
164 int32x4_t p1 = vmull_s16(vget_high_s16(p01_16), coeff1);
165 int32x4_t p2 = vmull_s16(vget_low_s16(p23_16), coeff2);
166
167 accum += p0;
168 accum += p1;
169 accum += p2;
170 } 158 }
171 159
172 // Bring this value back in range. All of the filter scaling factors 160 // Bring this value back in range. All of the filter scaling factors
173 // are in fixed point with kShiftBits bits of fractional part. 161 // are in fixed point with kShiftBits bits of fractional part.
174 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); 162 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);
175 163
176 // Pack and store the new pixel. 164 // Pack and store the new pixel.
177 int16x4_t accum16 = vqmovn_s32(accum); 165 int16x4_t accum16 = vqmovn_s32(accum);
178 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16)); 166 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));
179 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(a ccum8), 0); 167 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(a ccum8), 0);
(...skipping 187 matching lines...) Expand 10 before | Expand all | Expand 10 after
367 unsigned char* outRow[4], 355 unsigned char* outRow[4],
368 size_t outRowBytes) { 356 size_t outRowBytes) {
369 357
370 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); 358 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
371 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); 359 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
372 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); 360 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
373 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); 361 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
374 int num_values = filter.numValues(); 362 int num_values = filter.numValues();
375 363
376 int filterOffset, filterLength; 364 int filterOffset, filterLength;
377 // |mask| will be used to decimate all extra filter coefficients that are
378 // loaded by SIMD when |filter_length| is not divisible by 4.
379 // mask[0] is not used in following algorithm.
380 const uint16_t mask[4][4] = {
381 {0, 0, 0, 0},
382 {0xFFFF, 0, 0, 0},
383 {0xFFFF, 0xFFFF, 0, 0},
384 {0xFFFF, 0xFFFF, 0xFFFF, 0}
385 };
386 365
387 // Output one pixel each iteration, calculating all channels (RGBA) together . 366 // Output one pixel each iteration, calculating all channels (RGBA) together .
388 for (int outX = 0; outX < num_values; outX++) { 367 for (int outX = 0; outX < num_values; outX++) {
389 368
390 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = 369 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
391 filter.FilterForValue(outX, &filterOffset, &filterLength); 370 filter.FilterForValue(outX, &filterOffset, &filterLength);
392 371
393 // four pixels in a column per iteration. 372 // four pixels in a column per iteration.
394 int32x4_t accum0 = vdupq_n_s32(0); 373 int32x4_t accum0 = vdupq_n_s32(0);
395 int32x4_t accum1 = vdupq_n_s32(0); 374 int32x4_t accum1 = vdupq_n_s32(0);
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
430 ITERATION(srcData[1] + start, accum1); 409 ITERATION(srcData[1] + start, accum1);
431 ITERATION(srcData[2] + start, accum2); 410 ITERATION(srcData[2] + start, accum2);
432 ITERATION(srcData[3] + start, accum3); 411 ITERATION(srcData[3] + start, accum3);
433 412
434 start += 16; 413 start += 16;
435 filterValues += 4; 414 filterValues += 4;
436 } 415 }
437 416
438 int r = filterLength & 3; 417 int r = filterLength & 3;
439 if (r) { 418 if (r) {
440 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; 419 ACCUM_REMAINDER(srcData[0], accum0);
441 coeffs = vld1_s16(filterValues); 420 ACCUM_REMAINDER(srcData[1], accum1);
442 coeffs &= vreinterpret_s16_u16(vld1_u16(&mask[r][0])); 421 ACCUM_REMAINDER(srcData[2], accum2);
443 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask0)); 422 ACCUM_REMAINDER(srcData[3], accum3);
444 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask1));
445 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask2));
446 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask3));
447
448 uint8x16_t pixels;
449 int16x8_t p01_16, p23_16;
450 int32x4_t p0, p1, p2, p3;
451
452 ITERATION(srcData[0] + start, accum0);
453 ITERATION(srcData[1] + start, accum1);
454 ITERATION(srcData[2] + start, accum2);
455 ITERATION(srcData[3] + start, accum3);
456 } 423 }
457 424
458 int16x4_t accum16; 425 int16x4_t accum16;
459 uint8x8_t res0, res1, res2, res3; 426 uint8x8_t res0, res1, res2, res3;
460 427
461 #define PACK_RESULT(accum, res) \ 428 #define PACK_RESULT(accum, res) \
462 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \ 429 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \
463 accum16 = vqmovn_s32(accum); \ 430 accum16 = vqmovn_s32(accum); \
464 res = vqmovun_s16(vcombine_s16(accum16, accum16)); 431 res = vqmovun_s16(vcombine_s16(accum16, accum16));
465 432
466 PACK_RESULT(accum0, res0); 433 PACK_RESULT(accum0, res0);
467 PACK_RESULT(accum1, res1); 434 PACK_RESULT(accum1, res1);
468 PACK_RESULT(accum2, res2); 435 PACK_RESULT(accum2, res2);
469 PACK_RESULT(accum3, res3); 436 PACK_RESULT(accum3, res3);
470 437
471 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u 8(res0), 0); 438 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u 8(res0), 0);
472 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u 8(res1), 0); 439 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u 8(res1), 0);
473 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u 8(res2), 0); 440 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u 8(res2), 0);
474 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u 8(res3), 0); 441 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u 8(res3), 0);
475 outRow[0] += 4; 442 outRow[0] += 4;
476 outRow[1] += 4; 443 outRow[1] += 4;
477 outRow[2] += 4; 444 outRow[2] += 4;
478 outRow[3] += 4; 445 outRow[3] += 4;
479 } 446 }
480 } 447 }
481 448
482 void applySIMDPadding_neon(SkConvolutionFilter1D *filter) {
483 // Padding |paddingCount| of more dummy coefficients after the coefficients
484 // of last filter to prevent SIMD instructions which load 8 or 16 bytes
485 // together to access invalid memory areas. We are not trying to align the
486 // coefficients right now due to the opaqueness of <vector> implementation.
487 // This has to be done after all |AddFilter| calls.
488 for (int i = 0; i < 8; ++i) {
489 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix ed>(0));
490 }
491 }
492
493 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) { 449 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) {
494 procs->fExtraHorizontalReads = 3;
495 procs->fConvolveVertically = &convolveVertically_neon; 450 procs->fConvolveVertically = &convolveVertically_neon;
496 procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon; 451 procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon;
497 procs->fConvolveHorizontally = &convolveHorizontally_neon; 452 procs->fConvolveHorizontally = &convolveHorizontally_neon;
498 procs->fApplySIMDPadding = &applySIMDPadding_neon;
499 } 453 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698