Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(70)

Side by Side Diff: src/opts/SkBitmapProcState_arm_neon.cpp

Issue 2481733003: Make SSE2/Neon convolution functions not to read extra bytes (Closed)
Patch Set: Change macros to functions Created 4 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/opts/SkBitmapFilter_opts_SSE2.cpp ('k') | src/opts/opts_check_x86.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2012 Google Inc. 2 * Copyright 2012 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBitmapProcState.h" 8 #include "SkBitmapProcState.h"
9 #include "SkBitmapProcState_filter.h" 9 #include "SkBitmapProcState_filter.h"
10 #include "SkColorPriv.h" 10 #include "SkColorPriv.h"
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after
76 SG8_alpha_D32_filter_DXDY_neon, 76 SG8_alpha_D32_filter_DXDY_neon,
77 SG8_alpha_D32_filter_DX_neon, 77 SG8_alpha_D32_filter_DX_neon,
78 SG8_alpha_D32_filter_DX_neon, 78 SG8_alpha_D32_filter_DX_neon,
79 }; 79 };
80 80
81 /////////////////////////////////////////////////////////////////////////////// 81 ///////////////////////////////////////////////////////////////////////////////
82 82
83 #include <arm_neon.h> 83 #include <arm_neon.h>
84 #include "SkConvolver.h" 84 #include "SkConvolver.h"
85 85
86 static SK_ALWAYS_INLINE void accum_remainder(const unsigned char* pixels_left,
87 const SkConvolutionFilter1D::ConvolutionFixed* filter_values, int32x4_t& accum, int r) {
88 int remainder[4] = {0};
89 for (int i = 0; i < r; i++) {
90 SkConvolutionFilter1D::ConvolutionFixed coeff = filter_values[i];
91 remainder[0] += coeff * pixels_left[i * 4 + 0];
92 remainder[1] += coeff * pixels_left[i * 4 + 1];
93 remainder[2] += coeff * pixels_left[i * 4 + 2];
94 remainder[3] += coeff * pixels_left[i * 4 + 3];
95 }
96 int32x4_t t = {remainder[0], remainder[1], remainder[2], remainder[3]};
97 accum += t;
98 }
99
86 // Convolves horizontally along a single row. The row data is given in 100 // Convolves horizontally along a single row. The row data is given in
87 // |srcData| and continues for the numValues() of the filter. 101 // |srcData| and continues for the numValues() of the filter.
88 void convolveHorizontally_neon(const unsigned char* srcData, 102 void convolveHorizontally_neon(const unsigned char* srcData,
89 const SkConvolutionFilter1D& filter, 103 const SkConvolutionFilter1D& filter,
90 unsigned char* outRow, 104 unsigned char* outRow,
91 bool hasAlpha) { 105 bool hasAlpha) {
92 // Loop over each pixel on this row in the output image. 106 // Loop over each pixel on this row in the output image.
93 int numValues = filter.numValues(); 107 int numValues = filter.numValues();
94 for (int outX = 0; outX < numValues; outX++) { 108 for (int outX = 0; outX < numValues; outX++) {
95 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); 109 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after
133 147
134 accum += p0; 148 accum += p0;
135 accum += p1; 149 accum += p1;
136 accum += p2; 150 accum += p2;
137 accum += p3; 151 accum += p3;
138 152
139 // Advance the pointers 153 // Advance the pointers
140 rowToFilter += 16; 154 rowToFilter += 16;
141 filterValues += 4; 155 filterValues += 4;
142 } 156 }
157
143 int r = filterLength & 3; 158 int r = filterLength & 3;
144 if (r) { 159 if (r) {
145 const uint16_t mask[4][4] = { 160 int remainder_offset = (filterOffset + filterLength - r) * 4;
146 {0, 0, 0, 0}, 161 accum_remainder(srcData + remainder_offset, filterValues, accum, r);
147 {0xFFFF, 0, 0, 0},
148 {0xFFFF, 0xFFFF, 0, 0},
149 {0xFFFF, 0xFFFF, 0xFFFF, 0}
150 };
151 uint16x4_t coeffs;
152 int16x4_t coeff0, coeff1, coeff2;
153 coeffs = vld1_u16(reinterpret_cast<const uint16_t*>(filterValues));
154 coeffs &= vld1_u16(&mask[r][0]);
155 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c oeff_mask0));
156 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c oeff_mask1));
157 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), c oeff_mask2));
158
159 // Load pixels and calc
160 uint8x16_t pixels = vld1q_u8(rowToFilter);
161 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels )));
162 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixel s)));
163 int32x4_t p0 = vmull_s16(vget_low_s16(p01_16), coeff0);
164 int32x4_t p1 = vmull_s16(vget_high_s16(p01_16), coeff1);
165 int32x4_t p2 = vmull_s16(vget_low_s16(p23_16), coeff2);
166
167 accum += p0;
168 accum += p1;
169 accum += p2;
170 } 162 }
171 163
172 // Bring this value back in range. All of the filter scaling factors 164 // Bring this value back in range. All of the filter scaling factors
173 // are in fixed point with kShiftBits bits of fractional part. 165 // are in fixed point with kShiftBits bits of fractional part.
174 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); 166 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);
175 167
176 // Pack and store the new pixel. 168 // Pack and store the new pixel.
177 int16x4_t accum16 = vqmovn_s32(accum); 169 int16x4_t accum16 = vqmovn_s32(accum);
178 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16)); 170 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));
179 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(a ccum8), 0); 171 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(a ccum8), 0);
(...skipping 187 matching lines...) Expand 10 before | Expand all | Expand 10 after
367 unsigned char* outRow[4], 359 unsigned char* outRow[4],
368 size_t outRowBytes) { 360 size_t outRowBytes) {
369 361
370 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); 362 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
371 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); 363 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
372 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); 364 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
373 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); 365 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
374 int num_values = filter.numValues(); 366 int num_values = filter.numValues();
375 367
376 int filterOffset, filterLength; 368 int filterOffset, filterLength;
377 // |mask| will be used to decimate all extra filter coefficients that are
378 // loaded by SIMD when |filter_length| is not divisible by 4.
379 // mask[0] is not used in following algorithm.
380 const uint16_t mask[4][4] = {
381 {0, 0, 0, 0},
382 {0xFFFF, 0, 0, 0},
383 {0xFFFF, 0xFFFF, 0, 0},
384 {0xFFFF, 0xFFFF, 0xFFFF, 0}
385 };
386 369
387 // Output one pixel each iteration, calculating all channels (RGBA) together . 370 // Output one pixel each iteration, calculating all channels (RGBA) together .
388 for (int outX = 0; outX < num_values; outX++) { 371 for (int outX = 0; outX < num_values; outX++) {
389 372
390 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = 373 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
391 filter.FilterForValue(outX, &filterOffset, &filterLength); 374 filter.FilterForValue(outX, &filterOffset, &filterLength);
392 375
393 // four pixels in a column per iteration. 376 // four pixels in a column per iteration.
394 int32x4_t accum0 = vdupq_n_s32(0); 377 int32x4_t accum0 = vdupq_n_s32(0);
395 int32x4_t accum1 = vdupq_n_s32(0); 378 int32x4_t accum1 = vdupq_n_s32(0);
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
430 ITERATION(srcData[1] + start, accum1); 413 ITERATION(srcData[1] + start, accum1);
431 ITERATION(srcData[2] + start, accum2); 414 ITERATION(srcData[2] + start, accum2);
432 ITERATION(srcData[3] + start, accum3); 415 ITERATION(srcData[3] + start, accum3);
433 416
434 start += 16; 417 start += 16;
435 filterValues += 4; 418 filterValues += 4;
436 } 419 }
437 420
438 int r = filterLength & 3; 421 int r = filterLength & 3;
439 if (r) { 422 if (r) {
440 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; 423 int remainder_offset = (filterOffset + filterLength - r) * 4;
441 coeffs = vld1_s16(filterValues); 424 accum_remainder(srcData[0] + remainder_offset, filterValues, accum0, r);
442 coeffs &= vreinterpret_s16_u16(vld1_u16(&mask[r][0])); 425 accum_remainder(srcData[1] + remainder_offset, filterValues, accum1, r);
443 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask0)); 426 accum_remainder(srcData[2] + remainder_offset, filterValues, accum2, r);
444 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask1)); 427 accum_remainder(srcData[3] + remainder_offset, filterValues, accum3, r);
445 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask2));
446 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), c oeff_mask3));
447
448 uint8x16_t pixels;
449 int16x8_t p01_16, p23_16;
450 int32x4_t p0, p1, p2, p3;
451
452 ITERATION(srcData[0] + start, accum0);
453 ITERATION(srcData[1] + start, accum1);
454 ITERATION(srcData[2] + start, accum2);
455 ITERATION(srcData[3] + start, accum3);
456 } 428 }
457 429
458 int16x4_t accum16; 430 int16x4_t accum16;
459 uint8x8_t res0, res1, res2, res3; 431 uint8x8_t res0, res1, res2, res3;
460 432
461 #define PACK_RESULT(accum, res) \ 433 #define PACK_RESULT(accum, res) \
462 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \ 434 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \
463 accum16 = vqmovn_s32(accum); \ 435 accum16 = vqmovn_s32(accum); \
464 res = vqmovun_s16(vcombine_s16(accum16, accum16)); 436 res = vqmovun_s16(vcombine_s16(accum16, accum16));
465 437
466 PACK_RESULT(accum0, res0); 438 PACK_RESULT(accum0, res0);
467 PACK_RESULT(accum1, res1); 439 PACK_RESULT(accum1, res1);
468 PACK_RESULT(accum2, res2); 440 PACK_RESULT(accum2, res2);
469 PACK_RESULT(accum3, res3); 441 PACK_RESULT(accum3, res3);
470 442
471 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u 8(res0), 0); 443 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u 8(res0), 0);
472 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u 8(res1), 0); 444 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u 8(res1), 0);
473 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u 8(res2), 0); 445 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u 8(res2), 0);
474 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u 8(res3), 0); 446 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u 8(res3), 0);
475 outRow[0] += 4; 447 outRow[0] += 4;
476 outRow[1] += 4; 448 outRow[1] += 4;
477 outRow[2] += 4; 449 outRow[2] += 4;
478 outRow[3] += 4; 450 outRow[3] += 4;
479 } 451 }
480 } 452 }
481 453
482 void applySIMDPadding_neon(SkConvolutionFilter1D *filter) {
483 // Padding |paddingCount| of more dummy coefficients after the coefficients
484 // of last filter to prevent SIMD instructions which load 8 or 16 bytes
485 // together to access invalid memory areas. We are not trying to align the
486 // coefficients right now due to the opaqueness of <vector> implementation.
487 // This has to be done after all |AddFilter| calls.
488 for (int i = 0; i < 8; ++i) {
489 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix ed>(0));
490 }
491 }
492
493 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) { 454 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) {
494 procs->fExtraHorizontalReads = 3;
495 procs->fConvolveVertically = &convolveVertically_neon; 455 procs->fConvolveVertically = &convolveVertically_neon;
496 procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon; 456 procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon;
497 procs->fConvolveHorizontally = &convolveHorizontally_neon; 457 procs->fConvolveHorizontally = &convolveHorizontally_neon;
498 procs->fApplySIMDPadding = &applySIMDPadding_neon;
499 } 458 }
OLDNEW
« no previous file with comments | « src/opts/SkBitmapFilter_opts_SSE2.cpp ('k') | src/opts/opts_check_x86.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698