src/opts/SkBitmapFilter_opts_SSE2.cpp - Issue 2481733003: Make SSE2/Neon convolution functions not to read extra bytes

Side by Side Diff: src/opts/SkBitmapFilter_opts_SSE2.cpp

Issue 2481733003: Make SSE2/Neon convolution functions not to read extra bytes (Closed)

Patch Set: improve neon performance Created 4 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright 2013 Google Inc.	2 * Copyright 2013 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include <emmintrin.h>	8 #include <emmintrin.h>

9 #include "SkBitmap.h"	9 #include "SkBitmap.h"

10 #include "SkBitmapFilter_opts_SSE2.h"	10 #include "SkBitmapFilter_opts_SSE2.h"

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
43 // Convolves horizontally along a single row. The row data is given in	43 // Convolves horizontally along a single row. The row data is given in

44 // \|src_data\| and continues for the num_values() of the filter.	44 // \|src_data\| and continues for the num_values() of the filter.

45 void convolveHorizontally_SSE2(const unsigned char* src_data,	45 void convolveHorizontally_SSE2(const unsigned char* src_data,

46 const SkConvolutionFilter1D& filter,	46 const SkConvolutionFilter1D& filter,

47 unsigned char* out_row,	47 unsigned char* out_row,

48 bool /has_alpha/) {	48 bool /has_alpha/) {

49 int num_values = filter.numValues();	49 int num_values = filter.numValues();

50	50

51 int filter_offset, filter_length;	51 int filter_offset, filter_length;

52 __m128i zero = _mm_setzero_si128();	52 __m128i zero = _mm_setzero_si128();

53 __m128i mask[4];

54 // \|mask\| will be used to decimate all extra filter coefficients that are

55 // loaded by SIMD when \|filter_length\| is not divisible by 4.

56 // mask[0] is not used in following algorithm.

57 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);

58 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);

59 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);

60	53

61 // Output one pixel each iteration, calculating all channels (RGBA) together .	54 // Output one pixel each iteration, calculating all channels (RGBA) together .

62 for (int out_x = 0; out_x < num_values; out_x++) {	55 for (int out_x = 0; out_x < num_values; out_x++) {

63 const SkConvolutionFilter1D::ConvolutionFixed* filter_values =	56 const SkConvolutionFilter1D::ConvolutionFixed* filter_values =

64 filter.FilterForValue(out_x, &filter_offset, &filter_length);	57 filter.FilterForValue(out_x, &filter_offset, &filter_length);

65	58

66 __m128i accum = _mm_setzero_si128();	59 __m128i accum = _mm_setzero_si128();

67	60

68 // Compute the first pixel in this row that the filter affects. It will	61 // Compute the first pixel in this row that the filter affects. It will

69 // touch \|filter_length\| pixels (4 bytes each) after this.	62 // touch \|filter_length\| pixels (4 bytes each) after this.

(...skipping 43 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
113 accum = _mm_add_epi32(accum, t);	106 accum = _mm_add_epi32(accum, t);

114 // [32] a3c3 b3c3 g3c3 r3c3	107 // [32] a3c3 b3c3 g3c3 r3c3

115 t = _mm_unpackhi_epi16(mul_lo, mul_hi);	108 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

116 accum = _mm_add_epi32(accum, t);	109 accum = _mm_add_epi32(accum, t);

117	110

118 // Advance the pixel and coefficients pointers.	111 // Advance the pixel and coefficients pointers.

119 row_to_filter += 1;	112 row_to_filter += 1;

120 filter_values += 4;	113 filter_values += 4;

121 }	114 }

122	115

123 // When \|filter_length\| is not divisible by 4, we need to decimate some of	116 // When \|filter_length\| is not divisible by 4, we accumulate the last 1 - 3

124 // the filter coefficient that was loaded incorrectly to zero; Other tha n	117 // coefficients one at a time.

125 // that the algorithm is same with above, exceot that the 4th pixel will be	118 int r = filter_length & 3;

126 // always absent.

127 int r = filter_length&3;

128 if (r) {	119 if (r) {

129 // Note: filter_values must be padded to align_up(filter_offset, 8).	120 #define ACCUM_REMAINDER(src, accum) { \
	mtklein_C 2016/11/09 09:25:04 Can we make these ACCUM_REMAINDERs lambdas or stat Can we make these ACCUM_REMAINDERs lambdas or static functions instead of macros? I don't see anything too sneaky going on here that really requires a macro. If needed, SK_ALWAYS_INLINE is pretty reliable at forcing inlining. (Same question in the NEON code of course.) xiangze.zhang 2016/11/10 03:16:56 Done. Show quoted text On 2016/11/09 09:25:04, mtklein_C wrote: > Can we make these ACCUM_REMAINDERs lambdas or static functions instead of > macros? I don't see anything too sneaky going on here that really requires a > macro. If needed, SK_ALWAYS_INLINE is pretty reliable at forcing inlining. > > (Same question in the NEON code of course.) Done.
130 __m128i coeff, coeff16;	121 int remainder[4] = {0}; \

131 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_valu es));	122 const unsigned char* pixels_left = src + (filter_offset + filter_len gth - r) * 4; \

132 // Mask out extra filter taps.	123 for (int i = 0; i < r; i++) { \

133 coeff = _mm_and_si128(coeff, mask[r]);	124 SkConvolutionFilter1D::ConvolutionFixed coeff = filter_values[i] ; \

134 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));	125 remainder[0] += coeff * pixels_left[i * 4 + 0]; \

135 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);	126 remainder[1] += coeff * pixels_left[i * 4 + 1]; \

136	127 remainder[2] += coeff * pixels_left[i * 4 + 2]; \

137 // Note: line buffer must be padded to align_up(filter_offset, 16).	128 remainder[3] += coeff * pixels_left[i * 4 + 3]; \

138 // We resolve this by use C-version for the last horizontal line.	129 } \

139 __m128i src8 = _mm_loadu_si128(row_to_filter);	130 __m128i t = _mm_set_epi32(remainder[3], remainder[2], remainder[1], remainder[0]); \
	mtklein_C 2016/11/09 09:25:04 This is not a big deal, but we have been trending This is not a big deal, but we have been trending towards using _mm_setr versions of these functions, mostly to keep things reading consistently when you switch over to NEON or think about how the vector would be stored in memory. xiangze.zhang 2016/11/10 03:16:56 Done. Show quoted text On 2016/11/09 09:25:04, mtklein_C wrote: > This is not a big deal, but we have been trending towards using _mm_setr > versions of these functions, mostly to keep things reading consistently when you > switch over to NEON or think about how the vector would be stored in memory. Done.
140 __m128i src16 = _mm_unpacklo_epi8(src8, zero);	131 accum = _mm_add_epi32(accum, t); }
	mtklein_C 2016/11/09 09:25:04 If we do need to use macros, it's probably best to If we do need to use macros, it's probably best to add another \ and put the } on its own line. It's sort of easy to overlook here.
141 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);	132 ACCUM_REMAINDER(src_data, accum);

142 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

143 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

144 accum = _mm_add_epi32(accum, t);

145 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

146 accum = _mm_add_epi32(accum, t);

147

148 src16 = _mm_unpackhi_epi8(src8, zero);

149 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

150 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

151 mul_hi = _mm_mulhi_epi16(src16, coeff16);

152 mul_lo = _mm_mullo_epi16(src16, coeff16);

153 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

154 accum = _mm_add_epi32(accum, t);

155 }	133 }

156	134

157 // Shift right for fixed point implementation.	135 // Shift right for fixed point implementation.

158 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);	136 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);

159	137

160 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation).	138 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation).

161 accum = _mm_packs_epi32(accum, zero);	139 accum = _mm_packs_epi32(accum, zero);

162 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturation).	140 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturation).

163 accum = _mm_packus_epi16(accum, zero);	141 accum = _mm_packus_epi16(accum, zero);

164	142

(...skipping 10 matching lines...) Expand all Loading...
175 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],	153 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],

176 const SkConvolutionFilter1D& filter,	154 const SkConvolutionFilter1D& filter,

177 unsigned char* out_row[4],	155 unsigned char* out_row[4],

178 size_t outRowBytes) {	156 size_t outRowBytes) {

179 SkDEBUGCODE(const unsigned char* out_row_0_start = out_row[0];)	157 SkDEBUGCODE(const unsigned char* out_row_0_start = out_row[0];)

180	158

181 int num_values = filter.numValues();	159 int num_values = filter.numValues();

182	160

183 int filter_offset, filter_length;	161 int filter_offset, filter_length;

184 __m128i zero = _mm_setzero_si128();	162 __m128i zero = _mm_setzero_si128();

185 __m128i mask[4];

186 // \|mask\| will be used to decimate all extra filter coefficients that are

187 // loaded by SIMD when \|filter_length\| is not divisible by 4.

188 // mask[0] is not used in following algorithm.

189 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);

190 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);

191 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);

192	163

193 // Output one pixel each iteration, calculating all channels (RGBA) together .	164 // Output one pixel each iteration, calculating all channels (RGBA) together .

194 for (int out_x = 0; out_x < num_values; out_x++) {	165 for (int out_x = 0; out_x < num_values; out_x++) {

195 const SkConvolutionFilter1D::ConvolutionFixed* filter_values =	166 const SkConvolutionFilter1D::ConvolutionFixed* filter_values =

196 filter.FilterForValue(out_x, &filter_offset, &filter_length);	167 filter.FilterForValue(out_x, &filter_offset, &filter_length);

197	168

198 // four pixels in a column per iteration.	169 // four pixels in a column per iteration.

199 __m128i accum0 = _mm_setzero_si128();	170 __m128i accum0 = _mm_setzero_si128();

200 __m128i accum1 = _mm_setzero_si128();	171 __m128i accum1 = _mm_setzero_si128();

201 __m128i accum2 = _mm_setzero_si128();	172 __m128i accum2 = _mm_setzero_si128();

(...skipping 36 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
238 ITERATION(src_data[1] + start, accum1);	209 ITERATION(src_data[1] + start, accum1);

239 ITERATION(src_data[2] + start, accum2);	210 ITERATION(src_data[2] + start, accum2);

240 ITERATION(src_data[3] + start, accum3);	211 ITERATION(src_data[3] + start, accum3);

241	212

242 start += 16;	213 start += 16;

243 filter_values += 4;	214 filter_values += 4;

244 }	215 }

245	216

246 int r = filter_length & 3;	217 int r = filter_length & 3;

247 if (r) {	218 if (r) {

248 // Note: filter_values must be padded to align_up(filter_offset, 8);	219 ACCUM_REMAINDER(src_data[0], accum0);

249 __m128i coeff;	220 ACCUM_REMAINDER(src_data[1], accum1);

250 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_valu es));	221 ACCUM_REMAINDER(src_data[2], accum2);

251 // Mask out extra filter taps.	222 ACCUM_REMAINDER(src_data[3], accum3);

252 coeff = _mm_and_si128(coeff, mask[r]);

253

254 __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

255 /* c1 c1 c1 c1 c0 c0 c0 c0 */

256 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);

257 __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

258 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);

259

260 __m128i src8, src16, mul_hi, mul_lo, t;

261

262 ITERATION(src_data[0] + start, accum0);

263 ITERATION(src_data[1] + start, accum1);

264 ITERATION(src_data[2] + start, accum2);

265 ITERATION(src_data[3] + start, accum3);

266 }	223 }

267	224

268 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);	225 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);

269 accum0 = _mm_packs_epi32(accum0, zero);	226 accum0 = _mm_packs_epi32(accum0, zero);

270 accum0 = _mm_packus_epi16(accum0, zero);	227 accum0 = _mm_packus_epi16(accum0, zero);

271 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);	228 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);

272 accum1 = _mm_packs_epi32(accum1, zero);	229 accum1 = _mm_packs_epi32(accum1, zero);

273 accum1 = _mm_packus_epi16(accum1, zero);	230 accum1 = _mm_packus_epi16(accum1, zero);

274 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);	231 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);

275 accum2 = _mm_packs_epi32(accum2, zero);	232 accum2 = _mm_packs_epi32(accum2, zero);

(...skipping 204 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
480 pixel_width,	437 pixel_width,

481 out_row);	438 out_row);

482 } else {	439 } else {

483 convolveVertically_SSE2<false>(filter_values,	440 convolveVertically_SSE2<false>(filter_values,

484 filter_length,	441 filter_length,

485 source_data_rows,	442 source_data_rows,

486 pixel_width,	443 pixel_width,

487 out_row);	444 out_row);

488 }	445 }

489 }	446 }

490

491 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {

492 // Padding \|paddingCount\| of more dummy coefficients after the coefficients

493 // of last filter to prevent SIMD instructions which load 8 or 16 bytes

494 // together to access invalid memory areas. We are not trying to align the

495 // coefficients right now due to the opaqueness of <vector> implementation.

496 // This has to be done after all \|AddFilter\| calls.

497 for (int i = 0; i < 8; ++i) {

498 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix ed>(0));

499 }

500 }

OLD	NEW

« no previous file with comments | « src/core/SkConvolver.cpp ('k') | src/opts/SkBitmapProcState_arm_neon.cpp » ('j') | no next file with comments »