src/opts/SkBitmapFilter_opts_SSE2.cpp - Issue 264603002: Cleanup of SSE optimization files.

Side by Side Diff: src/opts/SkBitmapFilter_opts_SSE2.cpp

Issue 264603002: Cleanup of SSE optimization files. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Added chromium sync patch Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2013 Google Inc.	2 * Copyright 2013 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

	8 #include <emmintrin.h>

	9 #include "SkBitmap.h"

	10 #include "SkBitmapFilter_opts_SSE2.h"

8 #include "SkBitmapProcState.h"	11 #include "SkBitmapProcState.h"

9 #include "SkBitmap.h"

10 #include "SkColor.h"	12 #include "SkColor.h"

11 #include "SkColorPriv.h"	13 #include "SkColorPriv.h"

	14 #include "SkConvolver.h"

	15 #include "SkShader.h"

12 #include "SkUnPreMultiply.h"	16 #include "SkUnPreMultiply.h"

13 #include "SkShader.h"

14 #include "SkConvolver.h"

15

16 #include "SkBitmapFilter_opts_SSE2.h"

17

18 #include <emmintrin.h>

19	17

20 #if 0	18 #if 0

21 static inline void print128i(__m128i value) {	19 static inline void print128i(__m128i value) {

22 int v = (int) &value;	20 int v = (int) &value;

23 printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]);	21 printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]);

24 }	22 }

25	23

26 static inline void print128i_16(__m128i value) {	24 static inline void print128i_16(__m128i value) {

27 short v = (short) &value;	25 short v = (short) &value;

28 printf("% .5d % .5d % .5d % .5d % .5d % .5d % .5d % .5d\n", v[0], v[1], v[2] , v[3], v[4], v[5], v[6], v[7]);	26 printf("% .5d % .5d % .5d % .5d % .5d % .5d % .5d % .5d\n", v[0], v[1], v[2] , v[3], v[4], v[5], v[6], v[7]);

(...skipping 139 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
168 int r = SkClampMax(int(localResult[1]), a);	166 int r = SkClampMax(int(localResult[1]), a);

169 int g = SkClampMax(int(localResult[2]), a);	167 int g = SkClampMax(int(localResult[2]), a);

170 int b = SkClampMax(int(localResult[3]), a);	168 int b = SkClampMax(int(localResult[3]), a);

171	169

172 *colors++ = SkPackARGB32(a, r, g, b);	170 *colors++ = SkPackARGB32(a, r, g, b);

173	171

174 x++;	172 x++;

175	173

176 s.fInvProc(s.fInvMatrix, SkIntToScalar(x),	174 s.fInvProc(s.fInvMatrix, SkIntToScalar(x),

177 SkIntToScalar(y), &srcPt);	175 SkIntToScalar(y), &srcPt);

178

179 }	176 }

180 }	177 }

181	178

182 // Convolves horizontally along a single row. The row data is given in	179 // Convolves horizontally along a single row. The row data is given in

183 // \|src_data\| and continues for the num_values() of the filter.	180 // \|src_data\| and continues for the num_values() of the filter.

184 void convolveHorizontally_SSE2(const unsigned char* src_data,	181 void convolveHorizontally_SSE2(const unsigned char* src_data,

185 const SkConvolutionFilter1D& filter,	182 const SkConvolutionFilter1D& filter,

186 unsigned char* out_row,	183 unsigned char* out_row,

187 bool /has_alpha/) {	184 bool /has_alpha/) {

188 int num_values = filter.numValues();	185 int num_values = filter.numValues();

189	186

190 int filter_offset, filter_length;	187 int filter_offset, filter_length;

191 __m128i zero = _mm_setzero_si128();	188 __m128i zero = _mm_setzero_si128();

192 __m128i mask[4];	189 __m128i mask[4];

193 // \|mask\| will be used to decimate all extra filter coefficients that are	190 // \|mask\| will be used to decimate all extra filter coefficients that are

194 // loaded by SIMD when \|filter_length\| is not divisible by 4.	191 // loaded by SIMD when \|filter_length\| is not divisible by 4.

195 // mask[0] is not used in following algorithm.	192 // mask[0] is not used in following algorithm.

196 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);	193 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);

197 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);	194 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);

198 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);	195 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);

199	196

200 // Output one pixel each iteration, calculating all channels (RGBA) together.	197 // Output one pixel each iteration, calculating all channels (RGBA) together .

201 for (int out_x = 0; out_x < num_values; out_x++) {	198 for (int out_x = 0; out_x < num_values; out_x++) {

202 const SkConvolutionFilter1D::ConvolutionFixed* filter_values =	199 const SkConvolutionFilter1D::ConvolutionFixed* filter_values =

203 filter.FilterForValue(out_x, &filter_offset, &filter_length);	200 filter.FilterForValue(out_x, &filter_offset, &filter_length);

204	201

205 __m128i accum = _mm_setzero_si128();	202 __m128i accum = _mm_setzero_si128();

206	203

207 // Compute the first pixel in this row that the filter affects. It will	204 // Compute the first pixel in this row that the filter affects. It will

208 // touch \|filter_length\| pixels (4 bytes each) after this.	205 // touch \|filter_length\| pixels (4 bytes each) after this.

209 const __m128i* row_to_filter =	206 const __m128i* row_to_filter =

210 reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);	207 reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);

211	208

212 // We will load and accumulate with four coefficients per iteration.	209 // We will load and accumulate with four coefficients per iteration.

213 for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {	210 for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {

214	211

215 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.	212 // Load 4 coefficients => duplicate 1st and 2nd of them for all chan nels.

216 __m128i coeff, coeff16;	213 __m128i coeff, coeff16;

217 // [16] xx xx xx xx c3 c2 c1 c0	214 // [16] xx xx xx xx c3 c2 c1 c0

218 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));	215 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_valu es));

219 // [16] xx xx xx xx c1 c1 c0 c0	216 // [16] xx xx xx xx c1 c1 c0 c0

220 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));	217 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

221 // [16] c1 c1 c1 c1 c0 c0 c0 c0	218 // [16] c1 c1 c1 c1 c0 c0 c0 c0

222 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);	219 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

223	220

224 // Load four pixels => unpack the first two pixels to 16 bits =>	221 // Load four pixels => unpack the first two pixels to 16 bits =>

225 // multiply with coefficients => accumulate the convolution result.	222 // multiply with coefficients => accumulate the convolution result.

226 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0	223 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

227 __m128i src8 = _mm_loadu_si128(row_to_filter);	224 __m128i src8 = _mm_loadu_si128(row_to_filter);

228 // [16] a1 b1 g1 r1 a0 b0 g0 r0	225 // [16] a1 b1 g1 r1 a0 b0 g0 r0

229 __m128i src16 = _mm_unpacklo_epi8(src8, zero);	226 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

230 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);	227 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

231 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);	228 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

232 // [32] a0c0 b0c0 g0c0 r0c0	229 // [32] a0c0 b0c0 g0c0 r0c0

233 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);	230 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

234 accum = _mm_add_epi32(accum, t);	231 accum = _mm_add_epi32(accum, t);

235 // [32] a1c1 b1c1 g1c1 r1c1	232 // [32] a1c1 b1c1 g1c1 r1c1

236 t = _mm_unpackhi_epi16(mul_lo, mul_hi);	233 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

237 accum = _mm_add_epi32(accum, t);	234 accum = _mm_add_epi32(accum, t);

238	235

239 // Duplicate 3rd and 4th coefficients for all channels =>	236 // Duplicate 3rd and 4th coefficients for all channels =>

240 // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients	237 // unpack the 3rd and 4th pixels to 16 bits => multiply with coeffic ients

241 // => accumulate the convolution results.	238 // => accumulate the convolution results.

242 // [16] xx xx xx xx c3 c3 c2 c2	239 // [16] xx xx xx xx c3 c3 c2 c2

243 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));	240 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

244 // [16] c3 c3 c3 c3 c2 c2 c2 c2	241 // [16] c3 c3 c3 c3 c2 c2 c2 c2

245 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);	242 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

246 // [16] a3 g3 b3 r3 a2 g2 b2 r2	243 // [16] a3 g3 b3 r3 a2 g2 b2 r2

247 src16 = _mm_unpackhi_epi8(src8, zero);	244 src16 = _mm_unpackhi_epi8(src8, zero);

248 mul_hi = _mm_mulhi_epi16(src16, coeff16);	245 mul_hi = _mm_mulhi_epi16(src16, coeff16);

249 mul_lo = _mm_mullo_epi16(src16, coeff16);	246 mul_lo = _mm_mullo_epi16(src16, coeff16);

250 // [32] a2c2 b2c2 g2c2 r2c2	247 // [32] a2c2 b2c2 g2c2 r2c2

251 t = _mm_unpacklo_epi16(mul_lo, mul_hi);	248 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

252 accum = _mm_add_epi32(accum, t);	249 accum = _mm_add_epi32(accum, t);

253 // [32] a3c3 b3c3 g3c3 r3c3	250 // [32] a3c3 b3c3 g3c3 r3c3

254 t = _mm_unpackhi_epi16(mul_lo, mul_hi);	251 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

255 accum = _mm_add_epi32(accum, t);	252 accum = _mm_add_epi32(accum, t);

256	253

257 // Advance the pixel and coefficients pointers.	254 // Advance the pixel and coefficients pointers.

258 row_to_filter += 1;	255 row_to_filter += 1;

259 filter_values += 4;	256 filter_values += 4;

	257 }

	258

	259 // When \|filter_length\| is not divisible by 4, we need to decimate some of

	260 // the filter coefficient that was loaded incorrectly to zero; Other tha n

	261 // that the algorithm is same with above, exceot that the 4th pixel will be

	262 // always absent.

	263 int r = filter_length&3;

	264 if (r) {

	265 // Note: filter_values must be padded to align_up(filter_offset, 8).

	266 __m128i coeff, coeff16;

	267 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_valu es));

	268 // Mask out extra filter taps.

	269 coeff = _mm_and_si128(coeff, mask[r]);

	270 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	271 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	272

	273 // Note: line buffer must be padded to align_up(filter_offset, 16).

	274 // We resolve this by use C-version for the last horizontal line.

	275 __m128i src8 = _mm_loadu_si128(row_to_filter);

	276 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	277 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	278 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	279 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	280 accum = _mm_add_epi32(accum, t);

	281 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	282 accum = _mm_add_epi32(accum, t);

	283

	284 src16 = _mm_unpackhi_epi8(src8, zero);

	285 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	286 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

	287 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	288 mul_lo = _mm_mullo_epi16(src16, coeff16);

	289 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	290 accum = _mm_add_epi32(accum, t);

	291 }

	292

	293 // Shift right for fixed point implementation.

	294 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);

	295

	296 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation).

	297 accum = _mm_packs_epi32(accum, zero);

	298 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturation).

	299 accum = _mm_packus_epi16(accum, zero);

	300

	301 // Store the pixel value of 32 bits.

	302 (reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum);

	303 out_row += 4;

260 }	304 }

261

262 // When \|filter_length\| is not divisible by 4, we need to decimate some of

263 // the filter coefficient that was loaded incorrectly to zero; Other than

264 // that the algorithm is same with above, exceot that the 4th pixel will be

265 // always absent.

266 int r = filter_length&3;

267 if (r) {

268 // Note: filter_values must be padded to align_up(filter_offset, 8).

269 __m128i coeff, coeff16;

270 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

271 // Mask out extra filter taps.

272 coeff = _mm_and_si128(coeff, mask[r]);

273 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

274 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

275

276 // Note: line buffer must be padded to align_up(filter_offset, 16).

277 // We resolve this by use C-version for the last horizontal line.

278 __m128i src8 = _mm_loadu_si128(row_to_filter);

279 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

280 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

281 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

282 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

283 accum = _mm_add_epi32(accum, t);

284 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

285 accum = _mm_add_epi32(accum, t);

286

287 src16 = _mm_unpackhi_epi8(src8, zero);

288 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

289 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

290 mul_hi = _mm_mulhi_epi16(src16, coeff16);

291 mul_lo = _mm_mullo_epi16(src16, coeff16);

292 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

293 accum = _mm_add_epi32(accum, t);

294 }

295

296 // Shift right for fixed point implementation.

297 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);

298

299 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation).

300 accum = _mm_packs_epi32(accum, zero);

301 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturation).

302 accum = _mm_packus_epi16(accum, zero);

303

304 // Store the pixel value of 32 bits.

305 (reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum);

306 out_row += 4;

307 }

308 }	305 }

309	306

310 // Convolves horizontally along four rows. The row data is given in	307 // Convolves horizontally along four rows. The row data is given in

311 // \|src_data\| and continues for the num_values() of the filter.	308 // \|src_data\| and continues for the num_values() of the filter.

312 // The algorithm is almost same as \|ConvolveHorizontally_SSE2\|. Please	309 // The algorithm is almost same as \|ConvolveHorizontally_SSE2\|. Please

313 // refer to that function for detailed comments.	310 // refer to that function for detailed comments.

314 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],	311 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],

315 const SkConvolutionFilter1D& filter,	312 const SkConvolutionFilter1D& filter,

316 unsigned char* out_row[4]) {	313 unsigned char* out_row[4]) {

317 int num_values = filter.numValues();	314 int num_values = filter.numValues();

318	315

319 int filter_offset, filter_length;	316 int filter_offset, filter_length;

320 __m128i zero = _mm_setzero_si128();	317 __m128i zero = _mm_setzero_si128();

321 __m128i mask[4];	318 __m128i mask[4];

322 // \|mask\| will be used to decimate all extra filter coefficients that are	319 // \|mask\| will be used to decimate all extra filter coefficients that are

323 // loaded by SIMD when \|filter_length\| is not divisible by 4.	320 // loaded by SIMD when \|filter_length\| is not divisible by 4.

324 // mask[0] is not used in following algorithm.	321 // mask[0] is not used in following algorithm.

325 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);	322 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);

326 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);	323 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);

327 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);	324 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);

328	325

329 // Output one pixel each iteration, calculating all channels (RGBA) together.	326 // Output one pixel each iteration, calculating all channels (RGBA) together .

330 for (int out_x = 0; out_x < num_values; out_x++) {	327 for (int out_x = 0; out_x < num_values; out_x++) {

331 const SkConvolutionFilter1D::ConvolutionFixed* filter_values =	328 const SkConvolutionFilter1D::ConvolutionFixed* filter_values =

332 filter.FilterForValue(out_x, &filter_offset, &filter_length);	329 filter.FilterForValue(out_x, &filter_offset, &filter_length);

333	330

334 // four pixels in a column per iteration.	331 // four pixels in a column per iteration.

335 __m128i accum0 = _mm_setzero_si128();	332 __m128i accum0 = _mm_setzero_si128();

336 __m128i accum1 = _mm_setzero_si128();	333 __m128i accum1 = _mm_setzero_si128();

337 __m128i accum2 = _mm_setzero_si128();	334 __m128i accum2 = _mm_setzero_si128();

338 __m128i accum3 = _mm_setzero_si128();	335 __m128i accum3 = _mm_setzero_si128();

339 int start = (filter_offset<<2);	336 int start = (filter_offset<<2);

340 // We will load and accumulate with four coefficients per iteration.	337 // We will load and accumulate with four coefficients per iteration.

341 for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {	338 for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {

342 __m128i coeff, coeff16lo, coeff16hi;	339 __m128i coeff, coeff16lo, coeff16hi;

343 // [16] xx xx xx xx c3 c2 c1 c0	340 // [16] xx xx xx xx c3 c2 c1 c0

344 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));	341 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_valu es));

345 // [16] xx xx xx xx c1 c1 c0 c0	342 // [16] xx xx xx xx c1 c1 c0 c0

346 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));	343 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

347 // [16] c1 c1 c1 c1 c0 c0 c0 c0	344 // [16] c1 c1 c1 c1 c0 c0 c0 c0

348 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);	345 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);

349 // [16] xx xx xx xx c3 c3 c2 c2	346 // [16] xx xx xx xx c3 c3 c2 c2

350 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));	347 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

351 // [16] c3 c3 c3 c3 c2 c2 c2 c2	348 // [16] c3 c3 c3 c3 c2 c2 c2 c2

352 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);	349 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);

353	350

354 __m128i src8, src16, mul_hi, mul_lo, t;	351 __m128i src8, src16, mul_hi, mul_lo, t;

355	352

356 #define ITERATION(src, accum) \	353 #define ITERATION(src, accum) \

357 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \	354 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \

358 src16 = _mm_unpacklo_epi8(src8, zero); \	355 src16 = _mm_unpacklo_epi8(src8, zero); \

359 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \	356 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \

360 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \	357 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \

361 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \	358 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \

362 accum = _mm_add_epi32(accum, t); \	359 accum = _mm_add_epi32(accum, t); \

363 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \	360 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \

364 accum = _mm_add_epi32(accum, t); \	361 accum = _mm_add_epi32(accum, t); \

365 src16 = _mm_unpackhi_epi8(src8, zero); \	362 src16 = _mm_unpackhi_epi8(src8, zero); \

366 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \	363 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \

367 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \	364 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \

368 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \	365 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \

369 accum = _mm_add_epi32(accum, t); \	366 accum = _mm_add_epi32(accum, t); \

370 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \	367 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \

371 accum = _mm_add_epi32(accum, t)	368 accum = _mm_add_epi32(accum, t)

372	369

373 ITERATION(src_data[0] + start, accum0);	370 ITERATION(src_data[0] + start, accum0);

374 ITERATION(src_data[1] + start, accum1);	371 ITERATION(src_data[1] + start, accum1);

375 ITERATION(src_data[2] + start, accum2);	372 ITERATION(src_data[2] + start, accum2);

376 ITERATION(src_data[3] + start, accum3);	373 ITERATION(src_data[3] + start, accum3);

377	374

378 start += 16;	375 start += 16;

379 filter_values += 4;	376 filter_values += 4;

	377 }

	378

	379 int r = filter_length & 3;

	380 if (r) {

	381 // Note: filter_values must be padded to align_up(filter_offset, 8);

	382 __m128i coeff;

	383 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_valu es));

	384 // Mask out extra filter taps.

	385 coeff = _mm_and_si128(coeff, mask[r]);

	386

	387 __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

	388 /* c1 c1 c1 c1 c0 c0 c0 c0 */

	389 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);

	390 __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

	391 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);

	392

	393 __m128i src8, src16, mul_hi, mul_lo, t;

	394

	395 ITERATION(src_data[0] + start, accum0);

	396 ITERATION(src_data[1] + start, accum1);

	397 ITERATION(src_data[2] + start, accum2);

	398 ITERATION(src_data[3] + start, accum3);

	399 }

	400

	401 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);

	402 accum0 = _mm_packs_epi32(accum0, zero);

	403 accum0 = _mm_packus_epi16(accum0, zero);

	404 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);

	405 accum1 = _mm_packs_epi32(accum1, zero);

	406 accum1 = _mm_packus_epi16(accum1, zero);

	407 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);

	408 accum2 = _mm_packs_epi32(accum2, zero);

	409 accum2 = _mm_packus_epi16(accum2, zero);

	410 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);

	411 accum3 = _mm_packs_epi32(accum3, zero);

	412 accum3 = _mm_packus_epi16(accum3, zero);

	413

	414 (reinterpret_cast<int>(out_row[0])) = _mm_cvtsi128_si32(accum0);

	415 (reinterpret_cast<int>(out_row[1])) = _mm_cvtsi128_si32(accum1);

	416 (reinterpret_cast<int>(out_row[2])) = _mm_cvtsi128_si32(accum2);

	417 (reinterpret_cast<int>(out_row[3])) = _mm_cvtsi128_si32(accum3);

	418

	419 out_row[0] += 4;

	420 out_row[1] += 4;

	421 out_row[2] += 4;

	422 out_row[3] += 4;

380 }	423 }

381

382 int r = filter_length & 3;

383 if (r) {

384 // Note: filter_values must be padded to align_up(filter_offset, 8);

385 __m128i coeff;

386 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

387 // Mask out extra filter taps.

388 coeff = _mm_and_si128(coeff, mask[r]);

389

390 __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

391 /* c1 c1 c1 c1 c0 c0 c0 c0 */

392 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);

393 __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

394 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);

395

396 __m128i src8, src16, mul_hi, mul_lo, t;

397

398 ITERATION(src_data[0] + start, accum0);

399 ITERATION(src_data[1] + start, accum1);

400 ITERATION(src_data[2] + start, accum2);

401 ITERATION(src_data[3] + start, accum3);

402 }

403

404 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);

405 accum0 = _mm_packs_epi32(accum0, zero);

406 accum0 = _mm_packus_epi16(accum0, zero);

407 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);

408 accum1 = _mm_packs_epi32(accum1, zero);

409 accum1 = _mm_packus_epi16(accum1, zero);

410 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);

411 accum2 = _mm_packs_epi32(accum2, zero);

412 accum2 = _mm_packus_epi16(accum2, zero);

413 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);

414 accum3 = _mm_packs_epi32(accum3, zero);

415 accum3 = _mm_packus_epi16(accum3, zero);

416

417 (reinterpret_cast<int>(out_row[0])) = _mm_cvtsi128_si32(accum0);

418 (reinterpret_cast<int>(out_row[1])) = _mm_cvtsi128_si32(accum1);

419 (reinterpret_cast<int>(out_row[2])) = _mm_cvtsi128_si32(accum2);

420 (reinterpret_cast<int>(out_row[3])) = _mm_cvtsi128_si32(accum3);

421

422 out_row[0] += 4;

423 out_row[1] += 4;

424 out_row[2] += 4;

425 out_row[3] += 4;

426 }

427 }	424 }

428	425

429 // Does vertical convolution to produce one output row. The filter values and	426 // Does vertical convolution to produce one output row. The filter values and

430 // length are given in the first two parameters. These are applied to each	427 // length are given in the first two parameters. These are applied to each

431 // of the rows pointed to in the \|source_data_rows\| array, with each row	428 // of the rows pointed to in the \|source_data_rows\| array, with each row

432 // being \|pixel_width\| wide.	429 // being \|pixel_width\| wide.

433 //	430 //

434 // The output must have room for \|pixel_width * 4\| bytes.	431 // The output must have room for \|pixel_width * 4\| bytes.

435 template<bool has_alpha>	432 template<bool has_alpha>

436 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filt er_values,	433 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filt er_values,

437 int filter_length,	434 int filter_length,

438 unsigned char* const* source_data_rows,	435 unsigned char* const* source_data_rows,

439 int pixel_width,	436 int pixel_width,

440 unsigned char* out_row) {	437 unsigned char* out_row) {

441 int width = pixel_width & ~3;	438 int width = pixel_width & ~3;

442	439

443 __m128i zero = _mm_setzero_si128();	440 __m128i zero = _mm_setzero_si128();

444 __m128i accum0, accum1, accum2, accum3, coeff16;	441 __m128i accum0, accum1, accum2, accum3, coeff16;

445 const __m128i* src;	442 const __m128i* src;

446 // Output four pixels per iteration (16 bytes).	443 // Output four pixels per iteration (16 bytes).

447 for (int out_x = 0; out_x < width; out_x += 4) {	444 for (int out_x = 0; out_x < width; out_x += 4) {

448	445

449 // Accumulated result for each pixel. 32 bits per RGBA channel.	446 // Accumulated result for each pixel. 32 bits per RGBA channel.

450 accum0 = _mm_setzero_si128();	447 accum0 = _mm_setzero_si128();

451 accum1 = _mm_setzero_si128();	448 accum1 = _mm_setzero_si128();

452 accum2 = _mm_setzero_si128();	449 accum2 = _mm_setzero_si128();

453 accum3 = _mm_setzero_si128();	450 accum3 = _mm_setzero_si128();

454	451

455 // Convolve with one filter coefficient per iteration.	452 // Convolve with one filter coefficient per iteration.

456 for (int filter_y = 0; filter_y < filter_length; filter_y++) {	453 for (int filter_y = 0; filter_y < filter_length; filter_y++) {

457	454

458 // Duplicate the filter coefficient 8 times.	455 // Duplicate the filter coefficient 8 times.

459 // [16] cj cj cj cj cj cj cj cj	456 // [16] cj cj cj cj cj cj cj cj

460 coeff16 = _mm_set1_epi16(filter_values[filter_y]);	457 coeff16 = _mm_set1_epi16(filter_values[filter_y]);

461	458

462 // Load four pixels (16 bytes) together.	459 // Load four pixels (16 bytes) together.

463 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0	460 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

464 src = reinterpret_cast<const __m128i*>(	461 src = reinterpret_cast<const __m128i*>(

465 &source_data_rows[filter_y][out_x << 2]);	462 &source_data_rows[filter_y][out_x << 2]);

466 __m128i src8 = _mm_loadu_si128(src);	463 __m128i src8 = _mm_loadu_si128(src);

467	464

468 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>	465 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channel s =>

469 // multiply with current coefficient => accumulate the result.	466 // multiply with current coefficient => accumulate the result.

470 // [16] a1 b1 g1 r1 a0 b0 g0 r0	467 // [16] a1 b1 g1 r1 a0 b0 g0 r0

471 __m128i src16 = _mm_unpacklo_epi8(src8, zero);	468 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

472 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);	469 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

473 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);	470 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

474 // [32] a0 b0 g0 r0	471 // [32] a0 b0 g0 r0

475 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);	472 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

476 accum0 = _mm_add_epi32(accum0, t);	473 accum0 = _mm_add_epi32(accum0, t);

477 // [32] a1 b1 g1 r1	474 // [32] a1 b1 g1 r1

478 t = _mm_unpackhi_epi16(mul_lo, mul_hi);	475 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

479 accum1 = _mm_add_epi32(accum1, t);	476 accum1 = _mm_add_epi32(accum1, t);

480	477

481 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>	478 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channel s =>

482 // multiply with current coefficient => accumulate the result.	479 // multiply with current coefficient => accumulate the result.

483 // [16] a3 b3 g3 r3 a2 b2 g2 r2	480 // [16] a3 b3 g3 r3 a2 b2 g2 r2

484 src16 = _mm_unpackhi_epi8(src8, zero);	481 src16 = _mm_unpackhi_epi8(src8, zero);

485 mul_hi = _mm_mulhi_epi16(src16, coeff16);	482 mul_hi = _mm_mulhi_epi16(src16, coeff16);

486 mul_lo = _mm_mullo_epi16(src16, coeff16);	483 mul_lo = _mm_mullo_epi16(src16, coeff16);

487 // [32] a2 b2 g2 r2	484 // [32] a2 b2 g2 r2

488 t = _mm_unpacklo_epi16(mul_lo, mul_hi);	485 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

489 accum2 = _mm_add_epi32(accum2, t);	486 accum2 = _mm_add_epi32(accum2, t);

490 // [32] a3 b3 g3 r3	487 // [32] a3 b3 g3 r3

491 t = _mm_unpackhi_epi16(mul_lo, mul_hi);	488 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

492 accum3 = _mm_add_epi32(accum3, t);	489 accum3 = _mm_add_epi32(accum3, t);

	490 }

	491

	492 // Shift right for fixed point implementation.

	493 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);

	494 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);

	495 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);

	496 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);

	497

	498 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation).

	499 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	500 accum0 = _mm_packs_epi32(accum0, accum1);

	501 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	502 accum2 = _mm_packs_epi32(accum2, accum3);

	503

	504 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturation).

	505 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	506 accum0 = _mm_packus_epi16(accum0, accum2);

	507

	508 if (has_alpha) {

	509 // Compute the max(ri, gi, bi) for each pixel.

	510 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	511 __m128i a = _mm_srli_epi32(accum0, 8);

	512 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	513 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.

	514 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	515 a = _mm_srli_epi32(accum0, 16);

	516 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	517 b = _mm_max_epu8(a, b); // Max of r and g and b.

	518 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	519 b = _mm_slli_epi32(b, 24);

	520

	521 // Make sure the value of alpha channel is always larger than maximu m

	522 // value of color channels.

	523 accum0 = _mm_max_epu8(b, accum0);

	524 } else {

	525 // Set value of alpha channels to 0xFF.

	526 __m128i mask = _mm_set1_epi32(0xff000000);

	527 accum0 = _mm_or_si128(accum0, mask);

	528 }

	529

	530 // Store the convolution result (16 bytes) and advance the pixel pointer s.

	531 _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);

	532 out_row += 16;

493 }	533 }

494	534

495 // Shift right for fixed point implementation.	535 // When the width of the output is not divisible by 4, We need to save one

496 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);	536 // pixel (4 bytes) each time. And also the fourth pixel is always absent.

497 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);	537 if (pixel_width & 3) {

498 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);	538 accum0 = _mm_setzero_si128();

499 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);	539 accum1 = _mm_setzero_si128();

	540 accum2 = _mm_setzero_si128();

	541 for (int filter_y = 0; filter_y < filter_length; ++filter_y) {

	542 coeff16 = _mm_set1_epi16(filter_values[filter_y]);

	543 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	544 src = reinterpret_cast<const __m128i*>(

	545 &source_data_rows[filter_y][width<<2]);

	546 __m128i src8 = _mm_loadu_si128(src);

	547 // [16] a1 b1 g1 r1 a0 b0 g0 r0

	548 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

	549 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

	550 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

	551 // [32] a0 b0 g0 r0

	552 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	553 accum0 = _mm_add_epi32(accum0, t);

	554 // [32] a1 b1 g1 r1

	555 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

	556 accum1 = _mm_add_epi32(accum1, t);

	557 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	558 src16 = _mm_unpackhi_epi8(src8, zero);

	559 mul_hi = _mm_mulhi_epi16(src16, coeff16);

	560 mul_lo = _mm_mullo_epi16(src16, coeff16);

	561 // [32] a2 b2 g2 r2

	562 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

	563 accum2 = _mm_add_epi32(accum2, t);

	564 }

500	565

501 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation).	566 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);

502 // [16] a1 b1 g1 r1 a0 b0 g0 r0	567 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);

503 accum0 = _mm_packs_epi32(accum0, accum1);	568 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);

504 // [16] a3 b3 g3 r3 a2 b2 g2 r2	569 // [16] a1 b1 g1 r1 a0 b0 g0 r0

505 accum2 = _mm_packs_epi32(accum2, accum3);	570 accum0 = _mm_packs_epi32(accum0, accum1);

	571 // [16] a3 b3 g3 r3 a2 b2 g2 r2

	572 accum2 = _mm_packs_epi32(accum2, zero);

	573 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	574 accum0 = _mm_packus_epi16(accum0, accum2);

	575 if (has_alpha) {

	576 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	577 __m128i a = _mm_srli_epi32(accum0, 8);

	578 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	579 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.

	580 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	581 a = _mm_srli_epi32(accum0, 16);

	582 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	583 b = _mm_max_epu8(a, b); // Max of r and g and b.

	584 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	585 b = _mm_slli_epi32(b, 24);

	586 accum0 = _mm_max_epu8(b, accum0);

	587 } else {

	588 __m128i mask = _mm_set1_epi32(0xff000000);

	589 accum0 = _mm_or_si128(accum0, mask);

	590 }

506	591

507 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturation).	592 for (int out_x = width; out_x < pixel_width; out_x++) {

508 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0	593 (reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum0);

509 accum0 = _mm_packus_epi16(accum0, accum2);	594 accum0 = _mm_srli_si128(accum0, 4);

510	595 out_row += 4;

511 if (has_alpha) {	596 }

512 // Compute the max(ri, gi, bi) for each pixel.

513 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

514 __m128i a = _mm_srli_epi32(accum0, 8);

515 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

516 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.

517 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

518 a = _mm_srli_epi32(accum0, 16);

519 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

520 b = _mm_max_epu8(a, b); // Max of r and g and b.

521 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

522 b = _mm_slli_epi32(b, 24);

523

524 // Make sure the value of alpha channel is always larger than maximum

525 // value of color channels.

526 accum0 = _mm_max_epu8(b, accum0);

527 } else {

528 // Set value of alpha channels to 0xFF.

529 __m128i mask = _mm_set1_epi32(0xff000000);

530 accum0 = _mm_or_si128(accum0, mask);

531 }	597 }

532

533 // Store the convolution result (16 bytes) and advance the pixel pointers.

534 _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);

535 out_row += 16;

536 }

537

538 // When the width of the output is not divisible by 4, We need to save one

539 // pixel (4 bytes) each time. And also the fourth pixel is always absent.

540 if (pixel_width & 3) {

541 accum0 = _mm_setzero_si128();

542 accum1 = _mm_setzero_si128();

543 accum2 = _mm_setzero_si128();

544 for (int filter_y = 0; filter_y < filter_length; ++filter_y) {

545 coeff16 = _mm_set1_epi16(filter_values[filter_y]);

546 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

547 src = reinterpret_cast<const __m128i*>(

548 &source_data_rows[filter_y][width<<2]);

549 __m128i src8 = _mm_loadu_si128(src);

550 // [16] a1 b1 g1 r1 a0 b0 g0 r0

551 __m128i src16 = _mm_unpacklo_epi8(src8, zero);

552 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

553 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

554 // [32] a0 b0 g0 r0

555 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

556 accum0 = _mm_add_epi32(accum0, t);

557 // [32] a1 b1 g1 r1

558 t = _mm_unpackhi_epi16(mul_lo, mul_hi);

559 accum1 = _mm_add_epi32(accum1, t);

560 // [16] a3 b3 g3 r3 a2 b2 g2 r2

561 src16 = _mm_unpackhi_epi8(src8, zero);

562 mul_hi = _mm_mulhi_epi16(src16, coeff16);

563 mul_lo = _mm_mullo_epi16(src16, coeff16);

564 // [32] a2 b2 g2 r2

565 t = _mm_unpacklo_epi16(mul_lo, mul_hi);

566 accum2 = _mm_add_epi32(accum2, t);

567 }

568

569 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);

570 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);

571 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);

572 // [16] a1 b1 g1 r1 a0 b0 g0 r0

573 accum0 = _mm_packs_epi32(accum0, accum1);

574 // [16] a3 b3 g3 r3 a2 b2 g2 r2

575 accum2 = _mm_packs_epi32(accum2, zero);

576 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

577 accum0 = _mm_packus_epi16(accum0, accum2);

578 if (has_alpha) {

579 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

580 __m128i a = _mm_srli_epi32(accum0, 8);

581 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

582 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.

583 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

584 a = _mm_srli_epi32(accum0, 16);

585 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

586 b = _mm_max_epu8(a, b); // Max of r and g and b.

587 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

588 b = _mm_slli_epi32(b, 24);

589 accum0 = _mm_max_epu8(b, accum0);

590 } else {

591 __m128i mask = _mm_set1_epi32(0xff000000);

592 accum0 = _mm_or_si128(accum0, mask);

593 }

594

595 for (int out_x = width; out_x < pixel_width; out_x++) {

596 (reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum0);

597 accum0 = _mm_srli_si128(accum0, 4);

598 out_row += 4;

599 }

600 }

601 }	598 }

602	599

603 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filt er_values,	600 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filt er_values,

604 int filter_length,	601 int filter_length,

605 unsigned char* const* source_data_rows,	602 unsigned char* const* source_data_rows,

606 int pixel_width,	603 int pixel_width,

607 unsigned char* out_row,	604 unsigned char* out_row,

608 bool has_alpha) {	605 bool has_alpha) {

609 if (has_alpha) {	606 if (has_alpha) {

610 convolveVertically_SSE2<true>(filter_values,	607 convolveVertically_SSE2<true>(filter_values,

611 filter_length,	608 filter_length,

612 source_data_rows,	609 source_data_rows,

613 pixel_width,	610 pixel_width,

614 out_row);	611 out_row);

615 } else {	612 } else {

616 convolveVertically_SSE2<false>(filter_values,	613 convolveVertically_SSE2<false>(filter_values,

617 filter_length,	614 filter_length,

618 source_data_rows,	615 source_data_rows,

619 pixel_width,	616 pixel_width,

620 out_row);	617 out_row);

621 }	618 }

622 }	619 }

623	620

624 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {	621 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {

625 // Padding \|paddingCount\| of more dummy coefficients after the coefficients	622 // Padding \|paddingCount\| of more dummy coefficients after the coefficients

626 // of last filter to prevent SIMD instructions which load 8 or 16 bytes	623 // of last filter to prevent SIMD instructions which load 8 or 16 bytes

627 // together to access invalid memory areas. We are not trying to align the	624 // together to access invalid memory areas. We are not trying to align the

628 // coefficients right now due to the opaqueness of <vector> implementation.	625 // coefficients right now due to the opaqueness of <vector> implementation.

629 // This has to be done after all \|AddFilter\| calls.	626 // This has to be done after all \|AddFilter\| calls.

630 for (int i = 0; i < 8; ++i) {	627 for (int i = 0; i < 8; ++i) {

631 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix ed>(0));	628 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix ed>(0));

632 }	629 }

633 }	630 }

OLD	NEW

« no previous file with comments | « src/opts/SkBitmapFilter_opts_SSE2.h ('k') | src/opts/SkBitmapProcState_opts_SSE2.h » ('j') | no next file with comments »