Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(28)

Side by Side Diff: skia/ext/convolver.cc

Issue 6334070: SIMD implementation of Convolver for Lanczos filter etc. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: oops Created 9 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include <algorithm> 5 #include <algorithm>
6 6
7 #include "skia/ext/convolver.h" 7 #include "skia/ext/convolver.h"
8 #include "third_party/skia/include/core/SkTypes.h" 8 #include "third_party/skia/include/core/SkTypes.h"
9 9
10 #if defined(ARCH_CPU_X86_FAMILY)
11 #include <emmintrin.h> // ARCH_CPU_X86_FAMILY was defined in build/config.h
12 #endif
13
10 namespace skia { 14 namespace skia {
11 15
12 namespace { 16 namespace {
13 17
14 // Converts the argument to an 8-bit unsigned value by clamping to the range 18 // Converts the argument to an 8-bit unsigned value by clamping to the range
15 // 0-255. 19 // 0-255.
16 inline unsigned char ClampTo8(int a) { 20 inline unsigned char ClampTo8(int a) {
17 if (static_cast<unsigned>(a) < 256) 21 if (static_cast<unsigned>(a) < 256)
18 return a; // Avoid the extra check in the common case. 22 return a; // Avoid the extra check in the common case.
19 if (a < 0) 23 if (a < 0)
(...skipping 172 matching lines...) Expand 10 before | Expand all | Expand 10 after
192 if (has_alpha) 196 if (has_alpha)
193 accum[3] >>= ConvolutionFilter1D::kShiftBits; 197 accum[3] >>= ConvolutionFilter1D::kShiftBits;
194 198
195 // Store the new pixel. 199 // Store the new pixel.
196 out_row[byte_offset + 0] = ClampTo8(accum[0]); 200 out_row[byte_offset + 0] = ClampTo8(accum[0]);
197 out_row[byte_offset + 1] = ClampTo8(accum[1]); 201 out_row[byte_offset + 1] = ClampTo8(accum[1]);
198 out_row[byte_offset + 2] = ClampTo8(accum[2]); 202 out_row[byte_offset + 2] = ClampTo8(accum[2]);
199 if (has_alpha) { 203 if (has_alpha) {
200 unsigned char alpha = ClampTo8(accum[3]); 204 unsigned char alpha = ClampTo8(accum[3]);
201 205
202 // Make sure the alpha channel doesn't come out larger than any of the 206 // Make sure the alpha channel doesn't come out smaller than any of the
203 // color channels. We use premultipled alpha channels, so this should 207 // color channels. We use premultipled alpha channels, so this should
204 // never happen, but rounding errors will cause this from time to time. 208 // never happen, but rounding errors will cause this from time to time.
205 // These "impossible" colors will cause overflows (and hence random pixel 209 // These "impossible" colors will cause overflows (and hence random pixel
206 // values) when the resulting bitmap is drawn to the screen. 210 // values) when the resulting bitmap is drawn to the screen.
207 // 211 //
208 // We only need to do this when generating the final output row (here). 212 // We only need to do this when generating the final output row (here).
209 int max_color_channel = std::max(out_row[byte_offset + 0], 213 int max_color_channel = std::max(out_row[byte_offset + 0],
210 std::max(out_row[byte_offset + 1], out_row[byte_offset + 2])); 214 std::max(out_row[byte_offset + 1], out_row[byte_offset + 2]));
211 if (alpha < max_color_channel) 215 if (alpha < max_color_channel)
212 out_row[byte_offset + 3] = max_color_channel; 216 out_row[byte_offset + 3] = max_color_channel;
213 else 217 else
214 out_row[byte_offset + 3] = alpha; 218 out_row[byte_offset + 3] = alpha;
215 } else { 219 } else {
216 // No alpha channel, the image is opaque. 220 // No alpha channel, the image is opaque.
217 out_row[byte_offset + 3] = 0xff; 221 out_row[byte_offset + 3] = 0xff;
218 } 222 }
219 } 223 }
220 } 224 }
221 225
226
227 // Convolves horizontally along a single row. The row data is given in
228 // |src_data| and continues for the num_values() of the filter.
229 void ConvolveHorizontally_SSE2(const unsigned char* src_data,
230 const ConvolutionFilter1D& filter,
231 unsigned char* out_row) {
232 #ifdef ARCH_CPU_X86_FAMILY
233 int num_values = filter.num_values();
234
235 int filter_offset, filter_length;
236 __m128i zero = _mm_setzero_si128();
237 __m128i mask[4];
brettw 2011/02/21 04:45:45 What's mask[0] for? Can you provide a comment for
jiesun 2011/02/22 21:37:03 yes, mask[0] is not used.
238 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
239 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
240 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
241
242 for (int out_x = 0; out_x < num_values; out_x += 1) {
243 const ConvolutionFilter1D::Fixed* filter_values =
244 filter.FilterForValue(out_x, &filter_offset, &filter_length);
245
246 __m128i accum = _mm_setzero_si128();
247
248 const __m128i* row_to_filter =
brettw 2011/02/21 04:45:45 Can you comment what this means?
jiesun 2011/02/22 21:37:03 Done.
249 reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
250 // Four filter taps per iteration.
251 for (int j = 0; j < filter_length >> 2; ++j) {
252 __m128i coeff, coeff16;
brettw 2011/02/21 04:45:45 For each of the "blocks" of SSE code you're writte
jiesun 2011/02/22 21:37:03 Done.
253 // [16] xx xx xx xx c3 c2 c1 c0
254 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
255 // [16] xx xx xx xx c1 c1 c0 c0
256 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
257 // [16] c1 c1 c1 c1 c0 c0 c0 c0
258 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
259
260 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
261 __m128i src8 = _mm_loadu_si128(row_to_filter);
262 // [16] a1 b1 g1 r1 a0 b0 g0 r0
263 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
264 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
265 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
266 // [32] a0*c0 b0*c0 g0*c0 r0*c0
267 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
268 accum = _mm_add_epi32(accum, t);
269 // [32] a1*c1 b1*c1 g1*c1 r1*c1
270 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
271 accum = _mm_add_epi32(accum, t);
272
273 // [16] xx xx xx xx c3 c3 c2 c2
274 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
275 // [16] c3 c3 c3 c3 c2 c2 c2 c2
276 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
277 // [16] a3 g3 b3 r3 a2 g2 b2 r2
278 src16 = _mm_unpackhi_epi8(src8, zero);
279 mul_hi = _mm_mulhi_epi16(src16, coeff16);
280 mul_lo = _mm_mullo_epi16(src16, coeff16);
281 // [32] a2*c2 b2*c2 g2*c2 r2*c2
282 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
283 accum = _mm_add_epi32(accum, t);
284 // [32] a3*c3 b3*c3 g3*c3 r3*c3
285 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
286 accum = _mm_add_epi32(accum, t);
287
288 row_to_filter += 1;
289 filter_values += 4;
290 }
291
292 // remaining
brettw 2011/02/21 04:45:45 Can you provide a better comment here?
jiesun 2011/02/22 21:37:03 Done.
293 int r = filter_length&3;
294 if (r) {
295 // Note: filter_values must be padded to align_up(filter_offset, 8).
296 __m128i coeff, coeff16;
297 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
298 // Mask out extra filter taps.
299 coeff = _mm_and_si128(coeff, mask[r]);
300 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
301 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
302
303 // Note: line buffer must be padded to align_up(filter_offset, 16).
304 // We resolve this by use C-version for the last horizontal line.
305 __m128i src8 = _mm_loadu_si128(row_to_filter);
306 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
307 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
308 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
309 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
310 accum = _mm_add_epi32(accum, t);
311 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
312 accum = _mm_add_epi32(accum, t);
313
314 src16 = _mm_unpackhi_epi8(src8, zero);
315 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
316 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
317 mul_hi = _mm_mulhi_epi16(src16, coeff16);
318 mul_lo = _mm_mullo_epi16(src16, coeff16);
319 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
320 accum = _mm_add_epi32(accum, t);
321 }
322
323 // shift right for fix point implementation before saturation.
324 accum = _mm_srai_epi32(accum, ConvolutionFilter1D::kShiftBits);
325 accum = _mm_packs_epi32(accum, zero);
326 accum = _mm_packus_epi16(accum, zero);
327
328 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
329 out_row += 4;
330 }
331 #endif
332 }
333
334 // Convolves horizontally along four rows. The row data is given in
335 // |src_data| and continues for the num_values() of the filter.
336 void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
337 const ConvolutionFilter1D& filter,
338 unsigned char* out_row[4]) {
339 #ifdef ARCH_CPU_X86_FAMILY
340 int width = filter.num_values();
341
342 int filter_offset, filter_length;
343 __m128i zero = _mm_setzero_si128();
344 __m128i mask[4];
345 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
346 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
347 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
348
349 for (int i = 0; i < width; ++i) {
350 const ConvolutionFilter1D::Fixed* filter_values =
351 filter.FilterForValue(i, &filter_offset, &filter_length);
352
353 // four pixels in a column per iteration.
354 __m128i accum0 = _mm_setzero_si128();
355 __m128i accum1 = _mm_setzero_si128();
356 __m128i accum2 = _mm_setzero_si128();
357 __m128i accum3 = _mm_setzero_si128();
358 int start = (filter_offset<<2);
359 for (int j = 0; j < (filter_length >> 2); ++j) {
360 __m128i coeff, coeff16lo, coeff16hi;
361 // [16] xx xx xx xx c3 c2 c1 c0
362 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
363 // [16] xx xx xx xx c1 c1 c0 c0
364 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
365 // [16] c1 c1 c1 c1 c0 c0 c0 c0
366 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
367 // [16] xx xx xx xx c3 c3 c2 c2
368 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
369 // [16] c3 c3 c3 c3 c2 c2 c2 c2
370 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
371
372 __m128i src8, src16, mul_hi, mul_lo, t;
373
374 #define ITERATION(src, accum) \
375 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \
376 src16 = _mm_unpacklo_epi8(src8, zero); \
377 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \
378 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \
379 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
380 accum = _mm_add_epi32(accum, t); \
381 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
382 accum = _mm_add_epi32(accum, t); \
383 src16 = _mm_unpackhi_epi8(src8, zero); \
384 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \
385 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \
386 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
387 accum = _mm_add_epi32(accum, t); \
388 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
389 accum = _mm_add_epi32(accum, t)
390
391 ITERATION(src_data[0]+start, accum0);
392 ITERATION(src_data[1]+start, accum1);
393 ITERATION(src_data[2]+start, accum2);
394 ITERATION(src_data[3]+start, accum3);
395
396 start += 16;
397 filter_values += 4;
398 }
399
400 int r = filter_length&3;
401 if (r) {
402 // Note: filter_values must be padded to align_up(filter_offset, 8);
403 __m128i coeff;
404 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
405 // Mask out extra filter taps.
406 coeff = _mm_and_si128(coeff, mask[r]);
407
408 __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
409 /* c1 c1 c1 c1 c0 c0 c0 c0 */
410 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
411 __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
412 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
413
414 __m128i src8, src16, mul_hi, mul_lo, t;
415
416 ITERATION(src_data[0]+start, accum0);
417 ITERATION(src_data[1]+start, accum1);
418 ITERATION(src_data[2]+start, accum2);
419 ITERATION(src_data[3]+start, accum3);
420 }
421
422 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
423 accum0 = _mm_packs_epi32(accum0, zero);
424 accum0 = _mm_packus_epi16(accum0, zero);
425 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
426 accum1 = _mm_packs_epi32(accum1, zero);
427 accum1 = _mm_packus_epi16(accum1, zero);
428 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
429 accum2 = _mm_packs_epi32(accum2, zero);
430 accum2 = _mm_packus_epi16(accum2, zero);
431 accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);
432 accum3 = _mm_packs_epi32(accum3, zero);
433 accum3 = _mm_packus_epi16(accum3, zero);
434
435 *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
436 *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
437 *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
438 *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
439
440 out_row[0] += 4;
441 out_row[1] += 4;
442 out_row[2] += 4;
443 out_row[3] += 4;
444 }
445 #endif
446 }
447
448 template<bool has_alpha>
449 void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
450 int filter_length,
brettw 2011/02/21 04:45:45 Check indentation
jiesun 2011/02/22 21:37:03 Done.
451 unsigned char* const* source_data_rows,
452 int pixel_width,
453 unsigned char* out_row) {
454 #ifdef ARCH_CPU_X86_FAMILY
455 int width = pixel_width & ~3;
456
457 __m128i zero = _mm_setzero_si128();
458 __m128i accum0, accum1, accum2, accum3, coeff16;
459 const __m128i* src;
460 for (int i = 0; i < width; i += 4) { // Four pixels per iteration.
461 accum0 = _mm_setzero_si128();
462 accum1 = _mm_setzero_si128();
463 accum2 = _mm_setzero_si128();
464 accum3 = _mm_setzero_si128();
465 for (int j = 0; j < filter_length; ++j) {
466 coeff16 = _mm_set1_epi16(filter_values[j]);
467
468 // aligned load due to row_buffer is 16 byte aligned.
469 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
470 src = reinterpret_cast<const __m128i*>(&source_data_rows[j][i<<2]);
471 __m128i src8 = _mm_loadu_si128(src);
472 // [16] a1 b1 g1 r1 a0 b0 g0 r0
473 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
474 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
475 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
476 // [32] a0 b0 g0 r0
477 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
478 accum0 = _mm_add_epi32(accum0, t);
479 // [32] a1 b1 g1 r1
480 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
481 accum1 = _mm_add_epi32(accum1, t);
482 // [16] a3 b3 g3 r3 a2 b2 g2 r2
483 src16 = _mm_unpackhi_epi8(src8, zero);
484 mul_hi = _mm_mulhi_epi16(src16, coeff16);
485 mul_lo = _mm_mullo_epi16(src16, coeff16);
486 // [32] a2 b2 g2 r2
487 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
488 accum2 = _mm_add_epi32(accum2, t);
489 // [32] a3 b3 g3 r3
490 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
491 accum3 = _mm_add_epi32(accum3, t);
492 }
493
494 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
495 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
496 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
497 accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);
498 // [16] a1 b1 g1 r1 a0 b0 g0 r0
499 accum0 = _mm_packs_epi32(accum0, accum1);
500 // [16] a3 b3 g3 r3 a2 b2 g2 r2
501 accum2 = _mm_packs_epi32(accum2, accum3);
502 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
503 accum0 = _mm_packus_epi16(accum0, accum2);
504 if (has_alpha) {
505 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
506 __m128i a = _mm_srli_epi32(accum0, 8);
507 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
508 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
509 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
510 a = _mm_srli_epi32(accum0, 16);
511 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
512 b = _mm_max_epu8(a, b); // Max of r and g and b.
513 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
514 b = _mm_slli_epi32(b, 24);
515 accum0 = _mm_max_epu8(b, accum0);
516 } else {
517 __m128i mask = _mm_set1_epi32(0xff000000);
518 accum0 = _mm_or_si128(accum0, mask);
519 }
520 _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
521 out_row += 16;
522 }
523
524 if (pixel_width & 3) {
525 accum0 = _mm_setzero_si128();
526 accum1 = _mm_setzero_si128();
527 accum2 = _mm_setzero_si128();
528 for (int j = 0; j < filter_length; ++j) {
529 coeff16 = _mm_set1_epi16(filter_values[j]);
530 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
531 src = reinterpret_cast<const __m128i*>(&source_data_rows[j][width<<2]);
532 __m128i src8 = _mm_loadu_si128(src);
533 // [16] a1 b1 g1 r1 a0 b0 g0 r0
534 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
535 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
536 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
537 // [32] a0 b0 g0 r0
538 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
539 accum0 = _mm_add_epi32(accum0, t);
540 // [32] a1 b1 g1 r1
541 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
542 accum1 = _mm_add_epi32(accum1, t);
543 // [16] a3 b3 g3 r3 a2 b2 g2 r2
544 src16 = _mm_unpackhi_epi8(src8, zero);
545 mul_hi = _mm_mulhi_epi16(src16, coeff16);
546 mul_lo = _mm_mullo_epi16(src16, coeff16);
547 // [32] a2 b2 g2 r2
548 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
549 accum2 = _mm_add_epi32(accum2, t);
550 }
551
552 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
553 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
554 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
555 // [16] a1 b1 g1 r1 a0 b0 g0 r0
556 accum0 = _mm_packs_epi32(accum0, accum1);
557 // [16] a3 b3 g3 r3 a2 b2 g2 r2
558 accum2 = _mm_packs_epi32(accum2, zero);
559 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
560 accum0 = _mm_packus_epi16(accum0, accum2);
561 if (has_alpha) {
562 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
563 __m128i a = _mm_srli_epi32(accum0, 8);
564 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
565 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
566 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
567 a = _mm_srli_epi32(accum0, 16);
568 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
569 b = _mm_max_epu8(a, b); // Max of r and g and b.
570 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
571 b = _mm_slli_epi32(b, 24);
572 accum0 = _mm_max_epu8(b, accum0);
573 } else {
574 __m128i mask = _mm_set1_epi32(0xff000000);
575 accum0 = _mm_or_si128(accum0, mask);
576 }
577
578 for (int i = width; i < pixel_width; ++i) {
579 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
580 accum0 = _mm_srli_si128(accum0, 4);
581 out_row += 4;
582 }
583 }
584 #endif
585 }
586
222 } // namespace 587 } // namespace
223 588
224 // ConvolutionFilter1D --------------------------------------------------------- 589 // ConvolutionFilter1D ---------------------------------------------------------
225 590
226 ConvolutionFilter1D::ConvolutionFilter1D() 591 ConvolutionFilter1D::ConvolutionFilter1D()
227 : max_filter_(0) { 592 : max_filter_(0) {
228 } 593 }
229 594
230 ConvolutionFilter1D::~ConvolutionFilter1D() { 595 ConvolutionFilter1D::~ConvolutionFilter1D() {
231 } 596 }
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
277 // We pushed filter_length elements onto filter_values_ 642 // We pushed filter_length elements onto filter_values_
278 instance.data_location = (static_cast<int>(filter_values_.size()) - 643 instance.data_location = (static_cast<int>(filter_values_.size()) -
279 filter_length); 644 filter_length);
280 instance.offset = filter_offset; 645 instance.offset = filter_offset;
281 instance.length = filter_length; 646 instance.length = filter_length;
282 filters_.push_back(instance); 647 filters_.push_back(instance);
283 648
284 max_filter_ = std::max(max_filter_, filter_length); 649 max_filter_ = std::max(max_filter_, filter_length);
285 } 650 }
286 651
287 // BGRAConvolve2D ------------------------------------------------------------- 652 void BGRAConvolve2D_C(const unsigned char* source_data,
288 653 int source_byte_row_stride,
289 void BGRAConvolve2D(const unsigned char* source_data, 654 bool source_has_alpha,
290 int source_byte_row_stride, 655 const ConvolutionFilter1D& filter_x,
291 bool source_has_alpha, 656 const ConvolutionFilter1D& filter_y,
292 const ConvolutionFilter1D& filter_x, 657 int output_byte_row_stride,
293 const ConvolutionFilter1D& filter_y, 658 unsigned char* output) {
294 int output_byte_row_stride,
295 unsigned char* output) {
296 int max_y_filter_size = filter_y.max_filter(); 659 int max_y_filter_size = filter_y.max_filter();
297 660
298 // The next row in the input that we will generate a horizontally 661 // The next row in the input that we will generate a horizontally
299 // convolved row for. If the filter doesn't start at the beginning of the 662 // convolved row for. If the filter doesn't start at the beginning of the
300 // image (this is the case when we are only resizing a subset), then we 663 // image (this is the case when we are only resizing a subset), then we
301 // don't want to generate any output rows before that. Compute the starting 664 // don't want to generate any output rows before that. Compute the starting
302 // row for convolution as the first pixel for the first vertical filter. 665 // row for convolution as the first pixel for the first vertical filter.
303 int filter_offset, filter_length; 666 int filter_offset, filter_length;
304 const ConvolutionFilter1D::Fixed* filter_values = 667 const ConvolutionFilter1D::Fixed* filter_values =
305 filter_y.FilterForValue(0, &filter_offset, &filter_length); 668 filter_y.FilterForValue(0, &filter_offset, &filter_length);
306 int next_x_row = filter_offset; 669 int next_x_row = filter_offset;
307 670
308 // We loop over each row in the input doing a horizontal convolution. This 671 // We loop over each row in the input doing a horizontal convolution. This
309 // will result in a horizontally convolved image. We write the results into 672 // will result in a horizontally convolved image. We write the results into
310 // a circular buffer of convolved rows and do vertical convolution as rows 673 // a circular buffer of convolved rows and do vertical convolution as rows
311 // are available. This prevents us from having to store the entire 674 // are available. This prevents us from having to store the entire
312 // intermediate image and helps cache coherency. 675 // intermediate image and helps cache coherency.
313 CircularRowBuffer row_buffer(filter_x.num_values(), max_y_filter_size, 676 CircularRowBuffer row_buffer(filter_x.num_values(), max_y_filter_size,
314 filter_offset); 677 filter_offset);
315 678
316 // Loop over every possible output row, processing just enough horizontal 679 // Loop over every possible output row, processing just enough horizontal
317 // convolutions to run each subsequent vertical convolution. 680 // convolutions to run each subsequent vertical convolution.
318 SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4); 681 SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4);
319 int num_output_rows = filter_y.num_values(); 682 int num_output_rows = filter_y.num_values();
683
684 int last_filter_offset, last_filter_length;
685 filter_y.FilterForValue(num_output_rows-1, &last_filter_offset,
686 &last_filter_length);
687
320 for (int out_y = 0; out_y < num_output_rows; out_y++) { 688 for (int out_y = 0; out_y < num_output_rows; out_y++) {
321 filter_values = filter_y.FilterForValue(out_y, 689 filter_values = filter_y.FilterForValue(out_y,
322 &filter_offset, &filter_length); 690 &filter_offset, &filter_length);
323 691
324 // Generate output rows until we have enough to run the current filter. 692 // Generate output rows until we have enough to run the current filter.
325 while (next_x_row < filter_offset + filter_length) { 693 while (next_x_row < filter_offset + filter_length) {
326 if (source_has_alpha) { 694 if (source_has_alpha) {
327 ConvolveHorizontally<true>( 695 ConvolveHorizontally<true>(
328 &source_data[next_x_row * source_byte_row_stride], 696 &source_data[next_x_row * source_byte_row_stride],
329 filter_x, row_buffer.AdvanceRow()); 697 filter_x, row_buffer.AdvanceRow());
(...skipping 17 matching lines...) Expand all
347 // needs. 715 // needs.
348 unsigned char* const* first_row_for_filter = 716 unsigned char* const* first_row_for_filter =
349 &rows_to_convolve[filter_offset - first_row_in_circular_buffer]; 717 &rows_to_convolve[filter_offset - first_row_in_circular_buffer];
350 718
351 if (source_has_alpha) { 719 if (source_has_alpha) {
352 ConvolveVertically<true>(filter_values, filter_length, 720 ConvolveVertically<true>(filter_values, filter_length,
353 first_row_for_filter, 721 first_row_for_filter,
354 filter_x.num_values(), cur_output_row); 722 filter_x.num_values(), cur_output_row);
355 } else { 723 } else {
356 ConvolveVertically<false>(filter_values, filter_length, 724 ConvolveVertically<false>(filter_values, filter_length,
357 first_row_for_filter, 725 first_row_for_filter,
358 filter_x.num_values(), cur_output_row); 726 filter_x.num_values(), cur_output_row);
359 } 727 }
360 } 728 }
361 } 729 }
362 730
731 // BGRAConvolve2D -------------------------------------------------------------
732
733 void BGRAConvolve2D_SSE2(const unsigned char* source_data,
734 int source_byte_row_stride,
735 bool source_has_alpha,
736 const ConvolutionFilter1D& filter_x,
737 const ConvolutionFilter1D& filter_y,
738 int output_byte_row_stride,
739 unsigned char* output) {
740 int max_y_filter_size = filter_y.max_filter();
741
742 // The next row in the input that we will generate a horizontally
743 // convolved row for. If the filter doesn't start at the beginning of the
744 // image (this is the case when we are only resizing a subset), then we
745 // don't want to generate any output rows before that. Compute the starting
746 // row for convolution as the first pixel for the first vertical filter.
747 int filter_offset, filter_length;
748 const ConvolutionFilter1D::Fixed* filter_values =
749 filter_y.FilterForValue(0, &filter_offset, &filter_length);
750 int next_x_row = filter_offset;
751
752 // We loop over each row in the input doing a horizontal convolution. This
753 // will result in a horizontally convolved image. We write the results into
754 // a circular buffer of convolved rows and do vertical convolution as rows
755 // are available. This prevents us from having to store the entire
756 // intermediate image and helps cache coherency.
757 // We will need four extra rows to allow horizontal convolution could be done
758 // simultaneously.
759 int row_buffer_width = (filter_x.num_values() + 15) & ~0xF;
760 int row_buffer_height = max_y_filter_size + 4;
761 CircularRowBuffer row_buffer(row_buffer_width,
762 row_buffer_height,
763 filter_offset);
764
765 // Loop over every possible output row, processing just enough horizontal
766 // convolutions to run each subsequent vertical convolution.
767 SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4);
768 int num_output_rows = filter_y.num_values();
769
770 int last_filter_offset, last_filter_length;
771 filter_y.FilterForValue(num_output_rows-1, &last_filter_offset,
772 &last_filter_length);
773
774 for (int out_y = 0; out_y < num_output_rows; out_y++) {
775 filter_values = filter_y.FilterForValue(out_y,
776 &filter_offset, &filter_length);
777
778 // Generate output rows until we have enough to run the current filter.
779 while (next_x_row < filter_offset + filter_length) {
780 if (next_x_row + 3 < last_filter_offset + last_filter_length - 1) {
781 const unsigned char* src[4];
782 unsigned char* out_row[4];
783 for (int i = 0; i < 4; ++i) {
784 src[i] = &source_data[(next_x_row+i) * source_byte_row_stride];
785 out_row[i] = row_buffer.AdvanceRow();
786 }
787 ConvolveHorizontally4_SSE2(src, filter_x, out_row);
788 next_x_row+=4;
789 } else {
790 // For the last row, SSE2 load possibly to access data beyond the
791 // image area. therefore we use C version here. Hacking into skia
792 // to add line paddings is not something in my mind.
793 if (next_x_row == last_filter_offset + last_filter_length - 1) {
794 if (source_has_alpha)
795 ConvolveHorizontally<true>(
796 &source_data[next_x_row * source_byte_row_stride],
797 filter_x, row_buffer.AdvanceRow());
798 else
799 ConvolveHorizontally<false>(
800 &source_data[next_x_row * source_byte_row_stride],
801 filter_x, row_buffer.AdvanceRow());
802 } else {
803 ConvolveHorizontally_SSE2(
804 &source_data[next_x_row * source_byte_row_stride],
805 filter_x, row_buffer.AdvanceRow());
806 }
807 next_x_row++;
808 }
809 }
810
811 // Compute where in the output image this row of final data will go.
812 unsigned char* cur_output_row = &output[out_y * output_byte_row_stride];
813
814 // Get the list of rows that the circular buffer has, in order.
815 int first_row_in_circular_buffer;
816 unsigned char* const* rows_to_convolve =
817 row_buffer.GetRowAddresses(&first_row_in_circular_buffer);
818
819 // Now compute the start of the subset of those rows that the filter
820 // needs.
821 unsigned char* const* first_row_for_filter =
822 &rows_to_convolve[filter_offset - first_row_in_circular_buffer];
823
824 if (source_has_alpha) {
825 ConvolveVertically_SSE2<true>(filter_values, filter_length,
826 first_row_for_filter,
827 filter_x.num_values(), cur_output_row);
828 } else {
829 ConvolveVertically_SSE2<false>(filter_values, filter_length,
830 first_row_for_filter,
831 filter_x.num_values(), cur_output_row);
832 }
833 }
834 }
835
836 void BGRAConvolve2D(const unsigned char* source_data,
837 int source_byte_row_stride,
838 bool source_has_alpha,
839 const ConvolutionFilter1D& filter_x,
840 const ConvolutionFilter1D& filter_y,
841 int output_byte_row_stride,
842 unsigned char* output) {
843 base::CPU cpu;
844 if (cpu.has_sse2()) {
845 BGRAConvolve2D_SSE2(source_data, source_byte_row_stride, source_has_alpha,
846 filter_x, filter_y, output_byte_row_stride, output);
847 } else {
848 BGRAConvolve2D_C(source_data, source_byte_row_stride, source_has_alpha,
849 filter_x, filter_y, output_byte_row_stride, output);
850 }
851 }
852
363 } // namespace skia 853 } // namespace skia
OLDNEW
« skia/ext/convolver.h ('K') | « skia/ext/convolver.h ('k') | skia/ext/convolver_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698