Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: src/opts/SkBlitRow_opts_SSE2.cpp

Issue 754733002: Add SkBlendARGB32_SSE2() to clean up code (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: add fast path for SkBlendARGB32_SSE2 with constant alpha factor Created 6 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | src/opts/SkColor_opts_SSE2.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2012 The Android Open Source Project 2 * Copyright 2012 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include <emmintrin.h> 8 #include <emmintrin.h>
9 #include "SkBitmapProcState_opts_SSE2.h" 9 #include "SkBitmapProcState_opts_SSE2.h"
10 #include "SkBlitRow_opts_SSE2.h" 10 #include "SkBlitRow_opts_SSE2.h"
(...skipping 20 matching lines...) Expand all
31 SkASSERT(((size_t)dst & 0x03) == 0); 31 SkASSERT(((size_t)dst & 0x03) == 0);
32 while (((size_t)dst & 0x0F) != 0) { 32 while (((size_t)dst & 0x0F) != 0) {
33 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 33 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
34 src++; 34 src++;
35 dst++; 35 dst++;
36 count--; 36 count--;
37 } 37 }
38 38
39 const __m128i *s = reinterpret_cast<const __m128i*>(src); 39 const __m128i *s = reinterpret_cast<const __m128i*>(src);
40 __m128i *d = reinterpret_cast<__m128i*>(dst); 40 __m128i *d = reinterpret_cast<__m128i*>(dst);
41 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
42 __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
43 41
44 // Move scale factors to upper byte of word
45 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
46 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
47 while (count >= 4) { 42 while (count >= 4) {
48 // Load 4 pixels each of src and dest. 43 // Load 4 pixels each of src and dest.
49 __m128i src_pixel = _mm_loadu_si128(s); 44 __m128i src_pixel = _mm_loadu_si128(s);
50 __m128i dst_pixel = _mm_load_si128(d); 45 __m128i dst_pixel = _mm_load_si128(d);
51 46
52 // Interleave Atom port 0/1 operations based on the execution port 47 src_pixel = SkAlphaMulQ_SSE2(src_pixel, src_scale);
53 // constraints that multiply can only be executed on port 0 (while 48 dst_pixel = SkAlphaMulQ_SSE2(dst_pixel, dst_scale);
54 // boolean operations can be executed on either port 0 or port 1)
55 // because GCC currently doesn't do a good job scheduling
56 // instructions based on these constraints.
57
58 // Get red and blue pixels into lower byte of each word.
59 // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
60 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
61
62 // Multiply by scale.
63 // (4 x (0, rs.h, 0, bs.h))
64 // where rs.h stands for the higher byte of r * scale, and
65 // bs.h the higher byte of b * scale.
66 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
67
68 // Get alpha and green pixels into higher byte of each word.
69 // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
70 __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
71
72 // Multiply by scale.
73 // (4 x (as.h, as.l, gs.h, gs.l))
74 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
75
76 // Clear the lower byte of the a*scale and g*scale results
77 // (4 x (as.h, 0, gs.h, 0))
78 src_ag = _mm_and_si128(src_ag, ag_mask);
79
80 // Operations the destination pixels are the same as on the
81 // source pixels. See the comments above.
82 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
83 dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
84 __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
85 dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
86 dst_ag = _mm_and_si128(dst_ag, ag_mask);
87
88 // Combine back into RGBA.
89 // (4 x (as.h, rs.h, gs.h, bs.h))
90 src_pixel = _mm_or_si128(src_rb, src_ag);
91 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
92 49
93 // Add result 50 // Add result
94 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 51 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
95 _mm_store_si128(d, result); 52 _mm_store_si128(d, result);
96 s++; 53 s++;
97 d++; 54 d++;
98 count -= 4; 55 count -= 4;
99 } 56 }
100 src = reinterpret_cast<const SkPMColor*>(s); 57 src = reinterpret_cast<const SkPMColor*>(s);
101 dst = reinterpret_cast<SkPMColor*>(d); 58 dst = reinterpret_cast<SkPMColor*>(d);
(...skipping 141 matching lines...) Expand 10 before | Expand all | Expand 10 after
243 } 200 }
244 201
245 if (count >= 4) { 202 if (count >= 4) {
246 while (((size_t)dst & 0x0F) != 0) { 203 while (((size_t)dst & 0x0F) != 0) {
247 *dst = SkBlendARGB32(*src, *dst, alpha); 204 *dst = SkBlendARGB32(*src, *dst, alpha);
248 src++; 205 src++;
249 dst++; 206 dst++;
250 count--; 207 count--;
251 } 208 }
252 209
253 uint32_t src_scale = SkAlpha255To256(alpha);
254
255 const __m128i *s = reinterpret_cast<const __m128i*>(src); 210 const __m128i *s = reinterpret_cast<const __m128i*>(src);
256 __m128i *d = reinterpret_cast<__m128i*>(dst); 211 __m128i *d = reinterpret_cast<__m128i*>(dst);
257 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
258 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
259 __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)
260 while (count >= 4) { 212 while (count >= 4) {
261 // Load 4 pixels each of src and dest. 213 // Load 4 pixels each of src and dest.
262 __m128i src_pixel = _mm_loadu_si128(s); 214 __m128i src_pixel = _mm_loadu_si128(s);
263 __m128i dst_pixel = _mm_load_si128(d); 215 __m128i dst_pixel = _mm_load_si128(d);
264 216
265 // Get red and blue pixels into lower byte of each word. 217 __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
266 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
267 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
268
269 // Get alpha and green into lower byte of each word.
270 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
271 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
272
273 // Put per-pixel alpha in low byte of each word.
274 // After the following two statements, the dst_alpha looks like
275 // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
276 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
277 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
278
279 // dst_alpha = dst_alpha * src_scale
280 // Because src_scales are in the higher byte of each word and
281 // we use mulhi here, the resulting alpha values are already
282 // in the right place and don't need to be divided by 256.
283 // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
284 dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
285
286 // Subtract alphas from 256, to get 1..256
287 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
288
289 // Multiply red and blue by dst pixel alpha.
290 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
291 // Multiply alpha and green by dst pixel alpha.
292 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
293
294 // Multiply red and blue by global alpha.
295 // (4 x (0, rs.h, 0, bs.h))
296 // where rs.h stands for the higher byte of r * src_scale,
297 // and bs.h the higher byte of b * src_scale.
298 // Again, because we use mulhi, the resuling red and blue
299 // values are already in the right place and don't need to
300 // be divided by 256.
301 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
302 // Multiply alpha and green by global alpha.
303 // (4 x (0, as.h, 0, gs.h))
304 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
305
306 // Divide by 256.
307 dst_rb = _mm_srli_epi16(dst_rb, 8);
308
309 // Mask out low bits (goodies already in the right place; no need to divide)
310 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
311 // Shift alpha and green to higher byte of each word.
312 // (4 x (as.h, 0, gs.h, 0))
313 src_ag = _mm_slli_epi16(src_ag, 8);
314
315 // Combine back into RGBA.
316 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
317 src_pixel = _mm_or_si128(src_rb, src_ag);
318
319 // Add two pixels into result.
320 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
321 _mm_store_si128(d, result); 218 _mm_store_si128(d, result);
322 s++; 219 s++;
323 d++; 220 d++;
324 count -= 4; 221 count -= 4;
325 } 222 }
326 src = reinterpret_cast<const SkPMColor*>(s); 223 src = reinterpret_cast<const SkPMColor*>(s);
327 dst = reinterpret_cast<SkPMColor*>(d); 224 dst = reinterpret_cast<SkPMColor*>(d);
328 } 225 }
329 226
330 while (count > 0) { 227 while (count > 0) {
(...skipping 30 matching lines...) Expand all
361 SkASSERT(((size_t)dst & 0x03) == 0); 258 SkASSERT(((size_t)dst & 0x03) == 0);
362 while (((size_t)dst & 0x0F) != 0) { 259 while (((size_t)dst & 0x0F) != 0) {
363 *dst = color + SkAlphaMulQ(*src, scale); 260 *dst = color + SkAlphaMulQ(*src, scale);
364 src++; 261 src++;
365 dst++; 262 dst++;
366 count--; 263 count--;
367 } 264 }
368 265
369 const __m128i *s = reinterpret_cast<const __m128i*>(src); 266 const __m128i *s = reinterpret_cast<const __m128i*>(src);
370 __m128i *d = reinterpret_cast<__m128i*>(dst); 267 __m128i *d = reinterpret_cast<__m128i*>(dst);
371 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
372 __m128i src_scale_wide = _mm_set1_epi16(scale);
373 __m128i color_wide = _mm_set1_epi32(color); 268 __m128i color_wide = _mm_set1_epi32(color);
374 while (count >= 4) { 269 while (count >= 4) {
375 // Load 4 pixels each of src and dest.
376 __m128i src_pixel = _mm_loadu_si128(s); 270 __m128i src_pixel = _mm_loadu_si128(s);
271 src_pixel = SkAlphaMulQ_SSE2(src_pixel, scale);
377 272
378 // Get red and blue pixels into lower byte of each word.
379 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
380
381 // Get alpha and green into lower byte of each word.
382 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
383
384 // Multiply by scale.
385 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
386 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
387
388 // Divide by 256.
389 src_rb = _mm_srli_epi16(src_rb, 8);
390 src_ag = _mm_andnot_si128(rb_mask, src_ag);
391
392 // Combine back into RGBA.
393 src_pixel = _mm_or_si128(src_rb, src_ag);
394
395 // Add color to result.
396 __m128i result = _mm_add_epi8(color_wide, src_pixel); 273 __m128i result = _mm_add_epi8(color_wide, src_pixel);
397
398 // Store result.
399 _mm_store_si128(d, result); 274 _mm_store_si128(d, result);
400 s++; 275 s++;
401 d++; 276 d++;
402 count -= 4; 277 count -= 4;
403 } 278 }
404 src = reinterpret_cast<const SkPMColor*>(s); 279 src = reinterpret_cast<const SkPMColor*>(s);
405 dst = reinterpret_cast<SkPMColor*>(d); 280 dst = reinterpret_cast<SkPMColor*>(d);
406 } 281 }
407 282
408 while (count > 0) { 283 while (count > 0) {
(...skipping 16 matching lines...) Expand all
425 do { 300 do {
426 int count = width; 301 int count = width;
427 if (count >= 4) { 302 if (count >= 4) {
428 while (((size_t)dst & 0x0F) != 0 && (count > 0)) { 303 while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
429 *dst = SkBlendARGB32(color, *dst, *mask); 304 *dst = SkBlendARGB32(color, *dst, *mask);
430 mask++; 305 mask++;
431 dst++; 306 dst++;
432 count--; 307 count--;
433 } 308 }
434 __m128i *d = reinterpret_cast<__m128i*>(dst); 309 __m128i *d = reinterpret_cast<__m128i*>(dst);
435 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
436 __m128i c_256 = _mm_set1_epi16(256);
437 __m128i c_1 = _mm_set1_epi16(1);
438 __m128i src_pixel = _mm_set1_epi32(color); 310 __m128i src_pixel = _mm_set1_epi32(color);
439 while (count >= 4) { 311 while (count >= 4) {
440 // Load 4 pixels each of src and dest. 312 // Load 4 dst pixels
441 __m128i dst_pixel = _mm_load_si128(d); 313 __m128i dst_pixel = _mm_load_si128(d);
442 314
443 //set the aphla value 315 // Set the alpha value
444 __m128i src_scale_wide = _mm_cvtsi32_si128(*reinterpret_cast<con st uint32_t*>(mask)); 316 __m128i alpha_wide = _mm_cvtsi32_si128(*reinterpret_cast<const u int32_t*>(mask));
445 src_scale_wide = _mm_unpacklo_epi8(src_scale_wide, 317 alpha_wide = _mm_unpacklo_epi8(alpha_wide, _mm_setzero_si128());
446 _mm_setzero_si128()); 318 alpha_wide = _mm_unpacklo_epi16(alpha_wide, _mm_setzero_si128()) ;
447 src_scale_wide = _mm_unpacklo_epi16(src_scale_wide, src_scale_wi de);
448 319
449 //call SkAlpha255To256() 320 __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha_ wide);
450 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
451
452 // Get red and blue pixels into lower byte of each word.
453 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
454 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
455
456 // Get alpha and green into lower byte of each word.
457 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
458 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
459
460 // Put per-pixel alpha in low byte of each word.
461 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
462 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
463
464 // dst_alpha = dst_alpha * src_scale
465 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
466
467 // Divide by 256.
468 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
469
470 // Subtract alphas from 256, to get 1..256
471 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
472 // Multiply red and blue by dst pixel alpha.
473 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
474 // Multiply alpha and green by dst pixel alpha.
475 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
476
477 // Multiply red and blue by global alpha.
478 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
479 // Multiply alpha and green by global alpha.
480 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
481 // Divide by 256.
482 dst_rb = _mm_srli_epi16(dst_rb, 8);
483 src_rb = _mm_srli_epi16(src_rb, 8);
484
485 // Mask out low bits (goodies already in the right place; no nee d to divide)
486 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
487 src_ag = _mm_andnot_si128(rb_mask, src_ag);
488
489 // Combine back into RGBA.
490 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
491 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
492
493 // Add two pixels into result.
494 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
495 _mm_store_si128(d, result); 321 _mm_store_si128(d, result);
496 // load the next 4 pixel 322 // Load the next 4 dst pixels and alphas
497 mask = mask + 4; 323 mask = mask + 4;
498 d++; 324 d++;
499 count -= 4; 325 count -= 4;
500 } 326 }
501 dst = reinterpret_cast<SkPMColor *>(d); 327 dst = reinterpret_cast<SkPMColor*>(d);
502 } 328 }
503 while (count > 0) { 329 while (count > 0) {
504 *dst= SkBlendARGB32(color, *dst, *mask); 330 *dst= SkBlendARGB32(color, *dst, *mask);
505 dst += 1; 331 dst += 1;
506 mask++; 332 mask++;
507 count --; 333 count --;
508 } 334 }
509 dst = (SkPMColor *)((char*)dst + dstOffset); 335 dst = (SkPMColor *)((char*)dst + dstOffset);
510 mask += maskOffset; 336 mask += maskOffset;
511 } while (--height != 0); 337 } while (--height != 0);
(...skipping 839 matching lines...) Expand 10 before | Expand all | Expand 10 after
1351 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1177 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1352 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1178 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1353 // now src and dst expanded are in g:11 r:10 x:1 b:10 1179 // now src and dst expanded are in g:11 r:10 x:1 b:10
1354 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1180 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1355 } 1181 }
1356 dst += 1; 1182 dst += 1;
1357 DITHER_INC_X(x); 1183 DITHER_INC_X(x);
1358 } while (--count != 0); 1184 } while (--count != 0);
1359 } 1185 }
1360 } 1186 }
OLDNEW
« no previous file with comments | « no previous file | src/opts/SkColor_opts_SSE2.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698