OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include <emmintrin.h> | 8 #include <emmintrin.h> |
9 #include "SkBitmapProcState_opts_SSE2.h" | 9 #include "SkBitmapProcState_opts_SSE2.h" |
10 #include "SkBlitRow_opts_SSE2.h" | 10 #include "SkBlitRow_opts_SSE2.h" |
(...skipping 20 matching lines...) Expand all Loading... |
31 SkASSERT(((size_t)dst & 0x03) == 0); | 31 SkASSERT(((size_t)dst & 0x03) == 0); |
32 while (((size_t)dst & 0x0F) != 0) { | 32 while (((size_t)dst & 0x0F) != 0) { |
33 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); | 33 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); |
34 src++; | 34 src++; |
35 dst++; | 35 dst++; |
36 count--; | 36 count--; |
37 } | 37 } |
38 | 38 |
39 const __m128i *s = reinterpret_cast<const __m128i*>(src); | 39 const __m128i *s = reinterpret_cast<const __m128i*>(src); |
40 __m128i *d = reinterpret_cast<__m128i*>(dst); | 40 __m128i *d = reinterpret_cast<__m128i*>(dst); |
41 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); | |
42 __m128i ag_mask = _mm_set1_epi32(0xFF00FF00); | |
43 | 41 |
44 // Move scale factors to upper byte of word | |
45 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); | |
46 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8); | |
47 while (count >= 4) { | 42 while (count >= 4) { |
48 // Load 4 pixels each of src and dest. | 43 // Load 4 pixels each of src and dest. |
49 __m128i src_pixel = _mm_loadu_si128(s); | 44 __m128i src_pixel = _mm_loadu_si128(s); |
50 __m128i dst_pixel = _mm_load_si128(d); | 45 __m128i dst_pixel = _mm_load_si128(d); |
51 | 46 |
52 // Interleave Atom port 0/1 operations based on the execution port | 47 src_pixel = SkAlphaMulQ_SSE2(src_pixel, src_scale); |
53 // constraints that multiply can only be executed on port 0 (while | 48 dst_pixel = SkAlphaMulQ_SSE2(dst_pixel, dst_scale); |
54 // boolean operations can be executed on either port 0 or port 1) | |
55 // because GCC currently doesn't do a good job scheduling | |
56 // instructions based on these constraints. | |
57 | |
58 // Get red and blue pixels into lower byte of each word. | |
59 // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b) | |
60 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); | |
61 | |
62 // Multiply by scale. | |
63 // (4 x (0, rs.h, 0, bs.h)) | |
64 // where rs.h stands for the higher byte of r * scale, and | |
65 // bs.h the higher byte of b * scale. | |
66 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); | |
67 | |
68 // Get alpha and green pixels into higher byte of each word. | |
69 // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0) | |
70 __m128i src_ag = _mm_and_si128(ag_mask, src_pixel); | |
71 | |
72 // Multiply by scale. | |
73 // (4 x (as.h, as.l, gs.h, gs.l)) | |
74 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); | |
75 | |
76 // Clear the lower byte of the a*scale and g*scale results | |
77 // (4 x (as.h, 0, gs.h, 0)) | |
78 src_ag = _mm_and_si128(src_ag, ag_mask); | |
79 | |
80 // Operations the destination pixels are the same as on the | |
81 // source pixels. See the comments above. | |
82 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); | |
83 dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide); | |
84 __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel); | |
85 dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide); | |
86 dst_ag = _mm_and_si128(dst_ag, ag_mask); | |
87 | |
88 // Combine back into RGBA. | |
89 // (4 x (as.h, rs.h, gs.h, bs.h)) | |
90 src_pixel = _mm_or_si128(src_rb, src_ag); | |
91 dst_pixel = _mm_or_si128(dst_rb, dst_ag); | |
92 | 49 |
93 // Add result | 50 // Add result |
94 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); | 51 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); |
95 _mm_store_si128(d, result); | 52 _mm_store_si128(d, result); |
96 s++; | 53 s++; |
97 d++; | 54 d++; |
98 count -= 4; | 55 count -= 4; |
99 } | 56 } |
100 src = reinterpret_cast<const SkPMColor*>(s); | 57 src = reinterpret_cast<const SkPMColor*>(s); |
101 dst = reinterpret_cast<SkPMColor*>(d); | 58 dst = reinterpret_cast<SkPMColor*>(d); |
(...skipping 141 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
243 } | 200 } |
244 | 201 |
245 if (count >= 4) { | 202 if (count >= 4) { |
246 while (((size_t)dst & 0x0F) != 0) { | 203 while (((size_t)dst & 0x0F) != 0) { |
247 *dst = SkBlendARGB32(*src, *dst, alpha); | 204 *dst = SkBlendARGB32(*src, *dst, alpha); |
248 src++; | 205 src++; |
249 dst++; | 206 dst++; |
250 count--; | 207 count--; |
251 } | 208 } |
252 | 209 |
253 uint32_t src_scale = SkAlpha255To256(alpha); | |
254 | |
255 const __m128i *s = reinterpret_cast<const __m128i*>(src); | 210 const __m128i *s = reinterpret_cast<const __m128i*>(src); |
256 __m128i *d = reinterpret_cast<__m128i*>(dst); | 211 __m128i *d = reinterpret_cast<__m128i*>(dst); |
257 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); | |
258 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); | |
259 __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) | |
260 while (count >= 4) { | 212 while (count >= 4) { |
261 // Load 4 pixels each of src and dest. | 213 // Load 4 pixels each of src and dest. |
262 __m128i src_pixel = _mm_loadu_si128(s); | 214 __m128i src_pixel = _mm_loadu_si128(s); |
263 __m128i dst_pixel = _mm_load_si128(d); | 215 __m128i dst_pixel = _mm_load_si128(d); |
264 | 216 |
265 // Get red and blue pixels into lower byte of each word. | 217 __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha); |
266 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); | |
267 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); | |
268 | |
269 // Get alpha and green into lower byte of each word. | |
270 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); | |
271 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); | |
272 | |
273 // Put per-pixel alpha in low byte of each word. | |
274 // After the following two statements, the dst_alpha looks like | |
275 // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3) | |
276 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); | |
277 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); | |
278 | |
279 // dst_alpha = dst_alpha * src_scale | |
280 // Because src_scales are in the higher byte of each word and | |
281 // we use mulhi here, the resulting alpha values are already | |
282 // in the right place and don't need to be divided by 256. | |
283 // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3) | |
284 dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide); | |
285 | |
286 // Subtract alphas from 256, to get 1..256 | |
287 dst_alpha = _mm_sub_epi16(c_256, dst_alpha); | |
288 | |
289 // Multiply red and blue by dst pixel alpha. | |
290 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); | |
291 // Multiply alpha and green by dst pixel alpha. | |
292 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); | |
293 | |
294 // Multiply red and blue by global alpha. | |
295 // (4 x (0, rs.h, 0, bs.h)) | |
296 // where rs.h stands for the higher byte of r * src_scale, | |
297 // and bs.h the higher byte of b * src_scale. | |
298 // Again, because we use mulhi, the resuling red and blue | |
299 // values are already in the right place and don't need to | |
300 // be divided by 256. | |
301 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); | |
302 // Multiply alpha and green by global alpha. | |
303 // (4 x (0, as.h, 0, gs.h)) | |
304 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); | |
305 | |
306 // Divide by 256. | |
307 dst_rb = _mm_srli_epi16(dst_rb, 8); | |
308 | |
309 // Mask out low bits (goodies already in the right place; no need to
divide) | |
310 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); | |
311 // Shift alpha and green to higher byte of each word. | |
312 // (4 x (as.h, 0, gs.h, 0)) | |
313 src_ag = _mm_slli_epi16(src_ag, 8); | |
314 | |
315 // Combine back into RGBA. | |
316 dst_pixel = _mm_or_si128(dst_rb, dst_ag); | |
317 src_pixel = _mm_or_si128(src_rb, src_ag); | |
318 | |
319 // Add two pixels into result. | |
320 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); | |
321 _mm_store_si128(d, result); | 218 _mm_store_si128(d, result); |
322 s++; | 219 s++; |
323 d++; | 220 d++; |
324 count -= 4; | 221 count -= 4; |
325 } | 222 } |
326 src = reinterpret_cast<const SkPMColor*>(s); | 223 src = reinterpret_cast<const SkPMColor*>(s); |
327 dst = reinterpret_cast<SkPMColor*>(d); | 224 dst = reinterpret_cast<SkPMColor*>(d); |
328 } | 225 } |
329 | 226 |
330 while (count > 0) { | 227 while (count > 0) { |
(...skipping 30 matching lines...) Expand all Loading... |
361 SkASSERT(((size_t)dst & 0x03) == 0); | 258 SkASSERT(((size_t)dst & 0x03) == 0); |
362 while (((size_t)dst & 0x0F) != 0) { | 259 while (((size_t)dst & 0x0F) != 0) { |
363 *dst = color + SkAlphaMulQ(*src, scale); | 260 *dst = color + SkAlphaMulQ(*src, scale); |
364 src++; | 261 src++; |
365 dst++; | 262 dst++; |
366 count--; | 263 count--; |
367 } | 264 } |
368 | 265 |
369 const __m128i *s = reinterpret_cast<const __m128i*>(src); | 266 const __m128i *s = reinterpret_cast<const __m128i*>(src); |
370 __m128i *d = reinterpret_cast<__m128i*>(dst); | 267 __m128i *d = reinterpret_cast<__m128i*>(dst); |
371 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); | |
372 __m128i src_scale_wide = _mm_set1_epi16(scale); | |
373 __m128i color_wide = _mm_set1_epi32(color); | 268 __m128i color_wide = _mm_set1_epi32(color); |
374 while (count >= 4) { | 269 while (count >= 4) { |
375 // Load 4 pixels each of src and dest. | |
376 __m128i src_pixel = _mm_loadu_si128(s); | 270 __m128i src_pixel = _mm_loadu_si128(s); |
| 271 src_pixel = SkAlphaMulQ_SSE2(src_pixel, scale); |
377 | 272 |
378 // Get red and blue pixels into lower byte of each word. | |
379 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); | |
380 | |
381 // Get alpha and green into lower byte of each word. | |
382 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); | |
383 | |
384 // Multiply by scale. | |
385 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); | |
386 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); | |
387 | |
388 // Divide by 256. | |
389 src_rb = _mm_srli_epi16(src_rb, 8); | |
390 src_ag = _mm_andnot_si128(rb_mask, src_ag); | |
391 | |
392 // Combine back into RGBA. | |
393 src_pixel = _mm_or_si128(src_rb, src_ag); | |
394 | |
395 // Add color to result. | |
396 __m128i result = _mm_add_epi8(color_wide, src_pixel); | 273 __m128i result = _mm_add_epi8(color_wide, src_pixel); |
397 | |
398 // Store result. | |
399 _mm_store_si128(d, result); | 274 _mm_store_si128(d, result); |
400 s++; | 275 s++; |
401 d++; | 276 d++; |
402 count -= 4; | 277 count -= 4; |
403 } | 278 } |
404 src = reinterpret_cast<const SkPMColor*>(s); | 279 src = reinterpret_cast<const SkPMColor*>(s); |
405 dst = reinterpret_cast<SkPMColor*>(d); | 280 dst = reinterpret_cast<SkPMColor*>(d); |
406 } | 281 } |
407 | 282 |
408 while (count > 0) { | 283 while (count > 0) { |
(...skipping 16 matching lines...) Expand all Loading... |
425 do { | 300 do { |
426 int count = width; | 301 int count = width; |
427 if (count >= 4) { | 302 if (count >= 4) { |
428 while (((size_t)dst & 0x0F) != 0 && (count > 0)) { | 303 while (((size_t)dst & 0x0F) != 0 && (count > 0)) { |
429 *dst = SkBlendARGB32(color, *dst, *mask); | 304 *dst = SkBlendARGB32(color, *dst, *mask); |
430 mask++; | 305 mask++; |
431 dst++; | 306 dst++; |
432 count--; | 307 count--; |
433 } | 308 } |
434 __m128i *d = reinterpret_cast<__m128i*>(dst); | 309 __m128i *d = reinterpret_cast<__m128i*>(dst); |
435 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); | |
436 __m128i c_256 = _mm_set1_epi16(256); | |
437 __m128i c_1 = _mm_set1_epi16(1); | |
438 __m128i src_pixel = _mm_set1_epi32(color); | 310 __m128i src_pixel = _mm_set1_epi32(color); |
439 while (count >= 4) { | 311 while (count >= 4) { |
440 // Load 4 pixels each of src and dest. | 312 // Load 4 dst pixels |
441 __m128i dst_pixel = _mm_load_si128(d); | 313 __m128i dst_pixel = _mm_load_si128(d); |
442 | 314 |
443 //set the aphla value | 315 // Set the alpha value |
444 __m128i src_scale_wide = _mm_cvtsi32_si128(*reinterpret_cast<con
st uint32_t*>(mask)); | 316 __m128i alpha_wide = _mm_cvtsi32_si128(*reinterpret_cast<const u
int32_t*>(mask)); |
445 src_scale_wide = _mm_unpacklo_epi8(src_scale_wide, | 317 alpha_wide = _mm_unpacklo_epi8(alpha_wide, _mm_setzero_si128()); |
446 _mm_setzero_si128()); | 318 alpha_wide = _mm_unpacklo_epi16(alpha_wide, _mm_setzero_si128())
; |
447 src_scale_wide = _mm_unpacklo_epi16(src_scale_wide, src_scale_wi
de); | |
448 | 319 |
449 //call SkAlpha255To256() | 320 __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha_
wide); |
450 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1); | |
451 | |
452 // Get red and blue pixels into lower byte of each word. | |
453 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); | |
454 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); | |
455 | |
456 // Get alpha and green into lower byte of each word. | |
457 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); | |
458 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); | |
459 | |
460 // Put per-pixel alpha in low byte of each word. | |
461 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); | |
462 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); | |
463 | |
464 // dst_alpha = dst_alpha * src_scale | |
465 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); | |
466 | |
467 // Divide by 256. | |
468 dst_alpha = _mm_srli_epi16(dst_alpha, 8); | |
469 | |
470 // Subtract alphas from 256, to get 1..256 | |
471 dst_alpha = _mm_sub_epi16(c_256, dst_alpha); | |
472 // Multiply red and blue by dst pixel alpha. | |
473 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); | |
474 // Multiply alpha and green by dst pixel alpha. | |
475 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); | |
476 | |
477 // Multiply red and blue by global alpha. | |
478 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); | |
479 // Multiply alpha and green by global alpha. | |
480 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); | |
481 // Divide by 256. | |
482 dst_rb = _mm_srli_epi16(dst_rb, 8); | |
483 src_rb = _mm_srli_epi16(src_rb, 8); | |
484 | |
485 // Mask out low bits (goodies already in the right place; no nee
d to divide) | |
486 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); | |
487 src_ag = _mm_andnot_si128(rb_mask, src_ag); | |
488 | |
489 // Combine back into RGBA. | |
490 dst_pixel = _mm_or_si128(dst_rb, dst_ag); | |
491 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag); | |
492 | |
493 // Add two pixels into result. | |
494 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel); | |
495 _mm_store_si128(d, result); | 321 _mm_store_si128(d, result); |
496 // load the next 4 pixel | 322 // Load the next 4 dst pixels and alphas |
497 mask = mask + 4; | 323 mask = mask + 4; |
498 d++; | 324 d++; |
499 count -= 4; | 325 count -= 4; |
500 } | 326 } |
501 dst = reinterpret_cast<SkPMColor *>(d); | 327 dst = reinterpret_cast<SkPMColor*>(d); |
502 } | 328 } |
503 while (count > 0) { | 329 while (count > 0) { |
504 *dst= SkBlendARGB32(color, *dst, *mask); | 330 *dst= SkBlendARGB32(color, *dst, *mask); |
505 dst += 1; | 331 dst += 1; |
506 mask++; | 332 mask++; |
507 count --; | 333 count --; |
508 } | 334 } |
509 dst = (SkPMColor *)((char*)dst + dstOffset); | 335 dst = (SkPMColor *)((char*)dst + dstOffset); |
510 mask += maskOffset; | 336 mask += maskOffset; |
511 } while (--height != 0); | 337 } while (--height != 0); |
(...skipping 839 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1351 uint32_t dst_expanded = SkExpand_rgb_16(*dst); | 1177 uint32_t dst_expanded = SkExpand_rgb_16(*dst); |
1352 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); | 1178 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); |
1353 // now src and dst expanded are in g:11 r:10 x:1 b:10 | 1179 // now src and dst expanded are in g:11 r:10 x:1 b:10 |
1354 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); | 1180 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); |
1355 } | 1181 } |
1356 dst += 1; | 1182 dst += 1; |
1357 DITHER_INC_X(x); | 1183 DITHER_INC_X(x); |
1358 } while (--count != 0); | 1184 } while (--count != 0); |
1359 } | 1185 } |
1360 } | 1186 } |
OLD | NEW |