| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include <emmintrin.h> | 8 #include <emmintrin.h> |
| 9 #include "SkBitmapProcState_opts_SSE2.h" | 9 #include "SkBitmapProcState_opts_SSE2.h" |
| 10 #include "SkBlitRow_opts_SSE2.h" | 10 #include "SkBlitRow_opts_SSE2.h" |
| (...skipping 20 matching lines...) Expand all Loading... |
| 31 SkASSERT(((size_t)dst & 0x03) == 0); | 31 SkASSERT(((size_t)dst & 0x03) == 0); |
| 32 while (((size_t)dst & 0x0F) != 0) { | 32 while (((size_t)dst & 0x0F) != 0) { |
| 33 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); | 33 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); |
| 34 src++; | 34 src++; |
| 35 dst++; | 35 dst++; |
| 36 count--; | 36 count--; |
| 37 } | 37 } |
| 38 | 38 |
| 39 const __m128i *s = reinterpret_cast<const __m128i*>(src); | 39 const __m128i *s = reinterpret_cast<const __m128i*>(src); |
| 40 __m128i *d = reinterpret_cast<__m128i*>(dst); | 40 __m128i *d = reinterpret_cast<__m128i*>(dst); |
| 41 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); | |
| 42 __m128i ag_mask = _mm_set1_epi32(0xFF00FF00); | |
| 43 | 41 |
| 44 // Move scale factors to upper byte of word | |
| 45 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); | |
| 46 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8); | |
| 47 while (count >= 4) { | 42 while (count >= 4) { |
| 48 // Load 4 pixels each of src and dest. | 43 // Load 4 pixels each of src and dest. |
| 49 __m128i src_pixel = _mm_loadu_si128(s); | 44 __m128i src_pixel = _mm_loadu_si128(s); |
| 50 __m128i dst_pixel = _mm_load_si128(d); | 45 __m128i dst_pixel = _mm_load_si128(d); |
| 51 | 46 |
| 52 // Interleave Atom port 0/1 operations based on the execution port | 47 src_pixel = SkAlphaMulQ_SSE2(src_pixel, src_scale); |
| 53 // constraints that multiply can only be executed on port 0 (while | 48 dst_pixel = SkAlphaMulQ_SSE2(dst_pixel, dst_scale); |
| 54 // boolean operations can be executed on either port 0 or port 1) | |
| 55 // because GCC currently doesn't do a good job scheduling | |
| 56 // instructions based on these constraints. | |
| 57 | |
| 58 // Get red and blue pixels into lower byte of each word. | |
| 59 // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b) | |
| 60 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); | |
| 61 | |
| 62 // Multiply by scale. | |
| 63 // (4 x (0, rs.h, 0, bs.h)) | |
| 64 // where rs.h stands for the higher byte of r * scale, and | |
| 65 // bs.h the higher byte of b * scale. | |
| 66 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); | |
| 67 | |
| 68 // Get alpha and green pixels into higher byte of each word. | |
| 69 // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0) | |
| 70 __m128i src_ag = _mm_and_si128(ag_mask, src_pixel); | |
| 71 | |
| 72 // Multiply by scale. | |
| 73 // (4 x (as.h, as.l, gs.h, gs.l)) | |
| 74 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); | |
| 75 | |
| 76 // Clear the lower byte of the a*scale and g*scale results | |
| 77 // (4 x (as.h, 0, gs.h, 0)) | |
| 78 src_ag = _mm_and_si128(src_ag, ag_mask); | |
| 79 | |
| 80 // Operations the destination pixels are the same as on the | |
| 81 // source pixels. See the comments above. | |
| 82 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); | |
| 83 dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide); | |
| 84 __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel); | |
| 85 dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide); | |
| 86 dst_ag = _mm_and_si128(dst_ag, ag_mask); | |
| 87 | |
| 88 // Combine back into RGBA. | |
| 89 // (4 x (as.h, rs.h, gs.h, bs.h)) | |
| 90 src_pixel = _mm_or_si128(src_rb, src_ag); | |
| 91 dst_pixel = _mm_or_si128(dst_rb, dst_ag); | |
| 92 | 49 |
| 93 // Add result | 50 // Add result |
| 94 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); | 51 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); |
| 95 _mm_store_si128(d, result); | 52 _mm_store_si128(d, result); |
| 96 s++; | 53 s++; |
| 97 d++; | 54 d++; |
| 98 count -= 4; | 55 count -= 4; |
| 99 } | 56 } |
| 100 src = reinterpret_cast<const SkPMColor*>(s); | 57 src = reinterpret_cast<const SkPMColor*>(s); |
| 101 dst = reinterpret_cast<SkPMColor*>(d); | 58 dst = reinterpret_cast<SkPMColor*>(d); |
| (...skipping 141 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 243 } | 200 } |
| 244 | 201 |
| 245 if (count >= 4) { | 202 if (count >= 4) { |
| 246 while (((size_t)dst & 0x0F) != 0) { | 203 while (((size_t)dst & 0x0F) != 0) { |
| 247 *dst = SkBlendARGB32(*src, *dst, alpha); | 204 *dst = SkBlendARGB32(*src, *dst, alpha); |
| 248 src++; | 205 src++; |
| 249 dst++; | 206 dst++; |
| 250 count--; | 207 count--; |
| 251 } | 208 } |
| 252 | 209 |
| 253 uint32_t src_scale = SkAlpha255To256(alpha); | |
| 254 | |
| 255 const __m128i *s = reinterpret_cast<const __m128i*>(src); | 210 const __m128i *s = reinterpret_cast<const __m128i*>(src); |
| 256 __m128i *d = reinterpret_cast<__m128i*>(dst); | 211 __m128i *d = reinterpret_cast<__m128i*>(dst); |
| 257 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); | |
| 258 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); | |
| 259 __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) | |
| 260 while (count >= 4) { | 212 while (count >= 4) { |
| 261 // Load 4 pixels each of src and dest. | 213 // Load 4 pixels each of src and dest. |
| 262 __m128i src_pixel = _mm_loadu_si128(s); | 214 __m128i src_pixel = _mm_loadu_si128(s); |
| 263 __m128i dst_pixel = _mm_load_si128(d); | 215 __m128i dst_pixel = _mm_load_si128(d); |
| 264 | 216 |
| 265 // Get red and blue pixels into lower byte of each word. | 217 __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha); |
| 266 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); | |
| 267 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); | |
| 268 | |
| 269 // Get alpha and green into lower byte of each word. | |
| 270 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); | |
| 271 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); | |
| 272 | |
| 273 // Put per-pixel alpha in low byte of each word. | |
| 274 // After the following two statements, the dst_alpha looks like | |
| 275 // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3) | |
| 276 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); | |
| 277 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); | |
| 278 | |
| 279 // dst_alpha = dst_alpha * src_scale | |
| 280 // Because src_scales are in the higher byte of each word and | |
| 281 // we use mulhi here, the resulting alpha values are already | |
| 282 // in the right place and don't need to be divided by 256. | |
| 283 // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3) | |
| 284 dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide); | |
| 285 | |
| 286 // Subtract alphas from 256, to get 1..256 | |
| 287 dst_alpha = _mm_sub_epi16(c_256, dst_alpha); | |
| 288 | |
| 289 // Multiply red and blue by dst pixel alpha. | |
| 290 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); | |
| 291 // Multiply alpha and green by dst pixel alpha. | |
| 292 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); | |
| 293 | |
| 294 // Multiply red and blue by global alpha. | |
| 295 // (4 x (0, rs.h, 0, bs.h)) | |
| 296 // where rs.h stands for the higher byte of r * src_scale, | |
| 297 // and bs.h the higher byte of b * src_scale. | |
| 298 // Again, because we use mulhi, the resuling red and blue | |
| 299 // values are already in the right place and don't need to | |
| 300 // be divided by 256. | |
| 301 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); | |
| 302 // Multiply alpha and green by global alpha. | |
| 303 // (4 x (0, as.h, 0, gs.h)) | |
| 304 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); | |
| 305 | |
| 306 // Divide by 256. | |
| 307 dst_rb = _mm_srli_epi16(dst_rb, 8); | |
| 308 | |
| 309 // Mask out low bits (goodies already in the right place; no need to
divide) | |
| 310 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); | |
| 311 // Shift alpha and green to higher byte of each word. | |
| 312 // (4 x (as.h, 0, gs.h, 0)) | |
| 313 src_ag = _mm_slli_epi16(src_ag, 8); | |
| 314 | |
| 315 // Combine back into RGBA. | |
| 316 dst_pixel = _mm_or_si128(dst_rb, dst_ag); | |
| 317 src_pixel = _mm_or_si128(src_rb, src_ag); | |
| 318 | |
| 319 // Add two pixels into result. | |
| 320 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); | |
| 321 _mm_store_si128(d, result); | 218 _mm_store_si128(d, result); |
| 322 s++; | 219 s++; |
| 323 d++; | 220 d++; |
| 324 count -= 4; | 221 count -= 4; |
| 325 } | 222 } |
| 326 src = reinterpret_cast<const SkPMColor*>(s); | 223 src = reinterpret_cast<const SkPMColor*>(s); |
| 327 dst = reinterpret_cast<SkPMColor*>(d); | 224 dst = reinterpret_cast<SkPMColor*>(d); |
| 328 } | 225 } |
| 329 | 226 |
| 330 while (count > 0) { | 227 while (count > 0) { |
| (...skipping 30 matching lines...) Expand all Loading... |
| 361 SkASSERT(((size_t)dst & 0x03) == 0); | 258 SkASSERT(((size_t)dst & 0x03) == 0); |
| 362 while (((size_t)dst & 0x0F) != 0) { | 259 while (((size_t)dst & 0x0F) != 0) { |
| 363 *dst = color + SkAlphaMulQ(*src, scale); | 260 *dst = color + SkAlphaMulQ(*src, scale); |
| 364 src++; | 261 src++; |
| 365 dst++; | 262 dst++; |
| 366 count--; | 263 count--; |
| 367 } | 264 } |
| 368 | 265 |
| 369 const __m128i *s = reinterpret_cast<const __m128i*>(src); | 266 const __m128i *s = reinterpret_cast<const __m128i*>(src); |
| 370 __m128i *d = reinterpret_cast<__m128i*>(dst); | 267 __m128i *d = reinterpret_cast<__m128i*>(dst); |
| 371 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); | |
| 372 __m128i src_scale_wide = _mm_set1_epi16(scale); | |
| 373 __m128i color_wide = _mm_set1_epi32(color); | 268 __m128i color_wide = _mm_set1_epi32(color); |
| 374 while (count >= 4) { | 269 while (count >= 4) { |
| 375 // Load 4 pixels each of src and dest. | |
| 376 __m128i src_pixel = _mm_loadu_si128(s); | 270 __m128i src_pixel = _mm_loadu_si128(s); |
| 271 src_pixel = SkAlphaMulQ_SSE2(src_pixel, scale); |
| 377 | 272 |
| 378 // Get red and blue pixels into lower byte of each word. | |
| 379 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); | |
| 380 | |
| 381 // Get alpha and green into lower byte of each word. | |
| 382 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); | |
| 383 | |
| 384 // Multiply by scale. | |
| 385 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); | |
| 386 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); | |
| 387 | |
| 388 // Divide by 256. | |
| 389 src_rb = _mm_srli_epi16(src_rb, 8); | |
| 390 src_ag = _mm_andnot_si128(rb_mask, src_ag); | |
| 391 | |
| 392 // Combine back into RGBA. | |
| 393 src_pixel = _mm_or_si128(src_rb, src_ag); | |
| 394 | |
| 395 // Add color to result. | |
| 396 __m128i result = _mm_add_epi8(color_wide, src_pixel); | 273 __m128i result = _mm_add_epi8(color_wide, src_pixel); |
| 397 | |
| 398 // Store result. | |
| 399 _mm_store_si128(d, result); | 274 _mm_store_si128(d, result); |
| 400 s++; | 275 s++; |
| 401 d++; | 276 d++; |
| 402 count -= 4; | 277 count -= 4; |
| 403 } | 278 } |
| 404 src = reinterpret_cast<const SkPMColor*>(s); | 279 src = reinterpret_cast<const SkPMColor*>(s); |
| 405 dst = reinterpret_cast<SkPMColor*>(d); | 280 dst = reinterpret_cast<SkPMColor*>(d); |
| 406 } | 281 } |
| 407 | 282 |
| 408 while (count > 0) { | 283 while (count > 0) { |
| (...skipping 16 matching lines...) Expand all Loading... |
| 425 do { | 300 do { |
| 426 int count = width; | 301 int count = width; |
| 427 if (count >= 4) { | 302 if (count >= 4) { |
| 428 while (((size_t)dst & 0x0F) != 0 && (count > 0)) { | 303 while (((size_t)dst & 0x0F) != 0 && (count > 0)) { |
| 429 *dst = SkBlendARGB32(color, *dst, *mask); | 304 *dst = SkBlendARGB32(color, *dst, *mask); |
| 430 mask++; | 305 mask++; |
| 431 dst++; | 306 dst++; |
| 432 count--; | 307 count--; |
| 433 } | 308 } |
| 434 __m128i *d = reinterpret_cast<__m128i*>(dst); | 309 __m128i *d = reinterpret_cast<__m128i*>(dst); |
| 435 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); | |
| 436 __m128i c_256 = _mm_set1_epi16(256); | |
| 437 __m128i c_1 = _mm_set1_epi16(1); | |
| 438 __m128i src_pixel = _mm_set1_epi32(color); | 310 __m128i src_pixel = _mm_set1_epi32(color); |
| 439 while (count >= 4) { | 311 while (count >= 4) { |
| 440 // Load 4 pixels each of src and dest. | 312 // Load 4 dst pixels |
| 441 __m128i dst_pixel = _mm_load_si128(d); | 313 __m128i dst_pixel = _mm_load_si128(d); |
| 442 | 314 |
| 443 //set the aphla value | 315 // Set the alpha value |
| 444 __m128i src_scale_wide = _mm_cvtsi32_si128(*reinterpret_cast<con
st uint32_t*>(mask)); | 316 __m128i alpha_wide = _mm_cvtsi32_si128(*reinterpret_cast<const u
int32_t*>(mask)); |
| 445 src_scale_wide = _mm_unpacklo_epi8(src_scale_wide, | 317 alpha_wide = _mm_unpacklo_epi8(alpha_wide, _mm_setzero_si128()); |
| 446 _mm_setzero_si128()); | 318 alpha_wide = _mm_unpacklo_epi16(alpha_wide, _mm_setzero_si128())
; |
| 447 src_scale_wide = _mm_unpacklo_epi16(src_scale_wide, src_scale_wi
de); | |
| 448 | 319 |
| 449 //call SkAlpha255To256() | 320 __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha_
wide); |
| 450 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1); | |
| 451 | |
| 452 // Get red and blue pixels into lower byte of each word. | |
| 453 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); | |
| 454 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); | |
| 455 | |
| 456 // Get alpha and green into lower byte of each word. | |
| 457 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); | |
| 458 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); | |
| 459 | |
| 460 // Put per-pixel alpha in low byte of each word. | |
| 461 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); | |
| 462 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); | |
| 463 | |
| 464 // dst_alpha = dst_alpha * src_scale | |
| 465 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); | |
| 466 | |
| 467 // Divide by 256. | |
| 468 dst_alpha = _mm_srli_epi16(dst_alpha, 8); | |
| 469 | |
| 470 // Subtract alphas from 256, to get 1..256 | |
| 471 dst_alpha = _mm_sub_epi16(c_256, dst_alpha); | |
| 472 // Multiply red and blue by dst pixel alpha. | |
| 473 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); | |
| 474 // Multiply alpha and green by dst pixel alpha. | |
| 475 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); | |
| 476 | |
| 477 // Multiply red and blue by global alpha. | |
| 478 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); | |
| 479 // Multiply alpha and green by global alpha. | |
| 480 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); | |
| 481 // Divide by 256. | |
| 482 dst_rb = _mm_srli_epi16(dst_rb, 8); | |
| 483 src_rb = _mm_srli_epi16(src_rb, 8); | |
| 484 | |
| 485 // Mask out low bits (goodies already in the right place; no nee
d to divide) | |
| 486 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); | |
| 487 src_ag = _mm_andnot_si128(rb_mask, src_ag); | |
| 488 | |
| 489 // Combine back into RGBA. | |
| 490 dst_pixel = _mm_or_si128(dst_rb, dst_ag); | |
| 491 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag); | |
| 492 | |
| 493 // Add two pixels into result. | |
| 494 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel); | |
| 495 _mm_store_si128(d, result); | 321 _mm_store_si128(d, result); |
| 496 // load the next 4 pixel | 322 // Load the next 4 dst pixels and alphas |
| 497 mask = mask + 4; | 323 mask = mask + 4; |
| 498 d++; | 324 d++; |
| 499 count -= 4; | 325 count -= 4; |
| 500 } | 326 } |
| 501 dst = reinterpret_cast<SkPMColor *>(d); | 327 dst = reinterpret_cast<SkPMColor*>(d); |
| 502 } | 328 } |
| 503 while (count > 0) { | 329 while (count > 0) { |
| 504 *dst= SkBlendARGB32(color, *dst, *mask); | 330 *dst= SkBlendARGB32(color, *dst, *mask); |
| 505 dst += 1; | 331 dst += 1; |
| 506 mask++; | 332 mask++; |
| 507 count --; | 333 count --; |
| 508 } | 334 } |
| 509 dst = (SkPMColor *)((char*)dst + dstOffset); | 335 dst = (SkPMColor *)((char*)dst + dstOffset); |
| 510 mask += maskOffset; | 336 mask += maskOffset; |
| 511 } while (--height != 0); | 337 } while (--height != 0); |
| (...skipping 839 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1351 uint32_t dst_expanded = SkExpand_rgb_16(*dst); | 1177 uint32_t dst_expanded = SkExpand_rgb_16(*dst); |
| 1352 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); | 1178 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); |
| 1353 // now src and dst expanded are in g:11 r:10 x:1 b:10 | 1179 // now src and dst expanded are in g:11 r:10 x:1 b:10 |
| 1354 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); | 1180 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); |
| 1355 } | 1181 } |
| 1356 dst += 1; | 1182 dst += 1; |
| 1357 DITHER_INC_X(x); | 1183 DITHER_INC_X(x); |
| 1358 } while (--count != 0); | 1184 } while (--count != 0); |
| 1359 } | 1185 } |
| 1360 } | 1186 } |
| OLD | NEW |