| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include <emmintrin.h> | 8 #include <emmintrin.h> |
| 9 #include "SkBitmapProcState_opts_SSE2.h" | 9 #include "SkBitmapProcState_opts_SSE2.h" |
| 10 #include "SkBlitRow_opts_SSE2.h" | 10 #include "SkBlitRow_opts_SSE2.h" |
| (...skipping 20 matching lines...) Expand all Loading... |
| 31 SkASSERT(((size_t)dst & 0x03) == 0); | 31 SkASSERT(((size_t)dst & 0x03) == 0); |
| 32 while (((size_t)dst & 0x0F) != 0) { | 32 while (((size_t)dst & 0x0F) != 0) { |
| 33 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); | 33 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); |
| 34 src++; | 34 src++; |
| 35 dst++; | 35 dst++; |
| 36 count--; | 36 count--; |
| 37 } | 37 } |
| 38 | 38 |
| 39 const __m128i *s = reinterpret_cast<const __m128i*>(src); | 39 const __m128i *s = reinterpret_cast<const __m128i*>(src); |
| 40 __m128i *d = reinterpret_cast<__m128i*>(dst); | 40 __m128i *d = reinterpret_cast<__m128i*>(dst); |
| 41 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); | |
| 42 __m128i ag_mask = _mm_set1_epi32(0xFF00FF00); | |
| 43 | 41 |
| 44 // Move scale factors to upper byte of word | |
| 45 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); | |
| 46 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8); | |
| 47 while (count >= 4) { | 42 while (count >= 4) { |
| 48 // Load 4 pixels each of src and dest. | 43 // Load 4 pixels each of src and dest. |
| 49 __m128i src_pixel = _mm_loadu_si128(s); | 44 __m128i src_pixel = _mm_loadu_si128(s); |
| 50 __m128i dst_pixel = _mm_load_si128(d); | 45 __m128i dst_pixel = _mm_load_si128(d); |
| 51 | 46 |
| 52 // Interleave Atom port 0/1 operations based on the execution port | 47 src_pixel = SkAlphaMulQ_SSE2(src_pixel, src_scale); |
| 53 // constraints that multiply can only be executed on port 0 (while | 48 dst_pixel = SkAlphaMulQ_SSE2(dst_pixel, dst_scale); |
| 54 // boolean operations can be executed on either port 0 or port 1) | |
| 55 // because GCC currently doesn't do a good job scheduling | |
| 56 // instructions based on these constraints. | |
| 57 | |
| 58 // Get red and blue pixels into lower byte of each word. | |
| 59 // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b) | |
| 60 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); | |
| 61 | |
| 62 // Multiply by scale. | |
| 63 // (4 x (0, rs.h, 0, bs.h)) | |
| 64 // where rs.h stands for the higher byte of r * scale, and | |
| 65 // bs.h the higher byte of b * scale. | |
| 66 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); | |
| 67 | |
| 68 // Get alpha and green pixels into higher byte of each word. | |
| 69 // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0) | |
| 70 __m128i src_ag = _mm_and_si128(ag_mask, src_pixel); | |
| 71 | |
| 72 // Multiply by scale. | |
| 73 // (4 x (as.h, as.l, gs.h, gs.l)) | |
| 74 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); | |
| 75 | |
| 76 // Clear the lower byte of the a*scale and g*scale results | |
| 77 // (4 x (as.h, 0, gs.h, 0)) | |
| 78 src_ag = _mm_and_si128(src_ag, ag_mask); | |
| 79 | |
| 80 // Operations the destination pixels are the same as on the | |
| 81 // source pixels. See the comments above. | |
| 82 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); | |
| 83 dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide); | |
| 84 __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel); | |
| 85 dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide); | |
| 86 dst_ag = _mm_and_si128(dst_ag, ag_mask); | |
| 87 | |
| 88 // Combine back into RGBA. | |
| 89 // (4 x (as.h, rs.h, gs.h, bs.h)) | |
| 90 src_pixel = _mm_or_si128(src_rb, src_ag); | |
| 91 dst_pixel = _mm_or_si128(dst_rb, dst_ag); | |
| 92 | 49 |
| 93 // Add result | 50 // Add result |
| 94 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); | 51 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); |
| 95 _mm_store_si128(d, result); | 52 _mm_store_si128(d, result); |
| 96 s++; | 53 s++; |
| 97 d++; | 54 d++; |
| 98 count -= 4; | 55 count -= 4; |
| 99 } | 56 } |
| 100 src = reinterpret_cast<const SkPMColor*>(s); | 57 src = reinterpret_cast<const SkPMColor*>(s); |
| 101 dst = reinterpret_cast<SkPMColor*>(d); | 58 dst = reinterpret_cast<SkPMColor*>(d); |
| (...skipping 259 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 361 SkASSERT(((size_t)dst & 0x03) == 0); | 318 SkASSERT(((size_t)dst & 0x03) == 0); |
| 362 while (((size_t)dst & 0x0F) != 0) { | 319 while (((size_t)dst & 0x0F) != 0) { |
| 363 *dst = color + SkAlphaMulQ(*src, scale); | 320 *dst = color + SkAlphaMulQ(*src, scale); |
| 364 src++; | 321 src++; |
| 365 dst++; | 322 dst++; |
| 366 count--; | 323 count--; |
| 367 } | 324 } |
| 368 | 325 |
| 369 const __m128i *s = reinterpret_cast<const __m128i*>(src); | 326 const __m128i *s = reinterpret_cast<const __m128i*>(src); |
| 370 __m128i *d = reinterpret_cast<__m128i*>(dst); | 327 __m128i *d = reinterpret_cast<__m128i*>(dst); |
| 371 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); | |
| 372 __m128i src_scale_wide = _mm_set1_epi16(scale); | |
| 373 __m128i color_wide = _mm_set1_epi32(color); | 328 __m128i color_wide = _mm_set1_epi32(color); |
| 374 while (count >= 4) { | 329 while (count >= 4) { |
| 375 // Load 4 pixels each of src and dest. | |
| 376 __m128i src_pixel = _mm_loadu_si128(s); | 330 __m128i src_pixel = _mm_loadu_si128(s); |
| 331 src_pixel = SkAlphaMulQ_SSE2(src_pixel, scale); |
| 377 | 332 |
| 378 // Get red and blue pixels into lower byte of each word. | |
| 379 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); | |
| 380 | |
| 381 // Get alpha and green into lower byte of each word. | |
| 382 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); | |
| 383 | |
| 384 // Multiply by scale. | |
| 385 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); | |
| 386 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); | |
| 387 | |
| 388 // Divide by 256. | |
| 389 src_rb = _mm_srli_epi16(src_rb, 8); | |
| 390 src_ag = _mm_andnot_si128(rb_mask, src_ag); | |
| 391 | |
| 392 // Combine back into RGBA. | |
| 393 src_pixel = _mm_or_si128(src_rb, src_ag); | |
| 394 | |
| 395 // Add color to result. | |
| 396 __m128i result = _mm_add_epi8(color_wide, src_pixel); | 333 __m128i result = _mm_add_epi8(color_wide, src_pixel); |
| 397 | |
| 398 // Store result. | |
| 399 _mm_store_si128(d, result); | 334 _mm_store_si128(d, result); |
| 400 s++; | 335 s++; |
| 401 d++; | 336 d++; |
| 402 count -= 4; | 337 count -= 4; |
| 403 } | 338 } |
| 404 src = reinterpret_cast<const SkPMColor*>(s); | 339 src = reinterpret_cast<const SkPMColor*>(s); |
| 405 dst = reinterpret_cast<SkPMColor*>(d); | 340 dst = reinterpret_cast<SkPMColor*>(d); |
| 406 } | 341 } |
| 407 | 342 |
| 408 while (count > 0) { | 343 while (count > 0) { |
| (...skipping 942 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1351 uint32_t dst_expanded = SkExpand_rgb_16(*dst); | 1286 uint32_t dst_expanded = SkExpand_rgb_16(*dst); |
| 1352 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); | 1287 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); |
| 1353 // now src and dst expanded are in g:11 r:10 x:1 b:10 | 1288 // now src and dst expanded are in g:11 r:10 x:1 b:10 |
| 1354 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); | 1289 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); |
| 1355 } | 1290 } |
| 1356 dst += 1; | 1291 dst += 1; |
| 1357 DITHER_INC_X(x); | 1292 DITHER_INC_X(x); |
| 1358 } while (--count != 0); | 1293 } while (--count != 0); |
| 1359 } | 1294 } |
| 1360 } | 1295 } |
| OLD | NEW |