Chromium Code Reviews| Index: src/opts/SkBlitRow_opts_SSE4.cpp |
| diff --git a/src/opts/SkBlitRow_opts_SSE4.cpp b/src/opts/SkBlitRow_opts_SSE4.cpp |
| index ae92a77eb2d234ba9bf5a7d816b860f469b022e9..bc9a6d394482372127f13b1f561302165c2b0b61 100644 |
| --- a/src/opts/SkBlitRow_opts_SSE4.cpp |
| +++ b/src/opts/SkBlitRow_opts_SSE4.cpp |
| @@ -7,6 +7,10 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST |
| sk_throw(); |
| } |
| +void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) { |
| + sk_throw(); |
| +} |
| + |
| #else |
| #include <emmintrin.h> // SSE2: Most _mm_foo() in this file. |
| @@ -64,4 +68,112 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, |
| } |
| } |
| +void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) { |
| + SkASSERT(count > 0); |
| + // dst must be 2-byte aligned. |
| + SkASSERT((((size_t)dst) & 0x01) == 0); |
|
mtklein
2015/01/30 18:17:37
Seems paranoid? If we need to keep this, let's wr
henrik.smiding
2015/02/10 15:11:51
I thought so too, but I actually found it in the n
|
| + |
| + uint32_t src_expand = (SkGetPackedG32(src) << 24) | |
| + (SkGetPackedR32(src) << 13) | |
| + (SkGetPackedB32(src) << 2); |
| + unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3; |
| + |
| + // Check if we have enough pixels to run SIMD, at all |
| + if (count >= 8) { |
|
mtklein
2015/01/30 18:17:37
This logic seems complex, and I'm worried we're ov
henrik.smiding
2015/02/10 15:11:52
I did this optimization based on reports that it w
|
| + __m128i* dst_wide; |
| + const __m128i src_expand_wide = _mm_set1_epi32(src_expand); |
| + const __m128i scale_wide = _mm_set1_epi32(scale); |
| + const __m128i mask_green = _mm_set1_epi32(SK_R16_MASK_IN_PLACE | |
| + SK_B16_MASK_IN_PLACE | |
| + (SK_G16_MASK_IN_PLACE << 16)); |
| + |
| + // Check if we should run the aligned SIMD loop |
| + if ((count >= 64) || ((((size_t)dst) & 0x0F) == 0)) { |
|
mtklein
2015/01/30 18:17:37
How'd you pick 64?
henrik.smiding
2015/02/10 15:11:52
I measured different widths with skia bench, using
|
| + // Align dst to an even 16 byte address (0-7 pixels) |
| + while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) { |
| + uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale; |
|
mtklein
2015/01/30 18:17:37
You've got a lot of repeated code here. If it nee
henrik.smiding
2015/02/10 15:11:51
I'll re-factor the code a bit. When you're satisfi
|
| + *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5); |
| + dst += 1; |
| + count--; |
| + } |
| + |
| + dst_wide = reinterpret_cast<__m128i*>(dst); |
| + do { |
| + // Load 8 RGB565 pixels |
| + __m128i pixels = _mm_load_si128(dst_wide); |
| + |
| + // Duplicate and mask |
| + __m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels); |
| + pixels_high = _mm_and_si128(mask_green, pixels_high); |
| + pixels = _mm_unpacklo_epi16(pixels, pixels); |
| + pixels = _mm_and_si128(mask_green, pixels); |
| + |
| + // Scale with alpha |
| + pixels_high = _mm_mullo_epi32(pixels_high, scale_wide); |
|
mtklein
2015/01/30 18:17:37
It doesn't seem like SSE4 is essential to the spee
henrik.smiding
2015/02/10 15:11:51
I added the _mm_hadd instruction at the last minut
|
| + pixels = _mm_mullo_epi32(pixels, scale_wide); |
| + |
| + // Add src_expand_wide and shift down again |
| + pixels_high = _mm_add_epi32(pixels_high, src_expand_wide); |
| + pixels_high = _mm_srli_epi32(pixels_high, 5); |
| + pixels = _mm_add_epi32(pixels, src_expand_wide); |
| + pixels = _mm_srli_epi32(pixels, 5); |
| + |
| + // Mask |
| + pixels_high = _mm_and_si128(mask_green, pixels_high); |
| + pixels = _mm_and_si128(mask_green, pixels); |
| + |
| + // Combine into RGB565 and store |
| + pixels = _mm_hadd_epi16(pixels, pixels_high); |
| + _mm_store_si128(dst_wide, pixels); |
| + count -= 8; |
| + dst_wide++; |
| + } while (count >= 8); |
| + } |
| + else { // Unaligned loop to handle medium widths |
| + dst_wide = reinterpret_cast<__m128i*>(dst); |
| + |
| + do { |
| + // Load 8 RGB565 pixels |
| + __m128i pixels = _mm_loadu_si128(dst_wide); |
| + |
| + // Duplicate and mask |
| + __m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels); |
| + pixels_high = _mm_and_si128(mask_green, pixels_high); |
| + pixels = _mm_unpacklo_epi16(pixels, pixels); |
| + pixels = _mm_and_si128(mask_green, pixels); |
| + |
| + // Scale with alpha |
| + pixels_high = _mm_mullo_epi32(pixels_high, scale_wide); |
| + pixels = _mm_mullo_epi32(pixels, scale_wide); |
| + |
| + // Add src_expand_wide and shift down again |
| + pixels_high = _mm_add_epi32(pixels_high, src_expand_wide); |
| + pixels_high = _mm_srli_epi32(pixels_high, 5); |
| + pixels = _mm_add_epi32(pixels, src_expand_wide); |
| + pixels = _mm_srli_epi32(pixels, 5); |
| + |
| + // Mask |
| + pixels_high = _mm_and_si128(mask_green, pixels_high); |
| + pixels = _mm_and_si128(mask_green, pixels); |
| + |
| + // Combine into RGB565 and store |
| + pixels = _mm_hadd_epi16(pixels, pixels_high); |
| + _mm_storeu_si128(dst_wide, pixels); |
| + count -= 8; |
| + dst_wide++; |
| + } while (count >= 8); |
| + } |
| + |
| + dst = reinterpret_cast<uint16_t*>(dst_wide); |
| + } |
| + |
| + // Small loop to handle remaining 0-7 pixels. |
| + while (count > 0) { |
| + uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale; |
| + *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5); |
| + dst += 1; |
| + count--; |
| + } |
| +} |
| + |
| #endif |