Index: src/opts/SkBlitRow_opts_SSE4.cpp |
diff --git a/src/opts/SkBlitRow_opts_SSE4.cpp b/src/opts/SkBlitRow_opts_SSE4.cpp |
index f4273d27b4b0bfaf81d1df0c5d86f8f25f5a8164..3649d175efc2cc5ad081e8c2d4d44bf25c55f495 100644 |
--- a/src/opts/SkBlitRow_opts_SSE4.cpp |
+++ b/src/opts/SkBlitRow_opts_SSE4.cpp |
@@ -7,14 +7,9 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST |
sk_throw(); |
} |
-void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) { |
- sk_throw(); |
-} |
- |
#else |
-#include <smmintrin.h> // SSE4.1 intrinsics |
- |
+#include <smmintrin.h> // SSE4.1 intrinsics |
#include "SkColorPriv.h" |
#include "SkColor_opts_SSE2.h" |
@@ -66,76 +61,4 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, |
} |
} |
-static inline uint16_t Color32A_D565_1x(uint16_t dst, unsigned scale, uint32_t src_expand) { |
- uint32_t dst_expand = SkExpand_rgb_16(dst) * scale; |
- return SkCompact_rgb_16((src_expand + dst_expand) >> 5); |
-} |
- |
-void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) { |
- SkASSERT(count > 0); |
- |
- uint32_t src_expand = (SkGetPackedG32(src) << 24) | |
- (SkGetPackedR32(src) << 13) | |
- (SkGetPackedB32(src) << 2); |
- unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3; |
- |
- // Check if we have enough pixels to run SIMD |
- if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) { |
- __m128i* dst_wide; |
- const __m128i src_expand_wide = _mm_set1_epi32(src_expand); |
- const __m128i scale_wide = _mm_set1_epi32(scale); |
- const __m128i mask_green = _mm_set1_epi32(SK_R16_MASK_IN_PLACE | |
- SK_B16_MASK_IN_PLACE | |
- (SK_G16_MASK_IN_PLACE << 16)); |
- |
- // Align dst to an even 16 byte address (0-7 pixels) |
- while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) { |
- *dst = Color32A_D565_1x(*dst, scale, src_expand); |
- dst += 1; |
- count--; |
- } |
- |
- dst_wide = reinterpret_cast<__m128i*>(dst); |
- do { |
- // Load 8 RGB565 pixels |
- __m128i pixels = _mm_load_si128(dst_wide); |
- |
- // Duplicate and mask |
- __m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels); |
- pixels_high = _mm_and_si128(mask_green, pixels_high); |
- pixels = _mm_unpacklo_epi16(pixels, pixels); |
- pixels = _mm_and_si128(mask_green, pixels); |
- |
- // Scale with alpha |
- pixels_high = _mm_mullo_epi32(pixels_high, scale_wide); |
- pixels = _mm_mullo_epi32(pixels, scale_wide); |
- |
- // Add src_expand_wide and shift down again |
- pixels_high = _mm_add_epi32(pixels_high, src_expand_wide); |
- pixels_high = _mm_srli_epi32(pixels_high, 5); |
- pixels = _mm_add_epi32(pixels, src_expand_wide); |
- pixels = _mm_srli_epi32(pixels, 5); |
- |
- // Mask |
- pixels_high = _mm_and_si128(mask_green, pixels_high); |
- pixels = _mm_and_si128(mask_green, pixels); |
- |
- // Combine into RGB565 and store |
- pixels = _mm_hadd_epi16(pixels, pixels_high); |
- _mm_store_si128(dst_wide, pixels); |
- count -= 8; |
- dst_wide++; |
- } while (count >= 8); |
- |
- dst = reinterpret_cast<uint16_t*>(dst_wide); |
- } |
- |
- // Small loop to handle remaining pixels. |
- while (count > 0) { |
- *dst = Color32A_D565_1x(*dst, scale, src_expand); |
- dst += 1; |
- count--; |
- } |
-} |
- |
#endif |