| Index: src/opts/SkBlitRow_opts_SSE4.cpp
|
| diff --git a/src/opts/SkBlitRow_opts_SSE4.cpp b/src/opts/SkBlitRow_opts_SSE4.cpp
|
| index fd837d54fee4d879fd3d1d25887c343d1c51279c..f4273d27b4b0bfaf81d1df0c5d86f8f25f5a8164 100644
|
| --- a/src/opts/SkBlitRow_opts_SSE4.cpp
|
| +++ b/src/opts/SkBlitRow_opts_SSE4.cpp
|
| @@ -7,10 +7,13 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST
|
| sk_throw();
|
| }
|
|
|
| +void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) {
|
| + sk_throw();
|
| +}
|
| +
|
| #else
|
|
|
| -#include <emmintrin.h> // SSE2: Most _mm_foo() in this file.
|
| -#include <smmintrin.h> // SSE4.1: _mm_testz_si128 and _mm_testc_si128.
|
| +#include <smmintrin.h> // SSE4.1 intrinsics
|
|
|
| #include "SkColorPriv.h"
|
| #include "SkColor_opts_SSE2.h"
|
| @@ -63,4 +66,76 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,
|
| }
|
| }
|
|
|
| +static inline uint16_t Color32A_D565_1x(uint16_t dst, unsigned scale, uint32_t src_expand) {
|
| + uint32_t dst_expand = SkExpand_rgb_16(dst) * scale;
|
| + return SkCompact_rgb_16((src_expand + dst_expand) >> 5);
|
| +}
|
| +
|
| +void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) {
|
| + SkASSERT(count > 0);
|
| +
|
| + uint32_t src_expand = (SkGetPackedG32(src) << 24) |
|
| + (SkGetPackedR32(src) << 13) |
|
| + (SkGetPackedB32(src) << 2);
|
| + unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
|
| +
|
| + // Check if we have enough pixels to run SIMD
|
| + if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
|
| + __m128i* dst_wide;
|
| + const __m128i src_expand_wide = _mm_set1_epi32(src_expand);
|
| + const __m128i scale_wide = _mm_set1_epi32(scale);
|
| + const __m128i mask_green = _mm_set1_epi32(SK_R16_MASK_IN_PLACE |
|
| + SK_B16_MASK_IN_PLACE |
|
| + (SK_G16_MASK_IN_PLACE << 16));
|
| +
|
| + // Align dst to an even 16 byte address (0-7 pixels)
|
| + while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
|
| + *dst = Color32A_D565_1x(*dst, scale, src_expand);
|
| + dst += 1;
|
| + count--;
|
| + }
|
| +
|
| + dst_wide = reinterpret_cast<__m128i*>(dst);
|
| + do {
|
| + // Load 8 RGB565 pixels
|
| + __m128i pixels = _mm_load_si128(dst_wide);
|
| +
|
| + // Duplicate and mask
|
| + __m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels);
|
| + pixels_high = _mm_and_si128(mask_green, pixels_high);
|
| + pixels = _mm_unpacklo_epi16(pixels, pixels);
|
| + pixels = _mm_and_si128(mask_green, pixels);
|
| +
|
| + // Scale with alpha
|
| + pixels_high = _mm_mullo_epi32(pixels_high, scale_wide);
|
| + pixels = _mm_mullo_epi32(pixels, scale_wide);
|
| +
|
| + // Add src_expand_wide and shift down again
|
| + pixels_high = _mm_add_epi32(pixels_high, src_expand_wide);
|
| + pixels_high = _mm_srli_epi32(pixels_high, 5);
|
| + pixels = _mm_add_epi32(pixels, src_expand_wide);
|
| + pixels = _mm_srli_epi32(pixels, 5);
|
| +
|
| + // Mask
|
| + pixels_high = _mm_and_si128(mask_green, pixels_high);
|
| + pixels = _mm_and_si128(mask_green, pixels);
|
| +
|
| + // Combine into RGB565 and store
|
| + pixels = _mm_hadd_epi16(pixels, pixels_high);
|
| + _mm_store_si128(dst_wide, pixels);
|
| + count -= 8;
|
| + dst_wide++;
|
| + } while (count >= 8);
|
| +
|
| + dst = reinterpret_cast<uint16_t*>(dst_wide);
|
| + }
|
| +
|
| + // Small loop to handle remaining pixels.
|
| + while (count > 0) {
|
| + *dst = Color32A_D565_1x(*dst, scale, src_expand);
|
| + dst += 1;
|
| + count--;
|
| + }
|
| +}
|
| +
|
| #endif
|
|
|