src/opts/SkBlitRow_opts_SSE4.cpp - Issue 892623002: Add SSE optimization of Color32A_D565

Unified Diff: src/opts/SkBlitRow_opts_SSE4.cpp

Issue 892623002: Add SSE optimization of Color32A_D565 (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/opts/SkBlitRow_opts_SSE4.cpp

diff --git a/src/opts/SkBlitRow_opts_SSE4.cpp b/src/opts/SkBlitRow_opts_SSE4.cpp

index ae92a77eb2d234ba9bf5a7d816b860f469b022e9..bc9a6d394482372127f13b1f561302165c2b0b61 100644

--- a/src/opts/SkBlitRow_opts_SSE4.cpp

+++ b/src/opts/SkBlitRow_opts_SSE4.cpp

@@ -7,6 +7,10 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST

sk_throw();

}

+void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) {

+ sk_throw();

#else

#include <emmintrin.h> // SSE2: Most _mm_foo() in this file.

@@ -64,4 +68,112 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,

}

+void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) {

+ SkASSERT(count > 0);

+ // dst must be 2-byte aligned.

+ SkASSERT((((size_t)dst) & 0x01) == 0);

mtklein 2015/01/30 18:17:37 Seems paranoid? If we need to keep this, let's wr

henrik.smiding 2015/02/10 15:11:51 I thought so too, but I actually found it in the n

+ uint32_t src_expand = (SkGetPackedG32(src) << 24) |

+ (SkGetPackedR32(src) << 13) |

+ (SkGetPackedB32(src) << 2);

+ unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;

+ // Check if we have enough pixels to run SIMD, at all

+ if (count >= 8) {

mtklein 2015/01/30 18:17:37 This logic seems complex, and I'm worried we're ov

henrik.smiding 2015/02/10 15:11:52 I did this optimization based on reports that it w

+ __m128i* dst_wide;

+ const __m128i src_expand_wide = _mm_set1_epi32(src_expand);

+ const __m128i scale_wide = _mm_set1_epi32(scale);

+ const __m128i mask_green = _mm_set1_epi32(SK_R16_MASK_IN_PLACE |

+ SK_B16_MASK_IN_PLACE |

+ (SK_G16_MASK_IN_PLACE << 16));

+ // Check if we should run the aligned SIMD loop

+ if ((count >= 64) || ((((size_t)dst) & 0x0F) == 0)) {

mtklein 2015/01/30 18:17:37 How'd you pick 64?

henrik.smiding 2015/02/10 15:11:52 I measured different widths with skia bench, using

+ // Align dst to an even 16 byte address (0-7 pixels)

+ while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {

+ uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;

mtklein 2015/01/30 18:17:37 You've got a lot of repeated code here. If it nee

henrik.smiding 2015/02/10 15:11:51 I'll re-factor the code a bit. When you're satisfi

+ *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);

+ dst += 1;

+ count--;

+ }

+ dst_wide = reinterpret_cast<__m128i*>(dst);

+ do {

+ // Load 8 RGB565 pixels

+ __m128i pixels = _mm_load_si128(dst_wide);

+ // Duplicate and mask

+ __m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels);

+ pixels_high = _mm_and_si128(mask_green, pixels_high);

+ pixels = _mm_unpacklo_epi16(pixels, pixels);

+ pixels = _mm_and_si128(mask_green, pixels);

+ // Scale with alpha

+ pixels_high = _mm_mullo_epi32(pixels_high, scale_wide);

mtklein 2015/01/30 18:17:37 It doesn't seem like SSE4 is essential to the spee

henrik.smiding 2015/02/10 15:11:51 I added the _mm_hadd instruction at the last minut

+ pixels = _mm_mullo_epi32(pixels, scale_wide);

+ // Add src_expand_wide and shift down again

+ pixels_high = _mm_add_epi32(pixels_high, src_expand_wide);

+ pixels_high = _mm_srli_epi32(pixels_high, 5);

+ pixels = _mm_add_epi32(pixels, src_expand_wide);

+ pixels = _mm_srli_epi32(pixels, 5);

+ // Mask

+ pixels_high = _mm_and_si128(mask_green, pixels_high);

+ pixels = _mm_and_si128(mask_green, pixels);

+ // Combine into RGB565 and store

+ pixels = _mm_hadd_epi16(pixels, pixels_high);

+ _mm_store_si128(dst_wide, pixels);

+ count -= 8;

+ dst_wide++;

+ } while (count >= 8);

+ }

+ else { // Unaligned loop to handle medium widths

+ dst_wide = reinterpret_cast<__m128i*>(dst);

+ do {

+ // Load 8 RGB565 pixels

+ __m128i pixels = _mm_loadu_si128(dst_wide);

+ // Duplicate and mask

+ __m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels);

+ pixels_high = _mm_and_si128(mask_green, pixels_high);

+ pixels = _mm_unpacklo_epi16(pixels, pixels);

+ pixels = _mm_and_si128(mask_green, pixels);

+ // Scale with alpha

+ pixels_high = _mm_mullo_epi32(pixels_high, scale_wide);

+ pixels = _mm_mullo_epi32(pixels, scale_wide);

+ // Add src_expand_wide and shift down again

+ pixels_high = _mm_add_epi32(pixels_high, src_expand_wide);

+ pixels_high = _mm_srli_epi32(pixels_high, 5);

+ pixels = _mm_add_epi32(pixels, src_expand_wide);

+ pixels = _mm_srli_epi32(pixels, 5);

+ // Mask

+ pixels_high = _mm_and_si128(mask_green, pixels_high);

+ pixels = _mm_and_si128(mask_green, pixels);

+ // Combine into RGB565 and store

+ pixels = _mm_hadd_epi16(pixels, pixels_high);

+ _mm_storeu_si128(dst_wide, pixels);

+ count -= 8;

+ dst_wide++;

+ } while (count >= 8);

+ }

+ dst = reinterpret_cast<uint16_t*>(dst_wide);

+ }

+ // Small loop to handle remaining 0-7 pixels.

+ while (count > 0) {

+ uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;

+ *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);

+ dst += 1;

+ count--;

+ }

#endif

« src/core/SkBlitRow_D16.cpp ('K') | « src/opts/SkBlitRow_opts_SSE4.h ('k') | src/opts/opts_check_x86.cpp » ('j') | no next file with comments »