src/opts/SkColorXform_opts.h - Issue 2159993003: Improve naive SkColorXform to half floats

Unified Diff: src/opts/SkColorXform_opts.h

Issue 2159993003: Improve naive SkColorXform to half floats (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Fix NO_SIMD bot Created 4 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/opts/SkColorXform_opts.h

diff --git a/src/opts/SkColorXform_opts.h b/src/opts/SkColorXform_opts.h

index e7a2b4594682176740f15afa6a123760109f9d25..895bab162c6e1771315c8ce1eb4e57c70301662d 100644

--- a/src/opts/SkColorXform_opts.h

+++ b/src/opts/SkColorXform_opts.h

@@ -126,16 +126,37 @@ static void color_xform_RGB1(void* dst, const uint32_t* src, int len,

dst = SkTAddOffset<void>(dst, 4 * sizeof(uint32_t));

} else {

- // FIXME (msarett):

- // Can we do better here? Should we store half floats as planar?

- // Should we write Intel/Arm specific code? Should we add a transpose

- // function to SkNx? Should we rewrite the algorithm to be interleaved?

+ Sk4h halfReds = SkFloatToVectorHalf_finite(dstReds);

+ Sk4h halfGreens = SkFloatToVectorHalf_finite(dstGreens);

+ Sk4h halfBlues = SkFloatToVectorHalf_finite(dstBlues);

+ Sk4h halfAlphas = Sk4h(0x3C00);

mtklein 2016/07/19 12:45:46 I feel like this is going to come up from time to

msarett 2016/07/19 15:24:49 SGTM. Done.

mtklein 2016/07/19 12:45:46 // Transpose the half floats back to interlaced RG

msarett 2016/07/19 15:24:49 Added to Nx.

+#if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

+ __m128i rg = _mm_unpacklo_epi16(halfReds.fVec, halfGreens.fVec);

+ __m128i ba = _mm_unpacklo_epi16(halfBlues.fVec, halfAlphas.fVec);

+ __m128i rgba0 = _mm_unpacklo_epi32(rg, ba);

mtklein 2016/07/19 12:45:46 Maybe just lo,hi? Seeing rgba0 and rgba1 makes me

msarett 2016/07/19 15:24:49 Done.

+ __m128i rgba1 = _mm_unpackhi_epi32(rg, ba);

+ _mm_storeu_si128(((__m128i*) dst) + 0, rgba0);

+ _mm_storeu_si128(((__m128i*) dst) + 1, rgba1);

+#elif !defined(SKNX_NO_SIMD) && defined(SK_ARM_HAS_NEON)

+ uint16x4x2_t rg = vzip_u16(halfReds.fVec, halfGreens.fVec);

mtklein 2016/07/19 12:45:46 Huh. I was expecting: vst4_u16((uint16_t*)ds

msarett 2016/07/19 15:24:49 Duh, yup that's way better. Think I was stuck in

+ uint16x4x2_t ba = vzip_u16(halfBlues.fVec, halfAlphas.fVec);

+ uint32x4_t rg32 = vreinterpretq_u32_u16(vcombine_u16(rg.val[0], rg.val[1]));

+ uint32x4_t ba32 = vreinterpretq_u32_u16(vcombine_u16(ba.val[0], ba.val[1]));

+ uint32x4x2_t rgba = vzipq_u32(rg32, ba32);

+ vst1q_u32((uint32_t*) dst, rgba.val[0]);

+ vst1q_u32((uint32_t*) dst, rgba.val[1]);

+#else

+ Sk4h rgba0 = Sk4h(halfReds[0], halfGreens[0], halfBlues[0], halfAlphas[0]);

mtklein 2016/07/19 12:45:46 How bad is it if you just used this strategy, i.e.

msarett 2016/07/19 15:24:49 Added way more performance measures to commit mess

+ Sk4h rgba1 = Sk4h(halfReds[1], halfGreens[1], halfBlues[1], halfAlphas[1]);

+ Sk4h rgba2 = Sk4h(halfReds[2], halfGreens[2], halfBlues[2], halfAlphas[2]);

+ Sk4h rgba3 = Sk4h(halfReds[3], halfGreens[3], halfBlues[3], halfAlphas[3]);

uint64_t* dst64 = (uint64_t*) dst;

- dst64[0] = SkFloatToHalf_finite(Sk4f(dstReds[0], dstGreens[0], dstBlues[0], 1.0f));

- dst64[1] = SkFloatToHalf_finite(Sk4f(dstReds[1], dstGreens[1], dstBlues[1], 1.0f));

- dst64[2] = SkFloatToHalf_finite(Sk4f(dstReds[2], dstGreens[2], dstBlues[2], 1.0f));

- dst64[3] = SkFloatToHalf_finite(Sk4f(dstReds[3], dstGreens[3], dstBlues[3], 1.0f));

+ rgba0.store(dst64 + 0);

+ rgba1.store(dst64 + 1);

+ rgba2.store(dst64 + 2);

+ rgba3.store(dst64 + 3);

+#endif

dst = SkTAddOffset<void>(dst, 4 * sizeof(uint64_t));

}

};

« src/core/SkHalf.h ('K') | « src/core/SkHalf.h ('k') | no next file » | no next file with comments »