Chromium Code Reviews| Index: src/opts/SkColorXform_opts.h |
| diff --git a/src/opts/SkColorXform_opts.h b/src/opts/SkColorXform_opts.h |
| index e7a2b4594682176740f15afa6a123760109f9d25..895bab162c6e1771315c8ce1eb4e57c70301662d 100644 |
| --- a/src/opts/SkColorXform_opts.h |
| +++ b/src/opts/SkColorXform_opts.h |
| @@ -126,16 +126,37 @@ static void color_xform_RGB1(void* dst, const uint32_t* src, int len, |
| dst = SkTAddOffset<void>(dst, 4 * sizeof(uint32_t)); |
| } else { |
| - // FIXME (msarett): |
| - // Can we do better here? Should we store half floats as planar? |
| - // Should we write Intel/Arm specific code? Should we add a transpose |
| - // function to SkNx? Should we rewrite the algorithm to be interleaved? |
| + Sk4h halfReds = SkFloatToVectorHalf_finite(dstReds); |
| + Sk4h halfGreens = SkFloatToVectorHalf_finite(dstGreens); |
| + Sk4h halfBlues = SkFloatToVectorHalf_finite(dstBlues); |
| + Sk4h halfAlphas = Sk4h(0x3C00); |
|
mtklein
2016/07/19 12:45:46
I feel like this is going to come up from time to
msarett
2016/07/19 15:24:49
SGTM. Done.
|
| + |
|
mtklein
2016/07/19 12:45:46
// Transpose the half floats back to interlaced RG
msarett
2016/07/19 15:24:49
Added to Nx.
|
| +#if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
| + __m128i rg = _mm_unpacklo_epi16(halfReds.fVec, halfGreens.fVec); |
| + __m128i ba = _mm_unpacklo_epi16(halfBlues.fVec, halfAlphas.fVec); |
| + __m128i rgba0 = _mm_unpacklo_epi32(rg, ba); |
|
mtklein
2016/07/19 12:45:46
Maybe just lo,hi? Seeing rgba0 and rgba1 makes me
msarett
2016/07/19 15:24:49
Done.
|
| + __m128i rgba1 = _mm_unpackhi_epi32(rg, ba); |
| + _mm_storeu_si128(((__m128i*) dst) + 0, rgba0); |
| + _mm_storeu_si128(((__m128i*) dst) + 1, rgba1); |
| +#elif !defined(SKNX_NO_SIMD) && defined(SK_ARM_HAS_NEON) |
| + uint16x4x2_t rg = vzip_u16(halfReds.fVec, halfGreens.fVec); |
|
mtklein
2016/07/19 12:45:46
Huh. I was expecting:
vst4_u16((uint16_t*)ds
msarett
2016/07/19 15:24:49
Duh, yup that's way better. Think I was stuck in
|
| + uint16x4x2_t ba = vzip_u16(halfBlues.fVec, halfAlphas.fVec); |
| + uint32x4_t rg32 = vreinterpretq_u32_u16(vcombine_u16(rg.val[0], rg.val[1])); |
| + uint32x4_t ba32 = vreinterpretq_u32_u16(vcombine_u16(ba.val[0], ba.val[1])); |
| + uint32x4x2_t rgba = vzipq_u32(rg32, ba32); |
| + vst1q_u32((uint32_t*) dst, rgba.val[0]); |
| + vst1q_u32((uint32_t*) dst, rgba.val[1]); |
| +#else |
| + Sk4h rgba0 = Sk4h(halfReds[0], halfGreens[0], halfBlues[0], halfAlphas[0]); |
|
mtklein
2016/07/19 12:45:46
How bad is it if you just used this strategy, i.e.
msarett
2016/07/19 15:24:49
Added way more performance measures to commit mess
|
| + Sk4h rgba1 = Sk4h(halfReds[1], halfGreens[1], halfBlues[1], halfAlphas[1]); |
| + Sk4h rgba2 = Sk4h(halfReds[2], halfGreens[2], halfBlues[2], halfAlphas[2]); |
| + Sk4h rgba3 = Sk4h(halfReds[3], halfGreens[3], halfBlues[3], halfAlphas[3]); |
| uint64_t* dst64 = (uint64_t*) dst; |
| - dst64[0] = SkFloatToHalf_finite(Sk4f(dstReds[0], dstGreens[0], dstBlues[0], 1.0f)); |
| - dst64[1] = SkFloatToHalf_finite(Sk4f(dstReds[1], dstGreens[1], dstBlues[1], 1.0f)); |
| - dst64[2] = SkFloatToHalf_finite(Sk4f(dstReds[2], dstGreens[2], dstBlues[2], 1.0f)); |
| - dst64[3] = SkFloatToHalf_finite(Sk4f(dstReds[3], dstGreens[3], dstBlues[3], 1.0f)); |
| - |
| + rgba0.store(dst64 + 0); |
| + rgba1.store(dst64 + 1); |
| + rgba2.store(dst64 + 2); |
| + rgba3.store(dst64 + 3); |
| +#endif |
| dst = SkTAddOffset<void>(dst, 4 * sizeof(uint64_t)); |
| } |
| }; |