Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 /* | |
| 2 * Copyright 2016 Google Inc. | |
| 3 * | |
| 4 * Use of this source code is governed by a BSD-style license that can be | |
| 5 * found in the LICENSE file. | |
| 6 */ | |
| 7 | |
| 8 #ifndef SkColorXform_opts_DEFINED | |
| 9 #define SkColorXform_opts_DEFINED | |
| 10 | |
| 11 #include "SkColorPriv.h" | |
| 12 | |
| 13 namespace SK_OPTS_NS { | |
| 14 | |
| 15 static void color_xform_2Dot2_RGBA_to_8888_portable(uint32_t* dst, const uint32_ t* src, int len, | |
| 16 const float matrix[16]) { | |
| 17 while (len-- > 0) { | |
| 18 float srcFloats[3]; | |
| 19 srcFloats[0] = (float) ((*src >> 0) & 0xFF); | |
| 20 srcFloats[1] = (float) ((*src >> 8) & 0xFF); | |
| 21 srcFloats[2] = (float) ((*src >> 16) & 0xFF); | |
| 22 | |
| 23 // Convert to linear. | |
| 24 // TODO (msarett): | |
| 25 // We should use X^2.2 here instead of X^2. What is the impact on corre ctness? | |
| 26 // We should be able to get closer to 2.2 at a small performance cost. | |
| 27 srcFloats[0] = srcFloats[0] * srcFloats[0]; | |
| 28 srcFloats[1] = srcFloats[1] * srcFloats[1]; | |
| 29 srcFloats[2] = srcFloats[2] * srcFloats[2]; | |
| 30 | |
| 31 // Convert to dst gamut. | |
| 32 float dstFloats[3]; | |
| 33 // TODO (msarett): matrix[12], matrix[13], and matrix[14] are almost alw ays zero. | |
| 34 // Should we have another optimized path that avoids the extra addition when they | |
| 35 // are zero? | |
| 36 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] + | |
| 37 srcFloats[2] * matrix[8] + matrix[12]; | |
| 38 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] + | |
| 39 srcFloats[2] * matrix[9] + matrix[13]; | |
| 40 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] + | |
| 41 srcFloats[2] * matrix[10] + matrix[14]; | |
| 42 | |
| 43 // Convert to dst gamma. | |
| 44 // TODO (msarett): | |
| 45 // We should use X^(1/2.2) here instead of X^(1/2). What is the impact on correctness? | |
| 46 // We should be able to get closer to (1/2.2) at a small performance cos t. | |
| 47 dstFloats[0] = sqrtf(dstFloats[0]); | |
| 48 dstFloats[1] = sqrtf(dstFloats[1]); | |
| 49 dstFloats[2] = sqrtf(dstFloats[2]); | |
| 50 | |
| 51 // FIXME (msarett) | |
| 52 // We should clamp here, but I intentionally don't. Just curious to see what happens, | |
| 53 // since I think it's unnecessary for most of my current tests. It will definitely be | |
| 54 // necessary when converting wider gamuts to smaller gamuts. Should we always clamp, | |
| 55 // or is there a performance reason to clamp only when necessary? Also, clamping is not | |
|
mtklein
2016/06/08 01:41:40
I think clamping is always the right choice when c
msarett
2016/06/08 13:48:28
Yeah I think it's fine (and in many cases necessar
| |
| 56 // the right choice for all rendering intents. Should we consider the r endering intent? | |
| 57 *dst = SkPackARGB32NoCheck(((*src >> 24) & 0xFF), | |
| 58 (uint8_t) (dstFloats[0] + 0.5f), | |
| 59 (uint8_t) (dstFloats[0] + 0.5f), | |
| 60 (uint8_t) (dstFloats[0] + 0.5f)); | |
| 61 | |
| 62 dst++; | |
| 63 src++; | |
| 64 } | |
| 65 } | |
| 66 | |
| 67 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | |
| 68 | |
| 69 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i nt len, | |
| 70 const float matrix[16]) { | |
| 71 // Create masks. | |
| 72 __m128i byteMask = _mm_set1_epi32(0xFF); | |
| 73 __m128i alphaMask = _mm_set1_epi32(0xFF000000); | |
| 74 | |
| 75 // Load transformation matrix. | |
| 76 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]); | |
| 77 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]); | |
| 78 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]); | |
| 79 __m128 rQgQbQ = _mm_loadu_ps(&matrix[12]); | |
| 80 | |
| 81 while (len >= 4) { | |
| 82 // Load 4 pixels and convert them to floats. | |
| 83 __m128i rgba = _mm_loadu_si128((const __m128i*) src); | |
| 84 __m128 reds = _mm_cvtepi32_ps(_mm_and_si128( rgba, byteMask)); | |
| 85 __m128 greens = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 8), byteMask)); | |
| 86 __m128 blues = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 16), byteMask)); | |
| 87 | |
| 88 // Convert to linear. | |
| 89 // FIXME (msarett): | |
| 90 // Should we be more accurate? | |
|
mtklein
2016/06/08 01:41:40
Luckily we've been working on just this problem!
msarett
2016/06/08 13:48:28
Very cool, sounds good. I plan to start by landin
| |
| 91 reds = _mm_mul_ps(reds, reds); | |
| 92 greens = _mm_mul_ps(greens, greens); | |
| 93 blues = _mm_mul_ps(blues, blues); | |
| 94 | |
| 95 // Apply the transformation matrix to dst gamut. | |
| 96 // FIXME (msarett): | |
| 97 // rQ, gQ, and bQ are almost always zero. Can we save a couple instruct ions? | |
|
mtklein
2016/06/08 01:41:40
Seems worthwhile, given you can do the check once
msarett
2016/06/08 13:48:28
This actually improves performance more than I sus
| |
| 98 | |
| 99 // Copy rX, rY, rZ, and rQ across their vector own vectors. | |
|
mtklein
2016/06/08 01:41:41
I think I get what you're saying here, but this do
msarett
2016/06/08 13:48:28
Yes that's better :).
| |
| 100 __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0); | |
|
mtklein
2016/06/08 01:41:41
I sort of like to write this sort of shuffle as 0x
msarett
2016/06/08 13:48:28
sgtm
| |
| 101 __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0); | |
| 102 __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0); | |
| 103 __m128 rQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0); | |
| 104 | |
| 105 // dstReds = rX * reds + rY * greens + rZ * blues + rQ | |
| 106 __m128 dstReds = _mm_mul_ps(reds, rX); | |
| 107 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY)); | |
| 108 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ)); | |
| 109 dstReds = _mm_add_ps(dstReds, rQ); | |
| 110 | |
| 111 // Copy gX, gY, gZ, and gQ across their vector own vectors. | |
| 112 __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55); | |
| 113 __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55); | |
| 114 __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55); | |
| 115 __m128 gQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x55); | |
| 116 | |
| 117 // dstGreens = gX * reds + gY * greens + gZ * blues + gQ | |
| 118 __m128 dstGreens = _mm_mul_ps(reds, gX); | |
|
mtklein
2016/06/08 01:41:41
This mul-add chain makes me think we should follow
msarett
2016/06/08 13:48:28
Yes! I had the same thought.
| |
| 119 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY)); | |
| 120 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ)); | |
| 121 dstGreens = _mm_add_ps(dstGreens, gQ); | |
| 122 | |
| 123 // Copy bX, bY, bZ, and bQ across their vector own vectors. | |
| 124 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA); | |
| 125 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA); | |
| 126 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA); | |
| 127 __m128 bQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0xAA); | |
| 128 | |
| 129 // dstBlues = bX * reds + bY * greens + bZ * blues + bQ | |
| 130 __m128 dstBlues = _mm_mul_ps(reds, bX); | |
| 131 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY)); | |
| 132 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ)); | |
| 133 dstBlues = _mm_add_ps(dstBlues, bQ); | |
| 134 | |
| 135 // Convert to dst gamma. | |
| 136 // FIXME (msarett): | |
| 137 // Should we be more accurate? | |
| 138 dstReds = _mm_sqrt_ps(dstReds); | |
|
mtklein
2016/06/08 01:41:40
You might want to try _mm_rcp_ps(_mm_rsqrt_ps(...)
msarett
2016/06/08 13:48:28
Great! Drops about 10us.
| |
| 139 dstGreens = _mm_sqrt_ps(dstGreens); | |
| 140 dstBlues = _mm_sqrt_ps(dstBlues); | |
| 141 | |
| 142 // Convert to bytes and store to memory. | |
| 143 // FIXME (msarett): | |
| 144 // We need to clamp. | |
|
mtklein
2016/06/08 01:41:40
Definitely. Any overflow here will clobber neighb
msarett
2016/06/08 13:48:28
Done.
| |
| 145 rgba = _mm_and_si128(alphaMask, rgba); | |
| 146 #ifdef SK_PMCOLOR_IS_RGBA | |
| 147 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) ); | |
| 148 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) ); | |
| 149 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16) ); | |
| 150 #else | |
| 151 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstBlues) ); | |
| 152 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) ); | |
| 153 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstReds), 16) ); | |
| 154 #endif | |
| 155 _mm_storeu_si128((__m128i*) dst, rgba); | |
| 156 | |
| 157 dst += 4; | |
| 158 src += 4; | |
| 159 len -= 4; | |
| 160 } | |
| 161 | |
| 162 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix); | |
| 163 } | |
| 164 | |
| 165 #else | |
| 166 | |
| 167 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i nt len, | |
| 168 const float matrix[16]) { | |
| 169 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix); | |
| 170 } | |
| 171 | |
| 172 #endif | |
| 173 | |
| 174 } | |
| 175 | |
| 176 #endif // SkColorXform_opts_DEFINED | |
| OLD | NEW |