OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright 2016 Google Inc. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. |
| 6 */ |
| 7 |
| 8 #ifndef SkColorXform_opts_DEFINED |
| 9 #define SkColorXform_opts_DEFINED |
| 10 |
| 11 #include "SkColorPriv.h" |
| 12 |
| 13 namespace SK_OPTS_NS { |
| 14 |
| 15 static uint8_t clamp_float_to_byte(float v) { |
| 16 if (v >= 254.5f) { |
| 17 return 255; |
| 18 } else if (v < 0.5f) { |
| 19 return 0; |
| 20 } else { |
| 21 return (uint8_t) (v + 0.5f); |
| 22 } |
| 23 } |
| 24 |
| 25 static void color_xform_2Dot2_RGBA_to_8888_portable(uint32_t* dst, const uint32_
t* src, int len, |
| 26 const float matrix[16]) { |
| 27 while (len-- > 0) { |
| 28 float srcFloats[3]; |
| 29 srcFloats[0] = (float) ((*src >> 0) & 0xFF); |
| 30 srcFloats[1] = (float) ((*src >> 8) & 0xFF); |
| 31 srcFloats[2] = (float) ((*src >> 16) & 0xFF); |
| 32 |
| 33 // Convert to linear. |
| 34 // TODO (msarett): |
| 35 // We should use X^2.2 here instead of X^2. What is the impact on corre
ctness? |
| 36 // We should be able to get closer to 2.2 at a small performance cost. |
| 37 srcFloats[0] = srcFloats[0] * srcFloats[0]; |
| 38 srcFloats[1] = srcFloats[1] * srcFloats[1]; |
| 39 srcFloats[2] = srcFloats[2] * srcFloats[2]; |
| 40 |
| 41 // Convert to dst gamut. |
| 42 float dstFloats[3]; |
| 43 // TODO (msarett): matrix[12], matrix[13], and matrix[14] are almost alw
ays zero. |
| 44 // Should we have another optimized path that avoids the extra addition
when they |
| 45 // are zero? |
| 46 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] + |
| 47 srcFloats[2] * matrix[8] + matrix[12]; |
| 48 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] + |
| 49 srcFloats[2] * matrix[9] + matrix[13]; |
| 50 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] + |
| 51 srcFloats[2] * matrix[10] + matrix[14]; |
| 52 |
| 53 // Convert to dst gamma. |
| 54 // TODO (msarett): |
| 55 // We should use X^(1/2.2) here instead of X^(1/2). What is the impact
on correctness? |
| 56 // We should be able to get closer to (1/2.2) at a small performance cos
t. |
| 57 dstFloats[0] = sqrtf(dstFloats[0]); |
| 58 dstFloats[1] = sqrtf(dstFloats[1]); |
| 59 dstFloats[2] = sqrtf(dstFloats[2]); |
| 60 |
| 61 *dst = SkPackARGB32NoCheck(((*src >> 24) & 0xFF), |
| 62 clamp_float_to_byte(dstFloats[0]), |
| 63 clamp_float_to_byte(dstFloats[1]), |
| 64 clamp_float_to_byte(dstFloats[2])); |
| 65 |
| 66 dst++; |
| 67 src++; |
| 68 } |
| 69 } |
| 70 |
| 71 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
| 72 |
| 73 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i
nt len, |
| 74 const float matrix[16]) { |
| 75 // Load transformation matrix. |
| 76 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]); |
| 77 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]); |
| 78 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]); |
| 79 __m128 rQgQbQ = _mm_loadu_ps(&matrix[12]); |
| 80 |
| 81 while (len >= 4) { |
| 82 // Load 4 pixels and convert them to floats. |
| 83 __m128i rgba = _mm_loadu_si128((const __m128i*) src); |
| 84 __m128i byteMask = _mm_set1_epi32(0xFF); |
| 85 __m128 reds = _mm_cvtepi32_ps(_mm_and_si128( rgba,
byteMask)); |
| 86 __m128 greens = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 8),
byteMask)); |
| 87 __m128 blues = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 16),
byteMask)); |
| 88 |
| 89 // Convert to linear. |
| 90 // FIXME (msarett): |
| 91 // Should we be more accurate? |
| 92 reds = _mm_mul_ps(reds, reds); |
| 93 greens = _mm_mul_ps(greens, greens); |
| 94 blues = _mm_mul_ps(blues, blues); |
| 95 |
| 96 // Apply the transformation matrix to dst gamut. |
| 97 // FIXME (msarett): |
| 98 // rQ, gQ, and bQ are almost always zero. Can we save a couple instruct
ions? |
| 99 |
| 100 // Splat rX, rY, rZ, and rQ each across a register. |
| 101 __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00); |
| 102 __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00); |
| 103 __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00); |
| 104 __m128 rQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x00); |
| 105 |
| 106 // dstReds = rX * reds + rY * greens + rZ * blues + rQ |
| 107 __m128 dstReds = _mm_mul_ps(reds, rX); |
| 108 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY)); |
| 109 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ)); |
| 110 dstReds = _mm_add_ps(dstReds, rQ); |
| 111 |
| 112 // Splat gX, gY, gZ, and gQ each across a register. |
| 113 __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55); |
| 114 __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55); |
| 115 __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55); |
| 116 __m128 gQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x55); |
| 117 |
| 118 // dstGreens = gX * reds + gY * greens + gZ * blues + gQ |
| 119 __m128 dstGreens = _mm_mul_ps(reds, gX); |
| 120 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY)); |
| 121 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ)); |
| 122 dstGreens = _mm_add_ps(dstGreens, gQ); |
| 123 |
| 124 // Splat bX, bY, bZ, and bQ each across a register. |
| 125 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA); |
| 126 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA); |
| 127 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA); |
| 128 __m128 bQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0xAA); |
| 129 |
| 130 // dstBlues = bX * reds + bY * greens + bZ * blues + bQ |
| 131 __m128 dstBlues = _mm_mul_ps(reds, bX); |
| 132 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY)); |
| 133 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ)); |
| 134 dstBlues = _mm_add_ps(dstBlues, bQ); |
| 135 |
| 136 // Convert to dst gamma. |
| 137 // Note that the reciprocal of the reciprocal sqrt, is just a fast sqrt. |
| 138 // FIXME (msarett): |
| 139 // Should we be more accurate? |
| 140 dstReds = _mm_rcp_ps(_mm_rsqrt_ps(dstReds)); |
| 141 dstGreens = _mm_rcp_ps(_mm_rsqrt_ps(dstGreens)); |
| 142 dstBlues = _mm_rcp_ps(_mm_rsqrt_ps(dstBlues)); |
| 143 |
| 144 // Clamp floats to 0-255 range. |
| 145 dstReds = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstReds, _mm_set1_
ps(255.0f))); |
| 146 dstGreens = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstGreens, _mm_set1_
ps(255.0f))); |
| 147 dstBlues = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstBlues, _mm_set1_
ps(255.0f))); |
| 148 |
| 149 // Convert to bytes and store to memory. |
| 150 rgba = _mm_and_si128(_mm_set1_epi32(0xFF000000), rgba); |
| 151 #ifdef SK_PMCOLOR_IS_RGBA |
| 152 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds)
); |
| 153 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8)
); |
| 154 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16)
); |
| 155 #else |
| 156 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstBlues)
); |
| 157 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8)
); |
| 158 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstReds), 16)
); |
| 159 #endif |
| 160 _mm_storeu_si128((__m128i*) dst, rgba); |
| 161 |
| 162 dst += 4; |
| 163 src += 4; |
| 164 len -= 4; |
| 165 } |
| 166 |
| 167 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix); |
| 168 } |
| 169 |
| 170 #else |
| 171 |
| 172 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i
nt len, |
| 173 const float matrix[16]) { |
| 174 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix); |
| 175 } |
| 176 |
| 177 #endif |
| 178 |
| 179 } |
| 180 |
| 181 #endif // SkColorXform_opts_DEFINED |
OLD | NEW |