OLD | NEW |
---|---|
(Empty) | |
1 /* | |
2 * Copyright 2016 Google Inc. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license that can be | |
5 * found in the LICENSE file. | |
6 */ | |
7 | |
8 #ifndef SkColorXform_opts_DEFINED | |
9 #define SkColorXform_opts_DEFINED | |
10 | |
11 #include "SkColorPriv.h" | |
12 | |
13 namespace SK_OPTS_NS { | |
14 | |
15 static uint8_t clamp_float_to_byte(float v) { | |
scroggo
2016/06/08 17:08:34
This method is slightly different than the other o
msarett
2016/06/08 17:23:15
Done.
I think the version with 254.5 and 0.5 is "
| |
16 if (v >= 254.5f) { | |
17 return 255; | |
18 } else if (v < 0.5f) { | |
19 return 0; | |
20 } else { | |
21 return (uint8_t) (v + 0.5f); | |
22 } | |
23 } | |
24 | |
25 static void color_xform_2Dot2_RGBA_to_8888_portable(uint32_t* dst, const uint32_ t* src, int len, | |
26 const float matrix[16]) { | |
27 while (len-- > 0) { | |
28 float srcFloats[3]; | |
29 srcFloats[0] = (float) ((*src >> 0) & 0xFF); | |
30 srcFloats[1] = (float) ((*src >> 8) & 0xFF); | |
31 srcFloats[2] = (float) ((*src >> 16) & 0xFF); | |
32 | |
33 // Convert to linear. | |
34 // TODO (msarett): | |
35 // We should use X^2.2 here instead of X^2. What is the impact on corre ctness? | |
36 // We should be able to get closer to 2.2 at a small performance cost. | |
37 srcFloats[0] = srcFloats[0] * srcFloats[0]; | |
38 srcFloats[1] = srcFloats[1] * srcFloats[1]; | |
39 srcFloats[2] = srcFloats[2] * srcFloats[2]; | |
40 | |
41 // Convert to dst gamut. | |
42 float dstFloats[3]; | |
43 // TODO (msarett): matrix[12], matrix[13], and matrix[14] are almost alw ays zero. | |
44 // Should we have another optimized path that avoids the extra addition when they | |
45 // are zero? | |
46 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] + | |
47 srcFloats[2] * matrix[8] + matrix[12]; | |
48 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] + | |
49 srcFloats[2] * matrix[9] + matrix[13]; | |
50 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] + | |
51 srcFloats[2] * matrix[10] + matrix[14]; | |
52 | |
53 // Convert to dst gamma. | |
54 // TODO (msarett): | |
55 // We should use X^(1/2.2) here instead of X^(1/2). What is the impact on correctness? | |
56 // We should be able to get closer to (1/2.2) at a small performance cos t. | |
57 dstFloats[0] = sqrtf(dstFloats[0]); | |
58 dstFloats[1] = sqrtf(dstFloats[1]); | |
59 dstFloats[2] = sqrtf(dstFloats[2]); | |
60 | |
61 *dst = SkPackARGB32NoCheck(((*src >> 24) & 0xFF), | |
62 clamp_float_to_byte(dstFloats[0]), | |
63 clamp_float_to_byte(dstFloats[1]), | |
64 clamp_float_to_byte(dstFloats[2])); | |
65 | |
66 dst++; | |
67 src++; | |
68 } | |
69 } | |
70 | |
71 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | |
72 | |
73 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i nt len, | |
74 const float matrix[16]) { | |
75 // Load transformation matrix. | |
76 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]); | |
77 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]); | |
78 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]); | |
79 __m128 rQgQbQ = _mm_loadu_ps(&matrix[12]); | |
80 | |
81 while (len >= 4) { | |
82 // Load 4 pixels and convert them to floats. | |
83 __m128i rgba = _mm_loadu_si128((const __m128i*) src); | |
84 __m128i byteMask = _mm_set1_epi32(0xFF); | |
85 __m128 reds = _mm_cvtepi32_ps(_mm_and_si128( rgba, byteMask)); | |
86 __m128 greens = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 8), byteMask)); | |
87 __m128 blues = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 16), byteMask)); | |
88 | |
89 // Convert to linear. | |
90 // FIXME (msarett): | |
91 // Should we be more accurate? | |
92 reds = _mm_mul_ps(reds, reds); | |
93 greens = _mm_mul_ps(greens, greens); | |
94 blues = _mm_mul_ps(blues, blues); | |
95 | |
96 // Apply the transformation matrix to dst gamut. | |
97 // FIXME (msarett): | |
98 // rQ, gQ, and bQ are almost always zero. Can we save a couple instruct ions? | |
99 | |
100 // Splat rX, rY, rZ, and rQ each across a register. | |
101 __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00); | |
102 __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00); | |
103 __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00); | |
104 __m128 rQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x00); | |
105 | |
106 // dstReds = rX * reds + rY * greens + rZ * blues + rQ | |
107 __m128 dstReds = _mm_mul_ps(reds, rX); | |
108 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY)); | |
109 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ)); | |
110 dstReds = _mm_add_ps(dstReds, rQ); | |
111 | |
112 // Splat gX, gY, gZ, and gQ each across a register. | |
113 __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55); | |
114 __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55); | |
115 __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55); | |
116 __m128 gQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x55); | |
117 | |
118 // dstGreens = gX * reds + gY * greens + gZ * blues + gQ | |
119 __m128 dstGreens = _mm_mul_ps(reds, gX); | |
120 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY)); | |
121 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ)); | |
122 dstGreens = _mm_add_ps(dstGreens, gQ); | |
123 | |
124 // Splat bX, bY, bZ, and bQ each across a register. | |
125 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA); | |
126 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA); | |
127 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA); | |
128 __m128 bQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0xAA); | |
129 | |
130 // dstBlues = bX * reds + bY * greens + bZ * blues + bQ | |
131 __m128 dstBlues = _mm_mul_ps(reds, bX); | |
132 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY)); | |
133 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ)); | |
134 dstBlues = _mm_add_ps(dstBlues, bQ); | |
135 | |
136 // Convert to dst gamma. | |
137 // Note that the reciprocal of the reciprocal sqrt, is just a fast sqrt. | |
138 // FIXME (msarett): | |
139 // Should we be more accurate? | |
140 dstReds = _mm_rcp_ps(_mm_rsqrt_ps(dstReds)); | |
141 dstGreens = _mm_rcp_ps(_mm_rsqrt_ps(dstGreens)); | |
142 dstBlues = _mm_rcp_ps(_mm_rsqrt_ps(dstBlues)); | |
143 | |
144 // Clamp floats to 0-255 range. | |
145 dstReds = _mm_min_ps(_mm_max_ps(dstReds, _mm_setzero_ps()), _mm_set1 _ps(255.0f)); | |
146 dstGreens = _mm_min_ps(_mm_max_ps(dstGreens, _mm_setzero_ps()), _mm_set1 _ps(255.0f)); | |
147 dstBlues = _mm_min_ps(_mm_max_ps(dstBlues, _mm_setzero_ps()), _mm_set1 _ps(255.0f)); | |
148 | |
149 // Convert to bytes and store to memory. | |
150 rgba = _mm_and_si128(_mm_set1_epi32(0xFF000000), rgba); | |
151 #ifdef SK_PMCOLOR_IS_RGBA | |
152 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) ); | |
153 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) ); | |
154 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16) ); | |
155 #else | |
156 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstBlues) ); | |
157 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) ); | |
158 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstReds), 16) ); | |
159 #endif | |
160 _mm_storeu_si128((__m128i*) dst, rgba); | |
161 | |
162 dst += 4; | |
163 src += 4; | |
164 len -= 4; | |
165 } | |
166 | |
167 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix); | |
168 } | |
169 | |
170 #else | |
171 | |
172 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i nt len, | |
173 const float matrix[16]) { | |
174 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix); | |
175 } | |
176 | |
177 #endif | |
178 | |
179 } | |
180 | |
181 #endif // SkColorXform_opts_DEFINED | |
OLD | NEW |