Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: src/opts/SkColorXform_opts.h

Issue 2078913003: port to Sk4f (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: nah Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | src/opts/SkNx_neon.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 Google Inc. 2 * Copyright 2016 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #ifndef SkColorXform_opts_DEFINED 8 #ifndef SkColorXform_opts_DEFINED
9 #define SkColorXform_opts_DEFINED 9 #define SkColorXform_opts_DEFINED
10 10
11 #include "SkNx.h"
11 #include "SkColorPriv.h" 12 #include "SkColorPriv.h"
12 13
13 namespace SK_OPTS_NS { 14 namespace SK_OPTS_NS {
14 15
15 extern const float linear_from_srgb[256] = { 16 extern const float linear_from_srgb[256] = {
16 0.000000000000000000f, 0.000303526983548838f, 0.000607053967097675f, 0.0 00910580950646513f, 17 0.000000000000000000f, 0.000303526983548838f, 0.000607053967097675f, 0.0 00910580950646513f,
17 0.001214107934195350f, 0.001517634917744190f, 0.001821161901293030f, 0.0 02124688884841860f, 18 0.001214107934195350f, 0.001517634917744190f, 0.001821161901293030f, 0.0 02124688884841860f,
18 0.002428215868390700f, 0.002731742851939540f, 0.003034518678424960f, 0.0 03346535763899160f, 19 0.002428215868390700f, 0.002731742851939540f, 0.003034518678424960f, 0.0 03346535763899160f,
19 0.003676507324047440f, 0.004024717018496310f, 0.004391442037410290f, 0.0 04776953480693730f, 20 0.003676507324047440f, 0.004024717018496310f, 0.004391442037410290f, 0.0 04776953480693730f,
20 0.005181516702338390f, 0.005605391624202720f, 0.006048833022857060f, 0.0 06512090792594470f, 21 0.005181516702338390f, 0.005605391624202720f, 0.006048833022857060f, 0.0 06512090792594470f,
(...skipping 118 matching lines...) Expand 10 before | Expand all | Expand 10 after
139 0.751895080583051000f, 0.759299550695091000f, 0.766743616862161000f, 0.7 74227314218442000f, 140 0.751895080583051000f, 0.759299550695091000f, 0.766743616862161000f, 0.7 74227314218442000f,
140 0.781750677773962000f, 0.789313742415586000f, 0.796916542907978000f, 0.8 04559113894567000f, 141 0.781750677773962000f, 0.789313742415586000f, 0.796916542907978000f, 0.8 04559113894567000f,
141 0.812241489898490000f, 0.819963705323528000f, 0.827725794455034000f, 0.8 35527791460841000f, 142 0.812241489898490000f, 0.819963705323528000f, 0.827725794455034000f, 0.8 35527791460841000f,
142 0.843369730392169000f, 0.851251645184515000f, 0.859173569658532000f, 0.8 67135537520905000f, 143 0.843369730392169000f, 0.851251645184515000f, 0.859173569658532000f, 0.8 67135537520905000f,
143 0.875137582365205000f, 0.883179737672745000f, 0.891262036813419000f, 0.8 99384513046529000f, 144 0.875137582365205000f, 0.883179737672745000f, 0.891262036813419000f, 0.8 99384513046529000f,
144 0.907547199521614000f, 0.915750129279253000f, 0.923993335251873000f, 0.9 32276850264543000f, 145 0.907547199521614000f, 0.915750129279253000f, 0.923993335251873000f, 0.9 32276850264543000f,
145 0.940600707035753000f, 0.948964938178195000f, 0.957369576199527000f, 0.9 65814653503130000f, 146 0.940600707035753000f, 0.948964938178195000f, 0.957369576199527000f, 0.9 65814653503130000f,
146 0.974300202388861000f, 0.982826255053791000f, 0.991392843592940000f, 1.0 00000000000000000f, 147 0.974300202388861000f, 0.982826255053791000f, 0.991392843592940000f, 1.0 00000000000000000f,
147 }; 148 };
148 149
149 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 150 static Sk4f linear_to_2dot2(const Sk4f& x) {
151 // x^(29/64) is a very good approximation of the true value, x^(1/2.2).
152 auto x2 = x.rsqrt(), // x^(-1/2)
153 x32 = x2.rsqrt().rsqrt().rsqrt().rsqrt(), // x^(-1/32)
154 x64 = x32.rsqrt(); // x^(+1/64)
150 155
151 // x^(29/64) is a very good approximation of the true value, x^(1/2.2). 156 // 29 = 32 - 2 - 1
152 static __m128 linear_to_2dot2(__m128 x) { 157 return 255.0f * x2.invert() * x32 * x64.invert();
153 // x^(-1/2)
154 __m128 x2 = _mm_rsqrt_ps(x);
155
156 // x^(-1/32)
157 __m128 x32 = _mm_rsqrt_ps(_mm_rsqrt_ps(_mm_rsqrt_ps(_mm_rsqrt_ps(x2))));
158
159 // x^(+1/64)
160 __m128 x64 = _mm_rsqrt_ps(x32);
161
162 // x^(+29/64) = x^(+1/2) * x^(-1/32) * x^(-1/64)
163 // Note that we also scale to the 0-255 range.
164 // These terms can be combined more minimally with 3 muls and 1 reciprocal. However, this
165 // is faster, because it allows us to start the muls in parallel with the rs qrts.
166 __m128 scale = _mm_set1_ps(255.0f);
167 return _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(scale, _mm_rcp_ps(x2)), x32), _mm_rc p_ps(x64));
168 } 158 }
169 159
170 static __m128 clamp_0_to_255(__m128 x) { 160 static Sk4f clamp_0_to_255(const Sk4f& x) {
171 // The order of the arguments is important here. We want to make sure that NaN 161 // The order of the arguments is important here. We want to make sure that NaN
172 // clamps to zero. Note that max(NaN, 0) = 0, while max(0, NaN) = NaN. 162 // clamps to zero. Note that max(NaN, 0) = 0, while max(0, NaN) = NaN.
173 return _mm_min_ps(_mm_max_ps(x, _mm_setzero_ps()), _mm_set1_ps(255.0f)); 163 return Sk4f::Min(Sk4f::Max(x, 0.0f), 255.0f);
174 } 164 }
175 165
176 template <const float (&linear_from_curve)[256]> 166 template <const float (&linear_from_curve)[256]>
177 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len, 167 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,
178 const float matrix[16]) { 168 const float matrix[16]) {
179 // Load transformation matrix. 169 // Load transformation matrix.
180 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]); 170 auto rXgXbX = Sk4f::Load(matrix + 0),
181 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]); 171 rYgYbY = Sk4f::Load(matrix + 4),
182 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]); 172 rZgZbZ = Sk4f::Load(matrix + 8);
183 173
184 while (len >= 4) { 174 while (len >= 4) {
185 // Convert to linear. The look-up table has perfect accuracy. 175 // Convert to linear. The look-up table has perfect accuracy.
186 __m128 reds = _mm_setr_ps(linear_from_curve[(src[0] >> 0) & 0xFF], 176 auto reds = Sk4f{linear_from_curve[(src[0] >> 0) & 0xFF],
187 linear_from_curve[(src[1] >> 0) & 0xFF], 177 linear_from_curve[(src[1] >> 0) & 0xFF],
188 linear_from_curve[(src[2] >> 0) & 0xFF], 178 linear_from_curve[(src[2] >> 0) & 0xFF],
189 linear_from_curve[(src[3] >> 0) & 0xFF]); 179 linear_from_curve[(src[3] >> 0) & 0xFF]};
190 __m128 greens = _mm_setr_ps(linear_from_curve[(src[0] >> 8) & 0xFF], 180 auto greens = Sk4f{linear_from_curve[(src[0] >> 8) & 0xFF],
191 linear_from_curve[(src[1] >> 8) & 0xFF], 181 linear_from_curve[(src[1] >> 8) & 0xFF],
192 linear_from_curve[(src[2] >> 8) & 0xFF], 182 linear_from_curve[(src[2] >> 8) & 0xFF],
193 linear_from_curve[(src[3] >> 8) & 0xFF]); 183 linear_from_curve[(src[3] >> 8) & 0xFF]};
194 __m128 blues = _mm_setr_ps(linear_from_curve[(src[0] >> 16) & 0xFF], 184 auto blues = Sk4f{linear_from_curve[(src[0] >> 16) & 0xFF],
195 linear_from_curve[(src[1] >> 16) & 0xFF], 185 linear_from_curve[(src[1] >> 16) & 0xFF],
196 linear_from_curve[(src[2] >> 16) & 0xFF], 186 linear_from_curve[(src[2] >> 16) & 0xFF],
197 linear_from_curve[(src[3] >> 16) & 0xFF]); 187 linear_from_curve[(src[3] >> 16) & 0xFF]};
198 188
199 // Apply the transformation matrix to dst gamut. 189 // Apply the transformation matrix to dst gamut.
200 // Splat rX, rY, and rZ each across a register. 190 auto dstReds = rXgXbX[0]*reds + rYgYbY[0]*greens + rZgZbZ[0]*blues,
201 __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00); 191 dstGreens = rXgXbX[1]*reds + rYgYbY[1]*greens + rZgZbZ[1]*blues,
202 __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00); 192 dstBlues = rXgXbX[2]*reds + rYgYbY[2]*greens + rZgZbZ[2]*blues;
203 __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00);
204
205 // dstReds = rX * reds + rY * greens + rZ * blues
206 __m128 dstReds = _mm_mul_ps(reds, rX);
207 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY));
208 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ));
209
210 // Splat gX, gY, and gZ each across a register.
211 __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55);
212 __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55);
213 __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55);
214
215 // dstGreens = gX * reds + gY * greens + gZ * blues
216 __m128 dstGreens = _mm_mul_ps(reds, gX);
217 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY));
218 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ));
219
220 // Splat bX, bY, and bZ each across a register.
221 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA);
222 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA);
223 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA);
224
225 // dstBlues = bX * reds + bY * greens + bZ * blues
226 __m128 dstBlues = _mm_mul_ps(reds, bX);
227 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY));
228 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ));
229 193
230 // Convert to dst gamma. 194 // Convert to dst gamma.
231 dstReds = linear_to_2dot2(dstReds); 195 dstReds = linear_to_2dot2(dstReds);
232 dstGreens = linear_to_2dot2(dstGreens); 196 dstGreens = linear_to_2dot2(dstGreens);
233 dstBlues = linear_to_2dot2(dstBlues); 197 dstBlues = linear_to_2dot2(dstBlues);
234 198
235 // Clamp floats. 199 // Clamp floats to byte range.
236 dstReds = clamp_0_to_255(dstReds); 200 dstReds = clamp_0_to_255(dstReds);
237 dstGreens = clamp_0_to_255(dstGreens); 201 dstGreens = clamp_0_to_255(dstGreens);
238 dstBlues = clamp_0_to_255(dstBlues); 202 dstBlues = clamp_0_to_255(dstBlues);
239 203
240 // Convert to bytes and store to memory. 204 // Convert to bytes and store to memory.
241 __m128i rgba = _mm_set1_epi32(0xFF000000); 205 auto rgba = (Sk4i{(int)0xFF000000} )
242 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) ); 206 | (SkNx_cast<int>(dstReds) )
243 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) ); 207 | (SkNx_cast<int>(dstGreens) << 8)
244 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16) ); 208 | (SkNx_cast<int>(dstBlues) << 16);
245 _mm_storeu_si128((__m128i*) dst, rgba); 209 rgba.store(dst);
246 210
247 dst += 4; 211 dst += 4;
248 src += 4; 212 src += 4;
249 len -= 4; 213 len -= 4;
250 } 214 }
251 215
252 while (len > 0) { 216 while (len > 0) {
253 // Splat the red, green, and blue components. 217 // Splat r,g,b across a register each.
254 __m128 r = _mm_set1_ps(linear_from_curve[(src[0] >> 0) & 0xFF]), 218 auto r = Sk4f{linear_from_curve[(*src >> 0) & 0xFF]},
255 g = _mm_set1_ps(linear_from_curve[(src[0] >> 8) & 0xFF]), 219 g = Sk4f{linear_from_curve[(*src >> 8) & 0xFF]},
256 b = _mm_set1_ps(linear_from_curve[(src[0] >> 16) & 0xFF]); 220 b = Sk4f{linear_from_curve[(*src >> 16) & 0xFF]};
257 221
258 // Apply the transformation matrix to dst gamut. 222 // Apply transformation matrix to dst gamut.
259 __m128 dstPixel = _mm_mul_ps(r, rXgXbX); 223 auto dstPixel = rXgXbX*r + rYgYbY*g + rZgZbZ*b;
260 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(g, rYgYbY));
261 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(b, rZgZbZ));
262 224
263 // Convert to dst gamma. 225 // Convert to dst gamma.
264 dstPixel = linear_to_2dot2(dstPixel); 226 dstPixel = linear_to_2dot2(dstPixel);
265 227
266 // Clamp floats to 0-255 range. 228 // Clamp floats to byte range.
267 dstPixel = clamp_0_to_255(dstPixel); 229 dstPixel = clamp_0_to_255(dstPixel);
268 230
269 // Convert to bytes and store to memory. 231 // Convert to bytes and store to memory.
270 __m128i dstInts = _mm_cvtps_epi32(dstPixel); 232 uint32_t rgba;
271 __m128i dstBytes = _mm_packus_epi16(_mm_packus_epi16(dstInts, dstInts), dstInts); 233 SkNx_cast<uint8_t>(dstPixel).store(&rgba);
272 dstBytes = _mm_or_si128(_mm_set1_epi32(0xFF000000), dstBytes); 234 rgba |= 0xFF000000;
273 _mm_store_ss((float*) dst, _mm_castsi128_ps(dstBytes)); 235 *dst = rgba;
274 236
275 dst += 1; 237 dst += 1;
276 src += 1; 238 src += 1;
277 len -= 1; 239 len -= 1;
278 } 240 }
279 } 241 }
280 242
281 #else
282
283 static uint8_t clamp_float_to_byte(float v) {
284 // The ordering of the logic is a little strange here in order
285 // to make sure we convert NaNs to 0.
286 if (v >= 254.5f) {
287 return 255;
288 } else if (v >= 0.5f) {
289 return (uint8_t) (v + 0.5f);
290 } else {
291 return 0;
292 }
293 }
294
295 template <const float (&linear_from_curve)[256]>
296 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,
297 const float matrix[16]) {
298 while (len-- > 0) {
299 // Convert to linear.
300 float srcFloats[3];
301 srcFloats[0] = linear_from_curve[(*src >> 0) & 0xFF];
302 srcFloats[1] = linear_from_curve[(*src >> 8) & 0xFF];
303 srcFloats[2] = linear_from_curve[(*src >> 16) & 0xFF];
304
305 // Convert to dst gamut.
306 float dstFloats[3];
307 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] +
308 srcFloats[2] * matrix[8];
309 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] +
310 srcFloats[2] * matrix[9];
311 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] +
312 srcFloats[2] * matrix[10];
313
314 // Convert to dst gamma.
315 // Note: pow is really, really slow. We will suffer when SSE2 is not su pported.
316 dstFloats[0] = powf(dstFloats[0], (1/2.2f)) * 255.0f;
317 dstFloats[1] = powf(dstFloats[1], (1/2.2f)) * 255.0f;
318 dstFloats[2] = powf(dstFloats[2], (1/2.2f)) * 255.0f;
319
320 *dst = (0xFF << 24) |
321 (clamp_float_to_byte(dstFloats[2]) << 16) |
322 (clamp_float_to_byte(dstFloats[1]) << 8) |
323 (clamp_float_to_byte(dstFloats[0]) << 0);
324
325 dst++;
326 src++;
327 }
328 }
329
330 #endif
331
332 static void color_xform_RGB1_srgb_to_2dot2(uint32_t* dst, const uint32_t* src, i nt len, 243 static void color_xform_RGB1_srgb_to_2dot2(uint32_t* dst, const uint32_t* src, i nt len,
333 const float matrix[16]) { 244 const float matrix[16]) {
334 color_xform_RGB1<linear_from_srgb>(dst, src, len, matrix); 245 color_xform_RGB1<linear_from_srgb>(dst, src, len, matrix);
335 } 246 }
336 247
337 static void color_xform_RGB1_2dot2_to_2dot2(uint32_t* dst, const uint32_t* src, int len, 248 static void color_xform_RGB1_2dot2_to_2dot2(uint32_t* dst, const uint32_t* src, int len,
338 const float matrix[16]) { 249 const float matrix[16]) {
339 color_xform_RGB1<linear_from_2dot2>(dst, src, len, matrix); 250 color_xform_RGB1<linear_from_2dot2>(dst, src, len, matrix);
340 } 251 }
341 252
342 } 253 } // namespace SK_OPTS_NS
343 254
344 #endif // SkColorXform_opts_DEFINED 255 #endif // SkColorXform_opts_DEFINED
OLDNEW
« no previous file with comments | « no previous file | src/opts/SkNx_neon.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698