src/opts/SkColorXform_opts.h - Issue 2078913003: port to Sk4f

Side by Side Diff: src/opts/SkColorXform_opts.h

Issue 2078913003: port to Sk4f (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: nah Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 Google Inc.	2 * Copyright 2016 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef SkColorXform_opts_DEFINED	8 #ifndef SkColorXform_opts_DEFINED

9 #define SkColorXform_opts_DEFINED	9 #define SkColorXform_opts_DEFINED

10	10

	11 #include "SkNx.h"

11 #include "SkColorPriv.h"	12 #include "SkColorPriv.h"

12	13

13 namespace SK_OPTS_NS {	14 namespace SK_OPTS_NS {

14	15

15 extern const float linear_from_srgb[256] = {	16 extern const float linear_from_srgb[256] = {

16 0.000000000000000000f, 0.000303526983548838f, 0.000607053967097675f, 0.0 00910580950646513f,	17 0.000000000000000000f, 0.000303526983548838f, 0.000607053967097675f, 0.0 00910580950646513f,

17 0.001214107934195350f, 0.001517634917744190f, 0.001821161901293030f, 0.0 02124688884841860f,	18 0.001214107934195350f, 0.001517634917744190f, 0.001821161901293030f, 0.0 02124688884841860f,

18 0.002428215868390700f, 0.002731742851939540f, 0.003034518678424960f, 0.0 03346535763899160f,	19 0.002428215868390700f, 0.002731742851939540f, 0.003034518678424960f, 0.0 03346535763899160f,

19 0.003676507324047440f, 0.004024717018496310f, 0.004391442037410290f, 0.0 04776953480693730f,	20 0.003676507324047440f, 0.004024717018496310f, 0.004391442037410290f, 0.0 04776953480693730f,

20 0.005181516702338390f, 0.005605391624202720f, 0.006048833022857060f, 0.0 06512090792594470f,	21 0.005181516702338390f, 0.005605391624202720f, 0.006048833022857060f, 0.0 06512090792594470f,

(...skipping 118 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
139 0.751895080583051000f, 0.759299550695091000f, 0.766743616862161000f, 0.7 74227314218442000f,	140 0.751895080583051000f, 0.759299550695091000f, 0.766743616862161000f, 0.7 74227314218442000f,

140 0.781750677773962000f, 0.789313742415586000f, 0.796916542907978000f, 0.8 04559113894567000f,	141 0.781750677773962000f, 0.789313742415586000f, 0.796916542907978000f, 0.8 04559113894567000f,

141 0.812241489898490000f, 0.819963705323528000f, 0.827725794455034000f, 0.8 35527791460841000f,	142 0.812241489898490000f, 0.819963705323528000f, 0.827725794455034000f, 0.8 35527791460841000f,

142 0.843369730392169000f, 0.851251645184515000f, 0.859173569658532000f, 0.8 67135537520905000f,	143 0.843369730392169000f, 0.851251645184515000f, 0.859173569658532000f, 0.8 67135537520905000f,

143 0.875137582365205000f, 0.883179737672745000f, 0.891262036813419000f, 0.8 99384513046529000f,	144 0.875137582365205000f, 0.883179737672745000f, 0.891262036813419000f, 0.8 99384513046529000f,

144 0.907547199521614000f, 0.915750129279253000f, 0.923993335251873000f, 0.9 32276850264543000f,	145 0.907547199521614000f, 0.915750129279253000f, 0.923993335251873000f, 0.9 32276850264543000f,

145 0.940600707035753000f, 0.948964938178195000f, 0.957369576199527000f, 0.9 65814653503130000f,	146 0.940600707035753000f, 0.948964938178195000f, 0.957369576199527000f, 0.9 65814653503130000f,

146 0.974300202388861000f, 0.982826255053791000f, 0.991392843592940000f, 1.0 00000000000000000f,	147 0.974300202388861000f, 0.982826255053791000f, 0.991392843592940000f, 1.0 00000000000000000f,

147 };	148 };

148	149

149 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2	150 static Sk4f linear_to_2dot2(const Sk4f& x) {

	151 // x^(29/64) is a very good approximation of the true value, x^(1/2.2).

	152 auto x2 = x.rsqrt(), // x^(-1/2)

	153 x32 = x2.rsqrt().rsqrt().rsqrt().rsqrt(), // x^(-1/32)

	154 x64 = x32.rsqrt(); // x^(+1/64)

150	155

151 // x^(29/64) is a very good approximation of the true value, x^(1/2.2).	156 // 29 = 32 - 2 - 1

152 static __m128 linear_to_2dot2(__m128 x) {	157 return 255.0f * x2.invert() * x32 * x64.invert();

153 // x^(-1/2)

154 __m128 x2 = _mm_rsqrt_ps(x);

155

156 // x^(-1/32)

157 __m128 x32 = _mm_rsqrt_ps(_mm_rsqrt_ps(_mm_rsqrt_ps(_mm_rsqrt_ps(x2))));

158

159 // x^(+1/64)

160 __m128 x64 = _mm_rsqrt_ps(x32);

161

162 // x^(+29/64) = x^(+1/2) * x^(-1/32) * x^(-1/64)

163 // Note that we also scale to the 0-255 range.

164 // These terms can be combined more minimally with 3 muls and 1 reciprocal. However, this

165 // is faster, because it allows us to start the muls in parallel with the rs qrts.

166 __m128 scale = _mm_set1_ps(255.0f);

167 return _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(scale, _mm_rcp_ps(x2)), x32), _mm_rc p_ps(x64));

168 }	158 }

169	159

170 static __m128 clamp_0_to_255(__m128 x) {	160 static Sk4f clamp_0_to_255(const Sk4f& x) {

171 // The order of the arguments is important here. We want to make sure that NaN	161 // The order of the arguments is important here. We want to make sure that NaN

172 // clamps to zero. Note that max(NaN, 0) = 0, while max(0, NaN) = NaN.	162 // clamps to zero. Note that max(NaN, 0) = 0, while max(0, NaN) = NaN.

173 return _mm_min_ps(_mm_max_ps(x, _mm_setzero_ps()), _mm_set1_ps(255.0f));	163 return Sk4f::Min(Sk4f::Max(x, 0.0f), 255.0f);

174 }	164 }

175	165

176 template <const float (&linear_from_curve)[256]>	166 template <const float (&linear_from_curve)[256]>

177 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,	167 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,

178 const float matrix[16]) {	168 const float matrix[16]) {

179 // Load transformation matrix.	169 // Load transformation matrix.

180 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]);	170 auto rXgXbX = Sk4f::Load(matrix + 0),

181 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]);	171 rYgYbY = Sk4f::Load(matrix + 4),

182 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]);	172 rZgZbZ = Sk4f::Load(matrix + 8);

183	173

184 while (len >= 4) {	174 while (len >= 4) {

185 // Convert to linear. The look-up table has perfect accuracy.	175 // Convert to linear. The look-up table has perfect accuracy.

186 __m128 reds = _mm_setr_ps(linear_from_curve[(src[0] >> 0) & 0xFF],	176 auto reds = Sk4f{linear_from_curve[(src[0] >> 0) & 0xFF],

187 linear_from_curve[(src[1] >> 0) & 0xFF],	177 linear_from_curve[(src[1] >> 0) & 0xFF],

188 linear_from_curve[(src[2] >> 0) & 0xFF],	178 linear_from_curve[(src[2] >> 0) & 0xFF],

189 linear_from_curve[(src[3] >> 0) & 0xFF]);	179 linear_from_curve[(src[3] >> 0) & 0xFF]};

190 __m128 greens = _mm_setr_ps(linear_from_curve[(src[0] >> 8) & 0xFF],	180 auto greens = Sk4f{linear_from_curve[(src[0] >> 8) & 0xFF],

191 linear_from_curve[(src[1] >> 8) & 0xFF],	181 linear_from_curve[(src[1] >> 8) & 0xFF],

192 linear_from_curve[(src[2] >> 8) & 0xFF],	182 linear_from_curve[(src[2] >> 8) & 0xFF],

193 linear_from_curve[(src[3] >> 8) & 0xFF]);	183 linear_from_curve[(src[3] >> 8) & 0xFF]};

194 __m128 blues = _mm_setr_ps(linear_from_curve[(src[0] >> 16) & 0xFF],	184 auto blues = Sk4f{linear_from_curve[(src[0] >> 16) & 0xFF],

195 linear_from_curve[(src[1] >> 16) & 0xFF],	185 linear_from_curve[(src[1] >> 16) & 0xFF],

196 linear_from_curve[(src[2] >> 16) & 0xFF],	186 linear_from_curve[(src[2] >> 16) & 0xFF],

197 linear_from_curve[(src[3] >> 16) & 0xFF]);	187 linear_from_curve[(src[3] >> 16) & 0xFF]};

198	188

199 // Apply the transformation matrix to dst gamut.	189 // Apply the transformation matrix to dst gamut.

200 // Splat rX, rY, and rZ each across a register.	190 auto dstReds = rXgXbX[0]reds + rYgYbY[0]greens + rZgZbZ[0]*blues,

201 __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00);	191 dstGreens = rXgXbX[1]reds + rYgYbY[1]greens + rZgZbZ[1]*blues,

202 __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00);	192 dstBlues = rXgXbX[2]reds + rYgYbY[2]greens + rZgZbZ[2]*blues;

203 __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00);

204

205 // dstReds = rX * reds + rY * greens + rZ * blues

206 __m128 dstReds = _mm_mul_ps(reds, rX);

207 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY));

208 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ));

209

210 // Splat gX, gY, and gZ each across a register.

211 __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55);

212 __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55);

213 __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55);

214

215 // dstGreens = gX * reds + gY * greens + gZ * blues

216 __m128 dstGreens = _mm_mul_ps(reds, gX);

217 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY));

218 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ));

219

220 // Splat bX, bY, and bZ each across a register.

221 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA);

222 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA);

223 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA);

224

225 // dstBlues = bX * reds + bY * greens + bZ * blues

226 __m128 dstBlues = _mm_mul_ps(reds, bX);

227 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY));

228 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ));

229	193

230 // Convert to dst gamma.	194 // Convert to dst gamma.

231 dstReds = linear_to_2dot2(dstReds);	195 dstReds = linear_to_2dot2(dstReds);

232 dstGreens = linear_to_2dot2(dstGreens);	196 dstGreens = linear_to_2dot2(dstGreens);

233 dstBlues = linear_to_2dot2(dstBlues);	197 dstBlues = linear_to_2dot2(dstBlues);

234	198

235 // Clamp floats.	199 // Clamp floats to byte range.

236 dstReds = clamp_0_to_255(dstReds);	200 dstReds = clamp_0_to_255(dstReds);

237 dstGreens = clamp_0_to_255(dstGreens);	201 dstGreens = clamp_0_to_255(dstGreens);

238 dstBlues = clamp_0_to_255(dstBlues);	202 dstBlues = clamp_0_to_255(dstBlues);

239	203

240 // Convert to bytes and store to memory.	204 // Convert to bytes and store to memory.

241 __m128i rgba = _mm_set1_epi32(0xFF000000);	205 auto rgba = (Sk4i{(int)0xFF000000} )

242 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) );	206 \| (SkNx_cast<int>(dstReds) )

243 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) );	207 \| (SkNx_cast<int>(dstGreens) << 8)

244 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16) );	208 \| (SkNx_cast<int>(dstBlues) << 16);

245 _mm_storeu_si128((__m128i*) dst, rgba);	209 rgba.store(dst);

246	210

247 dst += 4;	211 dst += 4;

248 src += 4;	212 src += 4;

249 len -= 4;	213 len -= 4;

250 }	214 }

251	215

252 while (len > 0) {	216 while (len > 0) {

253 // Splat the red, green, and blue components.	217 // Splat r,g,b across a register each.

254 __m128 r = _mm_set1_ps(linear_from_curve[(src[0] >> 0) & 0xFF]),	218 auto r = Sk4f{linear_from_curve[(*src >> 0) & 0xFF]},

255 g = _mm_set1_ps(linear_from_curve[(src[0] >> 8) & 0xFF]),	219 g = Sk4f{linear_from_curve[(*src >> 8) & 0xFF]},

256 b = _mm_set1_ps(linear_from_curve[(src[0] >> 16) & 0xFF]);	220 b = Sk4f{linear_from_curve[(*src >> 16) & 0xFF]};

257	221

258 // Apply the transformation matrix to dst gamut.	222 // Apply transformation matrix to dst gamut.

259 __m128 dstPixel = _mm_mul_ps(r, rXgXbX);	223 auto dstPixel = rXgXbXr + rYgYbYg + rZgZbZ*b;

260 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(g, rYgYbY));

261 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(b, rZgZbZ));

262	224

263 // Convert to dst gamma.	225 // Convert to dst gamma.

264 dstPixel = linear_to_2dot2(dstPixel);	226 dstPixel = linear_to_2dot2(dstPixel);

265	227

266 // Clamp floats to 0-255 range.	228 // Clamp floats to byte range.

267 dstPixel = clamp_0_to_255(dstPixel);	229 dstPixel = clamp_0_to_255(dstPixel);

268	230

269 // Convert to bytes and store to memory.	231 // Convert to bytes and store to memory.

270 __m128i dstInts = _mm_cvtps_epi32(dstPixel);	232 uint32_t rgba;

271 __m128i dstBytes = _mm_packus_epi16(_mm_packus_epi16(dstInts, dstInts), dstInts);	233 SkNx_cast<uint8_t>(dstPixel).store(&rgba);

272 dstBytes = _mm_or_si128(_mm_set1_epi32(0xFF000000), dstBytes);	234 rgba \|= 0xFF000000;

273 _mm_store_ss((float*) dst, _mm_castsi128_ps(dstBytes));	235 *dst = rgba;

274	236

275 dst += 1;	237 dst += 1;

276 src += 1;	238 src += 1;

277 len -= 1;	239 len -= 1;

278 }	240 }

279 }	241 }

280	242

281 #else

282

283 static uint8_t clamp_float_to_byte(float v) {

284 // The ordering of the logic is a little strange here in order

285 // to make sure we convert NaNs to 0.

286 if (v >= 254.5f) {

287 return 255;

288 } else if (v >= 0.5f) {

289 return (uint8_t) (v + 0.5f);

290 } else {

291 return 0;

292 }

293 }

294

295 template <const float (&linear_from_curve)[256]>

296 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,

297 const float matrix[16]) {

298 while (len-- > 0) {

299 // Convert to linear.

300 float srcFloats[3];

301 srcFloats[0] = linear_from_curve[(*src >> 0) & 0xFF];

302 srcFloats[1] = linear_from_curve[(*src >> 8) & 0xFF];

303 srcFloats[2] = linear_from_curve[(*src >> 16) & 0xFF];

304

305 // Convert to dst gamut.

306 float dstFloats[3];

307 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] +

308 srcFloats[2] * matrix[8];

309 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] +

310 srcFloats[2] * matrix[9];

311 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] +

312 srcFloats[2] * matrix[10];

313

314 // Convert to dst gamma.

315 // Note: pow is really, really slow. We will suffer when SSE2 is not su pported.

316 dstFloats[0] = powf(dstFloats[0], (1/2.2f)) * 255.0f;

317 dstFloats[1] = powf(dstFloats[1], (1/2.2f)) * 255.0f;

318 dstFloats[2] = powf(dstFloats[2], (1/2.2f)) * 255.0f;

319

320 *dst = (0xFF << 24) \|

321 (clamp_float_to_byte(dstFloats[2]) << 16) \|

322 (clamp_float_to_byte(dstFloats[1]) << 8) \|

323 (clamp_float_to_byte(dstFloats[0]) << 0);

324

325 dst++;

326 src++;

327 }

328 }

329

330 #endif

331

332 static void color_xform_RGB1_srgb_to_2dot2(uint32_t* dst, const uint32_t* src, i nt len,	243 static void color_xform_RGB1_srgb_to_2dot2(uint32_t* dst, const uint32_t* src, i nt len,

333 const float matrix[16]) {	244 const float matrix[16]) {

334 color_xform_RGB1<linear_from_srgb>(dst, src, len, matrix);	245 color_xform_RGB1<linear_from_srgb>(dst, src, len, matrix);

335 }	246 }

336	247

337 static void color_xform_RGB1_2dot2_to_2dot2(uint32_t* dst, const uint32_t* src, int len,	248 static void color_xform_RGB1_2dot2_to_2dot2(uint32_t* dst, const uint32_t* src, int len,

338 const float matrix[16]) {	249 const float matrix[16]) {

339 color_xform_RGB1<linear_from_2dot2>(dst, src, len, matrix);	250 color_xform_RGB1<linear_from_2dot2>(dst, src, len, matrix);

340 }	251 }

341	252

342 }	253 } // namespace SK_OPTS_NS

343	254

344 #endif // SkColorXform_opts_DEFINED	255 #endif // SkColorXform_opts_DEFINED

OLD	NEW

« no previous file with comments | « no previous file | src/opts/SkNx_neon.h » ('j') | no next file with comments »