src/opts/SkRasterPipeline_opts.h - Issue 2449243003: Initial implementation of a SkColorSpace_A2B xform

Unified Diff: src/opts/SkRasterPipeline_opts.h

Issue 2449243003: Initial implementation of a SkColorSpace_A2B xform (Closed)

Patch Set: updated implementation to use SkRasterPipeline Created 4 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/opts/SkRasterPipeline_opts.h

diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h

index 5c5418be1a44d6d6ece160e288467828f10a75a5..db9a5bf290f270106a0178be146440179ef8ba31 100644

--- a/src/opts/SkRasterPipeline_opts.h

+++ b/src/opts/SkRasterPipeline_opts.h

@@ -9,7 +9,9 @@

#define SkRasterPipeline_opts_DEFINED

#include "SkColorPriv.h"

+#include "SkColorSpace_Base.h"

#include "SkHalf.h"

+#include "SkMatrix44.h"

#include "SkPM4f.h"

#include "SkPM4fPriv.h"

#include "SkRasterPipeline.h"

@@ -134,6 +136,59 @@ SI void SK_VECTORCALL next(TailStage* st, size_t x, size_t tail,

static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa, \

const SkNf& d, const SkNf& da)

+#define GAMMA_STAGE(name) \

msarett1 2016/11/09 00:01:05 Instead of this, I think I would prefer 6 normal s

mtklein_C 2016/11/09 11:04:36 I think you mean 3 normal stages? Each STAGE invo

raftias 2016/11/10 21:36:06 I did this with fn_1_r/g/b. If we add in specific

+ static SK_ALWAYS_INLINE SkNf name##_kernel(void* ctx, SkNf& s); \

+ SI void SK_VECTORCALL name##_r(BodyStage* st, size_t x, \

+ SkNf r, SkNf g, SkNf b, SkNf a, \

+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \

+ r = name##_kernel(st->ctx, r); \

+ next(st, x, r,g,b,a, dr,dg,db,da); \

+ } \

+ SI void SK_VECTORCALL name##_r(TailStage* st, size_t x, size_t tail, \

+ SkNf r, SkNf g, SkNf b, SkNf a, \

+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \

+ r = name##_kernel(st->ctx, r); \

+ next(st, x,tail, r,g,b,a, dr,dg,db,da); \

+ } \

+ SI void SK_VECTORCALL name##_g(BodyStage* st, size_t x, \

+ SkNf r, SkNf g, SkNf b, SkNf a, \

+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \

+ g = name##_kernel(st->ctx, g); \

+ next(st, x, r,g,b,a, dr,dg,db,da); \

+ } \

+ SI void SK_VECTORCALL name##_g(TailStage* st, size_t x, size_t tail, \

+ SkNf r, SkNf g, SkNf b, SkNf a, \

+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \

+ g = name##_kernel(st->ctx, g); \

+ next(st, x,tail, r,g,b,a, dr,dg,db,da); \

+ } \

+ SI void SK_VECTORCALL name##_b(BodyStage* st, size_t x, \

+ SkNf r, SkNf g, SkNf b, SkNf a, \

+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \

+ b = name##_kernel(st->ctx, b); \

+ next(st, x, r,g,b,a, dr,dg,db,da); \

+ } \

+ SI void SK_VECTORCALL name##_b(TailStage* st, size_t x, size_t tail, \

+ SkNf r, SkNf g, SkNf b, SkNf a, \

+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \

+ b = name##_kernel(st->ctx, b); \

+ next(st, x,tail, r,g,b,a, dr,dg,db,da); \

+ } \

+ SI void SK_VECTORCALL name##_a(BodyStage* st, size_t x, \

msarett1 2016/11/09 00:01:05 All we need to do with "a" is load it and store.

raftias 2016/11/10 21:36:07 It was indeed for CMYK/etc support.

+ SkNf r, SkNf g, SkNf b, SkNf a, \

+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \

+ a = name##_kernel(st->ctx, a); \

+ next(st, x, r,g,b,a, dr,dg,db,da); \

+ } \

+ SI void SK_VECTORCALL name##_a(TailStage* st, size_t x, size_t tail, \

+ SkNf r, SkNf g, SkNf b, SkNf a, \

+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \

+ a = name##_kernel(st->ctx, a); \

+ next(st, x,tail, r,g,b,a, dr,dg,db,da); \

+ } \

+ static SK_ALWAYS_INLINE SkNf name##_kernel(void* ctx, SkNf& s)

SI SkNf inv(const SkNf& x) { return 1.0f - x; }

SI SkNf lerp(const SkNf& from, const SkNf& to, const SkNf& cov) {

@@ -431,6 +486,59 @@ STAGE(store_srgb, false) {

| SkNx_cast<int>(0.5f + 255.0f * a) << SK_A32_SHIFT), (int*)ptr);

}

+STAGE(load_s_linear_rgba, true) {

msarett1 2016/11/09 00:01:05 nit: Follow style conventions from above Use whit

mtklein_C 2016/11/09 11:04:36 Let's call these _8888. That's our common shortha

+ auto ptr = *(const uint32_t**)ctx + x;

+ auto px = load<kIsTail>(tail, ptr);

+ auto to_int = [](const SkNx<N, uint32_t>& v) { return SkNi::Load(&v); };

+ r = (1/255.0f)*SkNx_cast<float>(to_int((px >> 0) & 0xFF));

+ g = (1/255.0f)*SkNx_cast<float>(to_int((px >> 8) & 0xFF));

+ b = (1/255.0f)*SkNx_cast<float>(to_int((px >> 16) & 0xFF));

+ a = (1/255.0f)*SkNx_cast<float>(to_int(px >> 24));

+STAGE(load_s_linear_bgra, true) {

mtklein_C 2016/11/09 11:04:36 How about we write everything in terms of rgba, an

raftias 2016/11/10 21:36:07 I that before (with that exact name, even), then t

msarett1 2016/11/11 14:36:51 Let's defer to Mike on this one. lgtm, as is.

+ auto ptr = *(const uint32_t**)ctx + x;

+ auto px = load<kIsTail>(tail, ptr);

+ auto to_int = [](const SkNx<N, uint32_t>& v) { return SkNi::Load(&v); };

+ r = (1/255.0f)*SkNx_cast<float>(to_int((px >> 16) & 0xFF));

+ g = (1/255.0f)*SkNx_cast<float>(to_int((px >> 8) & 0xFF));

+ b = (1/255.0f)*SkNx_cast<float>(to_int((px >> 0) & 0xFF));

+ a = (1/255.0f)*SkNx_cast<float>(to_int((px >> 24)));

+// Clamp colors into [0,1] premul (e.g. just before storing back to memory).

raftias 2016/11/08 21:19:58 I noticed when I pulled before uploading that this

msarett1 2016/11/09 00:01:05 I believe the idea is to not waste time clamping w

mtklein_C 2016/11/09 11:04:36 This has now been split into two stages, clamp_0 a

+SI void clamp_01_premul(SkNf& r, SkNf& g, SkNf& b, SkNf& a) {

+ a = SkNf::Max(a, 0.0f);

+ r = SkNf::Max(r, 0.0f);

+ g = SkNf::Max(g, 0.0f);

+ b = SkNf::Max(b, 0.0f);

+ a = SkNf::Min(a, 1.0f);

+ r = SkNf::Min(r, a);

+ g = SkNf::Min(g, a);

+ b = SkNf::Min(b, a);

+STAGE(store_linear_rgba, false) {

+ clamp_01_premul(r,g,b,a);

+ auto ptr = *(uint32_t**)ctx + x;

+ store<kIsTail>(tail, ( SkNx_cast<int>(255.0f * r + 0.5f) << 0

msarett1 2016/11/09 00:01:05 I don't think you need the "+ 0.5f" terms. I thin

mtklein_C 2016/11/09 11:04:36 No, we're doing that to round to the nearest byte

+ | SkNx_cast<int>(255.0f * g + 0.5f) << 8

+ | SkNx_cast<int>(255.0f * b + 0.5f) << 16

+ | SkNx_cast<int>(255.0f * a + 0.5f) << 24 ), (int*)ptr);

+STAGE(store_linear_bgra, false) {

+ clamp_01_premul(r,g,b,a);

+ auto ptr = *(uint32_t**)ctx + x;

+ store<kIsTail>(tail, ( SkNx_cast<int>(255.0f * r + 0.5f) << 16

+ | SkNx_cast<int>(255.0f * g + 0.5f) << 8

+ | SkNx_cast<int>(255.0f * b + 0.5f) << 0

+ | SkNx_cast<int>(255.0f * a + 0.5f) << 24 ), (int*)ptr);

RGBA_XFERMODE(clear) { return 0.0f; }

//RGBA_XFERMODE(src) { return s; } // This would be a no-op stage, so we just omit it.

RGBA_XFERMODE(dst) { return d; }

@@ -490,6 +598,19 @@ STAGE(luminance_to_alpha, true) {

r = g = b = 0;

}

+STAGE(matrix_4x4, true) {

+ const SkMatrix44& mat = *(const SkMatrix44*)ctx;

mtklein_C 2016/11/09 11:04:36 I'd like matrix_4x4 and matrix_4x5 to look and beh

raftias 2016/11/10 21:36:06 They were just different since I had written it an

+ auto fma = [](const SkNf& f, const SkNf& m, const SkNf& a) { return SkNx_fma(f,m,a); };

+ dr = fma(mat.get(0, 0),r, fma(mat.get(0, 1),g, fma(mat.get(0, 2),b, mat.get(0, 3)*a)));

msarett1 2016/11/09 00:01:05 No need for "*a". Actually I think we don't want

mtklein_C 2016/11/09 11:04:36 If we don't *a here, we can't really call this sta

raftias 2016/11/10 21:36:06 It's 3x4 now.

+ dg = fma(mat.get(1, 0),r, fma(mat.get(1, 1),g, fma(mat.get(1, 2),b, mat.get(1, 3)*a)));

+ db = fma(mat.get(2, 0),r, fma(mat.get(2, 1),g, fma(mat.get(2, 2),b, mat.get(2, 3)*a)));

msarett1 2016/11/09 00:01:05 Mike, is it ok that we're destructive to dr, dg, d

mtklein_C 2016/11/09 11:04:36 The pedantic answer is that that depends what you'

raftias 2016/11/10 21:36:06 I'll remove these and put them in temporaries. I j

+ da = fma(mat.get(3, 0),r, fma(mat.get(3, 1),g, fma(mat.get(3, 2),b, mat.get(3, 3)*a)));

+ r = dr;

+ g = dg;

+ b = db;

+ a = da;

STAGE(matrix_4x5, true) {

auto m = (const float*)ctx;

@@ -504,6 +625,169 @@ STAGE(matrix_4x5, true) {

a = A;

}

+static inline Sk4f powNf(const Sk4f& x, float exp) {

mtklein_C 2016/11/09 11:04:36 Generally this file writes static inline as SI. I

raftias 2016/11/10 21:36:06 Acknowledged.

+ return Sk4f{::powf(x[0], exp), ::powf(x[1], exp), ::powf(x[2], exp), ::powf(x[3], exp)};

+static inline Sk8f powNf(const Sk8f& x, float exp) {

+ return Sk8f{::powf(x[0], exp), ::powf(x[1], exp), ::powf(x[2], exp), ::powf(x[3], exp),

+ ::powf(x[4], exp), ::powf(x[5], exp), ::powf(x[6], exp), ::powf(x[7], exp)};

+GAMMA_STAGE(param_gamma) {

+ const SkColorSpaceTransferFn& gamma = *(const SkColorSpaceTransferFn*)ctx;

+ return (s <= gamma.fD).thenElse(gamma.fE * s + gamma.fF,

msarett1 2016/11/09 00:01:05 nit: < instead of <=

raftias 2016/11/10 21:36:06 Done.

+ powNf(s * gamma.fA + gamma.fB, gamma.fG) + gamma.fC);

+static constexpr float kGammaTableSize = 1024;

+GAMMA_STAGE(table_gamma) {

+ constexpr float maxIndex = kGammaTableSize - 1;

+ const float* gammaTables = (const float*)ctx;

mtklein_C 2016/11/09 11:04:36 This name makes it seem like we're going to be usi

raftias 2016/11/10 21:36:06 Acknowledged.

+ s = SkNf::Min(SkNf::Max(maxIndex * s, 0.f), maxIndex);

mtklein_C 2016/11/09 11:04:36 If we're not going to source the 1024 constant fro

raftias 2016/11/10 21:36:06 ApplyTable stores it now.

+ float result[N];

+ for (int i = 0; i < N; ++i) {

+ result[i] = gammaTables[lrintf(s[i])];

+ }

+ return SkNf::Load(result);

+static inline void interp_3d_clut(float dst[3], float src[3], const SkColorLookUpTable* colorLUT) {

msarett1 2016/11/09 00:01:05 This maybe does not need to belong in this file.

mtklein_C 2016/11/09 11:04:36 Why don't we make this a normal, separately-compil

raftias 2016/11/10 21:36:06 Done.

+ // Call the src components x, y, and z.

+ uint8_t maxX = colorLUT->fGridPoints[0] - 1;

+ uint8_t maxY = colorLUT->fGridPoints[1] - 1;

+ uint8_t maxZ = colorLUT->fGridPoints[2] - 1;

+ // An approximate index into each of the three dimensions of the table.

+ float x = src[0] * maxX;

+ float y = src[1] * maxY;

+ float z = src[2] * maxZ;

+ // This gives us the low index for our interpolation.

+ int ix = sk_float_floor2int(x);

+ int iy = sk_float_floor2int(y);

+ int iz = sk_float_floor2int(z);

+ // Make sure the low index is not also the max index.

+ ix = (maxX == ix) ? ix - 1 : ix;

+ iy = (maxY == iy) ? iy - 1 : iy;

+ iz = (maxZ == iz) ? iz - 1 : iz;

+ // Weighting factors for the interpolation.

+ float diffX = x - ix;

+ float diffY = y - iy;

+ float diffZ = z - iz;

+ // Constants to help us navigate the 3D table.

+ // Ex: Assume x = a, y = b, z = c.

+ // table[a * n001 + b * n010 + c * n100] logically equals table[a][b][c].

+ const int n000 = 0;

+ const int n001 = 3 * colorLUT->fGridPoints[1] * colorLUT->fGridPoints[2];

+ const int n010 = 3 * colorLUT->fGridPoints[2];

+ const int n011 = n001 + n010;

+ const int n100 = 3;

+ const int n101 = n100 + n001;

+ const int n110 = n100 + n010;

+ const int n111 = n110 + n001;

+ // Base ptr into the table.

+ const float* ptr = &(colorLUT->table()[ix*n001 + iy*n010 + iz*n100]);

+ // The code below performs a tetrahedral interpolation for each of the three

+ // dst components. Once the tetrahedron containing the interpolation point is

+ // identified, the interpolation is a weighted sum of grid values at the

+ // vertices of the tetrahedron. The claim is that tetrahedral interpolation

+ // provides a more accurate color conversion.

+ // blogs.mathworks.com/steve/2006/11/24/tetrahedral-interpolation-for-colorspace-conversion/

+ //

+ // I have one test image, and visually I can't tell the difference between

+ // tetrahedral and trilinear interpolation. In terms of computation, the

+ // tetrahedral code requires more branches but less computation. The

+ // SampleICC library provides an option for the client to choose either

+ // tetrahedral or trilinear.

+ for (int i = 0; i < 3; i++) {

+ if (diffZ < diffY) {

+ if (diffZ < diffX) {

+ dst[i] = (ptr[n000] + diffZ * (ptr[n110] - ptr[n010]) +

+ diffY * (ptr[n010] - ptr[n000]) +

+ diffX * (ptr[n111] - ptr[n110]));

+ } else if (diffY < diffX) {

+ dst[i] = (ptr[n000] + diffZ * (ptr[n111] - ptr[n011]) +

+ diffY * (ptr[n011] - ptr[n001]) +

+ diffX * (ptr[n001] - ptr[n000]));

+ } else {

+ dst[i] = (ptr[n000] + diffZ * (ptr[n111] - ptr[n011]) +

+ diffY * (ptr[n010] - ptr[n000]) +

+ diffX * (ptr[n011] - ptr[n010]));

+ }

+ } else {

+ if (diffZ < diffX) {

+ dst[i] = (ptr[n000] + diffZ * (ptr[n101] - ptr[n001]) +

+ diffY * (ptr[n111] - ptr[n101]) +

+ diffX * (ptr[n001] - ptr[n000]));

+ } else if (diffY < diffX) {

+ dst[i] = (ptr[n000] + diffZ * (ptr[n100] - ptr[n000]) +

+ diffY * (ptr[n111] - ptr[n101]) +

+ diffX * (ptr[n101] - ptr[n100]));

+ } else {

+ dst[i] = (ptr[n000] + diffZ * (ptr[n100] - ptr[n000]) +

+ diffY * (ptr[n110] - ptr[n100]) +

+ diffX * (ptr[n111] - ptr[n110]));

+ }

+ // Increment the table ptr in order to handle the next component.

+ // Note that this is the how table is designed: all of nXXX

+ // variables are multiples of 3 because there are 3 output

+ // components.

+ ptr++;

+ }

+STAGE(clut, true) {

mtklein_C 2016/11/09 11:04:36 how about color_lookup_table?

raftias 2016/11/10 21:36:06 Done.

+ const SkColorLookUpTable* colorLUT = (const SkColorLookUpTable*)ctx;

mtklein_C 2016/11/09 11:04:36 Side note: it's going to drive me nuts that we cap

raftias 2016/11/10 21:36:06 I didn't name it, but my guess is that it's becaus

msarett1 2016/11/11 14:36:51 I don't feel strongly about the name. Feel free t

+ float rgb[3];

+ alignas(alignof(SkNf)) float result[3][N];

mtklein_C 2016/11/09 11:04:36 Let's drop the alignment business. SkNf::Load() s

raftias 2016/11/10 21:36:06 Done.

+ for (int i = 0; i < N; ++i) {

+ rgb[0] = r[i];

+ rgb[1] = g[i];

+ rgb[2] = b[i];

+ interp_3d_clut(rgb, rgb, colorLUT);

+ result[0][i] = rgb[0];

+ result[1][i] = rgb[1];

+ result[2][i] = rgb[2];

+ }

+ r = SkNf::Load(result[0]);

+ g = SkNf::Load(result[1]);

+ b = SkNf::Load(result[2]);

+STAGE(labtoxyz, true) {

raftias 2016/11/08 21:19:58 I think this can be expressed as a matrix_4x4 foll

mtklein_C 2016/11/09 11:04:36 I think this is clearer as its own stage. It's pr

raftias 2016/11/10 21:36:06 Done.

+ const auto lab_l = r * 100.f;

+ const auto lab_a = g * 255.f - 128.f;

+ const auto lab_b = b * 255.f - 128.f;

+ auto Y = (lab_l + 16.f) * (1.f/116.f);

+ auto X = lab_a * (1.f/500.f) + Y;

mtklein_C 2016/11/09 11:04:36 One .f is plenty to get these solidly as float con

raftias 2016/11/10 21:36:06 Acknowledged.

+ auto Z = Y - (lab_b * (1.f/200.f));

+ auto cubed = X*X*X;

+ X = (cubed > 0.008856f).thenElse(cubed, (X - (16.f/116.f)) * (1.f/7.787f));

+ cubed = Y*Y*Y;

mtklein_C 2016/11/09 11:04:36 At a glance it looks like cubed must be accumulati

raftias 2016/11/10 21:36:06 Done.

+ Y = (cubed > 0.008856f).thenElse(cubed, (Y - (16.f/116.f)) * (1.f/7.787f));

+ cubed = Z*Z*Z;

+ Z = (cubed > 0.008856f).thenElse(cubed, (Z - (16.f/116.f)) * (1.f/7.787f));

+ // adjust to D50 illuminant

+ X *= 0.96422f;

+ Y *= 1.00000f;

+ Z *= 0.82521f;

+ r = X;

+ g = Y;

+ b = Z;

template <typename Fn>

SI Fn enum_to_Fn(SkRasterPipeline::StockStage st) {

switch (st) {

« src/core/SkColorSpace_A2B.h ('K') | « src/core/SkRasterPipeline.h ('k') | tests/ColorSpaceXformTest.cpp » ('j') | no next file with comments »