Index: src/opts/SkRasterPipeline_opts.h |
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h |
index 5c5418be1a44d6d6ece160e288467828f10a75a5..db9a5bf290f270106a0178be146440179ef8ba31 100644 |
--- a/src/opts/SkRasterPipeline_opts.h |
+++ b/src/opts/SkRasterPipeline_opts.h |
@@ -9,7 +9,9 @@ |
#define SkRasterPipeline_opts_DEFINED |
#include "SkColorPriv.h" |
+#include "SkColorSpace_Base.h" |
#include "SkHalf.h" |
+#include "SkMatrix44.h" |
#include "SkPM4f.h" |
#include "SkPM4fPriv.h" |
#include "SkRasterPipeline.h" |
@@ -134,6 +136,59 @@ SI void SK_VECTORCALL next(TailStage* st, size_t x, size_t tail, |
static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa, \ |
const SkNf& d, const SkNf& da) |
+ |
+#define GAMMA_STAGE(name) \ |
msarett1
2016/11/09 00:01:05
Instead of this, I think I would prefer 6 normal s
mtklein_C
2016/11/09 11:04:36
I think you mean 3 normal stages? Each STAGE invo
raftias
2016/11/10 21:36:06
I did this with fn_1_r/g/b. If we add in specific
|
+ static SK_ALWAYS_INLINE SkNf name##_kernel(void* ctx, SkNf& s); \ |
+ SI void SK_VECTORCALL name##_r(BodyStage* st, size_t x, \ |
+ SkNf r, SkNf g, SkNf b, SkNf a, \ |
+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \ |
+ r = name##_kernel(st->ctx, r); \ |
+ next(st, x, r,g,b,a, dr,dg,db,da); \ |
+ } \ |
+ SI void SK_VECTORCALL name##_r(TailStage* st, size_t x, size_t tail, \ |
+ SkNf r, SkNf g, SkNf b, SkNf a, \ |
+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \ |
+ r = name##_kernel(st->ctx, r); \ |
+ next(st, x,tail, r,g,b,a, dr,dg,db,da); \ |
+ } \ |
+ SI void SK_VECTORCALL name##_g(BodyStage* st, size_t x, \ |
+ SkNf r, SkNf g, SkNf b, SkNf a, \ |
+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \ |
+ g = name##_kernel(st->ctx, g); \ |
+ next(st, x, r,g,b,a, dr,dg,db,da); \ |
+ } \ |
+ SI void SK_VECTORCALL name##_g(TailStage* st, size_t x, size_t tail, \ |
+ SkNf r, SkNf g, SkNf b, SkNf a, \ |
+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \ |
+ g = name##_kernel(st->ctx, g); \ |
+ next(st, x,tail, r,g,b,a, dr,dg,db,da); \ |
+ } \ |
+ SI void SK_VECTORCALL name##_b(BodyStage* st, size_t x, \ |
+ SkNf r, SkNf g, SkNf b, SkNf a, \ |
+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \ |
+ b = name##_kernel(st->ctx, b); \ |
+ next(st, x, r,g,b,a, dr,dg,db,da); \ |
+ } \ |
+ SI void SK_VECTORCALL name##_b(TailStage* st, size_t x, size_t tail, \ |
+ SkNf r, SkNf g, SkNf b, SkNf a, \ |
+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \ |
+ b = name##_kernel(st->ctx, b); \ |
+ next(st, x,tail, r,g,b,a, dr,dg,db,da); \ |
+ } \ |
+ SI void SK_VECTORCALL name##_a(BodyStage* st, size_t x, \ |
msarett1
2016/11/09 00:01:05
All we need to do with "a" is load it and store.
raftias
2016/11/10 21:36:07
It was indeed for CMYK/etc support.
|
+ SkNf r, SkNf g, SkNf b, SkNf a, \ |
+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \ |
+ a = name##_kernel(st->ctx, a); \ |
+ next(st, x, r,g,b,a, dr,dg,db,da); \ |
+ } \ |
+ SI void SK_VECTORCALL name##_a(TailStage* st, size_t x, size_t tail, \ |
+ SkNf r, SkNf g, SkNf b, SkNf a, \ |
+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \ |
+ a = name##_kernel(st->ctx, a); \ |
+ next(st, x,tail, r,g,b,a, dr,dg,db,da); \ |
+ } \ |
+ static SK_ALWAYS_INLINE SkNf name##_kernel(void* ctx, SkNf& s) |
+ |
SI SkNf inv(const SkNf& x) { return 1.0f - x; } |
SI SkNf lerp(const SkNf& from, const SkNf& to, const SkNf& cov) { |
@@ -431,6 +486,59 @@ STAGE(store_srgb, false) { |
| SkNx_cast<int>(0.5f + 255.0f * a) << SK_A32_SHIFT), (int*)ptr); |
} |
+STAGE(load_s_linear_rgba, true) { |
msarett1
2016/11/09 00:01:05
nit: Follow style conventions from above
Use whit
mtklein_C
2016/11/09 11:04:36
Let's call these _8888. That's our common shortha
|
+ auto ptr = *(const uint32_t**)ctx + x; |
+ |
+ auto px = load<kIsTail>(tail, ptr); |
+ auto to_int = [](const SkNx<N, uint32_t>& v) { return SkNi::Load(&v); }; |
+ r = (1/255.0f)*SkNx_cast<float>(to_int((px >> 0) & 0xFF)); |
+ g = (1/255.0f)*SkNx_cast<float>(to_int((px >> 8) & 0xFF)); |
+ b = (1/255.0f)*SkNx_cast<float>(to_int((px >> 16) & 0xFF)); |
+ a = (1/255.0f)*SkNx_cast<float>(to_int(px >> 24)); |
+} |
+ |
+STAGE(load_s_linear_bgra, true) { |
mtklein_C
2016/11/09 11:04:36
How about we write everything in terms of rgba, an
raftias
2016/11/10 21:36:07
I that before (with that exact name, even), then t
msarett1
2016/11/11 14:36:51
Let's defer to Mike on this one. lgtm, as is.
|
+ auto ptr = *(const uint32_t**)ctx + x; |
+ |
+ auto px = load<kIsTail>(tail, ptr); |
+ auto to_int = [](const SkNx<N, uint32_t>& v) { return SkNi::Load(&v); }; |
+ r = (1/255.0f)*SkNx_cast<float>(to_int((px >> 16) & 0xFF)); |
+ g = (1/255.0f)*SkNx_cast<float>(to_int((px >> 8) & 0xFF)); |
+ b = (1/255.0f)*SkNx_cast<float>(to_int((px >> 0) & 0xFF)); |
+ a = (1/255.0f)*SkNx_cast<float>(to_int((px >> 24))); |
+} |
+ |
+// Clamp colors into [0,1] premul (e.g. just before storing back to memory). |
raftias
2016/11/08 21:19:58
I noticed when I pulled before uploading that this
msarett1
2016/11/09 00:01:05
I believe the idea is to not waste time clamping w
mtklein_C
2016/11/09 11:04:36
This has now been split into two stages, clamp_0 a
|
+SI void clamp_01_premul(SkNf& r, SkNf& g, SkNf& b, SkNf& a) { |
+ a = SkNf::Max(a, 0.0f); |
+ r = SkNf::Max(r, 0.0f); |
+ g = SkNf::Max(g, 0.0f); |
+ b = SkNf::Max(b, 0.0f); |
+ |
+ a = SkNf::Min(a, 1.0f); |
+ r = SkNf::Min(r, a); |
+ g = SkNf::Min(g, a); |
+ b = SkNf::Min(b, a); |
+} |
+ |
+STAGE(store_linear_rgba, false) { |
+ clamp_01_premul(r,g,b,a); |
+ auto ptr = *(uint32_t**)ctx + x; |
+ store<kIsTail>(tail, ( SkNx_cast<int>(255.0f * r + 0.5f) << 0 |
msarett1
2016/11/09 00:01:05
I don't think you need the "+ 0.5f" terms. I thin
mtklein_C
2016/11/09 11:04:36
No, we're doing that to round to the nearest byte
|
+ | SkNx_cast<int>(255.0f * g + 0.5f) << 8 |
+ | SkNx_cast<int>(255.0f * b + 0.5f) << 16 |
+ | SkNx_cast<int>(255.0f * a + 0.5f) << 24 ), (int*)ptr); |
+} |
+ |
+STAGE(store_linear_bgra, false) { |
+ clamp_01_premul(r,g,b,a); |
+ auto ptr = *(uint32_t**)ctx + x; |
+ store<kIsTail>(tail, ( SkNx_cast<int>(255.0f * r + 0.5f) << 16 |
+ | SkNx_cast<int>(255.0f * g + 0.5f) << 8 |
+ | SkNx_cast<int>(255.0f * b + 0.5f) << 0 |
+ | SkNx_cast<int>(255.0f * a + 0.5f) << 24 ), (int*)ptr); |
+} |
+ |
RGBA_XFERMODE(clear) { return 0.0f; } |
//RGBA_XFERMODE(src) { return s; } // This would be a no-op stage, so we just omit it. |
RGBA_XFERMODE(dst) { return d; } |
@@ -490,6 +598,19 @@ STAGE(luminance_to_alpha, true) { |
r = g = b = 0; |
} |
+STAGE(matrix_4x4, true) { |
+ const SkMatrix44& mat = *(const SkMatrix44*)ctx; |
mtklein_C
2016/11/09 11:04:36
I'd like matrix_4x4 and matrix_4x5 to look and beh
raftias
2016/11/10 21:36:06
They were just different since I had written it an
|
+ auto fma = [](const SkNf& f, const SkNf& m, const SkNf& a) { return SkNx_fma(f,m,a); }; |
+ dr = fma(mat.get(0, 0),r, fma(mat.get(0, 1),g, fma(mat.get(0, 2),b, mat.get(0, 3)*a))); |
msarett1
2016/11/09 00:01:05
No need for "*a". Actually I think we don't want
mtklein_C
2016/11/09 11:04:36
If we don't *a here, we can't really call this sta
raftias
2016/11/10 21:36:06
It's 3x4 now.
|
+ dg = fma(mat.get(1, 0),r, fma(mat.get(1, 1),g, fma(mat.get(1, 2),b, mat.get(1, 3)*a))); |
+ db = fma(mat.get(2, 0),r, fma(mat.get(2, 1),g, fma(mat.get(2, 2),b, mat.get(2, 3)*a))); |
msarett1
2016/11/09 00:01:05
Mike, is it ok that we're destructive to dr, dg, d
mtklein_C
2016/11/09 11:04:36
The pedantic answer is that that depends what you'
raftias
2016/11/10 21:36:06
I'll remove these and put them in temporaries. I j
|
+ da = fma(mat.get(3, 0),r, fma(mat.get(3, 1),g, fma(mat.get(3, 2),b, mat.get(3, 3)*a))); |
+ r = dr; |
+ g = dg; |
+ b = db; |
+ a = da; |
+} |
+ |
STAGE(matrix_4x5, true) { |
auto m = (const float*)ctx; |
@@ -504,6 +625,169 @@ STAGE(matrix_4x5, true) { |
a = A; |
} |
+static inline Sk4f powNf(const Sk4f& x, float exp) { |
mtklein_C
2016/11/09 11:04:36
Generally this file writes static inline as SI. I
raftias
2016/11/10 21:36:06
Acknowledged.
|
+ return Sk4f{::powf(x[0], exp), ::powf(x[1], exp), ::powf(x[2], exp), ::powf(x[3], exp)}; |
+} |
+ |
+static inline Sk8f powNf(const Sk8f& x, float exp) { |
+ return Sk8f{::powf(x[0], exp), ::powf(x[1], exp), ::powf(x[2], exp), ::powf(x[3], exp), |
+ ::powf(x[4], exp), ::powf(x[5], exp), ::powf(x[6], exp), ::powf(x[7], exp)}; |
+} |
+ |
+GAMMA_STAGE(param_gamma) { |
+ const SkColorSpaceTransferFn& gamma = *(const SkColorSpaceTransferFn*)ctx; |
+ return (s <= gamma.fD).thenElse(gamma.fE * s + gamma.fF, |
msarett1
2016/11/09 00:01:05
nit: < instead of <=
raftias
2016/11/10 21:36:06
Done.
|
+ powNf(s * gamma.fA + gamma.fB, gamma.fG) + gamma.fC); |
+} |
+ |
+static constexpr float kGammaTableSize = 1024; |
+ |
+GAMMA_STAGE(table_gamma) { |
+ constexpr float maxIndex = kGammaTableSize - 1; |
+ const float* gammaTables = (const float*)ctx; |
mtklein_C
2016/11/09 11:04:36
This name makes it seem like we're going to be usi
raftias
2016/11/10 21:36:06
Acknowledged.
|
+ s = SkNf::Min(SkNf::Max(maxIndex * s, 0.f), maxIndex); |
mtklein_C
2016/11/09 11:04:36
If we're not going to source the 1024 constant fro
raftias
2016/11/10 21:36:06
ApplyTable stores it now.
|
+ float result[N]; |
+ for (int i = 0; i < N; ++i) { |
+ result[i] = gammaTables[lrintf(s[i])]; |
+ } |
+ return SkNf::Load(result); |
+} |
+ |
+static inline void interp_3d_clut(float dst[3], float src[3], const SkColorLookUpTable* colorLUT) { |
msarett1
2016/11/09 00:01:05
This maybe does not need to belong in this file.
mtklein_C
2016/11/09 11:04:36
Why don't we make this a normal, separately-compil
raftias
2016/11/10 21:36:06
Done.
|
+ // Call the src components x, y, and z. |
+ uint8_t maxX = colorLUT->fGridPoints[0] - 1; |
+ uint8_t maxY = colorLUT->fGridPoints[1] - 1; |
+ uint8_t maxZ = colorLUT->fGridPoints[2] - 1; |
+ |
+ // An approximate index into each of the three dimensions of the table. |
+ float x = src[0] * maxX; |
+ float y = src[1] * maxY; |
+ float z = src[2] * maxZ; |
+ |
+ // This gives us the low index for our interpolation. |
+ int ix = sk_float_floor2int(x); |
+ int iy = sk_float_floor2int(y); |
+ int iz = sk_float_floor2int(z); |
+ |
+ // Make sure the low index is not also the max index. |
+ ix = (maxX == ix) ? ix - 1 : ix; |
+ iy = (maxY == iy) ? iy - 1 : iy; |
+ iz = (maxZ == iz) ? iz - 1 : iz; |
+ |
+ // Weighting factors for the interpolation. |
+ float diffX = x - ix; |
+ float diffY = y - iy; |
+ float diffZ = z - iz; |
+ |
+ // Constants to help us navigate the 3D table. |
+ // Ex: Assume x = a, y = b, z = c. |
+ // table[a * n001 + b * n010 + c * n100] logically equals table[a][b][c]. |
+ const int n000 = 0; |
+ const int n001 = 3 * colorLUT->fGridPoints[1] * colorLUT->fGridPoints[2]; |
+ const int n010 = 3 * colorLUT->fGridPoints[2]; |
+ const int n011 = n001 + n010; |
+ const int n100 = 3; |
+ const int n101 = n100 + n001; |
+ const int n110 = n100 + n010; |
+ const int n111 = n110 + n001; |
+ |
+ // Base ptr into the table. |
+ const float* ptr = &(colorLUT->table()[ix*n001 + iy*n010 + iz*n100]); |
+ |
+ // The code below performs a tetrahedral interpolation for each of the three |
+ // dst components. Once the tetrahedron containing the interpolation point is |
+ // identified, the interpolation is a weighted sum of grid values at the |
+ // vertices of the tetrahedron. The claim is that tetrahedral interpolation |
+ // provides a more accurate color conversion. |
+ // blogs.mathworks.com/steve/2006/11/24/tetrahedral-interpolation-for-colorspace-conversion/ |
+ // |
+ // I have one test image, and visually I can't tell the difference between |
+ // tetrahedral and trilinear interpolation. In terms of computation, the |
+ // tetrahedral code requires more branches but less computation. The |
+ // SampleICC library provides an option for the client to choose either |
+ // tetrahedral or trilinear. |
+ for (int i = 0; i < 3; i++) { |
+ if (diffZ < diffY) { |
+ if (diffZ < diffX) { |
+ dst[i] = (ptr[n000] + diffZ * (ptr[n110] - ptr[n010]) + |
+ diffY * (ptr[n010] - ptr[n000]) + |
+ diffX * (ptr[n111] - ptr[n110])); |
+ } else if (diffY < diffX) { |
+ dst[i] = (ptr[n000] + diffZ * (ptr[n111] - ptr[n011]) + |
+ diffY * (ptr[n011] - ptr[n001]) + |
+ diffX * (ptr[n001] - ptr[n000])); |
+ } else { |
+ dst[i] = (ptr[n000] + diffZ * (ptr[n111] - ptr[n011]) + |
+ diffY * (ptr[n010] - ptr[n000]) + |
+ diffX * (ptr[n011] - ptr[n010])); |
+ } |
+ } else { |
+ if (diffZ < diffX) { |
+ dst[i] = (ptr[n000] + diffZ * (ptr[n101] - ptr[n001]) + |
+ diffY * (ptr[n111] - ptr[n101]) + |
+ diffX * (ptr[n001] - ptr[n000])); |
+ } else if (diffY < diffX) { |
+ dst[i] = (ptr[n000] + diffZ * (ptr[n100] - ptr[n000]) + |
+ diffY * (ptr[n111] - ptr[n101]) + |
+ diffX * (ptr[n101] - ptr[n100])); |
+ } else { |
+ dst[i] = (ptr[n000] + diffZ * (ptr[n100] - ptr[n000]) + |
+ diffY * (ptr[n110] - ptr[n100]) + |
+ diffX * (ptr[n111] - ptr[n110])); |
+ } |
+ } |
+ |
+ // Increment the table ptr in order to handle the next component. |
+ // Note that this is the how table is designed: all of nXXX |
+ // variables are multiples of 3 because there are 3 output |
+ // components. |
+ ptr++; |
+ } |
+} |
+ |
+STAGE(clut, true) { |
mtklein_C
2016/11/09 11:04:36
how about color_lookup_table?
raftias
2016/11/10 21:36:06
Done.
|
+ const SkColorLookUpTable* colorLUT = (const SkColorLookUpTable*)ctx; |
mtklein_C
2016/11/09 11:04:36
Side note: it's going to drive me nuts that we cap
raftias
2016/11/10 21:36:06
I didn't name it, but my guess is that it's becaus
msarett1
2016/11/11 14:36:51
I don't feel strongly about the name. Feel free t
|
+ float rgb[3]; |
+ alignas(alignof(SkNf)) float result[3][N]; |
mtklein_C
2016/11/09 11:04:36
Let's drop the alignment business. SkNf::Load() s
raftias
2016/11/10 21:36:06
Done.
|
+ for (int i = 0; i < N; ++i) { |
+ rgb[0] = r[i]; |
+ rgb[1] = g[i]; |
+ rgb[2] = b[i]; |
+ interp_3d_clut(rgb, rgb, colorLUT); |
+ result[0][i] = rgb[0]; |
+ result[1][i] = rgb[1]; |
+ result[2][i] = rgb[2]; |
+ } |
+ r = SkNf::Load(result[0]); |
+ g = SkNf::Load(result[1]); |
+ b = SkNf::Load(result[2]); |
+} |
+ |
+STAGE(labtoxyz, true) { |
raftias
2016/11/08 21:19:58
I think this can be expressed as a matrix_4x4 foll
mtklein_C
2016/11/09 11:04:36
I think this is clearer as its own stage.
It's pr
raftias
2016/11/10 21:36:06
Done.
|
+ const auto lab_l = r * 100.f; |
+ const auto lab_a = g * 255.f - 128.f; |
+ const auto lab_b = b * 255.f - 128.f; |
+ auto Y = (lab_l + 16.f) * (1.f/116.f); |
+ auto X = lab_a * (1.f/500.f) + Y; |
mtklein_C
2016/11/09 11:04:36
One .f is plenty to get these solidly as float con
raftias
2016/11/10 21:36:06
Acknowledged.
|
+ auto Z = Y - (lab_b * (1.f/200.f)); |
+ |
+ auto cubed = X*X*X; |
+ X = (cubed > 0.008856f).thenElse(cubed, (X - (16.f/116.f)) * (1.f/7.787f)); |
+ cubed = Y*Y*Y; |
mtklein_C
2016/11/09 11:04:36
At a glance it looks like cubed must be accumulati
raftias
2016/11/10 21:36:06
Done.
|
+ Y = (cubed > 0.008856f).thenElse(cubed, (Y - (16.f/116.f)) * (1.f/7.787f)); |
+ cubed = Z*Z*Z; |
+ Z = (cubed > 0.008856f).thenElse(cubed, (Z - (16.f/116.f)) * (1.f/7.787f)); |
+ |
+ // adjust to D50 illuminant |
+ X *= 0.96422f; |
+ Y *= 1.00000f; |
+ Z *= 0.82521f; |
+ |
+ r = X; |
+ g = Y; |
+ b = Z; |
+} |
+ |
template <typename Fn> |
SI Fn enum_to_Fn(SkRasterPipeline::StockStage st) { |
switch (st) { |