Chromium Code Reviews| Index: src/opts/SkRasterPipeline_opts.h |
| diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h |
| index 5c5418be1a44d6d6ece160e288467828f10a75a5..db9a5bf290f270106a0178be146440179ef8ba31 100644 |
| --- a/src/opts/SkRasterPipeline_opts.h |
| +++ b/src/opts/SkRasterPipeline_opts.h |
| @@ -9,7 +9,9 @@ |
| #define SkRasterPipeline_opts_DEFINED |
| #include "SkColorPriv.h" |
| +#include "SkColorSpace_Base.h" |
| #include "SkHalf.h" |
| +#include "SkMatrix44.h" |
| #include "SkPM4f.h" |
| #include "SkPM4fPriv.h" |
| #include "SkRasterPipeline.h" |
| @@ -134,6 +136,59 @@ SI void SK_VECTORCALL next(TailStage* st, size_t x, size_t tail, |
| static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa, \ |
| const SkNf& d, const SkNf& da) |
| + |
| +#define GAMMA_STAGE(name) \ |
|
msarett1
2016/11/09 00:01:05
Instead of this, I think I would prefer 6 normal s
mtklein_C
2016/11/09 11:04:36
I think you mean 3 normal stages? Each STAGE invo
raftias
2016/11/10 21:36:06
I did this with fn_1_r/g/b. If we add in specific
|
| + static SK_ALWAYS_INLINE SkNf name##_kernel(void* ctx, SkNf& s); \ |
| + SI void SK_VECTORCALL name##_r(BodyStage* st, size_t x, \ |
| + SkNf r, SkNf g, SkNf b, SkNf a, \ |
| + SkNf dr, SkNf dg, SkNf db, SkNf da) { \ |
| + r = name##_kernel(st->ctx, r); \ |
| + next(st, x, r,g,b,a, dr,dg,db,da); \ |
| + } \ |
| + SI void SK_VECTORCALL name##_r(TailStage* st, size_t x, size_t tail, \ |
| + SkNf r, SkNf g, SkNf b, SkNf a, \ |
| + SkNf dr, SkNf dg, SkNf db, SkNf da) { \ |
| + r = name##_kernel(st->ctx, r); \ |
| + next(st, x,tail, r,g,b,a, dr,dg,db,da); \ |
| + } \ |
| + SI void SK_VECTORCALL name##_g(BodyStage* st, size_t x, \ |
| + SkNf r, SkNf g, SkNf b, SkNf a, \ |
| + SkNf dr, SkNf dg, SkNf db, SkNf da) { \ |
| + g = name##_kernel(st->ctx, g); \ |
| + next(st, x, r,g,b,a, dr,dg,db,da); \ |
| + } \ |
| + SI void SK_VECTORCALL name##_g(TailStage* st, size_t x, size_t tail, \ |
| + SkNf r, SkNf g, SkNf b, SkNf a, \ |
| + SkNf dr, SkNf dg, SkNf db, SkNf da) { \ |
| + g = name##_kernel(st->ctx, g); \ |
| + next(st, x,tail, r,g,b,a, dr,dg,db,da); \ |
| + } \ |
| + SI void SK_VECTORCALL name##_b(BodyStage* st, size_t x, \ |
| + SkNf r, SkNf g, SkNf b, SkNf a, \ |
| + SkNf dr, SkNf dg, SkNf db, SkNf da) { \ |
| + b = name##_kernel(st->ctx, b); \ |
| + next(st, x, r,g,b,a, dr,dg,db,da); \ |
| + } \ |
| + SI void SK_VECTORCALL name##_b(TailStage* st, size_t x, size_t tail, \ |
| + SkNf r, SkNf g, SkNf b, SkNf a, \ |
| + SkNf dr, SkNf dg, SkNf db, SkNf da) { \ |
| + b = name##_kernel(st->ctx, b); \ |
| + next(st, x,tail, r,g,b,a, dr,dg,db,da); \ |
| + } \ |
| + SI void SK_VECTORCALL name##_a(BodyStage* st, size_t x, \ |
|
msarett1
2016/11/09 00:01:05
All we need to do with "a" is load it and store.
raftias
2016/11/10 21:36:07
It was indeed for CMYK/etc support.
|
| + SkNf r, SkNf g, SkNf b, SkNf a, \ |
| + SkNf dr, SkNf dg, SkNf db, SkNf da) { \ |
| + a = name##_kernel(st->ctx, a); \ |
| + next(st, x, r,g,b,a, dr,dg,db,da); \ |
| + } \ |
| + SI void SK_VECTORCALL name##_a(TailStage* st, size_t x, size_t tail, \ |
| + SkNf r, SkNf g, SkNf b, SkNf a, \ |
| + SkNf dr, SkNf dg, SkNf db, SkNf da) { \ |
| + a = name##_kernel(st->ctx, a); \ |
| + next(st, x,tail, r,g,b,a, dr,dg,db,da); \ |
| + } \ |
| + static SK_ALWAYS_INLINE SkNf name##_kernel(void* ctx, SkNf& s) |
| + |
| SI SkNf inv(const SkNf& x) { return 1.0f - x; } |
| SI SkNf lerp(const SkNf& from, const SkNf& to, const SkNf& cov) { |
| @@ -431,6 +486,59 @@ STAGE(store_srgb, false) { |
| | SkNx_cast<int>(0.5f + 255.0f * a) << SK_A32_SHIFT), (int*)ptr); |
| } |
| +STAGE(load_s_linear_rgba, true) { |
|
msarett1
2016/11/09 00:01:05
nit: Follow style conventions from above
Use whit
mtklein_C
2016/11/09 11:04:36
Let's call these _8888. That's our common shortha
|
| + auto ptr = *(const uint32_t**)ctx + x; |
| + |
| + auto px = load<kIsTail>(tail, ptr); |
| + auto to_int = [](const SkNx<N, uint32_t>& v) { return SkNi::Load(&v); }; |
| + r = (1/255.0f)*SkNx_cast<float>(to_int((px >> 0) & 0xFF)); |
| + g = (1/255.0f)*SkNx_cast<float>(to_int((px >> 8) & 0xFF)); |
| + b = (1/255.0f)*SkNx_cast<float>(to_int((px >> 16) & 0xFF)); |
| + a = (1/255.0f)*SkNx_cast<float>(to_int(px >> 24)); |
| +} |
| + |
| +STAGE(load_s_linear_bgra, true) { |
|
mtklein_C
2016/11/09 11:04:36
How about we write everything in terms of rgba, an
raftias
2016/11/10 21:36:07
I that before (with that exact name, even), then t
msarett1
2016/11/11 14:36:51
Let's defer to Mike on this one. lgtm, as is.
|
| + auto ptr = *(const uint32_t**)ctx + x; |
| + |
| + auto px = load<kIsTail>(tail, ptr); |
| + auto to_int = [](const SkNx<N, uint32_t>& v) { return SkNi::Load(&v); }; |
| + r = (1/255.0f)*SkNx_cast<float>(to_int((px >> 16) & 0xFF)); |
| + g = (1/255.0f)*SkNx_cast<float>(to_int((px >> 8) & 0xFF)); |
| + b = (1/255.0f)*SkNx_cast<float>(to_int((px >> 0) & 0xFF)); |
| + a = (1/255.0f)*SkNx_cast<float>(to_int((px >> 24))); |
| +} |
| + |
| +// Clamp colors into [0,1] premul (e.g. just before storing back to memory). |
|
raftias
2016/11/08 21:19:58
I noticed when I pulled before uploading that this
msarett1
2016/11/09 00:01:05
I believe the idea is to not waste time clamping w
mtklein_C
2016/11/09 11:04:36
This has now been split into two stages, clamp_0 a
|
| +SI void clamp_01_premul(SkNf& r, SkNf& g, SkNf& b, SkNf& a) { |
| + a = SkNf::Max(a, 0.0f); |
| + r = SkNf::Max(r, 0.0f); |
| + g = SkNf::Max(g, 0.0f); |
| + b = SkNf::Max(b, 0.0f); |
| + |
| + a = SkNf::Min(a, 1.0f); |
| + r = SkNf::Min(r, a); |
| + g = SkNf::Min(g, a); |
| + b = SkNf::Min(b, a); |
| +} |
| + |
| +STAGE(store_linear_rgba, false) { |
| + clamp_01_premul(r,g,b,a); |
| + auto ptr = *(uint32_t**)ctx + x; |
| + store<kIsTail>(tail, ( SkNx_cast<int>(255.0f * r + 0.5f) << 0 |
|
msarett1
2016/11/09 00:01:05
I don't think you need the "+ 0.5f" terms. I thin
mtklein_C
2016/11/09 11:04:36
No, we're doing that to round to the nearest byte
|
| + | SkNx_cast<int>(255.0f * g + 0.5f) << 8 |
| + | SkNx_cast<int>(255.0f * b + 0.5f) << 16 |
| + | SkNx_cast<int>(255.0f * a + 0.5f) << 24 ), (int*)ptr); |
| +} |
| + |
| +STAGE(store_linear_bgra, false) { |
| + clamp_01_premul(r,g,b,a); |
| + auto ptr = *(uint32_t**)ctx + x; |
| + store<kIsTail>(tail, ( SkNx_cast<int>(255.0f * r + 0.5f) << 16 |
| + | SkNx_cast<int>(255.0f * g + 0.5f) << 8 |
| + | SkNx_cast<int>(255.0f * b + 0.5f) << 0 |
| + | SkNx_cast<int>(255.0f * a + 0.5f) << 24 ), (int*)ptr); |
| +} |
| + |
| RGBA_XFERMODE(clear) { return 0.0f; } |
| //RGBA_XFERMODE(src) { return s; } // This would be a no-op stage, so we just omit it. |
| RGBA_XFERMODE(dst) { return d; } |
| @@ -490,6 +598,19 @@ STAGE(luminance_to_alpha, true) { |
| r = g = b = 0; |
| } |
| +STAGE(matrix_4x4, true) { |
| + const SkMatrix44& mat = *(const SkMatrix44*)ctx; |
|
mtklein_C
2016/11/09 11:04:36
I'd like matrix_4x4 and matrix_4x5 to look and beh
raftias
2016/11/10 21:36:06
They were just different since I had written it an
|
| + auto fma = [](const SkNf& f, const SkNf& m, const SkNf& a) { return SkNx_fma(f,m,a); }; |
| + dr = fma(mat.get(0, 0),r, fma(mat.get(0, 1),g, fma(mat.get(0, 2),b, mat.get(0, 3)*a))); |
|
msarett1
2016/11/09 00:01:05
No need for "*a". Actually I think we don't want
mtklein_C
2016/11/09 11:04:36
If we don't *a here, we can't really call this sta
raftias
2016/11/10 21:36:06
It's 3x4 now.
|
| + dg = fma(mat.get(1, 0),r, fma(mat.get(1, 1),g, fma(mat.get(1, 2),b, mat.get(1, 3)*a))); |
| + db = fma(mat.get(2, 0),r, fma(mat.get(2, 1),g, fma(mat.get(2, 2),b, mat.get(2, 3)*a))); |
|
msarett1
2016/11/09 00:01:05
Mike, is it ok that we're destructive to dr, dg, d
mtklein_C
2016/11/09 11:04:36
The pedantic answer is that that depends what you'
raftias
2016/11/10 21:36:06
I'll remove these and put them in temporaries. I j
|
| + da = fma(mat.get(3, 0),r, fma(mat.get(3, 1),g, fma(mat.get(3, 2),b, mat.get(3, 3)*a))); |
| + r = dr; |
| + g = dg; |
| + b = db; |
| + a = da; |
| +} |
| + |
| STAGE(matrix_4x5, true) { |
| auto m = (const float*)ctx; |
| @@ -504,6 +625,169 @@ STAGE(matrix_4x5, true) { |
| a = A; |
| } |
| +static inline Sk4f powNf(const Sk4f& x, float exp) { |
|
mtklein_C
2016/11/09 11:04:36
Generally this file writes static inline as SI. I
raftias
2016/11/10 21:36:06
Acknowledged.
|
| + return Sk4f{::powf(x[0], exp), ::powf(x[1], exp), ::powf(x[2], exp), ::powf(x[3], exp)}; |
| +} |
| + |
| +static inline Sk8f powNf(const Sk8f& x, float exp) { |
| + return Sk8f{::powf(x[0], exp), ::powf(x[1], exp), ::powf(x[2], exp), ::powf(x[3], exp), |
| + ::powf(x[4], exp), ::powf(x[5], exp), ::powf(x[6], exp), ::powf(x[7], exp)}; |
| +} |
| + |
| +GAMMA_STAGE(param_gamma) { |
| + const SkColorSpaceTransferFn& gamma = *(const SkColorSpaceTransferFn*)ctx; |
| + return (s <= gamma.fD).thenElse(gamma.fE * s + gamma.fF, |
|
msarett1
2016/11/09 00:01:05
nit: < instead of <=
raftias
2016/11/10 21:36:06
Done.
|
| + powNf(s * gamma.fA + gamma.fB, gamma.fG) + gamma.fC); |
| +} |
| + |
| +static constexpr float kGammaTableSize = 1024; |
| + |
| +GAMMA_STAGE(table_gamma) { |
| + constexpr float maxIndex = kGammaTableSize - 1; |
| + const float* gammaTables = (const float*)ctx; |
|
mtklein_C
2016/11/09 11:04:36
This name makes it seem like we're going to be usi
raftias
2016/11/10 21:36:06
Acknowledged.
|
| + s = SkNf::Min(SkNf::Max(maxIndex * s, 0.f), maxIndex); |
|
mtklein_C
2016/11/09 11:04:36
If we're not going to source the 1024 constant fro
raftias
2016/11/10 21:36:06
ApplyTable stores it now.
|
| + float result[N]; |
| + for (int i = 0; i < N; ++i) { |
| + result[i] = gammaTables[lrintf(s[i])]; |
| + } |
| + return SkNf::Load(result); |
| +} |
| + |
| +static inline void interp_3d_clut(float dst[3], float src[3], const SkColorLookUpTable* colorLUT) { |
|
msarett1
2016/11/09 00:01:05
This maybe does not need to belong in this file.
mtklein_C
2016/11/09 11:04:36
Why don't we make this a normal, separately-compil
raftias
2016/11/10 21:36:06
Done.
|
| + // Call the src components x, y, and z. |
| + uint8_t maxX = colorLUT->fGridPoints[0] - 1; |
| + uint8_t maxY = colorLUT->fGridPoints[1] - 1; |
| + uint8_t maxZ = colorLUT->fGridPoints[2] - 1; |
| + |
| + // An approximate index into each of the three dimensions of the table. |
| + float x = src[0] * maxX; |
| + float y = src[1] * maxY; |
| + float z = src[2] * maxZ; |
| + |
| + // This gives us the low index for our interpolation. |
| + int ix = sk_float_floor2int(x); |
| + int iy = sk_float_floor2int(y); |
| + int iz = sk_float_floor2int(z); |
| + |
| + // Make sure the low index is not also the max index. |
| + ix = (maxX == ix) ? ix - 1 : ix; |
| + iy = (maxY == iy) ? iy - 1 : iy; |
| + iz = (maxZ == iz) ? iz - 1 : iz; |
| + |
| + // Weighting factors for the interpolation. |
| + float diffX = x - ix; |
| + float diffY = y - iy; |
| + float diffZ = z - iz; |
| + |
| + // Constants to help us navigate the 3D table. |
| + // Ex: Assume x = a, y = b, z = c. |
| + // table[a * n001 + b * n010 + c * n100] logically equals table[a][b][c]. |
| + const int n000 = 0; |
| + const int n001 = 3 * colorLUT->fGridPoints[1] * colorLUT->fGridPoints[2]; |
| + const int n010 = 3 * colorLUT->fGridPoints[2]; |
| + const int n011 = n001 + n010; |
| + const int n100 = 3; |
| + const int n101 = n100 + n001; |
| + const int n110 = n100 + n010; |
| + const int n111 = n110 + n001; |
| + |
| + // Base ptr into the table. |
| + const float* ptr = &(colorLUT->table()[ix*n001 + iy*n010 + iz*n100]); |
| + |
| + // The code below performs a tetrahedral interpolation for each of the three |
| + // dst components. Once the tetrahedron containing the interpolation point is |
| + // identified, the interpolation is a weighted sum of grid values at the |
| + // vertices of the tetrahedron. The claim is that tetrahedral interpolation |
| + // provides a more accurate color conversion. |
| + // blogs.mathworks.com/steve/2006/11/24/tetrahedral-interpolation-for-colorspace-conversion/ |
| + // |
| + // I have one test image, and visually I can't tell the difference between |
| + // tetrahedral and trilinear interpolation. In terms of computation, the |
| + // tetrahedral code requires more branches but less computation. The |
| + // SampleICC library provides an option for the client to choose either |
| + // tetrahedral or trilinear. |
| + for (int i = 0; i < 3; i++) { |
| + if (diffZ < diffY) { |
| + if (diffZ < diffX) { |
| + dst[i] = (ptr[n000] + diffZ * (ptr[n110] - ptr[n010]) + |
| + diffY * (ptr[n010] - ptr[n000]) + |
| + diffX * (ptr[n111] - ptr[n110])); |
| + } else if (diffY < diffX) { |
| + dst[i] = (ptr[n000] + diffZ * (ptr[n111] - ptr[n011]) + |
| + diffY * (ptr[n011] - ptr[n001]) + |
| + diffX * (ptr[n001] - ptr[n000])); |
| + } else { |
| + dst[i] = (ptr[n000] + diffZ * (ptr[n111] - ptr[n011]) + |
| + diffY * (ptr[n010] - ptr[n000]) + |
| + diffX * (ptr[n011] - ptr[n010])); |
| + } |
| + } else { |
| + if (diffZ < diffX) { |
| + dst[i] = (ptr[n000] + diffZ * (ptr[n101] - ptr[n001]) + |
| + diffY * (ptr[n111] - ptr[n101]) + |
| + diffX * (ptr[n001] - ptr[n000])); |
| + } else if (diffY < diffX) { |
| + dst[i] = (ptr[n000] + diffZ * (ptr[n100] - ptr[n000]) + |
| + diffY * (ptr[n111] - ptr[n101]) + |
| + diffX * (ptr[n101] - ptr[n100])); |
| + } else { |
| + dst[i] = (ptr[n000] + diffZ * (ptr[n100] - ptr[n000]) + |
| + diffY * (ptr[n110] - ptr[n100]) + |
| + diffX * (ptr[n111] - ptr[n110])); |
| + } |
| + } |
| + |
| + // Increment the table ptr in order to handle the next component. |
| + // Note that this is the how table is designed: all of nXXX |
| + // variables are multiples of 3 because there are 3 output |
| + // components. |
| + ptr++; |
| + } |
| +} |
| + |
| +STAGE(clut, true) { |
|
mtklein_C
2016/11/09 11:04:36
how about color_lookup_table?
raftias
2016/11/10 21:36:06
Done.
|
| + const SkColorLookUpTable* colorLUT = (const SkColorLookUpTable*)ctx; |
|
mtklein_C
2016/11/09 11:04:36
Side note: it's going to drive me nuts that we cap
raftias
2016/11/10 21:36:06
I didn't name it, but my guess is that it's becaus
msarett1
2016/11/11 14:36:51
I don't feel strongly about the name. Feel free t
|
| + float rgb[3]; |
| + alignas(alignof(SkNf)) float result[3][N]; |
|
mtklein_C
2016/11/09 11:04:36
Let's drop the alignment business. SkNf::Load() s
raftias
2016/11/10 21:36:06
Done.
|
| + for (int i = 0; i < N; ++i) { |
| + rgb[0] = r[i]; |
| + rgb[1] = g[i]; |
| + rgb[2] = b[i]; |
| + interp_3d_clut(rgb, rgb, colorLUT); |
| + result[0][i] = rgb[0]; |
| + result[1][i] = rgb[1]; |
| + result[2][i] = rgb[2]; |
| + } |
| + r = SkNf::Load(result[0]); |
| + g = SkNf::Load(result[1]); |
| + b = SkNf::Load(result[2]); |
| +} |
| + |
| +STAGE(labtoxyz, true) { |
|
raftias
2016/11/08 21:19:58
I think this can be expressed as a matrix_4x4 foll
mtklein_C
2016/11/09 11:04:36
I think this is clearer as its own stage.
It's pr
raftias
2016/11/10 21:36:06
Done.
|
| + const auto lab_l = r * 100.f; |
| + const auto lab_a = g * 255.f - 128.f; |
| + const auto lab_b = b * 255.f - 128.f; |
| + auto Y = (lab_l + 16.f) * (1.f/116.f); |
| + auto X = lab_a * (1.f/500.f) + Y; |
|
mtklein_C
2016/11/09 11:04:36
One .f is plenty to get these solidly as float con
raftias
2016/11/10 21:36:06
Acknowledged.
|
| + auto Z = Y - (lab_b * (1.f/200.f)); |
| + |
| + auto cubed = X*X*X; |
| + X = (cubed > 0.008856f).thenElse(cubed, (X - (16.f/116.f)) * (1.f/7.787f)); |
| + cubed = Y*Y*Y; |
|
mtklein_C
2016/11/09 11:04:36
At a glance it looks like cubed must be accumulati
raftias
2016/11/10 21:36:06
Done.
|
| + Y = (cubed > 0.008856f).thenElse(cubed, (Y - (16.f/116.f)) * (1.f/7.787f)); |
| + cubed = Z*Z*Z; |
| + Z = (cubed > 0.008856f).thenElse(cubed, (Z - (16.f/116.f)) * (1.f/7.787f)); |
| + |
| + // adjust to D50 illuminant |
| + X *= 0.96422f; |
| + Y *= 1.00000f; |
| + Z *= 0.82521f; |
| + |
| + r = X; |
| + g = Y; |
| + b = Z; |
| +} |
| + |
| template <typename Fn> |
| SI Fn enum_to_Fn(SkRasterPipeline::StockStage st) { |
| switch (st) { |