Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(3)

Side by Side Diff: src/opts/SkColorXform_opts.h

Issue 2060823003: Implement fast, correct gamma conversion for color xforms (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright 2016 Google Inc. 2 * Copyright 2016 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #ifndef SkColorXform_opts_DEFINED 8 #ifndef SkColorXform_opts_DEFINED
9 #define SkColorXform_opts_DEFINED 9 #define SkColorXform_opts_DEFINED
10 10
11 #include "SkColorPriv.h" 11 #include "SkColorPriv.h"
12 12
13 namespace SK_OPTS_NS { 13 namespace SK_OPTS_NS {
14 14
15 static uint8_t clamp_float_to_byte(float v) { 15 static constexpr float gamma_srgb_to_linear[256] = {
16 if (v >= 254.5f) { 16 0.000000000000000000f, 0.000303526983548838f, 0.000607053967097675f, 0.0 00910580950646513f,
17 return 255; 17 0.001214107934195350f, 0.001517634917744190f, 0.001821161901293030f, 0.0 02124688884841860f,
18 } else if (v < 0.5f) { 18 0.002428215868390700f, 0.002731742851939540f, 0.003034518678424960f, 0.0 03346535763899160f,
19 return 0; 19 0.003676507324047440f, 0.004024717018496310f, 0.004391442037410290f, 0.0 04776953480693730f,
20 0.005181516702338390f, 0.005605391624202720f, 0.006048833022857060f, 0.0 06512090792594470f,
21 0.006995410187265390f, 0.007499032043226180f, 0.008023192985384990f, 0.0 08568125618069310f,
22 0.009134058702220790f, 0.009721217320237850f, 0.010329823029626900f, 0.0 10960094006488200f,
23 0.011612245179743900f, 0.012286488356915900f, 0.012983032342173000f, 0.0 13702083047289700f,
24 0.014443843596092500f, 0.015208514422912700f, 0.015996293365509600f, 0.0 16807375752887400f,
25 0.017641954488384100f, 0.018500220128379700f, 0.019382360956935700f, 0.0 20288563056652400f,
26 0.021219010376003600f, 0.022173884793387400f, 0.023153366178110400f, 0.0 24157632448504800f,
27 0.025186859627361600f, 0.026241221894849900f, 0.027320891639074900f, 0.0 28426039504420800f,
28 0.029556834437808800f, 0.030713443732993600f, 0.031896033073011500f, 0.0 33104766570885100f,
29 0.034339806808682200f, 0.035601314875020300f, 0.036889450401100000f, 0.0 38204371595346500f,
30 0.039546235276732800f, 0.040915196906853200f, 0.042311410620809700f, 0.0 43735029256973500f,
31 0.045186204385675500f, 0.046665086336880100f, 0.048171824226889400f, 0.0 49706565984127200f,
32 0.051269458374043200f, 0.052860647023180200f, 0.054480276442442400f, 0.0 56128490049600100f,
33 0.057805430191067200f, 0.059511238162981200f, 0.061246054231617600f, 0.0 63010017653167700f,
34 0.064803266692905800f, 0.066625938643772900f, 0.068478169844400200f, 0.0 70360095696595900f,
35 0.072271850682317500f, 0.074213568380149600f, 0.076185381481307900f, 0.0 78187421805186300f,
36 0.080219820314468300f, 0.082282707129814800f, 0.084376211544148800f, 0.0 86500462036549800f,
37 0.088655586285772900f, 0.090841711183407700f, 0.093058962846687500f, 0.0 95307466630964700f,
38 0.097587347141862500f, 0.099898728247113900f, 0.102241733088101000f, 0.1 04616484091104000f,
39 0.107023102978268000f, 0.109461710778299000f, 0.111932427836906000f, 0.1 14435373826974000f,
40 0.116970667758511000f, 0.119538427988346000f, 0.122138772229602000f, 0.1 24771817560950000f,
41 0.127437680435647000f, 0.130136476690364000f, 0.132868321553818000f, 0.1 35633329655206000f,
42 0.138431615032452000f, 0.141263291140272000f, 0.144128470858058000f, 0.1 47027266497595000f,
43 0.149959789810609000f, 0.152926151996150000f, 0.155926463707827000f, 0.1 58960835060880000f,
44 0.162029375639111000f, 0.165132194501668000f, 0.168269400189691000f, 0.1 71441100732823000f,
45 0.174647403655585000f, 0.177888415983629000f, 0.181164244249860000f, 0.1 84474994500441000f,
46 0.187820772300678000f, 0.191201682740791000f, 0.194617830441576000f, 0.1 98069319559949000f,
47 0.201556253794397000f, 0.205078736390317000f, 0.208636870145256000f, 0.2 12230757414055000f,
48 0.215860500113899000f, 0.219526199729269000f, 0.223227957316809000f, 0.2 26965873510098000f,
49 0.230740048524349000f, 0.234550582161005000f, 0.238397573812271000f, 0.2 42281122465555000f,
50 0.246201326707835000f, 0.250158284729953000f, 0.254152094330827000f, 0.2 58182852921596000f,
51 0.262250657529696000f, 0.266355604802862000f, 0.270497791013066000f, 0.2 74677312060385000f,
52 0.278894263476810000f, 0.283148740429992000f, 0.287440837726918000f, 0.2 91770649817536000f,
53 0.296138270798321000f, 0.300543794415777000f, 0.304987314069886000f, 0.3 09468922817509000f,
54 0.313988713375718000f, 0.318546778125092000f, 0.323143209112951000f, 0.3 27778098056542000f,
55 0.332451536346179000f, 0.337163615048330000f, 0.341914424908661000f, 0.3 46704056355030000f,
56 0.351532599500439000f, 0.356400144145944000f, 0.361306779783510000f, 0.3 66252595598840000f,
57 0.371237680474149000f, 0.376262122990906000f, 0.381326011432530000f, 0.3 86429433787049000f,
58 0.391572477749723000f, 0.396755230725627000f, 0.401977779832196000f, 0.4 07240211901737000f,
59 0.412542613483904000f, 0.417885070848138000f, 0.423267669986072000f, 0.4 28690496613907000f,
60 0.434153636174749000f, 0.439657173840919000f, 0.445201194516228000f, 0.4 50785782838223000f,
61 0.456411023180405000f, 0.462076999654407000f, 0.467783796112159000f, 0.4 73531496148010000f,
62 0.479320183100827000f, 0.485149940056070000f, 0.491020849847836000f, 0.4 96932995060870000f,
63 0.502886458032569000f, 0.508881320854934000f, 0.514917665376521000f, 0.5 20995573204354000f,
64 0.527115125705813000f, 0.533276404010505000f, 0.539479489012107000f, 0.5 45724461370187000f,
65 0.552011401512000000f, 0.558340389634268000f, 0.564711505704929000f, 0.5 71124829464873000f,
66 0.577580440429651000f, 0.584078417891164000f, 0.590618840919337000f, 0.5 97201788363763000f,
67 0.603827338855338000f, 0.610495570807865000f, 0.617206562419651000f, 0.6 23960391675076000f,
68 0.630757136346147000f, 0.637596873994033000f, 0.644479681970582000f, 0.6 51405637419824000f,
69 0.658374817279448000f, 0.665387298282272000f, 0.672443156957688000f, 0.6 79542469633094000f,
70 0.686685312435314000f, 0.693871761291990000f, 0.701101891932973000f, 0.7 08375779891687000f,
71 0.715693500506481000f, 0.723055128921969000f, 0.730460740090354000f, 0.7 37910408772731000f,
72 0.745404209540387000f, 0.752942216776078000f, 0.760524504675292000f, 0.7 68151147247507000f,
73 0.775822218317423000f, 0.783537791526194000f, 0.791297940332630000f, 0.7 99102738014409000f,
74 0.806952257669252000f, 0.814846572216101000f, 0.822785754396284000f, 0.8 30769876774655000f,
75 0.838799011740740000f, 0.846873231509858000f, 0.854992608124234000f, 0.8 63157213454102000f,
76 0.871367119198797000f, 0.879622396887832000f, 0.887923117881966000f, 0.8 96269353374266000f,
77 0.904661174391149000f, 0.913098651793419000f, 0.921581856277295000f, 0.9 30110858375424000f,
78 0.938685728457888000f, 0.947306536733200000f, 0.955973353249286000f, 0.9 64686247894465000f,
79 0.973445290398413000f, 0.982250550333117000f, 0.991102097113830000f, 1.0 00000000000000000f,
80 };
81
82 static constexpr float gamma_2dot2_to_linear[256] = {
83 0.000000000000000000f, 0.000005077051900662f, 0.000023328004666099f, 0.0 00056921765712193f,
84 0.000107187362341244f, 0.000175123977503027f, 0.000261543754548491f, 0.0 00367136269815943f,
85 0.000492503787191433f, 0.000638182842167022f, 0.000804658499513058f, 0.0 00992374304074325f,
86 0.001201739522438400f, 0.001433134589671860f, 0.001686915316789280f, 0.0 01963416213396470f,
87 0.002262953160706430f, 0.002585825596234170f, 0.002932318323938360f, 0.0 03302703032003640f,
88 0.003697239578900130f, 0.004116177093282750f, 0.004559754922526020f, 0.0 05028203456855540f,
89 0.005521744850239660f, 0.006040593654849810f, 0.006584957382581690f, 0.0 07155037004573030f,
90 0.007751027397660610f, 0.008373117745148580f, 0.009021491898012130f, 0.0 09696328701658230f,
91 0.010397802292555300f, 0.011126082368383200f, 0.011881334434813700f, 0.0 12663720031582100f,
92 0.013473396940142600f, 0.014310519374884100f, 0.015175238159625200f, 0.0 16067700890886900f,
93 0.016988052089250000f, 0.017936433339950200f, 0.018912983423721500f, 0.0 19917838438785700f,
94 0.020951131914781100f, 0.022012994919336500f, 0.023103556157921400f, 0.0 24222942067534200f,
95 0.025371276904734600f, 0.026548682828472900f, 0.027755279978126000f, 0.0 28991186547107800f,
96 0.030256518852388700f, 0.031551391400226400f, 0.032875916948383800f, 0.0 34230206565082000f,
97 0.035614369684918800f, 0.037028514161960200f, 0.038472746320194600f, 0.0 39947171001525600f,
98 0.041451891611462500f, 0.042987010162657100f, 0.044552627316421400f, 0.0 46148842422351000f,
99 0.047775753556170600f, 0.049433457555908000f, 0.051122050056493400f, 0.0 52841625522879000f,
100 0.054592277281760300f, 0.056374097551979800f, 0.058187177473685400f, 0.0 60031607136313200f,
101 0.061907475605455800f, 0.063814870948677200f, 0.065753880260330100f, 0.0 67724589685424300f,
102 0.069727084442598800f, 0.071761448846239100f, 0.073827766327784600f, 0.0 75926119456264800f,
103 0.078056589958101900f, 0.080219258736215100f, 0.082414205888459200f, 0.0 84641510725429500f,
104 0.086901251787660300f, 0.089193506862247800f, 0.091518352998919500f, 0.0 93875866525577800f,
105 0.096266123063339700f, 0.098689197541094500f, 0.101145164209600000f, 0.1 03634096655137000f,
106 0.106156067812744000f, 0.108711149979039000f, 0.111299414824660000f, 0.1 13920933406333000f,
107 0.116575776178572000f, 0.119264013005047000f, 0.121985713169619000f, 0.1 24740945387051000f,
108 0.127529777813422000f, 0.130352278056244000f, 0.133208513184300000f, 0.1 36098549737202000f,
109 0.139022453734703000f, 0.141980290685736000f, 0.144972125597231000f, 0.1 47998022982685000f,
110 0.151058046870511000f, 0.154152260812165000f, 0.157280727890073000f, 0.1 60443510725344000f,
111 0.163640671485290000f, 0.166872271890766000f, 0.170138373223312000f, 0.1 73439036332135000f,
112 0.176774321640903000f, 0.180144289154390000f, 0.183548998464951000f, 0.1 86988508758844000f,
113 0.190462878822409000f, 0.193972167048093000f, 0.197516431440340000f, 0.2 01095729621346000f,
114 0.204710118836677000f, 0.208359655960767000f, 0.212044397502288000f, 0.2 15764399609395000f,
115 0.219519718074868000f, 0.223310408341127000f, 0.227136525505149000f, 0.2 30998124323267000f,
116 0.234895259215880000f, 0.238827984272048000f, 0.242796353254002000f, 0.2 46800419601550000f,
117 0.250840236436400000f, 0.254915856566385000f, 0.259027332489606000f, 0.2 63174716398492000f,
118 0.267358060183772000f, 0.271577415438375000f, 0.275832833461245000f, 0.2 80124365261085000f,
119 0.284452061560024000f, 0.288815972797219000f, 0.293216149132375000f, 0.2 97652640449211000f,
120 0.302125496358853000f, 0.306634766203158000f, 0.311180499057984000f, 0.3 15762743736397000f,
121 0.320381548791810000f, 0.325036962521076000f, 0.329729032967515000f, 0.3 34457807923889000f,
122 0.339223334935327000f, 0.344025661302187000f, 0.348864834082879000f, 0.3 53740900096629000f,
123 0.358653905926199000f, 0.363603897920553000f, 0.368590922197487000f, 0.3 73615024646202000f,
124 0.378676250929840000f, 0.383774646487975000f, 0.388910256539059000f, 0.3 94083126082829000f,
125 0.399293299902674000f, 0.404540822567962000f, 0.409825738436323000f, 0.4 15148091655907000f,
126 0.420507926167587000f, 0.425905285707146000f, 0.431340213807410000f, 0.4 36812753800359000f,
127 0.442322948819202000f, 0.447870841800410000f, 0.453456475485731000f, 0.4 59079892424160000f,
128 0.464741134973889000f, 0.470440245304218000f, 0.476177265397440000f, 0.4 81952237050698000f,
129 0.487765201877811000f, 0.493616201311074000f, 0.499505276603030000f, 0.5 05432468828216000f,
130 0.511397818884880000f, 0.517401367496673000f, 0.523443155214325000f, 0.5 29523222417277000f,
131 0.535641609315311000f, 0.541798355950137000f, 0.547993502196972000f, 0.5 54227087766085000f,
132 0.560499152204328000f, 0.566809734896638000f, 0.573158875067523000f, 0.5 79546611782525000f,
133 0.585972983949661000f, 0.592438030320847000f, 0.598941789493296000f, 0.6 05484299910907000f,
134 0.612065599865624000f, 0.618685727498780000f, 0.625344720802427000f, 0.6 32042617620641000f,
135 0.638779455650817000f, 0.645555272444935000f, 0.652370105410821000f, 0.6 59223991813387000f,
136 0.666116968775851000f, 0.673049073280942000f, 0.680020342172095000f, 0.6 87030812154625000f,
137 0.694080519796882000f, 0.701169501531402000f, 0.708297793656032000f, 0.7 15465432335048000f,
138 0.722672453600255000f, 0.729918893352071000f, 0.737204787360605000f, 0.7 44530171266715000f,
139 0.751895080583051000f, 0.759299550695091000f, 0.766743616862161000f, 0.7 74227314218442000f,
140 0.781750677773962000f, 0.789313742415586000f, 0.796916542907978000f, 0.8 04559113894567000f,
141 0.812241489898490000f, 0.819963705323528000f, 0.827725794455034000f, 0.8 35527791460841000f,
142 0.843369730392169000f, 0.851251645184515000f, 0.859173569658532000f, 0.8 67135537520905000f,
143 0.875137582365205000f, 0.883179737672745000f, 0.891262036813419000f, 0.8 99384513046529000f,
144 0.907547199521614000f, 0.915750129279253000f, 0.923993335251873000f, 0.9 32276850264543000f,
145 0.940600707035753000f, 0.948964938178195000f, 0.957369576199527000f, 0.9 65814653503130000f,
146 0.974300202388861000f, 0.982826255053791000f, 0.991392843592940000f, 1.0 00000000000000000f,
147 };
148
149 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
150
151 // x^(29/64) is a very good approximation of the true value, x^(1/2.2).
msarett 2016/06/14 22:52:56 This is the most accurate approach I tried. It's
152 static __m128 inverse_gamma_linear_to_2dot2(__m128 x) {
153 // x^(-1/2)
154 __m128 x2 = _mm_rsqrt_ps(x);
155
156 // x^(-1/32)
157 __m128 x32 = _mm_rsqrt_ps(_mm_rsqrt_ps(_mm_rsqrt_ps(_mm_rsqrt_ps(x2))));
158
159 // x^(+1/64)
160 __m128 x64 = _mm_rsqrt_ps(x32);
mtklein 2016/06/15 15:10:46 If I remember my high school math, the limit of x
msarett 2016/06/15 17:55:03 This is a good question, I'll need to follow up.
msarett 2016/06/16 15:46:11 I tested x^(29/64) and x^(30/64) over the 0 to 1 r
161
162 // x^(+29/64) = x * x^(-1/2) * x^(-1/32) * x^(-1/64)
mtklein 2016/06/15 15:10:46 Why do we need to involve x? Is it not simpler to
msarett 2016/06/15 17:55:03 Long story short: Looks good, let's do that! My f
163 // Note that we also scale to the 0-255 range.
164 // These terms can be combined more minimally with 3 muls and 1 reciprocal. However, order
165 // of operations is more important. We want to be able to start the muls in parallel with
166 // the rsqrts.
167 __m128 scale = _mm_set1_ps(255.0f);
168 return _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(_mm_mul_ps(scale, x), x2), x32), _mm _rcp_ps(x64));
169 }
170
171 template <SkColorSpace::GammaNamed kGammaNamed>
172 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,
173 const float matrix[16]) {
174 const float* gamma_to_linear;
175 if (SkColorSpace::kSRGB_GammaNamed == kGammaNamed) {
176 gamma_to_linear = gamma_srgb_to_linear;
20 } else { 177 } else {
21 return (uint8_t) (v + 0.5f); 178 gamma_to_linear = gamma_2dot2_to_linear;
22 } 179 }
23 } 180
24
25 static void color_xform_2Dot2_RGBA_to_8888_portable(uint32_t* dst, const uint32_ t* src, int len,
26 const float matrix[16]) {
27 while (len-- > 0) {
28 float srcFloats[3];
29 srcFloats[0] = (float) ((*src >> 0) & 0xFF);
30 srcFloats[1] = (float) ((*src >> 8) & 0xFF);
31 srcFloats[2] = (float) ((*src >> 16) & 0xFF);
32
33 // Convert to linear.
34 // TODO (msarett):
35 // We should use X^2.2 here instead of X^2. What is the impact on corre ctness?
36 // We should be able to get closer to 2.2 at a small performance cost.
37 srcFloats[0] = srcFloats[0] * srcFloats[0];
38 srcFloats[1] = srcFloats[1] * srcFloats[1];
39 srcFloats[2] = srcFloats[2] * srcFloats[2];
40
41 // Convert to dst gamut.
42 float dstFloats[3];
43 // TODO (msarett): matrix[12], matrix[13], and matrix[14] are almost alw ays zero.
44 // Should we have another optimized path that avoids the extra addition when they
45 // are zero?
46 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] +
47 srcFloats[2] * matrix[8] + matrix[12];
48 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] +
49 srcFloats[2] * matrix[9] + matrix[13];
50 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] +
51 srcFloats[2] * matrix[10] + matrix[14];
52
53 // Convert to dst gamma.
54 // TODO (msarett):
55 // We should use X^(1/2.2) here instead of X^(1/2). What is the impact on correctness?
56 // We should be able to get closer to (1/2.2) at a small performance cos t.
57 dstFloats[0] = sqrtf(dstFloats[0]);
58 dstFloats[1] = sqrtf(dstFloats[1]);
59 dstFloats[2] = sqrtf(dstFloats[2]);
60
61 *dst = SkPackARGB32NoCheck(((*src >> 24) & 0xFF),
62 clamp_float_to_byte(dstFloats[0]),
63 clamp_float_to_byte(dstFloats[1]),
64 clamp_float_to_byte(dstFloats[2]));
65
66 dst++;
67 src++;
68 }
69 }
70
71 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
72
73 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i nt len,
74 const float matrix[16]) {
75 // Load transformation matrix. 181 // Load transformation matrix.
76 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]); 182 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]);
77 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]); 183 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]);
78 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]); 184 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]);
79 __m128 rQgQbQ = _mm_loadu_ps(&matrix[12]);
80 185
81 while (len >= 4) { 186 while (len >= 4) {
82 // Load 4 pixels and convert them to floats. 187 // Convert to linear. The look-up table has perfect accuracy.
83 __m128i rgba = _mm_loadu_si128((const __m128i*) src); 188 __m128 reds = _mm_setr_ps(gamma_to_linear[(src[0] >> 0) & 0xFF],
mtklein 2016/06/15 15:10:46 Let's hook this all into SkOpts_sse41.cpp too? Th
msarett 2016/06/15 17:55:03 SGTM Woohoo! Another small performance win.
84 __m128i byteMask = _mm_set1_epi32(0xFF); 189 gamma_to_linear[(src[1] >> 0) & 0xFF],
85 __m128 reds = _mm_cvtepi32_ps(_mm_and_si128( rgba, byteMask)); 190 gamma_to_linear[(src[2] >> 0) & 0xFF],
86 __m128 greens = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 8), byteMask)); 191 gamma_to_linear[(src[3] >> 0) & 0xFF]);
87 __m128 blues = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 16), byteMask)); 192 __m128 greens = _mm_setr_ps(gamma_to_linear[(src[0] >> 8) & 0xFF],
88 193 gamma_to_linear[(src[1] >> 8) & 0xFF],
89 // Convert to linear. 194 gamma_to_linear[(src[2] >> 8) & 0xFF],
90 // FIXME (msarett): 195 gamma_to_linear[(src[3] >> 8) & 0xFF]);
91 // Should we be more accurate? 196 __m128 blues = _mm_setr_ps(gamma_to_linear[(src[0] >> 16) & 0xFF],
92 reds = _mm_mul_ps(reds, reds); 197 gamma_to_linear[(src[1] >> 16) & 0xFF],
93 greens = _mm_mul_ps(greens, greens); 198 gamma_to_linear[(src[2] >> 16) & 0xFF],
94 blues = _mm_mul_ps(blues, blues); 199 gamma_to_linear[(src[3] >> 16) & 0xFF]);
95 200
96 // Apply the transformation matrix to dst gamut. 201 // Apply the transformation matrix to dst gamut.
97 // FIXME (msarett):
98 // rQ, gQ, and bQ are almost always zero. Can we save a couple instruct ions?
99
100 // Splat rX, rY, rZ, and rQ each across a register. 202 // Splat rX, rY, rZ, and rQ each across a register.
101 __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00); 203 __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00);
102 __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00); 204 __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00);
103 __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00); 205 __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00);
104 __m128 rQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x00); 206
105 207 // dstReds = rX * reds + rY * greens + rZ * blues
106 // dstReds = rX * reds + rY * greens + rZ * blues + rQ
107 __m128 dstReds = _mm_mul_ps(reds, rX); 208 __m128 dstReds = _mm_mul_ps(reds, rX);
108 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY)); 209 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY));
109 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ)); 210 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ));
110 dstReds = _mm_add_ps(dstReds, rQ);
111 211
112 // Splat gX, gY, gZ, and gQ each across a register. 212 // Splat gX, gY, gZ, and gQ each across a register.
113 __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55); 213 __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55);
114 __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55); 214 __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55);
115 __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55); 215 __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55);
116 __m128 gQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x55); 216
117 217 // dstGreens = gX * reds + gY * greens + gZ * blues
118 // dstGreens = gX * reds + gY * greens + gZ * blues + gQ
119 __m128 dstGreens = _mm_mul_ps(reds, gX); 218 __m128 dstGreens = _mm_mul_ps(reds, gX);
120 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY)); 219 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY));
121 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ)); 220 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ));
122 dstGreens = _mm_add_ps(dstGreens, gQ);
123 221
124 // Splat bX, bY, bZ, and bQ each across a register. 222 // Splat bX, bY, bZ, and bQ each across a register.
125 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA); 223 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA);
126 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA); 224 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA);
127 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA); 225 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA);
128 __m128 bQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0xAA); 226
129 227 // dstBlues = bX * reds + bY * greens + bZ * blues
130 // dstBlues = bX * reds + bY * greens + bZ * blues + bQ
131 __m128 dstBlues = _mm_mul_ps(reds, bX); 228 __m128 dstBlues = _mm_mul_ps(reds, bX);
132 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY)); 229 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY));
133 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ)); 230 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ));
134 dstBlues = _mm_add_ps(dstBlues, bQ);
135 231
136 // Convert to dst gamma. 232 // Convert to dst gamma.
137 // Note that the reciprocal of the reciprocal sqrt, is just a fast sqrt. 233 dstReds = inverse_gamma_linear_to_2dot2(dstReds);
138 // FIXME (msarett): 234 dstGreens = inverse_gamma_linear_to_2dot2(dstGreens);
139 // Should we be more accurate? 235 dstBlues = inverse_gamma_linear_to_2dot2(dstBlues);
140 dstReds = _mm_rcp_ps(_mm_rsqrt_ps(dstReds));
141 dstGreens = _mm_rcp_ps(_mm_rsqrt_ps(dstGreens));
142 dstBlues = _mm_rcp_ps(_mm_rsqrt_ps(dstBlues));
143 236
144 // Clamp floats to 0-255 range. 237 // Clamp floats to 0-255 range.
145 dstReds = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstReds, _mm_set1_ ps(255.0f))); 238 // The order of the arguments is important here. We want to make sure t hat NaN
146 dstGreens = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstGreens, _mm_set1_ ps(255.0f))); 239 // clamps to zero. Note that max(NaN, 0) = 0, while max(0, NaN) = NaN.
147 dstBlues = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstBlues, _mm_set1_ ps(255.0f))); 240 dstReds = _mm_min_ps(_mm_max_ps(dstReds, _mm_setzero_ps()), _mm_set1 _ps(255.0f));
241 dstGreens = _mm_min_ps(_mm_max_ps(dstGreens, _mm_setzero_ps()), _mm_set1 _ps(255.0f));
242 dstBlues = _mm_min_ps(_mm_max_ps(dstBlues, _mm_setzero_ps()), _mm_set1 _ps(255.0f));
148 243
149 // Convert to bytes and store to memory. 244 // Convert to bytes and store to memory.
150 rgba = _mm_and_si128(_mm_set1_epi32(0xFF000000), rgba); 245 __m128i rgba = _mm_set1_epi32(0xFF000000);
151 #ifdef SK_PMCOLOR_IS_RGBA
152 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) ); 246 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) );
153 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) ); 247 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) );
154 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16) ); 248 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16) );
155 #else
156 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstBlues) );
157 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) );
158 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstReds), 16) );
159 #endif
160 _mm_storeu_si128((__m128i*) dst, rgba); 249 _mm_storeu_si128((__m128i*) dst, rgba);
161 250
162 dst += 4; 251 dst += 4;
163 src += 4; 252 src += 4;
164 len -= 4; 253 len -= 4;
165 } 254 }
166 255
167 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix); 256 while (len > 0) {
msarett 2016/06/14 22:52:56 The planar approach ultimately was faster than the
257 // Convert to linear. The look-up table has perfect accuracy.
258 __m128 srcPixel = _mm_setr_ps(gamma_srgb_to_linear[(src[0] >> 0) & 0xFF ],
259 gamma_srgb_to_linear[(src[0] >> 8) & 0xFF ],
260 gamma_srgb_to_linear[(src[0] >> 16) & 0xFF ],
261 gamma_srgb_to_linear[(src[0] >> 24) & 0xFF ]);
mtklein 2016/06/15 15:10:46 Can't this lane just be a constant, e.g. 0 or 1?
msarett 2016/06/15 17:55:03 Of course, removing thanks.
262
263 // Apply the transformation matrix to dst gamut.
264 // This time, splat the red, green, and blue components.
265 __m128 r = _mm_shuffle_ps(srcPixel, srcPixel, 0x00);
266 __m128 g = _mm_shuffle_ps(srcPixel, srcPixel, 0x55);
267 __m128 b = _mm_shuffle_ps(srcPixel, srcPixel, 0xAA);
268 __m128 dstPixel = _mm_mul_ps(r, rXgXbX);
mtklein 2016/06/15 15:10:46 funky formatting here?
msarett 2016/06/15 17:55:03 Done.
269 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(g, rYgYbY));
270 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(b, rZgZbZ));
271
272 // Convert to dst gamma.
273 dstPixel = inverse_gamma_linear_to_2dot2(dstPixel);
274
275 // Clamp floats to 0-255 range.
276 dstPixel = _mm_min_ps(_mm_max_ps(dstPixel, _mm_setzero_ps()), _mm_set1_p s(255.0f));
277
278 // Convert to bytes and store to memory.
279 __m128i dstInts = _mm_cvtps_epi32(dstPixel);
280 __m128i dstBytes = _mm_packus_epi16(_mm_packus_epi16(dstInts, dstInts), dstInts);
281 dstBytes = _mm_or_si128(_mm_set1_epi32(0xFF000000), dstBytes);
282 _mm_store_ss((float*) dst, _mm_castsi128_ps(dstBytes));
283
284 dst += 1;
285 src += 1;
286 len -= 1;
287 }
168 } 288 }
169 289
170 #else 290 #else
171 291
172 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i nt len, 292 static uint8_t clamp_float_to_byte(float v) {
msarett 2016/06/14 22:52:56 Next two functions copied from above, with minor m
Brian Osman 2016/06/15 14:02:01 Not on ARM?
msarett 2016/06/15 14:05:27 True. NEON implementations are coming soon. We c
293 if (v >= 254.5f) {
294 return 255;
295 } else if (v < 0.5f) {
296 return 0;
297 } else {
298 return (uint8_t) (v + 0.5f);
299 }
300 }
301
302 template <SkColorSpace::GammaNamed kGammaNamed>
303 static void color_xform_RGB1_portable(uint32_t* dst, const uint32_t* src, int le n,
304 const float matrix[16]) {
305 const float* gamma_to_linear;
306 if (SkColorSpace::kSRGB_GammaNamed == kGammaNamed) {
307 gamma_to_linear = gamma_srgb_to_linear;
308 } else {
309 gamma_to_linear = gamma_2dot2_to_linear;
310 }
311
312 while (len-- > 0) {
313 // Convert to linear.
314 float srcFloats[3];
315 srcFloats[0] = gamma_to_linear[(*src >> 0) & 0xFF];
316 srcFloats[1] = gamma_to_linear[(*src >> 8) & 0xFF];
317 srcFloats[2] = gamma_to_linear[(*src >> 16) & 0xFF];
318
319 // Convert to dst gamut.
320 float dstFloats[3];
321 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] +
322 srcFloats[2] * matrix[8];
323 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] +
324 srcFloats[2] * matrix[9];
325 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] +
326 srcFloats[2] * matrix[10];
327
328 // Convert to dst gamma.
329 // Note: pow is really, really slow. We will suffer when SSE2 is not su pported.
330 dstFloats[0] = pow(dstFloats[0], (1/2.2f)) * 255.0f;
mtklein 2016/06/15 15:10:46 might as well use powf. that at least avoids floa
msarett 2016/06/15 17:55:03 Done.
331 dstFloats[1] = pow(dstFloats[1], (1/2.2f)) * 255.0f;
332 dstFloats[2] = pow(dstFloats[2], (1/2.2f)) * 255.0f;
333
334 *dst = SkPackARGB32NoCheck(((*src >> 24) & 0xFF),
335 clamp_float_to_byte(dstFloats[0]),
336 clamp_float_to_byte(dstFloats[1]),
337 clamp_float_to_byte(dstFloats[2]));
338
339 dst++;
340 src++;
341 }
342 }
343
344 template <SkColorSpace::GammaNamed kGammaNamed>
345 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,
173 const float matrix[16]) { 346 const float matrix[16]) {
174 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix); 347 color_xform_RGB1_portable<kGammaNamed>(dst, src, len, matrix);
175 } 348 }
176 349
177 #endif 350 #endif
178 351
352 static void color_xform_RGB1_srgb_to_2dot2(uint32_t* dst, const uint32_t* src, i nt len,
353 const float matrix[16]) {
354 color_xform_RGB1<SkColorSpace::kSRGB_GammaNamed>(dst, src, len, matrix);
355 }
356
357 static void color_xform_RGB1_2dot2_to_2dot2(uint32_t* dst, const uint32_t* src, int len,
358 const float matrix[16]) {
359 color_xform_RGB1<SkColorSpace::k2Dot2Curve_GammaNamed>(dst, src, len, matrix );
360 }
361
179 } 362 }
180 363
181 #endif // SkColorXform_opts_DEFINED 364 #endif // SkColorXform_opts_DEFINED
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698