Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(203)

Side by Side Diff: src/opts/SkColorXform_opts.h

Issue 2060823003: Implement fast, correct gamma conversion for color xforms (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Should be 0xFF Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright 2016 Google Inc. 2 * Copyright 2016 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #ifndef SkColorXform_opts_DEFINED 8 #ifndef SkColorXform_opts_DEFINED
9 #define SkColorXform_opts_DEFINED 9 #define SkColorXform_opts_DEFINED
10 10
11 #include "SkColorPriv.h" 11 #include "SkColorPriv.h"
12 12
13 namespace SK_OPTS_NS { 13 namespace SK_OPTS_NS {
14 14
15 static uint8_t clamp_float_to_byte(float v) { 15 static constexpr float gamma_srgb_to_linear[256] = {
16 if (v >= 254.5f) { 16 0.000000000000000000f, 0.000303526983548838f, 0.000607053967097675f, 0.0 00910580950646513f,
17 return 255; 17 0.001214107934195350f, 0.001517634917744190f, 0.001821161901293030f, 0.0 02124688884841860f,
18 } else if (v < 0.5f) { 18 0.002428215868390700f, 0.002731742851939540f, 0.003034518678424960f, 0.0 03346535763899160f,
19 return 0; 19 0.003676507324047440f, 0.004024717018496310f, 0.004391442037410290f, 0.0 04776953480693730f,
20 0.005181516702338390f, 0.005605391624202720f, 0.006048833022857060f, 0.0 06512090792594470f,
21 0.006995410187265390f, 0.007499032043226180f, 0.008023192985384990f, 0.0 08568125618069310f,
22 0.009134058702220790f, 0.009721217320237850f, 0.010329823029626900f, 0.0 10960094006488200f,
23 0.011612245179743900f, 0.012286488356915900f, 0.012983032342173000f, 0.0 13702083047289700f,
24 0.014443843596092500f, 0.015208514422912700f, 0.015996293365509600f, 0.0 16807375752887400f,
25 0.017641954488384100f, 0.018500220128379700f, 0.019382360956935700f, 0.0 20288563056652400f,
26 0.021219010376003600f, 0.022173884793387400f, 0.023153366178110400f, 0.0 24157632448504800f,
27 0.025186859627361600f, 0.026241221894849900f, 0.027320891639074900f, 0.0 28426039504420800f,
28 0.029556834437808800f, 0.030713443732993600f, 0.031896033073011500f, 0.0 33104766570885100f,
29 0.034339806808682200f, 0.035601314875020300f, 0.036889450401100000f, 0.0 38204371595346500f,
30 0.039546235276732800f, 0.040915196906853200f, 0.042311410620809700f, 0.0 43735029256973500f,
31 0.045186204385675500f, 0.046665086336880100f, 0.048171824226889400f, 0.0 49706565984127200f,
32 0.051269458374043200f, 0.052860647023180200f, 0.054480276442442400f, 0.0 56128490049600100f,
33 0.057805430191067200f, 0.059511238162981200f, 0.061246054231617600f, 0.0 63010017653167700f,
34 0.064803266692905800f, 0.066625938643772900f, 0.068478169844400200f, 0.0 70360095696595900f,
35 0.072271850682317500f, 0.074213568380149600f, 0.076185381481307900f, 0.0 78187421805186300f,
36 0.080219820314468300f, 0.082282707129814800f, 0.084376211544148800f, 0.0 86500462036549800f,
37 0.088655586285772900f, 0.090841711183407700f, 0.093058962846687500f, 0.0 95307466630964700f,
38 0.097587347141862500f, 0.099898728247113900f, 0.102241733088101000f, 0.1 04616484091104000f,
39 0.107023102978268000f, 0.109461710778299000f, 0.111932427836906000f, 0.1 14435373826974000f,
40 0.116970667758511000f, 0.119538427988346000f, 0.122138772229602000f, 0.1 24771817560950000f,
41 0.127437680435647000f, 0.130136476690364000f, 0.132868321553818000f, 0.1 35633329655206000f,
42 0.138431615032452000f, 0.141263291140272000f, 0.144128470858058000f, 0.1 47027266497595000f,
43 0.149959789810609000f, 0.152926151996150000f, 0.155926463707827000f, 0.1 58960835060880000f,
44 0.162029375639111000f, 0.165132194501668000f, 0.168269400189691000f, 0.1 71441100732823000f,
45 0.174647403655585000f, 0.177888415983629000f, 0.181164244249860000f, 0.1 84474994500441000f,
46 0.187820772300678000f, 0.191201682740791000f, 0.194617830441576000f, 0.1 98069319559949000f,
47 0.201556253794397000f, 0.205078736390317000f, 0.208636870145256000f, 0.2 12230757414055000f,
48 0.215860500113899000f, 0.219526199729269000f, 0.223227957316809000f, 0.2 26965873510098000f,
49 0.230740048524349000f, 0.234550582161005000f, 0.238397573812271000f, 0.2 42281122465555000f,
50 0.246201326707835000f, 0.250158284729953000f, 0.254152094330827000f, 0.2 58182852921596000f,
51 0.262250657529696000f, 0.266355604802862000f, 0.270497791013066000f, 0.2 74677312060385000f,
52 0.278894263476810000f, 0.283148740429992000f, 0.287440837726918000f, 0.2 91770649817536000f,
53 0.296138270798321000f, 0.300543794415777000f, 0.304987314069886000f, 0.3 09468922817509000f,
54 0.313988713375718000f, 0.318546778125092000f, 0.323143209112951000f, 0.3 27778098056542000f,
55 0.332451536346179000f, 0.337163615048330000f, 0.341914424908661000f, 0.3 46704056355030000f,
56 0.351532599500439000f, 0.356400144145944000f, 0.361306779783510000f, 0.3 66252595598840000f,
57 0.371237680474149000f, 0.376262122990906000f, 0.381326011432530000f, 0.3 86429433787049000f,
58 0.391572477749723000f, 0.396755230725627000f, 0.401977779832196000f, 0.4 07240211901737000f,
59 0.412542613483904000f, 0.417885070848138000f, 0.423267669986072000f, 0.4 28690496613907000f,
60 0.434153636174749000f, 0.439657173840919000f, 0.445201194516228000f, 0.4 50785782838223000f,
61 0.456411023180405000f, 0.462076999654407000f, 0.467783796112159000f, 0.4 73531496148010000f,
62 0.479320183100827000f, 0.485149940056070000f, 0.491020849847836000f, 0.4 96932995060870000f,
63 0.502886458032569000f, 0.508881320854934000f, 0.514917665376521000f, 0.5 20995573204354000f,
64 0.527115125705813000f, 0.533276404010505000f, 0.539479489012107000f, 0.5 45724461370187000f,
65 0.552011401512000000f, 0.558340389634268000f, 0.564711505704929000f, 0.5 71124829464873000f,
66 0.577580440429651000f, 0.584078417891164000f, 0.590618840919337000f, 0.5 97201788363763000f,
67 0.603827338855338000f, 0.610495570807865000f, 0.617206562419651000f, 0.6 23960391675076000f,
68 0.630757136346147000f, 0.637596873994033000f, 0.644479681970582000f, 0.6 51405637419824000f,
69 0.658374817279448000f, 0.665387298282272000f, 0.672443156957688000f, 0.6 79542469633094000f,
70 0.686685312435314000f, 0.693871761291990000f, 0.701101891932973000f, 0.7 08375779891687000f,
71 0.715693500506481000f, 0.723055128921969000f, 0.730460740090354000f, 0.7 37910408772731000f,
72 0.745404209540387000f, 0.752942216776078000f, 0.760524504675292000f, 0.7 68151147247507000f,
73 0.775822218317423000f, 0.783537791526194000f, 0.791297940332630000f, 0.7 99102738014409000f,
74 0.806952257669252000f, 0.814846572216101000f, 0.822785754396284000f, 0.8 30769876774655000f,
75 0.838799011740740000f, 0.846873231509858000f, 0.854992608124234000f, 0.8 63157213454102000f,
76 0.871367119198797000f, 0.879622396887832000f, 0.887923117881966000f, 0.8 96269353374266000f,
77 0.904661174391149000f, 0.913098651793419000f, 0.921581856277295000f, 0.9 30110858375424000f,
78 0.938685728457888000f, 0.947306536733200000f, 0.955973353249286000f, 0.9 64686247894465000f,
79 0.973445290398413000f, 0.982250550333117000f, 0.991102097113830000f, 1.0 00000000000000000f,
80 };
81
82 static constexpr float gamma_2dot2_to_linear[256] = {
83 0.000000000000000000f, 0.000005077051900662f, 0.000023328004666099f, 0.0 00056921765712193f,
84 0.000107187362341244f, 0.000175123977503027f, 0.000261543754548491f, 0.0 00367136269815943f,
85 0.000492503787191433f, 0.000638182842167022f, 0.000804658499513058f, 0.0 00992374304074325f,
86 0.001201739522438400f, 0.001433134589671860f, 0.001686915316789280f, 0.0 01963416213396470f,
87 0.002262953160706430f, 0.002585825596234170f, 0.002932318323938360f, 0.0 03302703032003640f,
88 0.003697239578900130f, 0.004116177093282750f, 0.004559754922526020f, 0.0 05028203456855540f,
89 0.005521744850239660f, 0.006040593654849810f, 0.006584957382581690f, 0.0 07155037004573030f,
90 0.007751027397660610f, 0.008373117745148580f, 0.009021491898012130f, 0.0 09696328701658230f,
91 0.010397802292555300f, 0.011126082368383200f, 0.011881334434813700f, 0.0 12663720031582100f,
92 0.013473396940142600f, 0.014310519374884100f, 0.015175238159625200f, 0.0 16067700890886900f,
93 0.016988052089250000f, 0.017936433339950200f, 0.018912983423721500f, 0.0 19917838438785700f,
94 0.020951131914781100f, 0.022012994919336500f, 0.023103556157921400f, 0.0 24222942067534200f,
95 0.025371276904734600f, 0.026548682828472900f, 0.027755279978126000f, 0.0 28991186547107800f,
96 0.030256518852388700f, 0.031551391400226400f, 0.032875916948383800f, 0.0 34230206565082000f,
97 0.035614369684918800f, 0.037028514161960200f, 0.038472746320194600f, 0.0 39947171001525600f,
98 0.041451891611462500f, 0.042987010162657100f, 0.044552627316421400f, 0.0 46148842422351000f,
99 0.047775753556170600f, 0.049433457555908000f, 0.051122050056493400f, 0.0 52841625522879000f,
100 0.054592277281760300f, 0.056374097551979800f, 0.058187177473685400f, 0.0 60031607136313200f,
101 0.061907475605455800f, 0.063814870948677200f, 0.065753880260330100f, 0.0 67724589685424300f,
102 0.069727084442598800f, 0.071761448846239100f, 0.073827766327784600f, 0.0 75926119456264800f,
103 0.078056589958101900f, 0.080219258736215100f, 0.082414205888459200f, 0.0 84641510725429500f,
104 0.086901251787660300f, 0.089193506862247800f, 0.091518352998919500f, 0.0 93875866525577800f,
105 0.096266123063339700f, 0.098689197541094500f, 0.101145164209600000f, 0.1 03634096655137000f,
106 0.106156067812744000f, 0.108711149979039000f, 0.111299414824660000f, 0.1 13920933406333000f,
107 0.116575776178572000f, 0.119264013005047000f, 0.121985713169619000f, 0.1 24740945387051000f,
108 0.127529777813422000f, 0.130352278056244000f, 0.133208513184300000f, 0.1 36098549737202000f,
109 0.139022453734703000f, 0.141980290685736000f, 0.144972125597231000f, 0.1 47998022982685000f,
110 0.151058046870511000f, 0.154152260812165000f, 0.157280727890073000f, 0.1 60443510725344000f,
111 0.163640671485290000f, 0.166872271890766000f, 0.170138373223312000f, 0.1 73439036332135000f,
112 0.176774321640903000f, 0.180144289154390000f, 0.183548998464951000f, 0.1 86988508758844000f,
113 0.190462878822409000f, 0.193972167048093000f, 0.197516431440340000f, 0.2 01095729621346000f,
114 0.204710118836677000f, 0.208359655960767000f, 0.212044397502288000f, 0.2 15764399609395000f,
115 0.219519718074868000f, 0.223310408341127000f, 0.227136525505149000f, 0.2 30998124323267000f,
116 0.234895259215880000f, 0.238827984272048000f, 0.242796353254002000f, 0.2 46800419601550000f,
117 0.250840236436400000f, 0.254915856566385000f, 0.259027332489606000f, 0.2 63174716398492000f,
118 0.267358060183772000f, 0.271577415438375000f, 0.275832833461245000f, 0.2 80124365261085000f,
119 0.284452061560024000f, 0.288815972797219000f, 0.293216149132375000f, 0.2 97652640449211000f,
120 0.302125496358853000f, 0.306634766203158000f, 0.311180499057984000f, 0.3 15762743736397000f,
121 0.320381548791810000f, 0.325036962521076000f, 0.329729032967515000f, 0.3 34457807923889000f,
122 0.339223334935327000f, 0.344025661302187000f, 0.348864834082879000f, 0.3 53740900096629000f,
123 0.358653905926199000f, 0.363603897920553000f, 0.368590922197487000f, 0.3 73615024646202000f,
124 0.378676250929840000f, 0.383774646487975000f, 0.388910256539059000f, 0.3 94083126082829000f,
125 0.399293299902674000f, 0.404540822567962000f, 0.409825738436323000f, 0.4 15148091655907000f,
126 0.420507926167587000f, 0.425905285707146000f, 0.431340213807410000f, 0.4 36812753800359000f,
127 0.442322948819202000f, 0.447870841800410000f, 0.453456475485731000f, 0.4 59079892424160000f,
128 0.464741134973889000f, 0.470440245304218000f, 0.476177265397440000f, 0.4 81952237050698000f,
129 0.487765201877811000f, 0.493616201311074000f, 0.499505276603030000f, 0.5 05432468828216000f,
130 0.511397818884880000f, 0.517401367496673000f, 0.523443155214325000f, 0.5 29523222417277000f,
131 0.535641609315311000f, 0.541798355950137000f, 0.547993502196972000f, 0.5 54227087766085000f,
132 0.560499152204328000f, 0.566809734896638000f, 0.573158875067523000f, 0.5 79546611782525000f,
133 0.585972983949661000f, 0.592438030320847000f, 0.598941789493296000f, 0.6 05484299910907000f,
134 0.612065599865624000f, 0.618685727498780000f, 0.625344720802427000f, 0.6 32042617620641000f,
135 0.638779455650817000f, 0.645555272444935000f, 0.652370105410821000f, 0.6 59223991813387000f,
136 0.666116968775851000f, 0.673049073280942000f, 0.680020342172095000f, 0.6 87030812154625000f,
137 0.694080519796882000f, 0.701169501531402000f, 0.708297793656032000f, 0.7 15465432335048000f,
138 0.722672453600255000f, 0.729918893352071000f, 0.737204787360605000f, 0.7 44530171266715000f,
139 0.751895080583051000f, 0.759299550695091000f, 0.766743616862161000f, 0.7 74227314218442000f,
140 0.781750677773962000f, 0.789313742415586000f, 0.796916542907978000f, 0.8 04559113894567000f,
141 0.812241489898490000f, 0.819963705323528000f, 0.827725794455034000f, 0.8 35527791460841000f,
142 0.843369730392169000f, 0.851251645184515000f, 0.859173569658532000f, 0.8 67135537520905000f,
143 0.875137582365205000f, 0.883179737672745000f, 0.891262036813419000f, 0.8 99384513046529000f,
144 0.907547199521614000f, 0.915750129279253000f, 0.923993335251873000f, 0.9 32276850264543000f,
145 0.940600707035753000f, 0.948964938178195000f, 0.957369576199527000f, 0.9 65814653503130000f,
146 0.974300202388861000f, 0.982826255053791000f, 0.991392843592940000f, 1.0 00000000000000000f,
147 };
148
149 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
150
151 // x^(29/64) is a very good approximation of the true value, x^(1/2.2).
152 static __m128 inverse_gamma_linear_to_2dot2(__m128 x) {
mtklein_C 2016/06/16 13:27:41 Just some naming questions. Some of these names f
msarett 2016/06/16 15:46:11 sgtm, shorter names are better.
153 // x^(-1/2)
154 __m128 x2 = _mm_rsqrt_ps(x);
155
156 // x^(-1/32)
157 __m128 x32 = _mm_rsqrt_ps(_mm_rsqrt_ps(_mm_rsqrt_ps(_mm_rsqrt_ps(x2))));
158
159 // x^(+1/64)
160 __m128 x64 = _mm_rsqrt_ps(x32);
161
162 // x^(+29/64) = x^(+1/2) * x^(-1/32) * x^(-1/64)
163 // Note that we also scale to the 0-255 range.
164 // These terms can be combined more minimally with 3 muls and 1 reciprocal. However, this
165 // is faster, because it allows us to start the muls in parallel with the rs qrts.
166 __m128 scale = _mm_set1_ps(255.0f);
167 return _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(scale, _mm_rcp_ps(x2)), x32), _mm_rc p_ps(x64));
168 }
169
170 template <SkColorSpace::GammaNamed kGammaNamed>
171 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,
172 const float matrix[16]) {
mtklein_C 2016/06/16 13:27:41 This line might want to be re-wrapped?
msarett 2016/06/16 15:46:12 Done.
173 const float* gamma_to_linear;
174 if (SkColorSpace::kSRGB_GammaNamed == kGammaNamed) {
mtklein_C 2016/06/16 13:27:41 Since we're not otherwise using kGammaNamed, I thi
msarett 2016/06/16 15:46:12 Woohoo, this is cool!
175 gamma_to_linear = gamma_srgb_to_linear;
20 } else { 176 } else {
21 return (uint8_t) (v + 0.5f); 177 gamma_to_linear = gamma_2dot2_to_linear;
22 } 178 }
23 } 179
24
25 static void color_xform_2Dot2_RGBA_to_8888_portable(uint32_t* dst, const uint32_ t* src, int len,
26 const float matrix[16]) {
27 while (len-- > 0) {
28 float srcFloats[3];
29 srcFloats[0] = (float) ((*src >> 0) & 0xFF);
30 srcFloats[1] = (float) ((*src >> 8) & 0xFF);
31 srcFloats[2] = (float) ((*src >> 16) & 0xFF);
32
33 // Convert to linear.
34 // TODO (msarett):
35 // We should use X^2.2 here instead of X^2. What is the impact on corre ctness?
36 // We should be able to get closer to 2.2 at a small performance cost.
37 srcFloats[0] = srcFloats[0] * srcFloats[0];
38 srcFloats[1] = srcFloats[1] * srcFloats[1];
39 srcFloats[2] = srcFloats[2] * srcFloats[2];
40
41 // Convert to dst gamut.
42 float dstFloats[3];
43 // TODO (msarett): matrix[12], matrix[13], and matrix[14] are almost alw ays zero.
44 // Should we have another optimized path that avoids the extra addition when they
45 // are zero?
46 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] +
47 srcFloats[2] * matrix[8] + matrix[12];
48 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] +
49 srcFloats[2] * matrix[9] + matrix[13];
50 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] +
51 srcFloats[2] * matrix[10] + matrix[14];
52
53 // Convert to dst gamma.
54 // TODO (msarett):
55 // We should use X^(1/2.2) here instead of X^(1/2). What is the impact on correctness?
56 // We should be able to get closer to (1/2.2) at a small performance cos t.
57 dstFloats[0] = sqrtf(dstFloats[0]);
58 dstFloats[1] = sqrtf(dstFloats[1]);
59 dstFloats[2] = sqrtf(dstFloats[2]);
60
61 *dst = SkPackARGB32NoCheck(((*src >> 24) & 0xFF),
62 clamp_float_to_byte(dstFloats[0]),
63 clamp_float_to_byte(dstFloats[1]),
64 clamp_float_to_byte(dstFloats[2]));
65
66 dst++;
67 src++;
68 }
69 }
70
71 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
72
73 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i nt len,
74 const float matrix[16]) {
75 // Load transformation matrix. 180 // Load transformation matrix.
76 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]); 181 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]);
77 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]); 182 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]);
78 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]); 183 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]);
79 __m128 rQgQbQ = _mm_loadu_ps(&matrix[12]);
80 184
81 while (len >= 4) { 185 while (len >= 4) {
82 // Load 4 pixels and convert them to floats. 186 // Convert to linear. The look-up table has perfect accuracy.
83 __m128i rgba = _mm_loadu_si128((const __m128i*) src); 187 __m128 reds = _mm_setr_ps(gamma_to_linear[(src[0] >> 0) & 0xFF],
84 __m128i byteMask = _mm_set1_epi32(0xFF); 188 gamma_to_linear[(src[1] >> 0) & 0xFF],
85 __m128 reds = _mm_cvtepi32_ps(_mm_and_si128( rgba, byteMask)); 189 gamma_to_linear[(src[2] >> 0) & 0xFF],
86 __m128 greens = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 8), byteMask)); 190 gamma_to_linear[(src[3] >> 0) & 0xFF]);
87 __m128 blues = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 16), byteMask)); 191 __m128 greens = _mm_setr_ps(gamma_to_linear[(src[0] >> 8) & 0xFF],
88 192 gamma_to_linear[(src[1] >> 8) & 0xFF],
89 // Convert to linear. 193 gamma_to_linear[(src[2] >> 8) & 0xFF],
90 // FIXME (msarett): 194 gamma_to_linear[(src[3] >> 8) & 0xFF]);
91 // Should we be more accurate? 195 __m128 blues = _mm_setr_ps(gamma_to_linear[(src[0] >> 16) & 0xFF],
92 reds = _mm_mul_ps(reds, reds); 196 gamma_to_linear[(src[1] >> 16) & 0xFF],
93 greens = _mm_mul_ps(greens, greens); 197 gamma_to_linear[(src[2] >> 16) & 0xFF],
94 blues = _mm_mul_ps(blues, blues); 198 gamma_to_linear[(src[3] >> 16) & 0xFF]);
95 199
96 // Apply the transformation matrix to dst gamut. 200 // Apply the transformation matrix to dst gamut.
97 // FIXME (msarett):
98 // rQ, gQ, and bQ are almost always zero. Can we save a couple instruct ions?
99
100 // Splat rX, rY, rZ, and rQ each across a register. 201 // Splat rX, rY, rZ, and rQ each across a register.
101 __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00); 202 __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00);
102 __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00); 203 __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00);
103 __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00); 204 __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00);
104 __m128 rQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x00); 205
105 206 // dstReds = rX * reds + rY * greens + rZ * blues
106 // dstReds = rX * reds + rY * greens + rZ * blues + rQ
107 __m128 dstReds = _mm_mul_ps(reds, rX); 207 __m128 dstReds = _mm_mul_ps(reds, rX);
108 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY)); 208 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY));
109 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ)); 209 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ));
110 dstReds = _mm_add_ps(dstReds, rQ);
111 210
112 // Splat gX, gY, gZ, and gQ each across a register. 211 // Splat gX, gY, gZ, and gQ each across a register.
113 __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55); 212 __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55);
114 __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55); 213 __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55);
115 __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55); 214 __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55);
116 __m128 gQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x55); 215
117 216 // dstGreens = gX * reds + gY * greens + gZ * blues
118 // dstGreens = gX * reds + gY * greens + gZ * blues + gQ
119 __m128 dstGreens = _mm_mul_ps(reds, gX); 217 __m128 dstGreens = _mm_mul_ps(reds, gX);
120 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY)); 218 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY));
121 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ)); 219 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ));
122 dstGreens = _mm_add_ps(dstGreens, gQ);
123 220
124 // Splat bX, bY, bZ, and bQ each across a register. 221 // Splat bX, bY, bZ, and bQ each across a register.
125 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA); 222 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA);
126 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA); 223 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA);
127 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA); 224 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA);
128 __m128 bQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0xAA); 225
129 226 // dstBlues = bX * reds + bY * greens + bZ * blues
130 // dstBlues = bX * reds + bY * greens + bZ * blues + bQ
131 __m128 dstBlues = _mm_mul_ps(reds, bX); 227 __m128 dstBlues = _mm_mul_ps(reds, bX);
132 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY)); 228 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY));
133 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ)); 229 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ));
134 dstBlues = _mm_add_ps(dstBlues, bQ);
135 230
136 // Convert to dst gamma. 231 // Convert to dst gamma.
137 // Note that the reciprocal of the reciprocal sqrt, is just a fast sqrt. 232 dstReds = inverse_gamma_linear_to_2dot2(dstReds);
138 // FIXME (msarett): 233 dstGreens = inverse_gamma_linear_to_2dot2(dstGreens);
139 // Should we be more accurate? 234 dstBlues = inverse_gamma_linear_to_2dot2(dstBlues);
140 dstReds = _mm_rcp_ps(_mm_rsqrt_ps(dstReds));
141 dstGreens = _mm_rcp_ps(_mm_rsqrt_ps(dstGreens));
142 dstBlues = _mm_rcp_ps(_mm_rsqrt_ps(dstBlues));
143 235
144 // Clamp floats to 0-255 range. 236 // Clamp floats to 0-255 range.
145 dstReds = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstReds, _mm_set1_ ps(255.0f))); 237 // The order of the arguments is important here. We want to make sure t hat NaN
146 dstGreens = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstGreens, _mm_set1_ ps(255.0f))); 238 // clamps to zero. Note that max(NaN, 0) = 0, while max(0, NaN) = NaN.
mtklein_C 2016/06/16 13:27:40 Do we have test cases exercising the NaN input? J
msarett 2016/06/16 15:46:11 Yes we are hitting this case quite frequently actu
147 dstBlues = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstBlues, _mm_set1_ ps(255.0f))); 239 dstReds = _mm_min_ps(_mm_max_ps(dstReds, _mm_setzero_ps()), _mm_set1 _ps(255.0f));
240 dstGreens = _mm_min_ps(_mm_max_ps(dstGreens, _mm_setzero_ps()), _mm_set1 _ps(255.0f));
241 dstBlues = _mm_min_ps(_mm_max_ps(dstBlues, _mm_setzero_ps()), _mm_set1 _ps(255.0f));
148 242
149 // Convert to bytes and store to memory. 243 // Convert to bytes and store to memory.
150 rgba = _mm_and_si128(_mm_set1_epi32(0xFF000000), rgba); 244 __m128i rgba = _mm_set1_epi32(0xFF000000);
151 #ifdef SK_PMCOLOR_IS_RGBA
152 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) ); 245 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) );
153 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) ); 246 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) );
154 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16) ); 247 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16) );
155 #else
156 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstBlues) );
157 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) );
158 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstReds), 16) );
159 #endif
160 _mm_storeu_si128((__m128i*) dst, rgba); 248 _mm_storeu_si128((__m128i*) dst, rgba);
161 249
162 dst += 4; 250 dst += 4;
163 src += 4; 251 src += 4;
164 len -= 4; 252 len -= 4;
165 } 253 }
166 254
167 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix); 255 while (len > 0) {
256 // Convert to linear. The look-up table has perfect accuracy.
257 __m128 srcPixel = _mm_setr_ps(gamma_srgb_to_linear[(src[0] >> 0) & 0xFF ],
mtklein_C 2016/06/16 13:27:40 Wouldn't this part be simpler as, // Splat red, g
msarett 2016/06/16 15:46:12 Yes I like this better, done.
258 gamma_srgb_to_linear[(src[0] >> 8) & 0xFF ],
259 gamma_srgb_to_linear[(src[0] >> 16) & 0xFF ],
260 0.0f);
261
262 // Apply the transformation matrix to dst gamut.
263 // This time, splat the red, green, and blue components.
264 __m128 r = _mm_shuffle_ps(srcPixel, srcPixel, 0x00);
265 __m128 g = _mm_shuffle_ps(srcPixel, srcPixel, 0x55);
266 __m128 b = _mm_shuffle_ps(srcPixel, srcPixel, 0xAA);
267 __m128 dstPixel = _mm_mul_ps(r, rXgXbX);
268 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(g, rYgYbY));
269 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(b, rZgZbZ));
270
271 // Convert to dst gamma.
272 dstPixel = inverse_gamma_linear_to_2dot2(dstPixel);
273
274 // Clamp floats to 0-255 range.
275 dstPixel = _mm_min_ps(_mm_max_ps(dstPixel, _mm_setzero_ps()), _mm_set1_p s(255.0f));
mtklein_C 2016/06/16 13:27:41 Let's make the clamping a static function? That c
msarett 2016/06/16 15:46:12 SGTM
276
277 // Convert to bytes and store to memory.
278 __m128i dstInts = _mm_cvtps_epi32(dstPixel);
279 __m128i dstBytes = _mm_packus_epi16(_mm_packus_epi16(dstInts, dstInts), dstInts);
280 dstBytes = _mm_or_si128(_mm_set1_epi32(0xFF000000), dstBytes);
281 _mm_store_ss((float*) dst, _mm_castsi128_ps(dstBytes));
282
283 dst += 1;
284 src += 1;
285 len -= 1;
286 }
168 } 287 }
169 288
170 #else 289 #else
171 290
172 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i nt len, 291 static uint8_t clamp_float_to_byte(float v) {
292 if (v >= 254.5f) {
293 return 255;
294 } else if (v < 0.5f) {
295 return 0;
296 } else {
297 return (uint8_t) (v + 0.5f);
298 }
299 }
300
301 template <SkColorSpace::GammaNamed kGammaNamed>
302 static void color_xform_RGB1_portable(uint32_t* dst, const uint32_t* src, int le n,
303 const float matrix[16]) {
304 const float* gamma_to_linear;
305 if (SkColorSpace::kSRGB_GammaNamed == kGammaNamed) {
306 gamma_to_linear = gamma_srgb_to_linear;
307 } else {
308 gamma_to_linear = gamma_2dot2_to_linear;
309 }
310
311 while (len-- > 0) {
312 // Convert to linear.
313 float srcFloats[3];
314 srcFloats[0] = gamma_to_linear[(*src >> 0) & 0xFF];
315 srcFloats[1] = gamma_to_linear[(*src >> 8) & 0xFF];
316 srcFloats[2] = gamma_to_linear[(*src >> 16) & 0xFF];
317
318 // Convert to dst gamut.
319 float dstFloats[3];
320 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] +
321 srcFloats[2] * matrix[8];
322 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] +
323 srcFloats[2] * matrix[9];
324 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] +
325 srcFloats[2] * matrix[10];
326
327 // Convert to dst gamma.
328 // Note: pow is really, really slow. We will suffer when SSE2 is not su pported.
329 dstFloats[0] = powf(dstFloats[0], (1/2.2f)) * 255.0f;
330 dstFloats[1] = powf(dstFloats[1], (1/2.2f)) * 255.0f;
331 dstFloats[2] = powf(dstFloats[2], (1/2.2f)) * 255.0f;
332
333 *dst = SkPackARGB32NoCheck(0xFF,
mtklein_C 2016/06/16 13:27:41 Didn't you already munge the matrix so that we sho
msarett 2016/06/16 15:46:12 Agreed, this is wrong. Using shifts.
334 clamp_float_to_byte(dstFloats[0]),
335 clamp_float_to_byte(dstFloats[1]),
336 clamp_float_to_byte(dstFloats[2]));
337
338 dst++;
339 src++;
340 }
341 }
342
343 template <SkColorSpace::GammaNamed kGammaNamed>
344 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,
173 const float matrix[16]) { 345 const float matrix[16]) {
174 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix); 346 color_xform_RGB1_portable<kGammaNamed>(dst, src, len, matrix);
mtklein_C 2016/06/16 13:27:40 This appears to be a complete passthrough function
msarett 2016/06/16 15:46:12 Of course, Done.
175 } 347 }
176 348
177 #endif 349 #endif
178 350
351 static void color_xform_RGB1_srgb_to_2dot2(uint32_t* dst, const uint32_t* src, i nt len,
352 const float matrix[16]) {
353 color_xform_RGB1<SkColorSpace::kSRGB_GammaNamed>(dst, src, len, matrix);
354 }
355
356 static void color_xform_RGB1_2dot2_to_2dot2(uint32_t* dst, const uint32_t* src, int len,
357 const float matrix[16]) {
358 color_xform_RGB1<SkColorSpace::k2Dot2Curve_GammaNamed>(dst, src, len, matrix );
359 }
360
179 } 361 }
180 362
181 #endif // SkColorXform_opts_DEFINED 363 #endif // SkColorXform_opts_DEFINED
OLDNEW
« src/core/SkOpts.h ('K') | « src/core/SkOpts.cpp ('k') | src/opts/SkOpts_sse41.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698