OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkColorXform_opts_DEFINED | 8 #ifndef SkColorXform_opts_DEFINED |
9 #define SkColorXform_opts_DEFINED | 9 #define SkColorXform_opts_DEFINED |
10 | 10 |
11 #include "SkColorPriv.h" | 11 #include "SkColorPriv.h" |
12 | 12 |
13 namespace SK_OPTS_NS { | 13 namespace SK_OPTS_NS { |
14 | 14 |
15 static uint8_t clamp_float_to_byte(float v) { | 15 static constexpr float gamma_srgb_to_linear[256] = { |
16 if (v >= 254.5f) { | 16 0.000000000000000000f, 0.000303526983548838f, 0.000607053967097675f, 0.0 00910580950646513f, |
17 return 255; | 17 0.001214107934195350f, 0.001517634917744190f, 0.001821161901293030f, 0.0 02124688884841860f, |
18 } else if (v < 0.5f) { | 18 0.002428215868390700f, 0.002731742851939540f, 0.003034518678424960f, 0.0 03346535763899160f, |
19 return 0; | 19 0.003676507324047440f, 0.004024717018496310f, 0.004391442037410290f, 0.0 04776953480693730f, |
20 0.005181516702338390f, 0.005605391624202720f, 0.006048833022857060f, 0.0 06512090792594470f, | |
21 0.006995410187265390f, 0.007499032043226180f, 0.008023192985384990f, 0.0 08568125618069310f, | |
22 0.009134058702220790f, 0.009721217320237850f, 0.010329823029626900f, 0.0 10960094006488200f, | |
23 0.011612245179743900f, 0.012286488356915900f, 0.012983032342173000f, 0.0 13702083047289700f, | |
24 0.014443843596092500f, 0.015208514422912700f, 0.015996293365509600f, 0.0 16807375752887400f, | |
25 0.017641954488384100f, 0.018500220128379700f, 0.019382360956935700f, 0.0 20288563056652400f, | |
26 0.021219010376003600f, 0.022173884793387400f, 0.023153366178110400f, 0.0 24157632448504800f, | |
27 0.025186859627361600f, 0.026241221894849900f, 0.027320891639074900f, 0.0 28426039504420800f, | |
28 0.029556834437808800f, 0.030713443732993600f, 0.031896033073011500f, 0.0 33104766570885100f, | |
29 0.034339806808682200f, 0.035601314875020300f, 0.036889450401100000f, 0.0 38204371595346500f, | |
30 0.039546235276732800f, 0.040915196906853200f, 0.042311410620809700f, 0.0 43735029256973500f, | |
31 0.045186204385675500f, 0.046665086336880100f, 0.048171824226889400f, 0.0 49706565984127200f, | |
32 0.051269458374043200f, 0.052860647023180200f, 0.054480276442442400f, 0.0 56128490049600100f, | |
33 0.057805430191067200f, 0.059511238162981200f, 0.061246054231617600f, 0.0 63010017653167700f, | |
34 0.064803266692905800f, 0.066625938643772900f, 0.068478169844400200f, 0.0 70360095696595900f, | |
35 0.072271850682317500f, 0.074213568380149600f, 0.076185381481307900f, 0.0 78187421805186300f, | |
36 0.080219820314468300f, 0.082282707129814800f, 0.084376211544148800f, 0.0 86500462036549800f, | |
37 0.088655586285772900f, 0.090841711183407700f, 0.093058962846687500f, 0.0 95307466630964700f, | |
38 0.097587347141862500f, 0.099898728247113900f, 0.102241733088101000f, 0.1 04616484091104000f, | |
39 0.107023102978268000f, 0.109461710778299000f, 0.111932427836906000f, 0.1 14435373826974000f, | |
40 0.116970667758511000f, 0.119538427988346000f, 0.122138772229602000f, 0.1 24771817560950000f, | |
41 0.127437680435647000f, 0.130136476690364000f, 0.132868321553818000f, 0.1 35633329655206000f, | |
42 0.138431615032452000f, 0.141263291140272000f, 0.144128470858058000f, 0.1 47027266497595000f, | |
43 0.149959789810609000f, 0.152926151996150000f, 0.155926463707827000f, 0.1 58960835060880000f, | |
44 0.162029375639111000f, 0.165132194501668000f, 0.168269400189691000f, 0.1 71441100732823000f, | |
45 0.174647403655585000f, 0.177888415983629000f, 0.181164244249860000f, 0.1 84474994500441000f, | |
46 0.187820772300678000f, 0.191201682740791000f, 0.194617830441576000f, 0.1 98069319559949000f, | |
47 0.201556253794397000f, 0.205078736390317000f, 0.208636870145256000f, 0.2 12230757414055000f, | |
48 0.215860500113899000f, 0.219526199729269000f, 0.223227957316809000f, 0.2 26965873510098000f, | |
49 0.230740048524349000f, 0.234550582161005000f, 0.238397573812271000f, 0.2 42281122465555000f, | |
50 0.246201326707835000f, 0.250158284729953000f, 0.254152094330827000f, 0.2 58182852921596000f, | |
51 0.262250657529696000f, 0.266355604802862000f, 0.270497791013066000f, 0.2 74677312060385000f, | |
52 0.278894263476810000f, 0.283148740429992000f, 0.287440837726918000f, 0.2 91770649817536000f, | |
53 0.296138270798321000f, 0.300543794415777000f, 0.304987314069886000f, 0.3 09468922817509000f, | |
54 0.313988713375718000f, 0.318546778125092000f, 0.323143209112951000f, 0.3 27778098056542000f, | |
55 0.332451536346179000f, 0.337163615048330000f, 0.341914424908661000f, 0.3 46704056355030000f, | |
56 0.351532599500439000f, 0.356400144145944000f, 0.361306779783510000f, 0.3 66252595598840000f, | |
57 0.371237680474149000f, 0.376262122990906000f, 0.381326011432530000f, 0.3 86429433787049000f, | |
58 0.391572477749723000f, 0.396755230725627000f, 0.401977779832196000f, 0.4 07240211901737000f, | |
59 0.412542613483904000f, 0.417885070848138000f, 0.423267669986072000f, 0.4 28690496613907000f, | |
60 0.434153636174749000f, 0.439657173840919000f, 0.445201194516228000f, 0.4 50785782838223000f, | |
61 0.456411023180405000f, 0.462076999654407000f, 0.467783796112159000f, 0.4 73531496148010000f, | |
62 0.479320183100827000f, 0.485149940056070000f, 0.491020849847836000f, 0.4 96932995060870000f, | |
63 0.502886458032569000f, 0.508881320854934000f, 0.514917665376521000f, 0.5 20995573204354000f, | |
64 0.527115125705813000f, 0.533276404010505000f, 0.539479489012107000f, 0.5 45724461370187000f, | |
65 0.552011401512000000f, 0.558340389634268000f, 0.564711505704929000f, 0.5 71124829464873000f, | |
66 0.577580440429651000f, 0.584078417891164000f, 0.590618840919337000f, 0.5 97201788363763000f, | |
67 0.603827338855338000f, 0.610495570807865000f, 0.617206562419651000f, 0.6 23960391675076000f, | |
68 0.630757136346147000f, 0.637596873994033000f, 0.644479681970582000f, 0.6 51405637419824000f, | |
69 0.658374817279448000f, 0.665387298282272000f, 0.672443156957688000f, 0.6 79542469633094000f, | |
70 0.686685312435314000f, 0.693871761291990000f, 0.701101891932973000f, 0.7 08375779891687000f, | |
71 0.715693500506481000f, 0.723055128921969000f, 0.730460740090354000f, 0.7 37910408772731000f, | |
72 0.745404209540387000f, 0.752942216776078000f, 0.760524504675292000f, 0.7 68151147247507000f, | |
73 0.775822218317423000f, 0.783537791526194000f, 0.791297940332630000f, 0.7 99102738014409000f, | |
74 0.806952257669252000f, 0.814846572216101000f, 0.822785754396284000f, 0.8 30769876774655000f, | |
75 0.838799011740740000f, 0.846873231509858000f, 0.854992608124234000f, 0.8 63157213454102000f, | |
76 0.871367119198797000f, 0.879622396887832000f, 0.887923117881966000f, 0.8 96269353374266000f, | |
77 0.904661174391149000f, 0.913098651793419000f, 0.921581856277295000f, 0.9 30110858375424000f, | |
78 0.938685728457888000f, 0.947306536733200000f, 0.955973353249286000f, 0.9 64686247894465000f, | |
79 0.973445290398413000f, 0.982250550333117000f, 0.991102097113830000f, 1.0 00000000000000000f, | |
80 }; | |
81 | |
82 static constexpr float gamma_2dot2_to_linear[256] = { | |
83 0.000000000000000000f, 0.000005077051900662f, 0.000023328004666099f, 0.0 00056921765712193f, | |
84 0.000107187362341244f, 0.000175123977503027f, 0.000261543754548491f, 0.0 00367136269815943f, | |
85 0.000492503787191433f, 0.000638182842167022f, 0.000804658499513058f, 0.0 00992374304074325f, | |
86 0.001201739522438400f, 0.001433134589671860f, 0.001686915316789280f, 0.0 01963416213396470f, | |
87 0.002262953160706430f, 0.002585825596234170f, 0.002932318323938360f, 0.0 03302703032003640f, | |
88 0.003697239578900130f, 0.004116177093282750f, 0.004559754922526020f, 0.0 05028203456855540f, | |
89 0.005521744850239660f, 0.006040593654849810f, 0.006584957382581690f, 0.0 07155037004573030f, | |
90 0.007751027397660610f, 0.008373117745148580f, 0.009021491898012130f, 0.0 09696328701658230f, | |
91 0.010397802292555300f, 0.011126082368383200f, 0.011881334434813700f, 0.0 12663720031582100f, | |
92 0.013473396940142600f, 0.014310519374884100f, 0.015175238159625200f, 0.0 16067700890886900f, | |
93 0.016988052089250000f, 0.017936433339950200f, 0.018912983423721500f, 0.0 19917838438785700f, | |
94 0.020951131914781100f, 0.022012994919336500f, 0.023103556157921400f, 0.0 24222942067534200f, | |
95 0.025371276904734600f, 0.026548682828472900f, 0.027755279978126000f, 0.0 28991186547107800f, | |
96 0.030256518852388700f, 0.031551391400226400f, 0.032875916948383800f, 0.0 34230206565082000f, | |
97 0.035614369684918800f, 0.037028514161960200f, 0.038472746320194600f, 0.0 39947171001525600f, | |
98 0.041451891611462500f, 0.042987010162657100f, 0.044552627316421400f, 0.0 46148842422351000f, | |
99 0.047775753556170600f, 0.049433457555908000f, 0.051122050056493400f, 0.0 52841625522879000f, | |
100 0.054592277281760300f, 0.056374097551979800f, 0.058187177473685400f, 0.0 60031607136313200f, | |
101 0.061907475605455800f, 0.063814870948677200f, 0.065753880260330100f, 0.0 67724589685424300f, | |
102 0.069727084442598800f, 0.071761448846239100f, 0.073827766327784600f, 0.0 75926119456264800f, | |
103 0.078056589958101900f, 0.080219258736215100f, 0.082414205888459200f, 0.0 84641510725429500f, | |
104 0.086901251787660300f, 0.089193506862247800f, 0.091518352998919500f, 0.0 93875866525577800f, | |
105 0.096266123063339700f, 0.098689197541094500f, 0.101145164209600000f, 0.1 03634096655137000f, | |
106 0.106156067812744000f, 0.108711149979039000f, 0.111299414824660000f, 0.1 13920933406333000f, | |
107 0.116575776178572000f, 0.119264013005047000f, 0.121985713169619000f, 0.1 24740945387051000f, | |
108 0.127529777813422000f, 0.130352278056244000f, 0.133208513184300000f, 0.1 36098549737202000f, | |
109 0.139022453734703000f, 0.141980290685736000f, 0.144972125597231000f, 0.1 47998022982685000f, | |
110 0.151058046870511000f, 0.154152260812165000f, 0.157280727890073000f, 0.1 60443510725344000f, | |
111 0.163640671485290000f, 0.166872271890766000f, 0.170138373223312000f, 0.1 73439036332135000f, | |
112 0.176774321640903000f, 0.180144289154390000f, 0.183548998464951000f, 0.1 86988508758844000f, | |
113 0.190462878822409000f, 0.193972167048093000f, 0.197516431440340000f, 0.2 01095729621346000f, | |
114 0.204710118836677000f, 0.208359655960767000f, 0.212044397502288000f, 0.2 15764399609395000f, | |
115 0.219519718074868000f, 0.223310408341127000f, 0.227136525505149000f, 0.2 30998124323267000f, | |
116 0.234895259215880000f, 0.238827984272048000f, 0.242796353254002000f, 0.2 46800419601550000f, | |
117 0.250840236436400000f, 0.254915856566385000f, 0.259027332489606000f, 0.2 63174716398492000f, | |
118 0.267358060183772000f, 0.271577415438375000f, 0.275832833461245000f, 0.2 80124365261085000f, | |
119 0.284452061560024000f, 0.288815972797219000f, 0.293216149132375000f, 0.2 97652640449211000f, | |
120 0.302125496358853000f, 0.306634766203158000f, 0.311180499057984000f, 0.3 15762743736397000f, | |
121 0.320381548791810000f, 0.325036962521076000f, 0.329729032967515000f, 0.3 34457807923889000f, | |
122 0.339223334935327000f, 0.344025661302187000f, 0.348864834082879000f, 0.3 53740900096629000f, | |
123 0.358653905926199000f, 0.363603897920553000f, 0.368590922197487000f, 0.3 73615024646202000f, | |
124 0.378676250929840000f, 0.383774646487975000f, 0.388910256539059000f, 0.3 94083126082829000f, | |
125 0.399293299902674000f, 0.404540822567962000f, 0.409825738436323000f, 0.4 15148091655907000f, | |
126 0.420507926167587000f, 0.425905285707146000f, 0.431340213807410000f, 0.4 36812753800359000f, | |
127 0.442322948819202000f, 0.447870841800410000f, 0.453456475485731000f, 0.4 59079892424160000f, | |
128 0.464741134973889000f, 0.470440245304218000f, 0.476177265397440000f, 0.4 81952237050698000f, | |
129 0.487765201877811000f, 0.493616201311074000f, 0.499505276603030000f, 0.5 05432468828216000f, | |
130 0.511397818884880000f, 0.517401367496673000f, 0.523443155214325000f, 0.5 29523222417277000f, | |
131 0.535641609315311000f, 0.541798355950137000f, 0.547993502196972000f, 0.5 54227087766085000f, | |
132 0.560499152204328000f, 0.566809734896638000f, 0.573158875067523000f, 0.5 79546611782525000f, | |
133 0.585972983949661000f, 0.592438030320847000f, 0.598941789493296000f, 0.6 05484299910907000f, | |
134 0.612065599865624000f, 0.618685727498780000f, 0.625344720802427000f, 0.6 32042617620641000f, | |
135 0.638779455650817000f, 0.645555272444935000f, 0.652370105410821000f, 0.6 59223991813387000f, | |
136 0.666116968775851000f, 0.673049073280942000f, 0.680020342172095000f, 0.6 87030812154625000f, | |
137 0.694080519796882000f, 0.701169501531402000f, 0.708297793656032000f, 0.7 15465432335048000f, | |
138 0.722672453600255000f, 0.729918893352071000f, 0.737204787360605000f, 0.7 44530171266715000f, | |
139 0.751895080583051000f, 0.759299550695091000f, 0.766743616862161000f, 0.7 74227314218442000f, | |
140 0.781750677773962000f, 0.789313742415586000f, 0.796916542907978000f, 0.8 04559113894567000f, | |
141 0.812241489898490000f, 0.819963705323528000f, 0.827725794455034000f, 0.8 35527791460841000f, | |
142 0.843369730392169000f, 0.851251645184515000f, 0.859173569658532000f, 0.8 67135537520905000f, | |
143 0.875137582365205000f, 0.883179737672745000f, 0.891262036813419000f, 0.8 99384513046529000f, | |
144 0.907547199521614000f, 0.915750129279253000f, 0.923993335251873000f, 0.9 32276850264543000f, | |
145 0.940600707035753000f, 0.948964938178195000f, 0.957369576199527000f, 0.9 65814653503130000f, | |
146 0.974300202388861000f, 0.982826255053791000f, 0.991392843592940000f, 1.0 00000000000000000f, | |
147 }; | |
148 | |
149 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | |
150 | |
151 // x^(29/64) is a very good approximation of the true value, x^(1/2.2). | |
152 static __m128 inverse_gamma_linear_to_2dot2(__m128 x) { | |
mtklein_C
2016/06/16 13:27:41
Just some naming questions. Some of these names f
msarett
2016/06/16 15:46:11
sgtm, shorter names are better.
| |
153 // x^(-1/2) | |
154 __m128 x2 = _mm_rsqrt_ps(x); | |
155 | |
156 // x^(-1/32) | |
157 __m128 x32 = _mm_rsqrt_ps(_mm_rsqrt_ps(_mm_rsqrt_ps(_mm_rsqrt_ps(x2)))); | |
158 | |
159 // x^(+1/64) | |
160 __m128 x64 = _mm_rsqrt_ps(x32); | |
161 | |
162 // x^(+29/64) = x^(+1/2) * x^(-1/32) * x^(-1/64) | |
163 // Note that we also scale to the 0-255 range. | |
164 // These terms can be combined more minimally with 3 muls and 1 reciprocal. However, this | |
165 // is faster, because it allows us to start the muls in parallel with the rs qrts. | |
166 __m128 scale = _mm_set1_ps(255.0f); | |
167 return _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(scale, _mm_rcp_ps(x2)), x32), _mm_rc p_ps(x64)); | |
168 } | |
169 | |
170 template <SkColorSpace::GammaNamed kGammaNamed> | |
171 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len, | |
172 const float matrix[16]) { | |
mtklein_C
2016/06/16 13:27:41
This line might want to be re-wrapped?
msarett
2016/06/16 15:46:12
Done.
| |
173 const float* gamma_to_linear; | |
174 if (SkColorSpace::kSRGB_GammaNamed == kGammaNamed) { | |
mtklein_C
2016/06/16 13:27:41
Since we're not otherwise using kGammaNamed, I thi
msarett
2016/06/16 15:46:12
Woohoo, this is cool!
| |
175 gamma_to_linear = gamma_srgb_to_linear; | |
20 } else { | 176 } else { |
21 return (uint8_t) (v + 0.5f); | 177 gamma_to_linear = gamma_2dot2_to_linear; |
22 } | 178 } |
23 } | 179 |
24 | |
25 static void color_xform_2Dot2_RGBA_to_8888_portable(uint32_t* dst, const uint32_ t* src, int len, | |
26 const float matrix[16]) { | |
27 while (len-- > 0) { | |
28 float srcFloats[3]; | |
29 srcFloats[0] = (float) ((*src >> 0) & 0xFF); | |
30 srcFloats[1] = (float) ((*src >> 8) & 0xFF); | |
31 srcFloats[2] = (float) ((*src >> 16) & 0xFF); | |
32 | |
33 // Convert to linear. | |
34 // TODO (msarett): | |
35 // We should use X^2.2 here instead of X^2. What is the impact on corre ctness? | |
36 // We should be able to get closer to 2.2 at a small performance cost. | |
37 srcFloats[0] = srcFloats[0] * srcFloats[0]; | |
38 srcFloats[1] = srcFloats[1] * srcFloats[1]; | |
39 srcFloats[2] = srcFloats[2] * srcFloats[2]; | |
40 | |
41 // Convert to dst gamut. | |
42 float dstFloats[3]; | |
43 // TODO (msarett): matrix[12], matrix[13], and matrix[14] are almost alw ays zero. | |
44 // Should we have another optimized path that avoids the extra addition when they | |
45 // are zero? | |
46 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] + | |
47 srcFloats[2] * matrix[8] + matrix[12]; | |
48 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] + | |
49 srcFloats[2] * matrix[9] + matrix[13]; | |
50 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] + | |
51 srcFloats[2] * matrix[10] + matrix[14]; | |
52 | |
53 // Convert to dst gamma. | |
54 // TODO (msarett): | |
55 // We should use X^(1/2.2) here instead of X^(1/2). What is the impact on correctness? | |
56 // We should be able to get closer to (1/2.2) at a small performance cos t. | |
57 dstFloats[0] = sqrtf(dstFloats[0]); | |
58 dstFloats[1] = sqrtf(dstFloats[1]); | |
59 dstFloats[2] = sqrtf(dstFloats[2]); | |
60 | |
61 *dst = SkPackARGB32NoCheck(((*src >> 24) & 0xFF), | |
62 clamp_float_to_byte(dstFloats[0]), | |
63 clamp_float_to_byte(dstFloats[1]), | |
64 clamp_float_to_byte(dstFloats[2])); | |
65 | |
66 dst++; | |
67 src++; | |
68 } | |
69 } | |
70 | |
71 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | |
72 | |
73 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i nt len, | |
74 const float matrix[16]) { | |
75 // Load transformation matrix. | 180 // Load transformation matrix. |
76 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]); | 181 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]); |
77 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]); | 182 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]); |
78 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]); | 183 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]); |
79 __m128 rQgQbQ = _mm_loadu_ps(&matrix[12]); | |
80 | 184 |
81 while (len >= 4) { | 185 while (len >= 4) { |
82 // Load 4 pixels and convert them to floats. | 186 // Convert to linear. The look-up table has perfect accuracy. |
83 __m128i rgba = _mm_loadu_si128((const __m128i*) src); | 187 __m128 reds = _mm_setr_ps(gamma_to_linear[(src[0] >> 0) & 0xFF], |
84 __m128i byteMask = _mm_set1_epi32(0xFF); | 188 gamma_to_linear[(src[1] >> 0) & 0xFF], |
85 __m128 reds = _mm_cvtepi32_ps(_mm_and_si128( rgba, byteMask)); | 189 gamma_to_linear[(src[2] >> 0) & 0xFF], |
86 __m128 greens = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 8), byteMask)); | 190 gamma_to_linear[(src[3] >> 0) & 0xFF]); |
87 __m128 blues = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 16), byteMask)); | 191 __m128 greens = _mm_setr_ps(gamma_to_linear[(src[0] >> 8) & 0xFF], |
88 | 192 gamma_to_linear[(src[1] >> 8) & 0xFF], |
89 // Convert to linear. | 193 gamma_to_linear[(src[2] >> 8) & 0xFF], |
90 // FIXME (msarett): | 194 gamma_to_linear[(src[3] >> 8) & 0xFF]); |
91 // Should we be more accurate? | 195 __m128 blues = _mm_setr_ps(gamma_to_linear[(src[0] >> 16) & 0xFF], |
92 reds = _mm_mul_ps(reds, reds); | 196 gamma_to_linear[(src[1] >> 16) & 0xFF], |
93 greens = _mm_mul_ps(greens, greens); | 197 gamma_to_linear[(src[2] >> 16) & 0xFF], |
94 blues = _mm_mul_ps(blues, blues); | 198 gamma_to_linear[(src[3] >> 16) & 0xFF]); |
95 | 199 |
96 // Apply the transformation matrix to dst gamut. | 200 // Apply the transformation matrix to dst gamut. |
97 // FIXME (msarett): | |
98 // rQ, gQ, and bQ are almost always zero. Can we save a couple instruct ions? | |
99 | |
100 // Splat rX, rY, rZ, and rQ each across a register. | 201 // Splat rX, rY, rZ, and rQ each across a register. |
101 __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00); | 202 __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00); |
102 __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00); | 203 __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00); |
103 __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00); | 204 __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00); |
104 __m128 rQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x00); | 205 |
105 | 206 // dstReds = rX * reds + rY * greens + rZ * blues |
106 // dstReds = rX * reds + rY * greens + rZ * blues + rQ | |
107 __m128 dstReds = _mm_mul_ps(reds, rX); | 207 __m128 dstReds = _mm_mul_ps(reds, rX); |
108 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY)); | 208 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY)); |
109 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ)); | 209 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ)); |
110 dstReds = _mm_add_ps(dstReds, rQ); | |
111 | 210 |
112 // Splat gX, gY, gZ, and gQ each across a register. | 211 // Splat gX, gY, gZ, and gQ each across a register. |
113 __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55); | 212 __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55); |
114 __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55); | 213 __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55); |
115 __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55); | 214 __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55); |
116 __m128 gQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x55); | 215 |
117 | 216 // dstGreens = gX * reds + gY * greens + gZ * blues |
118 // dstGreens = gX * reds + gY * greens + gZ * blues + gQ | |
119 __m128 dstGreens = _mm_mul_ps(reds, gX); | 217 __m128 dstGreens = _mm_mul_ps(reds, gX); |
120 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY)); | 218 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY)); |
121 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ)); | 219 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ)); |
122 dstGreens = _mm_add_ps(dstGreens, gQ); | |
123 | 220 |
124 // Splat bX, bY, bZ, and bQ each across a register. | 221 // Splat bX, bY, bZ, and bQ each across a register. |
125 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA); | 222 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA); |
126 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA); | 223 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA); |
127 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA); | 224 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA); |
128 __m128 bQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0xAA); | 225 |
129 | 226 // dstBlues = bX * reds + bY * greens + bZ * blues |
130 // dstBlues = bX * reds + bY * greens + bZ * blues + bQ | |
131 __m128 dstBlues = _mm_mul_ps(reds, bX); | 227 __m128 dstBlues = _mm_mul_ps(reds, bX); |
132 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY)); | 228 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY)); |
133 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ)); | 229 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ)); |
134 dstBlues = _mm_add_ps(dstBlues, bQ); | |
135 | 230 |
136 // Convert to dst gamma. | 231 // Convert to dst gamma. |
137 // Note that the reciprocal of the reciprocal sqrt, is just a fast sqrt. | 232 dstReds = inverse_gamma_linear_to_2dot2(dstReds); |
138 // FIXME (msarett): | 233 dstGreens = inverse_gamma_linear_to_2dot2(dstGreens); |
139 // Should we be more accurate? | 234 dstBlues = inverse_gamma_linear_to_2dot2(dstBlues); |
140 dstReds = _mm_rcp_ps(_mm_rsqrt_ps(dstReds)); | |
141 dstGreens = _mm_rcp_ps(_mm_rsqrt_ps(dstGreens)); | |
142 dstBlues = _mm_rcp_ps(_mm_rsqrt_ps(dstBlues)); | |
143 | 235 |
144 // Clamp floats to 0-255 range. | 236 // Clamp floats to 0-255 range. |
145 dstReds = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstReds, _mm_set1_ ps(255.0f))); | 237 // The order of the arguments is important here. We want to make sure t hat NaN |
146 dstGreens = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstGreens, _mm_set1_ ps(255.0f))); | 238 // clamps to zero. Note that max(NaN, 0) = 0, while max(0, NaN) = NaN. |
mtklein_C
2016/06/16 13:27:40
Do we have test cases exercising the NaN input? J
msarett
2016/06/16 15:46:11
Yes we are hitting this case quite frequently actu
| |
147 dstBlues = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstBlues, _mm_set1_ ps(255.0f))); | 239 dstReds = _mm_min_ps(_mm_max_ps(dstReds, _mm_setzero_ps()), _mm_set1 _ps(255.0f)); |
240 dstGreens = _mm_min_ps(_mm_max_ps(dstGreens, _mm_setzero_ps()), _mm_set1 _ps(255.0f)); | |
241 dstBlues = _mm_min_ps(_mm_max_ps(dstBlues, _mm_setzero_ps()), _mm_set1 _ps(255.0f)); | |
148 | 242 |
149 // Convert to bytes and store to memory. | 243 // Convert to bytes and store to memory. |
150 rgba = _mm_and_si128(_mm_set1_epi32(0xFF000000), rgba); | 244 __m128i rgba = _mm_set1_epi32(0xFF000000); |
151 #ifdef SK_PMCOLOR_IS_RGBA | |
152 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) ); | 245 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) ); |
153 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) ); | 246 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) ); |
154 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16) ); | 247 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16) ); |
155 #else | |
156 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstBlues) ); | |
157 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) ); | |
158 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstReds), 16) ); | |
159 #endif | |
160 _mm_storeu_si128((__m128i*) dst, rgba); | 248 _mm_storeu_si128((__m128i*) dst, rgba); |
161 | 249 |
162 dst += 4; | 250 dst += 4; |
163 src += 4; | 251 src += 4; |
164 len -= 4; | 252 len -= 4; |
165 } | 253 } |
166 | 254 |
167 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix); | 255 while (len > 0) { |
256 // Convert to linear. The look-up table has perfect accuracy. | |
257 __m128 srcPixel = _mm_setr_ps(gamma_srgb_to_linear[(src[0] >> 0) & 0xFF ], | |
mtklein_C
2016/06/16 13:27:40
Wouldn't this part be simpler as,
// Splat red, g
msarett
2016/06/16 15:46:12
Yes I like this better, done.
| |
258 gamma_srgb_to_linear[(src[0] >> 8) & 0xFF ], | |
259 gamma_srgb_to_linear[(src[0] >> 16) & 0xFF ], | |
260 0.0f); | |
261 | |
262 // Apply the transformation matrix to dst gamut. | |
263 // This time, splat the red, green, and blue components. | |
264 __m128 r = _mm_shuffle_ps(srcPixel, srcPixel, 0x00); | |
265 __m128 g = _mm_shuffle_ps(srcPixel, srcPixel, 0x55); | |
266 __m128 b = _mm_shuffle_ps(srcPixel, srcPixel, 0xAA); | |
267 __m128 dstPixel = _mm_mul_ps(r, rXgXbX); | |
268 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(g, rYgYbY)); | |
269 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(b, rZgZbZ)); | |
270 | |
271 // Convert to dst gamma. | |
272 dstPixel = inverse_gamma_linear_to_2dot2(dstPixel); | |
273 | |
274 // Clamp floats to 0-255 range. | |
275 dstPixel = _mm_min_ps(_mm_max_ps(dstPixel, _mm_setzero_ps()), _mm_set1_p s(255.0f)); | |
mtklein_C
2016/06/16 13:27:41
Let's make the clamping a static function? That c
msarett
2016/06/16 15:46:12
SGTM
| |
276 | |
277 // Convert to bytes and store to memory. | |
278 __m128i dstInts = _mm_cvtps_epi32(dstPixel); | |
279 __m128i dstBytes = _mm_packus_epi16(_mm_packus_epi16(dstInts, dstInts), dstInts); | |
280 dstBytes = _mm_or_si128(_mm_set1_epi32(0xFF000000), dstBytes); | |
281 _mm_store_ss((float*) dst, _mm_castsi128_ps(dstBytes)); | |
282 | |
283 dst += 1; | |
284 src += 1; | |
285 len -= 1; | |
286 } | |
168 } | 287 } |
169 | 288 |
170 #else | 289 #else |
171 | 290 |
172 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i nt len, | 291 static uint8_t clamp_float_to_byte(float v) { |
292 if (v >= 254.5f) { | |
293 return 255; | |
294 } else if (v < 0.5f) { | |
295 return 0; | |
296 } else { | |
297 return (uint8_t) (v + 0.5f); | |
298 } | |
299 } | |
300 | |
301 template <SkColorSpace::GammaNamed kGammaNamed> | |
302 static void color_xform_RGB1_portable(uint32_t* dst, const uint32_t* src, int le n, | |
303 const float matrix[16]) { | |
304 const float* gamma_to_linear; | |
305 if (SkColorSpace::kSRGB_GammaNamed == kGammaNamed) { | |
306 gamma_to_linear = gamma_srgb_to_linear; | |
307 } else { | |
308 gamma_to_linear = gamma_2dot2_to_linear; | |
309 } | |
310 | |
311 while (len-- > 0) { | |
312 // Convert to linear. | |
313 float srcFloats[3]; | |
314 srcFloats[0] = gamma_to_linear[(*src >> 0) & 0xFF]; | |
315 srcFloats[1] = gamma_to_linear[(*src >> 8) & 0xFF]; | |
316 srcFloats[2] = gamma_to_linear[(*src >> 16) & 0xFF]; | |
317 | |
318 // Convert to dst gamut. | |
319 float dstFloats[3]; | |
320 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] + | |
321 srcFloats[2] * matrix[8]; | |
322 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] + | |
323 srcFloats[2] * matrix[9]; | |
324 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] + | |
325 srcFloats[2] * matrix[10]; | |
326 | |
327 // Convert to dst gamma. | |
328 // Note: pow is really, really slow. We will suffer when SSE2 is not su pported. | |
329 dstFloats[0] = powf(dstFloats[0], (1/2.2f)) * 255.0f; | |
330 dstFloats[1] = powf(dstFloats[1], (1/2.2f)) * 255.0f; | |
331 dstFloats[2] = powf(dstFloats[2], (1/2.2f)) * 255.0f; | |
332 | |
333 *dst = SkPackARGB32NoCheck(0xFF, | |
mtklein_C
2016/06/16 13:27:41
Didn't you already munge the matrix so that we sho
msarett
2016/06/16 15:46:12
Agreed, this is wrong. Using shifts.
| |
334 clamp_float_to_byte(dstFloats[0]), | |
335 clamp_float_to_byte(dstFloats[1]), | |
336 clamp_float_to_byte(dstFloats[2])); | |
337 | |
338 dst++; | |
339 src++; | |
340 } | |
341 } | |
342 | |
343 template <SkColorSpace::GammaNamed kGammaNamed> | |
344 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len, | |
173 const float matrix[16]) { | 345 const float matrix[16]) { |
174 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix); | 346 color_xform_RGB1_portable<kGammaNamed>(dst, src, len, matrix); |
mtklein_C
2016/06/16 13:27:40
This appears to be a complete passthrough function
msarett
2016/06/16 15:46:12
Of course, Done.
| |
175 } | 347 } |
176 | 348 |
177 #endif | 349 #endif |
178 | 350 |
351 static void color_xform_RGB1_srgb_to_2dot2(uint32_t* dst, const uint32_t* src, i nt len, | |
352 const float matrix[16]) { | |
353 color_xform_RGB1<SkColorSpace::kSRGB_GammaNamed>(dst, src, len, matrix); | |
354 } | |
355 | |
356 static void color_xform_RGB1_2dot2_to_2dot2(uint32_t* dst, const uint32_t* src, int len, | |
357 const float matrix[16]) { | |
358 color_xform_RGB1<SkColorSpace::k2Dot2Curve_GammaNamed>(dst, src, len, matrix ); | |
359 } | |
360 | |
179 } | 361 } |
180 | 362 |
181 #endif // SkColorXform_opts_DEFINED | 363 #endif // SkColorXform_opts_DEFINED |
OLD | NEW |