Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: src/opts/SkColorXform_opts.h

Issue 2060823003: Implement fast, correct gamma conversion for color xforms (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Win test Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/core/SkOpts.cpp ('k') | src/opts/SkOpts_sse41.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 Google Inc. 2 * Copyright 2016 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #ifndef SkColorXform_opts_DEFINED 8 #ifndef SkColorXform_opts_DEFINED
9 #define SkColorXform_opts_DEFINED 9 #define SkColorXform_opts_DEFINED
10 10
11 #include "SkColorPriv.h" 11 #include "SkColorPriv.h"
12 12
13 namespace SK_OPTS_NS { 13 namespace SK_OPTS_NS {
14 14
15 static uint8_t clamp_float_to_byte(float v) { 15 extern const float linear_from_srgb[256] = {
16 if (v >= 254.5f) { 16 0.000000000000000000f, 0.000303526983548838f, 0.000607053967097675f, 0.0 00910580950646513f,
17 return 255; 17 0.001214107934195350f, 0.001517634917744190f, 0.001821161901293030f, 0.0 02124688884841860f,
18 } else if (v < 0.5f) { 18 0.002428215868390700f, 0.002731742851939540f, 0.003034518678424960f, 0.0 03346535763899160f,
19 return 0; 19 0.003676507324047440f, 0.004024717018496310f, 0.004391442037410290f, 0.0 04776953480693730f,
20 } else { 20 0.005181516702338390f, 0.005605391624202720f, 0.006048833022857060f, 0.0 06512090792594470f,
21 return (uint8_t) (v + 0.5f); 21 0.006995410187265390f, 0.007499032043226180f, 0.008023192985384990f, 0.0 08568125618069310f,
22 } 22 0.009134058702220790f, 0.009721217320237850f, 0.010329823029626900f, 0.0 10960094006488200f,
23 0.011612245179743900f, 0.012286488356915900f, 0.012983032342173000f, 0.0 13702083047289700f,
24 0.014443843596092500f, 0.015208514422912700f, 0.015996293365509600f, 0.0 16807375752887400f,
25 0.017641954488384100f, 0.018500220128379700f, 0.019382360956935700f, 0.0 20288563056652400f,
26 0.021219010376003600f, 0.022173884793387400f, 0.023153366178110400f, 0.0 24157632448504800f,
27 0.025186859627361600f, 0.026241221894849900f, 0.027320891639074900f, 0.0 28426039504420800f,
28 0.029556834437808800f, 0.030713443732993600f, 0.031896033073011500f, 0.0 33104766570885100f,
29 0.034339806808682200f, 0.035601314875020300f, 0.036889450401100000f, 0.0 38204371595346500f,
30 0.039546235276732800f, 0.040915196906853200f, 0.042311410620809700f, 0.0 43735029256973500f,
31 0.045186204385675500f, 0.046665086336880100f, 0.048171824226889400f, 0.0 49706565984127200f,
32 0.051269458374043200f, 0.052860647023180200f, 0.054480276442442400f, 0.0 56128490049600100f,
33 0.057805430191067200f, 0.059511238162981200f, 0.061246054231617600f, 0.0 63010017653167700f,
34 0.064803266692905800f, 0.066625938643772900f, 0.068478169844400200f, 0.0 70360095696595900f,
35 0.072271850682317500f, 0.074213568380149600f, 0.076185381481307900f, 0.0 78187421805186300f,
36 0.080219820314468300f, 0.082282707129814800f, 0.084376211544148800f, 0.0 86500462036549800f,
37 0.088655586285772900f, 0.090841711183407700f, 0.093058962846687500f, 0.0 95307466630964700f,
38 0.097587347141862500f, 0.099898728247113900f, 0.102241733088101000f, 0.1 04616484091104000f,
39 0.107023102978268000f, 0.109461710778299000f, 0.111932427836906000f, 0.1 14435373826974000f,
40 0.116970667758511000f, 0.119538427988346000f, 0.122138772229602000f, 0.1 24771817560950000f,
41 0.127437680435647000f, 0.130136476690364000f, 0.132868321553818000f, 0.1 35633329655206000f,
42 0.138431615032452000f, 0.141263291140272000f, 0.144128470858058000f, 0.1 47027266497595000f,
43 0.149959789810609000f, 0.152926151996150000f, 0.155926463707827000f, 0.1 58960835060880000f,
44 0.162029375639111000f, 0.165132194501668000f, 0.168269400189691000f, 0.1 71441100732823000f,
45 0.174647403655585000f, 0.177888415983629000f, 0.181164244249860000f, 0.1 84474994500441000f,
46 0.187820772300678000f, 0.191201682740791000f, 0.194617830441576000f, 0.1 98069319559949000f,
47 0.201556253794397000f, 0.205078736390317000f, 0.208636870145256000f, 0.2 12230757414055000f,
48 0.215860500113899000f, 0.219526199729269000f, 0.223227957316809000f, 0.2 26965873510098000f,
49 0.230740048524349000f, 0.234550582161005000f, 0.238397573812271000f, 0.2 42281122465555000f,
50 0.246201326707835000f, 0.250158284729953000f, 0.254152094330827000f, 0.2 58182852921596000f,
51 0.262250657529696000f, 0.266355604802862000f, 0.270497791013066000f, 0.2 74677312060385000f,
52 0.278894263476810000f, 0.283148740429992000f, 0.287440837726918000f, 0.2 91770649817536000f,
53 0.296138270798321000f, 0.300543794415777000f, 0.304987314069886000f, 0.3 09468922817509000f,
54 0.313988713375718000f, 0.318546778125092000f, 0.323143209112951000f, 0.3 27778098056542000f,
55 0.332451536346179000f, 0.337163615048330000f, 0.341914424908661000f, 0.3 46704056355030000f,
56 0.351532599500439000f, 0.356400144145944000f, 0.361306779783510000f, 0.3 66252595598840000f,
57 0.371237680474149000f, 0.376262122990906000f, 0.381326011432530000f, 0.3 86429433787049000f,
58 0.391572477749723000f, 0.396755230725627000f, 0.401977779832196000f, 0.4 07240211901737000f,
59 0.412542613483904000f, 0.417885070848138000f, 0.423267669986072000f, 0.4 28690496613907000f,
60 0.434153636174749000f, 0.439657173840919000f, 0.445201194516228000f, 0.4 50785782838223000f,
61 0.456411023180405000f, 0.462076999654407000f, 0.467783796112159000f, 0.4 73531496148010000f,
62 0.479320183100827000f, 0.485149940056070000f, 0.491020849847836000f, 0.4 96932995060870000f,
63 0.502886458032569000f, 0.508881320854934000f, 0.514917665376521000f, 0.5 20995573204354000f,
64 0.527115125705813000f, 0.533276404010505000f, 0.539479489012107000f, 0.5 45724461370187000f,
65 0.552011401512000000f, 0.558340389634268000f, 0.564711505704929000f, 0.5 71124829464873000f,
66 0.577580440429651000f, 0.584078417891164000f, 0.590618840919337000f, 0.5 97201788363763000f,
67 0.603827338855338000f, 0.610495570807865000f, 0.617206562419651000f, 0.6 23960391675076000f,
68 0.630757136346147000f, 0.637596873994033000f, 0.644479681970582000f, 0.6 51405637419824000f,
69 0.658374817279448000f, 0.665387298282272000f, 0.672443156957688000f, 0.6 79542469633094000f,
70 0.686685312435314000f, 0.693871761291990000f, 0.701101891932973000f, 0.7 08375779891687000f,
71 0.715693500506481000f, 0.723055128921969000f, 0.730460740090354000f, 0.7 37910408772731000f,
72 0.745404209540387000f, 0.752942216776078000f, 0.760524504675292000f, 0.7 68151147247507000f,
73 0.775822218317423000f, 0.783537791526194000f, 0.791297940332630000f, 0.7 99102738014409000f,
74 0.806952257669252000f, 0.814846572216101000f, 0.822785754396284000f, 0.8 30769876774655000f,
75 0.838799011740740000f, 0.846873231509858000f, 0.854992608124234000f, 0.8 63157213454102000f,
76 0.871367119198797000f, 0.879622396887832000f, 0.887923117881966000f, 0.8 96269353374266000f,
77 0.904661174391149000f, 0.913098651793419000f, 0.921581856277295000f, 0.9 30110858375424000f,
78 0.938685728457888000f, 0.947306536733200000f, 0.955973353249286000f, 0.9 64686247894465000f,
79 0.973445290398413000f, 0.982250550333117000f, 0.991102097113830000f, 1.0 00000000000000000f,
80 };
81
82 extern const float linear_from_2dot2[256] = {
83 0.000000000000000000f, 0.000005077051900662f, 0.000023328004666099f, 0.0 00056921765712193f,
84 0.000107187362341244f, 0.000175123977503027f, 0.000261543754548491f, 0.0 00367136269815943f,
85 0.000492503787191433f, 0.000638182842167022f, 0.000804658499513058f, 0.0 00992374304074325f,
86 0.001201739522438400f, 0.001433134589671860f, 0.001686915316789280f, 0.0 01963416213396470f,
87 0.002262953160706430f, 0.002585825596234170f, 0.002932318323938360f, 0.0 03302703032003640f,
88 0.003697239578900130f, 0.004116177093282750f, 0.004559754922526020f, 0.0 05028203456855540f,
89 0.005521744850239660f, 0.006040593654849810f, 0.006584957382581690f, 0.0 07155037004573030f,
90 0.007751027397660610f, 0.008373117745148580f, 0.009021491898012130f, 0.0 09696328701658230f,
91 0.010397802292555300f, 0.011126082368383200f, 0.011881334434813700f, 0.0 12663720031582100f,
92 0.013473396940142600f, 0.014310519374884100f, 0.015175238159625200f, 0.0 16067700890886900f,
93 0.016988052089250000f, 0.017936433339950200f, 0.018912983423721500f, 0.0 19917838438785700f,
94 0.020951131914781100f, 0.022012994919336500f, 0.023103556157921400f, 0.0 24222942067534200f,
95 0.025371276904734600f, 0.026548682828472900f, 0.027755279978126000f, 0.0 28991186547107800f,
96 0.030256518852388700f, 0.031551391400226400f, 0.032875916948383800f, 0.0 34230206565082000f,
97 0.035614369684918800f, 0.037028514161960200f, 0.038472746320194600f, 0.0 39947171001525600f,
98 0.041451891611462500f, 0.042987010162657100f, 0.044552627316421400f, 0.0 46148842422351000f,
99 0.047775753556170600f, 0.049433457555908000f, 0.051122050056493400f, 0.0 52841625522879000f,
100 0.054592277281760300f, 0.056374097551979800f, 0.058187177473685400f, 0.0 60031607136313200f,
101 0.061907475605455800f, 0.063814870948677200f, 0.065753880260330100f, 0.0 67724589685424300f,
102 0.069727084442598800f, 0.071761448846239100f, 0.073827766327784600f, 0.0 75926119456264800f,
103 0.078056589958101900f, 0.080219258736215100f, 0.082414205888459200f, 0.0 84641510725429500f,
104 0.086901251787660300f, 0.089193506862247800f, 0.091518352998919500f, 0.0 93875866525577800f,
105 0.096266123063339700f, 0.098689197541094500f, 0.101145164209600000f, 0.1 03634096655137000f,
106 0.106156067812744000f, 0.108711149979039000f, 0.111299414824660000f, 0.1 13920933406333000f,
107 0.116575776178572000f, 0.119264013005047000f, 0.121985713169619000f, 0.1 24740945387051000f,
108 0.127529777813422000f, 0.130352278056244000f, 0.133208513184300000f, 0.1 36098549737202000f,
109 0.139022453734703000f, 0.141980290685736000f, 0.144972125597231000f, 0.1 47998022982685000f,
110 0.151058046870511000f, 0.154152260812165000f, 0.157280727890073000f, 0.1 60443510725344000f,
111 0.163640671485290000f, 0.166872271890766000f, 0.170138373223312000f, 0.1 73439036332135000f,
112 0.176774321640903000f, 0.180144289154390000f, 0.183548998464951000f, 0.1 86988508758844000f,
113 0.190462878822409000f, 0.193972167048093000f, 0.197516431440340000f, 0.2 01095729621346000f,
114 0.204710118836677000f, 0.208359655960767000f, 0.212044397502288000f, 0.2 15764399609395000f,
115 0.219519718074868000f, 0.223310408341127000f, 0.227136525505149000f, 0.2 30998124323267000f,
116 0.234895259215880000f, 0.238827984272048000f, 0.242796353254002000f, 0.2 46800419601550000f,
117 0.250840236436400000f, 0.254915856566385000f, 0.259027332489606000f, 0.2 63174716398492000f,
118 0.267358060183772000f, 0.271577415438375000f, 0.275832833461245000f, 0.2 80124365261085000f,
119 0.284452061560024000f, 0.288815972797219000f, 0.293216149132375000f, 0.2 97652640449211000f,
120 0.302125496358853000f, 0.306634766203158000f, 0.311180499057984000f, 0.3 15762743736397000f,
121 0.320381548791810000f, 0.325036962521076000f, 0.329729032967515000f, 0.3 34457807923889000f,
122 0.339223334935327000f, 0.344025661302187000f, 0.348864834082879000f, 0.3 53740900096629000f,
123 0.358653905926199000f, 0.363603897920553000f, 0.368590922197487000f, 0.3 73615024646202000f,
124 0.378676250929840000f, 0.383774646487975000f, 0.388910256539059000f, 0.3 94083126082829000f,
125 0.399293299902674000f, 0.404540822567962000f, 0.409825738436323000f, 0.4 15148091655907000f,
126 0.420507926167587000f, 0.425905285707146000f, 0.431340213807410000f, 0.4 36812753800359000f,
127 0.442322948819202000f, 0.447870841800410000f, 0.453456475485731000f, 0.4 59079892424160000f,
128 0.464741134973889000f, 0.470440245304218000f, 0.476177265397440000f, 0.4 81952237050698000f,
129 0.487765201877811000f, 0.493616201311074000f, 0.499505276603030000f, 0.5 05432468828216000f,
130 0.511397818884880000f, 0.517401367496673000f, 0.523443155214325000f, 0.5 29523222417277000f,
131 0.535641609315311000f, 0.541798355950137000f, 0.547993502196972000f, 0.5 54227087766085000f,
132 0.560499152204328000f, 0.566809734896638000f, 0.573158875067523000f, 0.5 79546611782525000f,
133 0.585972983949661000f, 0.592438030320847000f, 0.598941789493296000f, 0.6 05484299910907000f,
134 0.612065599865624000f, 0.618685727498780000f, 0.625344720802427000f, 0.6 32042617620641000f,
135 0.638779455650817000f, 0.645555272444935000f, 0.652370105410821000f, 0.6 59223991813387000f,
136 0.666116968775851000f, 0.673049073280942000f, 0.680020342172095000f, 0.6 87030812154625000f,
137 0.694080519796882000f, 0.701169501531402000f, 0.708297793656032000f, 0.7 15465432335048000f,
138 0.722672453600255000f, 0.729918893352071000f, 0.737204787360605000f, 0.7 44530171266715000f,
139 0.751895080583051000f, 0.759299550695091000f, 0.766743616862161000f, 0.7 74227314218442000f,
140 0.781750677773962000f, 0.789313742415586000f, 0.796916542907978000f, 0.8 04559113894567000f,
141 0.812241489898490000f, 0.819963705323528000f, 0.827725794455034000f, 0.8 35527791460841000f,
142 0.843369730392169000f, 0.851251645184515000f, 0.859173569658532000f, 0.8 67135537520905000f,
143 0.875137582365205000f, 0.883179737672745000f, 0.891262036813419000f, 0.8 99384513046529000f,
144 0.907547199521614000f, 0.915750129279253000f, 0.923993335251873000f, 0.9 32276850264543000f,
145 0.940600707035753000f, 0.948964938178195000f, 0.957369576199527000f, 0.9 65814653503130000f,
146 0.974300202388861000f, 0.982826255053791000f, 0.991392843592940000f, 1.0 00000000000000000f,
147 };
148
149 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
150
151 // x^(29/64) is a very good approximation of the true value, x^(1/2.2).
152 static __m128 linear_to_2dot2(__m128 x) {
153 // x^(-1/2)
154 __m128 x2 = _mm_rsqrt_ps(x);
155
156 // x^(-1/32)
157 __m128 x32 = _mm_rsqrt_ps(_mm_rsqrt_ps(_mm_rsqrt_ps(_mm_rsqrt_ps(x2))));
158
159 // x^(+1/64)
160 __m128 x64 = _mm_rsqrt_ps(x32);
161
162 // x^(+29/64) = x^(+1/2) * x^(-1/32) * x^(-1/64)
163 // Note that we also scale to the 0-255 range.
164 // These terms can be combined more minimally with 3 muls and 1 reciprocal. However, this
165 // is faster, because it allows us to start the muls in parallel with the rs qrts.
166 __m128 scale = _mm_set1_ps(255.0f);
167 return _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(scale, _mm_rcp_ps(x2)), x32), _mm_rc p_ps(x64));
23 } 168 }
24 169
25 static void color_xform_2Dot2_RGBA_to_8888_portable(uint32_t* dst, const uint32_ t* src, int len, 170 static __m128 clamp_0_to_255(__m128 x) {
26 const float matrix[16]) { 171 // The order of the arguments is important here. We want to make sure that NaN
27 while (len-- > 0) { 172 // clamps to zero. Note that max(NaN, 0) = 0, while max(0, NaN) = NaN.
28 float srcFloats[3]; 173 return _mm_min_ps(_mm_max_ps(x, _mm_setzero_ps()), _mm_set1_ps(255.0f));
29 srcFloats[0] = (float) ((*src >> 0) & 0xFF);
30 srcFloats[1] = (float) ((*src >> 8) & 0xFF);
31 srcFloats[2] = (float) ((*src >> 16) & 0xFF);
32
33 // Convert to linear.
34 // TODO (msarett):
35 // We should use X^2.2 here instead of X^2. What is the impact on corre ctness?
36 // We should be able to get closer to 2.2 at a small performance cost.
37 srcFloats[0] = srcFloats[0] * srcFloats[0];
38 srcFloats[1] = srcFloats[1] * srcFloats[1];
39 srcFloats[2] = srcFloats[2] * srcFloats[2];
40
41 // Convert to dst gamut.
42 float dstFloats[3];
43 // TODO (msarett): matrix[12], matrix[13], and matrix[14] are almost alw ays zero.
44 // Should we have another optimized path that avoids the extra addition when they
45 // are zero?
46 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] +
47 srcFloats[2] * matrix[8] + matrix[12];
48 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] +
49 srcFloats[2] * matrix[9] + matrix[13];
50 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] +
51 srcFloats[2] * matrix[10] + matrix[14];
52
53 // Convert to dst gamma.
54 // TODO (msarett):
55 // We should use X^(1/2.2) here instead of X^(1/2). What is the impact on correctness?
56 // We should be able to get closer to (1/2.2) at a small performance cos t.
57 dstFloats[0] = sqrtf(dstFloats[0]);
58 dstFloats[1] = sqrtf(dstFloats[1]);
59 dstFloats[2] = sqrtf(dstFloats[2]);
60
61 *dst = SkPackARGB32NoCheck(((*src >> 24) & 0xFF),
62 clamp_float_to_byte(dstFloats[0]),
63 clamp_float_to_byte(dstFloats[1]),
64 clamp_float_to_byte(dstFloats[2]));
65
66 dst++;
67 src++;
68 }
69 } 174 }
70 175
71 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 176 template <const float (&linear_from_curve)[256]>
72 177 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,
73 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i nt len, 178 const float matrix[16]) {
74 const float matrix[16]) {
75 // Load transformation matrix. 179 // Load transformation matrix.
76 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]); 180 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]);
77 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]); 181 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]);
78 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]); 182 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]);
79 __m128 rQgQbQ = _mm_loadu_ps(&matrix[12]);
80 183
81 while (len >= 4) { 184 while (len >= 4) {
82 // Load 4 pixels and convert them to floats. 185 // Convert to linear. The look-up table has perfect accuracy.
83 __m128i rgba = _mm_loadu_si128((const __m128i*) src); 186 __m128 reds = _mm_setr_ps(linear_from_curve[(src[0] >> 0) & 0xFF],
84 __m128i byteMask = _mm_set1_epi32(0xFF); 187 linear_from_curve[(src[1] >> 0) & 0xFF],
85 __m128 reds = _mm_cvtepi32_ps(_mm_and_si128( rgba, byteMask)); 188 linear_from_curve[(src[2] >> 0) & 0xFF],
86 __m128 greens = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 8), byteMask)); 189 linear_from_curve[(src[3] >> 0) & 0xFF]);
87 __m128 blues = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 16), byteMask)); 190 __m128 greens = _mm_setr_ps(linear_from_curve[(src[0] >> 8) & 0xFF],
88 191 linear_from_curve[(src[1] >> 8) & 0xFF],
89 // Convert to linear. 192 linear_from_curve[(src[2] >> 8) & 0xFF],
90 // FIXME (msarett): 193 linear_from_curve[(src[3] >> 8) & 0xFF]);
91 // Should we be more accurate? 194 __m128 blues = _mm_setr_ps(linear_from_curve[(src[0] >> 16) & 0xFF],
92 reds = _mm_mul_ps(reds, reds); 195 linear_from_curve[(src[1] >> 16) & 0xFF],
93 greens = _mm_mul_ps(greens, greens); 196 linear_from_curve[(src[2] >> 16) & 0xFF],
94 blues = _mm_mul_ps(blues, blues); 197 linear_from_curve[(src[3] >> 16) & 0xFF]);
95 198
96 // Apply the transformation matrix to dst gamut. 199 // Apply the transformation matrix to dst gamut.
97 // FIXME (msarett): 200 // Splat rX, rY, and rZ each across a register.
98 // rQ, gQ, and bQ are almost always zero. Can we save a couple instruct ions?
99
100 // Splat rX, rY, rZ, and rQ each across a register.
101 __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00); 201 __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00);
102 __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00); 202 __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00);
103 __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00); 203 __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00);
104 __m128 rQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x00); 204
105 205 // dstReds = rX * reds + rY * greens + rZ * blues
106 // dstReds = rX * reds + rY * greens + rZ * blues + rQ
107 __m128 dstReds = _mm_mul_ps(reds, rX); 206 __m128 dstReds = _mm_mul_ps(reds, rX);
108 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY)); 207 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY));
109 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ)); 208 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ));
110 dstReds = _mm_add_ps(dstReds, rQ); 209
111 210 // Splat gX, gY, and gZ each across a register.
112 // Splat gX, gY, gZ, and gQ each across a register.
113 __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55); 211 __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55);
114 __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55); 212 __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55);
115 __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55); 213 __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55);
116 __m128 gQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x55); 214
117 215 // dstGreens = gX * reds + gY * greens + gZ * blues
118 // dstGreens = gX * reds + gY * greens + gZ * blues + gQ
119 __m128 dstGreens = _mm_mul_ps(reds, gX); 216 __m128 dstGreens = _mm_mul_ps(reds, gX);
120 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY)); 217 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY));
121 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ)); 218 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ));
122 dstGreens = _mm_add_ps(dstGreens, gQ); 219
123 220 // Splat bX, bY, and bZ each across a register.
124 // Splat bX, bY, bZ, and bQ each across a register.
125 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA); 221 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA);
126 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA); 222 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA);
127 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA); 223 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA);
128 __m128 bQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0xAA); 224
129 225 // dstBlues = bX * reds + bY * greens + bZ * blues
130 // dstBlues = bX * reds + bY * greens + bZ * blues + bQ
131 __m128 dstBlues = _mm_mul_ps(reds, bX); 226 __m128 dstBlues = _mm_mul_ps(reds, bX);
132 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY)); 227 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY));
133 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ)); 228 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ));
134 dstBlues = _mm_add_ps(dstBlues, bQ);
135 229
136 // Convert to dst gamma. 230 // Convert to dst gamma.
137 // Note that the reciprocal of the reciprocal sqrt, is just a fast sqrt. 231 dstReds = linear_to_2dot2(dstReds);
138 // FIXME (msarett): 232 dstGreens = linear_to_2dot2(dstGreens);
139 // Should we be more accurate? 233 dstBlues = linear_to_2dot2(dstBlues);
140 dstReds = _mm_rcp_ps(_mm_rsqrt_ps(dstReds)); 234
141 dstGreens = _mm_rcp_ps(_mm_rsqrt_ps(dstGreens)); 235 // Clamp floats.
142 dstBlues = _mm_rcp_ps(_mm_rsqrt_ps(dstBlues)); 236 dstReds = clamp_0_to_255(dstReds);
143 237 dstGreens = clamp_0_to_255(dstGreens);
144 // Clamp floats to 0-255 range. 238 dstBlues = clamp_0_to_255(dstBlues);
145 dstReds = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstReds, _mm_set1_ ps(255.0f)));
146 dstGreens = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstGreens, _mm_set1_ ps(255.0f)));
147 dstBlues = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstBlues, _mm_set1_ ps(255.0f)));
148 239
149 // Convert to bytes and store to memory. 240 // Convert to bytes and store to memory.
150 rgba = _mm_and_si128(_mm_set1_epi32(0xFF000000), rgba); 241 __m128i rgba = _mm_set1_epi32(0xFF000000);
151 #ifdef SK_PMCOLOR_IS_RGBA
152 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) ); 242 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) );
153 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) ); 243 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) );
154 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16) ); 244 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16) );
155 #else
156 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstBlues) );
157 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) );
158 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstReds), 16) );
159 #endif
160 _mm_storeu_si128((__m128i*) dst, rgba); 245 _mm_storeu_si128((__m128i*) dst, rgba);
161 246
162 dst += 4; 247 dst += 4;
163 src += 4; 248 src += 4;
164 len -= 4; 249 len -= 4;
165 } 250 }
166 251
167 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix); 252 while (len > 0) {
253 // Splat the red, green, and blue components.
254 __m128 r = _mm_set1_ps(linear_from_curve[(src[0] >> 0) & 0xFF]),
255 g = _mm_set1_ps(linear_from_curve[(src[0] >> 8) & 0xFF]),
256 b = _mm_set1_ps(linear_from_curve[(src[0] >> 16) & 0xFF]);
257
258 // Apply the transformation matrix to dst gamut.
259 __m128 dstPixel = _mm_mul_ps(r, rXgXbX);
260 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(g, rYgYbY));
261 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(b, rZgZbZ));
262
263 // Convert to dst gamma.
264 dstPixel = linear_to_2dot2(dstPixel);
265
266 // Clamp floats to 0-255 range.
267 dstPixel = clamp_0_to_255(dstPixel);
268
269 // Convert to bytes and store to memory.
270 __m128i dstInts = _mm_cvtps_epi32(dstPixel);
271 __m128i dstBytes = _mm_packus_epi16(_mm_packus_epi16(dstInts, dstInts), dstInts);
272 dstBytes = _mm_or_si128(_mm_set1_epi32(0xFF000000), dstBytes);
273 _mm_store_ss((float*) dst, _mm_castsi128_ps(dstBytes));
274
275 dst += 1;
276 src += 1;
277 len -= 1;
278 }
168 } 279 }
169 280
170 #else 281 #else
171 282
172 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i nt len, 283 static uint8_t clamp_float_to_byte(float v) {
173 const float matrix[16]) { 284 // The ordering of the logic is a little strange here in order
174 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix); 285 // to make sure we convert NaNs to 0.
286 if (v >= 254.5f) {
287 return 255;
288 } else if (v >= 0.5f) {
289 return (uint8_t) (v + 0.5f);
290 } else {
291 return 0;
292 }
293 }
294
295 template <const float (&linear_from_curve)[256]>
296 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,
297 const float matrix[16]) {
298 while (len-- > 0) {
299 // Convert to linear.
300 float srcFloats[3];
301 srcFloats[0] = linear_from_curve[(*src >> 0) & 0xFF];
302 srcFloats[1] = linear_from_curve[(*src >> 8) & 0xFF];
303 srcFloats[2] = linear_from_curve[(*src >> 16) & 0xFF];
304
305 // Convert to dst gamut.
306 float dstFloats[3];
307 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] +
308 srcFloats[2] * matrix[8];
309 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] +
310 srcFloats[2] * matrix[9];
311 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] +
312 srcFloats[2] * matrix[10];
313
314 // Convert to dst gamma.
315 // Note: pow is really, really slow. We will suffer when SSE2 is not su pported.
316 dstFloats[0] = powf(dstFloats[0], (1/2.2f)) * 255.0f;
317 dstFloats[1] = powf(dstFloats[1], (1/2.2f)) * 255.0f;
318 dstFloats[2] = powf(dstFloats[2], (1/2.2f)) * 255.0f;
319
320 *dst = (0xFF << 24) |
321 (clamp_float_to_byte(dstFloats[2]) << 16) |
322 (clamp_float_to_byte(dstFloats[1]) << 8) |
323 (clamp_float_to_byte(dstFloats[0]) << 0);
324
325 dst++;
326 src++;
327 }
175 } 328 }
176 329
177 #endif 330 #endif
178 331
332 static void color_xform_RGB1_srgb_to_2dot2(uint32_t* dst, const uint32_t* src, i nt len,
333 const float matrix[16]) {
334 color_xform_RGB1<linear_from_srgb>(dst, src, len, matrix);
335 }
336
337 static void color_xform_RGB1_2dot2_to_2dot2(uint32_t* dst, const uint32_t* src, int len,
338 const float matrix[16]) {
339 color_xform_RGB1<linear_from_2dot2>(dst, src, len, matrix);
340 }
341
179 } 342 }
180 343
181 #endif // SkColorXform_opts_DEFINED 344 #endif // SkColorXform_opts_DEFINED
OLDNEW
« no previous file with comments | « src/core/SkOpts.cpp ('k') | src/opts/SkOpts_sse41.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698