OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkColorXform_opts_DEFINED | 8 #ifndef SkColorXform_opts_DEFINED |
9 #define SkColorXform_opts_DEFINED | 9 #define SkColorXform_opts_DEFINED |
10 | 10 |
11 #include "SkColorPriv.h" | 11 #include "SkColorPriv.h" |
12 | 12 |
13 namespace SK_OPTS_NS { | 13 namespace SK_OPTS_NS { |
14 | 14 |
15 static uint8_t clamp_float_to_byte(float v) { | 15 extern const float linear_from_srgb[256] = { |
16 if (v >= 254.5f) { | 16 0.000000000000000000f, 0.000303526983548838f, 0.000607053967097675f, 0.0
00910580950646513f, |
17 return 255; | 17 0.001214107934195350f, 0.001517634917744190f, 0.001821161901293030f, 0.0
02124688884841860f, |
18 } else if (v < 0.5f) { | 18 0.002428215868390700f, 0.002731742851939540f, 0.003034518678424960f, 0.0
03346535763899160f, |
19 return 0; | 19 0.003676507324047440f, 0.004024717018496310f, 0.004391442037410290f, 0.0
04776953480693730f, |
20 } else { | 20 0.005181516702338390f, 0.005605391624202720f, 0.006048833022857060f, 0.0
06512090792594470f, |
21 return (uint8_t) (v + 0.5f); | 21 0.006995410187265390f, 0.007499032043226180f, 0.008023192985384990f, 0.0
08568125618069310f, |
22 } | 22 0.009134058702220790f, 0.009721217320237850f, 0.010329823029626900f, 0.0
10960094006488200f, |
| 23 0.011612245179743900f, 0.012286488356915900f, 0.012983032342173000f, 0.0
13702083047289700f, |
| 24 0.014443843596092500f, 0.015208514422912700f, 0.015996293365509600f, 0.0
16807375752887400f, |
| 25 0.017641954488384100f, 0.018500220128379700f, 0.019382360956935700f, 0.0
20288563056652400f, |
| 26 0.021219010376003600f, 0.022173884793387400f, 0.023153366178110400f, 0.0
24157632448504800f, |
| 27 0.025186859627361600f, 0.026241221894849900f, 0.027320891639074900f, 0.0
28426039504420800f, |
| 28 0.029556834437808800f, 0.030713443732993600f, 0.031896033073011500f, 0.0
33104766570885100f, |
| 29 0.034339806808682200f, 0.035601314875020300f, 0.036889450401100000f, 0.0
38204371595346500f, |
| 30 0.039546235276732800f, 0.040915196906853200f, 0.042311410620809700f, 0.0
43735029256973500f, |
| 31 0.045186204385675500f, 0.046665086336880100f, 0.048171824226889400f, 0.0
49706565984127200f, |
| 32 0.051269458374043200f, 0.052860647023180200f, 0.054480276442442400f, 0.0
56128490049600100f, |
| 33 0.057805430191067200f, 0.059511238162981200f, 0.061246054231617600f, 0.0
63010017653167700f, |
| 34 0.064803266692905800f, 0.066625938643772900f, 0.068478169844400200f, 0.0
70360095696595900f, |
| 35 0.072271850682317500f, 0.074213568380149600f, 0.076185381481307900f, 0.0
78187421805186300f, |
| 36 0.080219820314468300f, 0.082282707129814800f, 0.084376211544148800f, 0.0
86500462036549800f, |
| 37 0.088655586285772900f, 0.090841711183407700f, 0.093058962846687500f, 0.0
95307466630964700f, |
| 38 0.097587347141862500f, 0.099898728247113900f, 0.102241733088101000f, 0.1
04616484091104000f, |
| 39 0.107023102978268000f, 0.109461710778299000f, 0.111932427836906000f, 0.1
14435373826974000f, |
| 40 0.116970667758511000f, 0.119538427988346000f, 0.122138772229602000f, 0.1
24771817560950000f, |
| 41 0.127437680435647000f, 0.130136476690364000f, 0.132868321553818000f, 0.1
35633329655206000f, |
| 42 0.138431615032452000f, 0.141263291140272000f, 0.144128470858058000f, 0.1
47027266497595000f, |
| 43 0.149959789810609000f, 0.152926151996150000f, 0.155926463707827000f, 0.1
58960835060880000f, |
| 44 0.162029375639111000f, 0.165132194501668000f, 0.168269400189691000f, 0.1
71441100732823000f, |
| 45 0.174647403655585000f, 0.177888415983629000f, 0.181164244249860000f, 0.1
84474994500441000f, |
| 46 0.187820772300678000f, 0.191201682740791000f, 0.194617830441576000f, 0.1
98069319559949000f, |
| 47 0.201556253794397000f, 0.205078736390317000f, 0.208636870145256000f, 0.2
12230757414055000f, |
| 48 0.215860500113899000f, 0.219526199729269000f, 0.223227957316809000f, 0.2
26965873510098000f, |
| 49 0.230740048524349000f, 0.234550582161005000f, 0.238397573812271000f, 0.2
42281122465555000f, |
| 50 0.246201326707835000f, 0.250158284729953000f, 0.254152094330827000f, 0.2
58182852921596000f, |
| 51 0.262250657529696000f, 0.266355604802862000f, 0.270497791013066000f, 0.2
74677312060385000f, |
| 52 0.278894263476810000f, 0.283148740429992000f, 0.287440837726918000f, 0.2
91770649817536000f, |
| 53 0.296138270798321000f, 0.300543794415777000f, 0.304987314069886000f, 0.3
09468922817509000f, |
| 54 0.313988713375718000f, 0.318546778125092000f, 0.323143209112951000f, 0.3
27778098056542000f, |
| 55 0.332451536346179000f, 0.337163615048330000f, 0.341914424908661000f, 0.3
46704056355030000f, |
| 56 0.351532599500439000f, 0.356400144145944000f, 0.361306779783510000f, 0.3
66252595598840000f, |
| 57 0.371237680474149000f, 0.376262122990906000f, 0.381326011432530000f, 0.3
86429433787049000f, |
| 58 0.391572477749723000f, 0.396755230725627000f, 0.401977779832196000f, 0.4
07240211901737000f, |
| 59 0.412542613483904000f, 0.417885070848138000f, 0.423267669986072000f, 0.4
28690496613907000f, |
| 60 0.434153636174749000f, 0.439657173840919000f, 0.445201194516228000f, 0.4
50785782838223000f, |
| 61 0.456411023180405000f, 0.462076999654407000f, 0.467783796112159000f, 0.4
73531496148010000f, |
| 62 0.479320183100827000f, 0.485149940056070000f, 0.491020849847836000f, 0.4
96932995060870000f, |
| 63 0.502886458032569000f, 0.508881320854934000f, 0.514917665376521000f, 0.5
20995573204354000f, |
| 64 0.527115125705813000f, 0.533276404010505000f, 0.539479489012107000f, 0.5
45724461370187000f, |
| 65 0.552011401512000000f, 0.558340389634268000f, 0.564711505704929000f, 0.5
71124829464873000f, |
| 66 0.577580440429651000f, 0.584078417891164000f, 0.590618840919337000f, 0.5
97201788363763000f, |
| 67 0.603827338855338000f, 0.610495570807865000f, 0.617206562419651000f, 0.6
23960391675076000f, |
| 68 0.630757136346147000f, 0.637596873994033000f, 0.644479681970582000f, 0.6
51405637419824000f, |
| 69 0.658374817279448000f, 0.665387298282272000f, 0.672443156957688000f, 0.6
79542469633094000f, |
| 70 0.686685312435314000f, 0.693871761291990000f, 0.701101891932973000f, 0.7
08375779891687000f, |
| 71 0.715693500506481000f, 0.723055128921969000f, 0.730460740090354000f, 0.7
37910408772731000f, |
| 72 0.745404209540387000f, 0.752942216776078000f, 0.760524504675292000f, 0.7
68151147247507000f, |
| 73 0.775822218317423000f, 0.783537791526194000f, 0.791297940332630000f, 0.7
99102738014409000f, |
| 74 0.806952257669252000f, 0.814846572216101000f, 0.822785754396284000f, 0.8
30769876774655000f, |
| 75 0.838799011740740000f, 0.846873231509858000f, 0.854992608124234000f, 0.8
63157213454102000f, |
| 76 0.871367119198797000f, 0.879622396887832000f, 0.887923117881966000f, 0.8
96269353374266000f, |
| 77 0.904661174391149000f, 0.913098651793419000f, 0.921581856277295000f, 0.9
30110858375424000f, |
| 78 0.938685728457888000f, 0.947306536733200000f, 0.955973353249286000f, 0.9
64686247894465000f, |
| 79 0.973445290398413000f, 0.982250550333117000f, 0.991102097113830000f, 1.0
00000000000000000f, |
| 80 }; |
| 81 |
| 82 extern const float linear_from_2dot2[256] = { |
| 83 0.000000000000000000f, 0.000005077051900662f, 0.000023328004666099f, 0.0
00056921765712193f, |
| 84 0.000107187362341244f, 0.000175123977503027f, 0.000261543754548491f, 0.0
00367136269815943f, |
| 85 0.000492503787191433f, 0.000638182842167022f, 0.000804658499513058f, 0.0
00992374304074325f, |
| 86 0.001201739522438400f, 0.001433134589671860f, 0.001686915316789280f, 0.0
01963416213396470f, |
| 87 0.002262953160706430f, 0.002585825596234170f, 0.002932318323938360f, 0.0
03302703032003640f, |
| 88 0.003697239578900130f, 0.004116177093282750f, 0.004559754922526020f, 0.0
05028203456855540f, |
| 89 0.005521744850239660f, 0.006040593654849810f, 0.006584957382581690f, 0.0
07155037004573030f, |
| 90 0.007751027397660610f, 0.008373117745148580f, 0.009021491898012130f, 0.0
09696328701658230f, |
| 91 0.010397802292555300f, 0.011126082368383200f, 0.011881334434813700f, 0.0
12663720031582100f, |
| 92 0.013473396940142600f, 0.014310519374884100f, 0.015175238159625200f, 0.0
16067700890886900f, |
| 93 0.016988052089250000f, 0.017936433339950200f, 0.018912983423721500f, 0.0
19917838438785700f, |
| 94 0.020951131914781100f, 0.022012994919336500f, 0.023103556157921400f, 0.0
24222942067534200f, |
| 95 0.025371276904734600f, 0.026548682828472900f, 0.027755279978126000f, 0.0
28991186547107800f, |
| 96 0.030256518852388700f, 0.031551391400226400f, 0.032875916948383800f, 0.0
34230206565082000f, |
| 97 0.035614369684918800f, 0.037028514161960200f, 0.038472746320194600f, 0.0
39947171001525600f, |
| 98 0.041451891611462500f, 0.042987010162657100f, 0.044552627316421400f, 0.0
46148842422351000f, |
| 99 0.047775753556170600f, 0.049433457555908000f, 0.051122050056493400f, 0.0
52841625522879000f, |
| 100 0.054592277281760300f, 0.056374097551979800f, 0.058187177473685400f, 0.0
60031607136313200f, |
| 101 0.061907475605455800f, 0.063814870948677200f, 0.065753880260330100f, 0.0
67724589685424300f, |
| 102 0.069727084442598800f, 0.071761448846239100f, 0.073827766327784600f, 0.0
75926119456264800f, |
| 103 0.078056589958101900f, 0.080219258736215100f, 0.082414205888459200f, 0.0
84641510725429500f, |
| 104 0.086901251787660300f, 0.089193506862247800f, 0.091518352998919500f, 0.0
93875866525577800f, |
| 105 0.096266123063339700f, 0.098689197541094500f, 0.101145164209600000f, 0.1
03634096655137000f, |
| 106 0.106156067812744000f, 0.108711149979039000f, 0.111299414824660000f, 0.1
13920933406333000f, |
| 107 0.116575776178572000f, 0.119264013005047000f, 0.121985713169619000f, 0.1
24740945387051000f, |
| 108 0.127529777813422000f, 0.130352278056244000f, 0.133208513184300000f, 0.1
36098549737202000f, |
| 109 0.139022453734703000f, 0.141980290685736000f, 0.144972125597231000f, 0.1
47998022982685000f, |
| 110 0.151058046870511000f, 0.154152260812165000f, 0.157280727890073000f, 0.1
60443510725344000f, |
| 111 0.163640671485290000f, 0.166872271890766000f, 0.170138373223312000f, 0.1
73439036332135000f, |
| 112 0.176774321640903000f, 0.180144289154390000f, 0.183548998464951000f, 0.1
86988508758844000f, |
| 113 0.190462878822409000f, 0.193972167048093000f, 0.197516431440340000f, 0.2
01095729621346000f, |
| 114 0.204710118836677000f, 0.208359655960767000f, 0.212044397502288000f, 0.2
15764399609395000f, |
| 115 0.219519718074868000f, 0.223310408341127000f, 0.227136525505149000f, 0.2
30998124323267000f, |
| 116 0.234895259215880000f, 0.238827984272048000f, 0.242796353254002000f, 0.2
46800419601550000f, |
| 117 0.250840236436400000f, 0.254915856566385000f, 0.259027332489606000f, 0.2
63174716398492000f, |
| 118 0.267358060183772000f, 0.271577415438375000f, 0.275832833461245000f, 0.2
80124365261085000f, |
| 119 0.284452061560024000f, 0.288815972797219000f, 0.293216149132375000f, 0.2
97652640449211000f, |
| 120 0.302125496358853000f, 0.306634766203158000f, 0.311180499057984000f, 0.3
15762743736397000f, |
| 121 0.320381548791810000f, 0.325036962521076000f, 0.329729032967515000f, 0.3
34457807923889000f, |
| 122 0.339223334935327000f, 0.344025661302187000f, 0.348864834082879000f, 0.3
53740900096629000f, |
| 123 0.358653905926199000f, 0.363603897920553000f, 0.368590922197487000f, 0.3
73615024646202000f, |
| 124 0.378676250929840000f, 0.383774646487975000f, 0.388910256539059000f, 0.3
94083126082829000f, |
| 125 0.399293299902674000f, 0.404540822567962000f, 0.409825738436323000f, 0.4
15148091655907000f, |
| 126 0.420507926167587000f, 0.425905285707146000f, 0.431340213807410000f, 0.4
36812753800359000f, |
| 127 0.442322948819202000f, 0.447870841800410000f, 0.453456475485731000f, 0.4
59079892424160000f, |
| 128 0.464741134973889000f, 0.470440245304218000f, 0.476177265397440000f, 0.4
81952237050698000f, |
| 129 0.487765201877811000f, 0.493616201311074000f, 0.499505276603030000f, 0.5
05432468828216000f, |
| 130 0.511397818884880000f, 0.517401367496673000f, 0.523443155214325000f, 0.5
29523222417277000f, |
| 131 0.535641609315311000f, 0.541798355950137000f, 0.547993502196972000f, 0.5
54227087766085000f, |
| 132 0.560499152204328000f, 0.566809734896638000f, 0.573158875067523000f, 0.5
79546611782525000f, |
| 133 0.585972983949661000f, 0.592438030320847000f, 0.598941789493296000f, 0.6
05484299910907000f, |
| 134 0.612065599865624000f, 0.618685727498780000f, 0.625344720802427000f, 0.6
32042617620641000f, |
| 135 0.638779455650817000f, 0.645555272444935000f, 0.652370105410821000f, 0.6
59223991813387000f, |
| 136 0.666116968775851000f, 0.673049073280942000f, 0.680020342172095000f, 0.6
87030812154625000f, |
| 137 0.694080519796882000f, 0.701169501531402000f, 0.708297793656032000f, 0.7
15465432335048000f, |
| 138 0.722672453600255000f, 0.729918893352071000f, 0.737204787360605000f, 0.7
44530171266715000f, |
| 139 0.751895080583051000f, 0.759299550695091000f, 0.766743616862161000f, 0.7
74227314218442000f, |
| 140 0.781750677773962000f, 0.789313742415586000f, 0.796916542907978000f, 0.8
04559113894567000f, |
| 141 0.812241489898490000f, 0.819963705323528000f, 0.827725794455034000f, 0.8
35527791460841000f, |
| 142 0.843369730392169000f, 0.851251645184515000f, 0.859173569658532000f, 0.8
67135537520905000f, |
| 143 0.875137582365205000f, 0.883179737672745000f, 0.891262036813419000f, 0.8
99384513046529000f, |
| 144 0.907547199521614000f, 0.915750129279253000f, 0.923993335251873000f, 0.9
32276850264543000f, |
| 145 0.940600707035753000f, 0.948964938178195000f, 0.957369576199527000f, 0.9
65814653503130000f, |
| 146 0.974300202388861000f, 0.982826255053791000f, 0.991392843592940000f, 1.0
00000000000000000f, |
| 147 }; |
| 148 |
| 149 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
| 150 |
| 151 // x^(29/64) is a very good approximation of the true value, x^(1/2.2). |
| 152 static __m128 linear_to_2dot2(__m128 x) { |
| 153 // x^(-1/2) |
| 154 __m128 x2 = _mm_rsqrt_ps(x); |
| 155 |
| 156 // x^(-1/32) |
| 157 __m128 x32 = _mm_rsqrt_ps(_mm_rsqrt_ps(_mm_rsqrt_ps(_mm_rsqrt_ps(x2)))); |
| 158 |
| 159 // x^(+1/64) |
| 160 __m128 x64 = _mm_rsqrt_ps(x32); |
| 161 |
| 162 // x^(+29/64) = x^(+1/2) * x^(-1/32) * x^(-1/64) |
| 163 // Note that we also scale to the 0-255 range. |
| 164 // These terms can be combined more minimally with 3 muls and 1 reciprocal.
However, this |
| 165 // is faster, because it allows us to start the muls in parallel with the rs
qrts. |
| 166 __m128 scale = _mm_set1_ps(255.0f); |
| 167 return _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(scale, _mm_rcp_ps(x2)), x32), _mm_rc
p_ps(x64)); |
23 } | 168 } |
24 | 169 |
25 static void color_xform_2Dot2_RGBA_to_8888_portable(uint32_t* dst, const uint32_
t* src, int len, | 170 static __m128 clamp_0_to_255(__m128 x) { |
26 const float matrix[16]) { | 171 // The order of the arguments is important here. We want to make sure that
NaN |
27 while (len-- > 0) { | 172 // clamps to zero. Note that max(NaN, 0) = 0, while max(0, NaN) = NaN. |
28 float srcFloats[3]; | 173 return _mm_min_ps(_mm_max_ps(x, _mm_setzero_ps()), _mm_set1_ps(255.0f)); |
29 srcFloats[0] = (float) ((*src >> 0) & 0xFF); | |
30 srcFloats[1] = (float) ((*src >> 8) & 0xFF); | |
31 srcFloats[2] = (float) ((*src >> 16) & 0xFF); | |
32 | |
33 // Convert to linear. | |
34 // TODO (msarett): | |
35 // We should use X^2.2 here instead of X^2. What is the impact on corre
ctness? | |
36 // We should be able to get closer to 2.2 at a small performance cost. | |
37 srcFloats[0] = srcFloats[0] * srcFloats[0]; | |
38 srcFloats[1] = srcFloats[1] * srcFloats[1]; | |
39 srcFloats[2] = srcFloats[2] * srcFloats[2]; | |
40 | |
41 // Convert to dst gamut. | |
42 float dstFloats[3]; | |
43 // TODO (msarett): matrix[12], matrix[13], and matrix[14] are almost alw
ays zero. | |
44 // Should we have another optimized path that avoids the extra addition
when they | |
45 // are zero? | |
46 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] + | |
47 srcFloats[2] * matrix[8] + matrix[12]; | |
48 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] + | |
49 srcFloats[2] * matrix[9] + matrix[13]; | |
50 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] + | |
51 srcFloats[2] * matrix[10] + matrix[14]; | |
52 | |
53 // Convert to dst gamma. | |
54 // TODO (msarett): | |
55 // We should use X^(1/2.2) here instead of X^(1/2). What is the impact
on correctness? | |
56 // We should be able to get closer to (1/2.2) at a small performance cos
t. | |
57 dstFloats[0] = sqrtf(dstFloats[0]); | |
58 dstFloats[1] = sqrtf(dstFloats[1]); | |
59 dstFloats[2] = sqrtf(dstFloats[2]); | |
60 | |
61 *dst = SkPackARGB32NoCheck(((*src >> 24) & 0xFF), | |
62 clamp_float_to_byte(dstFloats[0]), | |
63 clamp_float_to_byte(dstFloats[1]), | |
64 clamp_float_to_byte(dstFloats[2])); | |
65 | |
66 dst++; | |
67 src++; | |
68 } | |
69 } | 174 } |
70 | 175 |
71 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | 176 template <const float (&linear_from_curve)[256]> |
72 | 177 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len, |
73 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i
nt len, | 178 const float matrix[16]) { |
74 const float matrix[16]) { | |
75 // Load transformation matrix. | 179 // Load transformation matrix. |
76 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]); | 180 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]); |
77 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]); | 181 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]); |
78 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]); | 182 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]); |
79 __m128 rQgQbQ = _mm_loadu_ps(&matrix[12]); | |
80 | 183 |
81 while (len >= 4) { | 184 while (len >= 4) { |
82 // Load 4 pixels and convert them to floats. | 185 // Convert to linear. The look-up table has perfect accuracy. |
83 __m128i rgba = _mm_loadu_si128((const __m128i*) src); | 186 __m128 reds = _mm_setr_ps(linear_from_curve[(src[0] >> 0) & 0xFF], |
84 __m128i byteMask = _mm_set1_epi32(0xFF); | 187 linear_from_curve[(src[1] >> 0) & 0xFF], |
85 __m128 reds = _mm_cvtepi32_ps(_mm_and_si128( rgba,
byteMask)); | 188 linear_from_curve[(src[2] >> 0) & 0xFF], |
86 __m128 greens = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 8),
byteMask)); | 189 linear_from_curve[(src[3] >> 0) & 0xFF]); |
87 __m128 blues = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 16),
byteMask)); | 190 __m128 greens = _mm_setr_ps(linear_from_curve[(src[0] >> 8) & 0xFF], |
88 | 191 linear_from_curve[(src[1] >> 8) & 0xFF], |
89 // Convert to linear. | 192 linear_from_curve[(src[2] >> 8) & 0xFF], |
90 // FIXME (msarett): | 193 linear_from_curve[(src[3] >> 8) & 0xFF]); |
91 // Should we be more accurate? | 194 __m128 blues = _mm_setr_ps(linear_from_curve[(src[0] >> 16) & 0xFF], |
92 reds = _mm_mul_ps(reds, reds); | 195 linear_from_curve[(src[1] >> 16) & 0xFF], |
93 greens = _mm_mul_ps(greens, greens); | 196 linear_from_curve[(src[2] >> 16) & 0xFF], |
94 blues = _mm_mul_ps(blues, blues); | 197 linear_from_curve[(src[3] >> 16) & 0xFF]); |
95 | 198 |
96 // Apply the transformation matrix to dst gamut. | 199 // Apply the transformation matrix to dst gamut. |
97 // FIXME (msarett): | 200 // Splat rX, rY, and rZ each across a register. |
98 // rQ, gQ, and bQ are almost always zero. Can we save a couple instruct
ions? | |
99 | |
100 // Splat rX, rY, rZ, and rQ each across a register. | |
101 __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00); | 201 __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00); |
102 __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00); | 202 __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00); |
103 __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00); | 203 __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00); |
104 __m128 rQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x00); | 204 |
105 | 205 // dstReds = rX * reds + rY * greens + rZ * blues |
106 // dstReds = rX * reds + rY * greens + rZ * blues + rQ | |
107 __m128 dstReds = _mm_mul_ps(reds, rX); | 206 __m128 dstReds = _mm_mul_ps(reds, rX); |
108 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY)); | 207 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY)); |
109 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ)); | 208 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ)); |
110 dstReds = _mm_add_ps(dstReds, rQ); | 209 |
111 | 210 // Splat gX, gY, and gZ each across a register. |
112 // Splat gX, gY, gZ, and gQ each across a register. | |
113 __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55); | 211 __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55); |
114 __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55); | 212 __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55); |
115 __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55); | 213 __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55); |
116 __m128 gQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x55); | 214 |
117 | 215 // dstGreens = gX * reds + gY * greens + gZ * blues |
118 // dstGreens = gX * reds + gY * greens + gZ * blues + gQ | |
119 __m128 dstGreens = _mm_mul_ps(reds, gX); | 216 __m128 dstGreens = _mm_mul_ps(reds, gX); |
120 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY)); | 217 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY)); |
121 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ)); | 218 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ)); |
122 dstGreens = _mm_add_ps(dstGreens, gQ); | 219 |
123 | 220 // Splat bX, bY, and bZ each across a register. |
124 // Splat bX, bY, bZ, and bQ each across a register. | |
125 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA); | 221 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA); |
126 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA); | 222 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA); |
127 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA); | 223 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA); |
128 __m128 bQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0xAA); | 224 |
129 | 225 // dstBlues = bX * reds + bY * greens + bZ * blues |
130 // dstBlues = bX * reds + bY * greens + bZ * blues + bQ | |
131 __m128 dstBlues = _mm_mul_ps(reds, bX); | 226 __m128 dstBlues = _mm_mul_ps(reds, bX); |
132 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY)); | 227 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY)); |
133 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ)); | 228 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ)); |
134 dstBlues = _mm_add_ps(dstBlues, bQ); | |
135 | 229 |
136 // Convert to dst gamma. | 230 // Convert to dst gamma. |
137 // Note that the reciprocal of the reciprocal sqrt, is just a fast sqrt. | 231 dstReds = linear_to_2dot2(dstReds); |
138 // FIXME (msarett): | 232 dstGreens = linear_to_2dot2(dstGreens); |
139 // Should we be more accurate? | 233 dstBlues = linear_to_2dot2(dstBlues); |
140 dstReds = _mm_rcp_ps(_mm_rsqrt_ps(dstReds)); | 234 |
141 dstGreens = _mm_rcp_ps(_mm_rsqrt_ps(dstGreens)); | 235 // Clamp floats. |
142 dstBlues = _mm_rcp_ps(_mm_rsqrt_ps(dstBlues)); | 236 dstReds = clamp_0_to_255(dstReds); |
143 | 237 dstGreens = clamp_0_to_255(dstGreens); |
144 // Clamp floats to 0-255 range. | 238 dstBlues = clamp_0_to_255(dstBlues); |
145 dstReds = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstReds, _mm_set1_
ps(255.0f))); | |
146 dstGreens = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstGreens, _mm_set1_
ps(255.0f))); | |
147 dstBlues = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstBlues, _mm_set1_
ps(255.0f))); | |
148 | 239 |
149 // Convert to bytes and store to memory. | 240 // Convert to bytes and store to memory. |
150 rgba = _mm_and_si128(_mm_set1_epi32(0xFF000000), rgba); | 241 __m128i rgba = _mm_set1_epi32(0xFF000000); |
151 #ifdef SK_PMCOLOR_IS_RGBA | |
152 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds)
); | 242 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds)
); |
153 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8)
); | 243 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8)
); |
154 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16)
); | 244 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16)
); |
155 #else | |
156 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstBlues)
); | |
157 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8)
); | |
158 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstReds), 16)
); | |
159 #endif | |
160 _mm_storeu_si128((__m128i*) dst, rgba); | 245 _mm_storeu_si128((__m128i*) dst, rgba); |
161 | 246 |
162 dst += 4; | 247 dst += 4; |
163 src += 4; | 248 src += 4; |
164 len -= 4; | 249 len -= 4; |
165 } | 250 } |
166 | 251 |
167 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix); | 252 while (len > 0) { |
| 253 // Splat the red, green, and blue components. |
| 254 __m128 r = _mm_set1_ps(linear_from_curve[(src[0] >> 0) & 0xFF]), |
| 255 g = _mm_set1_ps(linear_from_curve[(src[0] >> 8) & 0xFF]), |
| 256 b = _mm_set1_ps(linear_from_curve[(src[0] >> 16) & 0xFF]); |
| 257 |
| 258 // Apply the transformation matrix to dst gamut. |
| 259 __m128 dstPixel = _mm_mul_ps(r, rXgXbX); |
| 260 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(g, rYgYbY)); |
| 261 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(b, rZgZbZ)); |
| 262 |
| 263 // Convert to dst gamma. |
| 264 dstPixel = linear_to_2dot2(dstPixel); |
| 265 |
| 266 // Clamp floats to 0-255 range. |
| 267 dstPixel = clamp_0_to_255(dstPixel); |
| 268 |
| 269 // Convert to bytes and store to memory. |
| 270 __m128i dstInts = _mm_cvtps_epi32(dstPixel); |
| 271 __m128i dstBytes = _mm_packus_epi16(_mm_packus_epi16(dstInts, dstInts),
dstInts); |
| 272 dstBytes = _mm_or_si128(_mm_set1_epi32(0xFF000000), dstBytes); |
| 273 _mm_store_ss((float*) dst, _mm_castsi128_ps(dstBytes)); |
| 274 |
| 275 dst += 1; |
| 276 src += 1; |
| 277 len -= 1; |
| 278 } |
168 } | 279 } |
169 | 280 |
170 #else | 281 #else |
171 | 282 |
172 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i
nt len, | 283 static uint8_t clamp_float_to_byte(float v) { |
173 const float matrix[16]) { | 284 // The ordering of the logic is a little strange here in order |
174 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix); | 285 // to make sure we convert NaNs to 0. |
| 286 if (v >= 254.5f) { |
| 287 return 255; |
| 288 } else if (v >= 0.5f) { |
| 289 return (uint8_t) (v + 0.5f); |
| 290 } else { |
| 291 return 0; |
| 292 } |
| 293 } |
| 294 |
| 295 template <const float (&linear_from_curve)[256]> |
| 296 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len, |
| 297 const float matrix[16]) { |
| 298 while (len-- > 0) { |
| 299 // Convert to linear. |
| 300 float srcFloats[3]; |
| 301 srcFloats[0] = linear_from_curve[(*src >> 0) & 0xFF]; |
| 302 srcFloats[1] = linear_from_curve[(*src >> 8) & 0xFF]; |
| 303 srcFloats[2] = linear_from_curve[(*src >> 16) & 0xFF]; |
| 304 |
| 305 // Convert to dst gamut. |
| 306 float dstFloats[3]; |
| 307 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] + |
| 308 srcFloats[2] * matrix[8]; |
| 309 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] + |
| 310 srcFloats[2] * matrix[9]; |
| 311 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] + |
| 312 srcFloats[2] * matrix[10]; |
| 313 |
| 314 // Convert to dst gamma. |
| 315 // Note: pow is really, really slow. We will suffer when SSE2 is not su
pported. |
| 316 dstFloats[0] = powf(dstFloats[0], (1/2.2f)) * 255.0f; |
| 317 dstFloats[1] = powf(dstFloats[1], (1/2.2f)) * 255.0f; |
| 318 dstFloats[2] = powf(dstFloats[2], (1/2.2f)) * 255.0f; |
| 319 |
| 320 *dst = (0xFF << 24) | |
| 321 (clamp_float_to_byte(dstFloats[2]) << 16) | |
| 322 (clamp_float_to_byte(dstFloats[1]) << 8) | |
| 323 (clamp_float_to_byte(dstFloats[0]) << 0); |
| 324 |
| 325 dst++; |
| 326 src++; |
| 327 } |
175 } | 328 } |
176 | 329 |
177 #endif | 330 #endif |
178 | 331 |
| 332 static void color_xform_RGB1_srgb_to_2dot2(uint32_t* dst, const uint32_t* src, i
nt len, |
| 333 const float matrix[16]) { |
| 334 color_xform_RGB1<linear_from_srgb>(dst, src, len, matrix); |
| 335 } |
| 336 |
| 337 static void color_xform_RGB1_2dot2_to_2dot2(uint32_t* dst, const uint32_t* src,
int len, |
| 338 const float matrix[16]) { |
| 339 color_xform_RGB1<linear_from_2dot2>(dst, src, len, matrix); |
| 340 } |
| 341 |
179 } | 342 } |
180 | 343 |
181 #endif // SkColorXform_opts_DEFINED | 344 #endif // SkColorXform_opts_DEFINED |
OLD | NEW |