src/opts/SkColorXform_opts.h - Issue 2060823003: Implement fast, correct gamma conversion for color xforms

Side by Side Diff: src/opts/SkColorXform_opts.h

Issue 2060823003: Implement fast, correct gamma conversion for color xforms (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Should be 0xFF Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 Google Inc.	2 * Copyright 2016 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef SkColorXform_opts_DEFINED	8 #ifndef SkColorXform_opts_DEFINED

9 #define SkColorXform_opts_DEFINED	9 #define SkColorXform_opts_DEFINED

10	10

11 #include "SkColorPriv.h"	11 #include "SkColorPriv.h"

12	12

13 namespace SK_OPTS_NS {	13 namespace SK_OPTS_NS {

14	14

15 static uint8_t clamp_float_to_byte(float v) {	15 static constexpr float gamma_srgb_to_linear[256] = {

16 if (v >= 254.5f) {	16 0.000000000000000000f, 0.000303526983548838f, 0.000607053967097675f, 0.0 00910580950646513f,

17 return 255;	17 0.001214107934195350f, 0.001517634917744190f, 0.001821161901293030f, 0.0 02124688884841860f,

18 } else if (v < 0.5f) {	18 0.002428215868390700f, 0.002731742851939540f, 0.003034518678424960f, 0.0 03346535763899160f,

19 return 0;	19 0.003676507324047440f, 0.004024717018496310f, 0.004391442037410290f, 0.0 04776953480693730f,

	20 0.005181516702338390f, 0.005605391624202720f, 0.006048833022857060f, 0.0 06512090792594470f,

	21 0.006995410187265390f, 0.007499032043226180f, 0.008023192985384990f, 0.0 08568125618069310f,

	22 0.009134058702220790f, 0.009721217320237850f, 0.010329823029626900f, 0.0 10960094006488200f,

	23 0.011612245179743900f, 0.012286488356915900f, 0.012983032342173000f, 0.0 13702083047289700f,

	24 0.014443843596092500f, 0.015208514422912700f, 0.015996293365509600f, 0.0 16807375752887400f,

	25 0.017641954488384100f, 0.018500220128379700f, 0.019382360956935700f, 0.0 20288563056652400f,

	26 0.021219010376003600f, 0.022173884793387400f, 0.023153366178110400f, 0.0 24157632448504800f,

	27 0.025186859627361600f, 0.026241221894849900f, 0.027320891639074900f, 0.0 28426039504420800f,

	28 0.029556834437808800f, 0.030713443732993600f, 0.031896033073011500f, 0.0 33104766570885100f,

	29 0.034339806808682200f, 0.035601314875020300f, 0.036889450401100000f, 0.0 38204371595346500f,

	30 0.039546235276732800f, 0.040915196906853200f, 0.042311410620809700f, 0.0 43735029256973500f,

	31 0.045186204385675500f, 0.046665086336880100f, 0.048171824226889400f, 0.0 49706565984127200f,

	32 0.051269458374043200f, 0.052860647023180200f, 0.054480276442442400f, 0.0 56128490049600100f,

	33 0.057805430191067200f, 0.059511238162981200f, 0.061246054231617600f, 0.0 63010017653167700f,

	34 0.064803266692905800f, 0.066625938643772900f, 0.068478169844400200f, 0.0 70360095696595900f,

	35 0.072271850682317500f, 0.074213568380149600f, 0.076185381481307900f, 0.0 78187421805186300f,

	36 0.080219820314468300f, 0.082282707129814800f, 0.084376211544148800f, 0.0 86500462036549800f,

	37 0.088655586285772900f, 0.090841711183407700f, 0.093058962846687500f, 0.0 95307466630964700f,

	38 0.097587347141862500f, 0.099898728247113900f, 0.102241733088101000f, 0.1 04616484091104000f,

	39 0.107023102978268000f, 0.109461710778299000f, 0.111932427836906000f, 0.1 14435373826974000f,

	40 0.116970667758511000f, 0.119538427988346000f, 0.122138772229602000f, 0.1 24771817560950000f,

	41 0.127437680435647000f, 0.130136476690364000f, 0.132868321553818000f, 0.1 35633329655206000f,

	42 0.138431615032452000f, 0.141263291140272000f, 0.144128470858058000f, 0.1 47027266497595000f,

	43 0.149959789810609000f, 0.152926151996150000f, 0.155926463707827000f, 0.1 58960835060880000f,

	44 0.162029375639111000f, 0.165132194501668000f, 0.168269400189691000f, 0.1 71441100732823000f,

	45 0.174647403655585000f, 0.177888415983629000f, 0.181164244249860000f, 0.1 84474994500441000f,

	46 0.187820772300678000f, 0.191201682740791000f, 0.194617830441576000f, 0.1 98069319559949000f,

	47 0.201556253794397000f, 0.205078736390317000f, 0.208636870145256000f, 0.2 12230757414055000f,

	48 0.215860500113899000f, 0.219526199729269000f, 0.223227957316809000f, 0.2 26965873510098000f,

	49 0.230740048524349000f, 0.234550582161005000f, 0.238397573812271000f, 0.2 42281122465555000f,

	50 0.246201326707835000f, 0.250158284729953000f, 0.254152094330827000f, 0.2 58182852921596000f,

	51 0.262250657529696000f, 0.266355604802862000f, 0.270497791013066000f, 0.2 74677312060385000f,

	52 0.278894263476810000f, 0.283148740429992000f, 0.287440837726918000f, 0.2 91770649817536000f,

	53 0.296138270798321000f, 0.300543794415777000f, 0.304987314069886000f, 0.3 09468922817509000f,

	54 0.313988713375718000f, 0.318546778125092000f, 0.323143209112951000f, 0.3 27778098056542000f,

	55 0.332451536346179000f, 0.337163615048330000f, 0.341914424908661000f, 0.3 46704056355030000f,

	56 0.351532599500439000f, 0.356400144145944000f, 0.361306779783510000f, 0.3 66252595598840000f,

	57 0.371237680474149000f, 0.376262122990906000f, 0.381326011432530000f, 0.3 86429433787049000f,

	58 0.391572477749723000f, 0.396755230725627000f, 0.401977779832196000f, 0.4 07240211901737000f,

	59 0.412542613483904000f, 0.417885070848138000f, 0.423267669986072000f, 0.4 28690496613907000f,

	60 0.434153636174749000f, 0.439657173840919000f, 0.445201194516228000f, 0.4 50785782838223000f,

	61 0.456411023180405000f, 0.462076999654407000f, 0.467783796112159000f, 0.4 73531496148010000f,

	62 0.479320183100827000f, 0.485149940056070000f, 0.491020849847836000f, 0.4 96932995060870000f,

	63 0.502886458032569000f, 0.508881320854934000f, 0.514917665376521000f, 0.5 20995573204354000f,

	64 0.527115125705813000f, 0.533276404010505000f, 0.539479489012107000f, 0.5 45724461370187000f,

	65 0.552011401512000000f, 0.558340389634268000f, 0.564711505704929000f, 0.5 71124829464873000f,

	66 0.577580440429651000f, 0.584078417891164000f, 0.590618840919337000f, 0.5 97201788363763000f,

	67 0.603827338855338000f, 0.610495570807865000f, 0.617206562419651000f, 0.6 23960391675076000f,

	68 0.630757136346147000f, 0.637596873994033000f, 0.644479681970582000f, 0.6 51405637419824000f,

	69 0.658374817279448000f, 0.665387298282272000f, 0.672443156957688000f, 0.6 79542469633094000f,

	70 0.686685312435314000f, 0.693871761291990000f, 0.701101891932973000f, 0.7 08375779891687000f,

	71 0.715693500506481000f, 0.723055128921969000f, 0.730460740090354000f, 0.7 37910408772731000f,

	72 0.745404209540387000f, 0.752942216776078000f, 0.760524504675292000f, 0.7 68151147247507000f,

	73 0.775822218317423000f, 0.783537791526194000f, 0.791297940332630000f, 0.7 99102738014409000f,

	74 0.806952257669252000f, 0.814846572216101000f, 0.822785754396284000f, 0.8 30769876774655000f,

	75 0.838799011740740000f, 0.846873231509858000f, 0.854992608124234000f, 0.8 63157213454102000f,

	76 0.871367119198797000f, 0.879622396887832000f, 0.887923117881966000f, 0.8 96269353374266000f,

	77 0.904661174391149000f, 0.913098651793419000f, 0.921581856277295000f, 0.9 30110858375424000f,

	78 0.938685728457888000f, 0.947306536733200000f, 0.955973353249286000f, 0.9 64686247894465000f,

	79 0.973445290398413000f, 0.982250550333117000f, 0.991102097113830000f, 1.0 00000000000000000f,

	80 };

	81

	82 static constexpr float gamma_2dot2_to_linear[256] = {

	83 0.000000000000000000f, 0.000005077051900662f, 0.000023328004666099f, 0.0 00056921765712193f,

	84 0.000107187362341244f, 0.000175123977503027f, 0.000261543754548491f, 0.0 00367136269815943f,

	85 0.000492503787191433f, 0.000638182842167022f, 0.000804658499513058f, 0.0 00992374304074325f,

	86 0.001201739522438400f, 0.001433134589671860f, 0.001686915316789280f, 0.0 01963416213396470f,

	87 0.002262953160706430f, 0.002585825596234170f, 0.002932318323938360f, 0.0 03302703032003640f,

	88 0.003697239578900130f, 0.004116177093282750f, 0.004559754922526020f, 0.0 05028203456855540f,

	89 0.005521744850239660f, 0.006040593654849810f, 0.006584957382581690f, 0.0 07155037004573030f,

	90 0.007751027397660610f, 0.008373117745148580f, 0.009021491898012130f, 0.0 09696328701658230f,

	91 0.010397802292555300f, 0.011126082368383200f, 0.011881334434813700f, 0.0 12663720031582100f,

	92 0.013473396940142600f, 0.014310519374884100f, 0.015175238159625200f, 0.0 16067700890886900f,

	93 0.016988052089250000f, 0.017936433339950200f, 0.018912983423721500f, 0.0 19917838438785700f,

	94 0.020951131914781100f, 0.022012994919336500f, 0.023103556157921400f, 0.0 24222942067534200f,

	95 0.025371276904734600f, 0.026548682828472900f, 0.027755279978126000f, 0.0 28991186547107800f,

	96 0.030256518852388700f, 0.031551391400226400f, 0.032875916948383800f, 0.0 34230206565082000f,

	97 0.035614369684918800f, 0.037028514161960200f, 0.038472746320194600f, 0.0 39947171001525600f,

	98 0.041451891611462500f, 0.042987010162657100f, 0.044552627316421400f, 0.0 46148842422351000f,

	99 0.047775753556170600f, 0.049433457555908000f, 0.051122050056493400f, 0.0 52841625522879000f,

	100 0.054592277281760300f, 0.056374097551979800f, 0.058187177473685400f, 0.0 60031607136313200f,

	101 0.061907475605455800f, 0.063814870948677200f, 0.065753880260330100f, 0.0 67724589685424300f,

	102 0.069727084442598800f, 0.071761448846239100f, 0.073827766327784600f, 0.0 75926119456264800f,

	103 0.078056589958101900f, 0.080219258736215100f, 0.082414205888459200f, 0.0 84641510725429500f,

	104 0.086901251787660300f, 0.089193506862247800f, 0.091518352998919500f, 0.0 93875866525577800f,

	105 0.096266123063339700f, 0.098689197541094500f, 0.101145164209600000f, 0.1 03634096655137000f,

	106 0.106156067812744000f, 0.108711149979039000f, 0.111299414824660000f, 0.1 13920933406333000f,

	107 0.116575776178572000f, 0.119264013005047000f, 0.121985713169619000f, 0.1 24740945387051000f,

	108 0.127529777813422000f, 0.130352278056244000f, 0.133208513184300000f, 0.1 36098549737202000f,

	109 0.139022453734703000f, 0.141980290685736000f, 0.144972125597231000f, 0.1 47998022982685000f,

	110 0.151058046870511000f, 0.154152260812165000f, 0.157280727890073000f, 0.1 60443510725344000f,

	111 0.163640671485290000f, 0.166872271890766000f, 0.170138373223312000f, 0.1 73439036332135000f,

	112 0.176774321640903000f, 0.180144289154390000f, 0.183548998464951000f, 0.1 86988508758844000f,

	113 0.190462878822409000f, 0.193972167048093000f, 0.197516431440340000f, 0.2 01095729621346000f,

	114 0.204710118836677000f, 0.208359655960767000f, 0.212044397502288000f, 0.2 15764399609395000f,

	115 0.219519718074868000f, 0.223310408341127000f, 0.227136525505149000f, 0.2 30998124323267000f,

	116 0.234895259215880000f, 0.238827984272048000f, 0.242796353254002000f, 0.2 46800419601550000f,

	117 0.250840236436400000f, 0.254915856566385000f, 0.259027332489606000f, 0.2 63174716398492000f,

	118 0.267358060183772000f, 0.271577415438375000f, 0.275832833461245000f, 0.2 80124365261085000f,

	119 0.284452061560024000f, 0.288815972797219000f, 0.293216149132375000f, 0.2 97652640449211000f,

	120 0.302125496358853000f, 0.306634766203158000f, 0.311180499057984000f, 0.3 15762743736397000f,

	121 0.320381548791810000f, 0.325036962521076000f, 0.329729032967515000f, 0.3 34457807923889000f,

	122 0.339223334935327000f, 0.344025661302187000f, 0.348864834082879000f, 0.3 53740900096629000f,

	123 0.358653905926199000f, 0.363603897920553000f, 0.368590922197487000f, 0.3 73615024646202000f,

	124 0.378676250929840000f, 0.383774646487975000f, 0.388910256539059000f, 0.3 94083126082829000f,

	125 0.399293299902674000f, 0.404540822567962000f, 0.409825738436323000f, 0.4 15148091655907000f,

	126 0.420507926167587000f, 0.425905285707146000f, 0.431340213807410000f, 0.4 36812753800359000f,

	127 0.442322948819202000f, 0.447870841800410000f, 0.453456475485731000f, 0.4 59079892424160000f,

	128 0.464741134973889000f, 0.470440245304218000f, 0.476177265397440000f, 0.4 81952237050698000f,

	129 0.487765201877811000f, 0.493616201311074000f, 0.499505276603030000f, 0.5 05432468828216000f,

	130 0.511397818884880000f, 0.517401367496673000f, 0.523443155214325000f, 0.5 29523222417277000f,

	131 0.535641609315311000f, 0.541798355950137000f, 0.547993502196972000f, 0.5 54227087766085000f,

	132 0.560499152204328000f, 0.566809734896638000f, 0.573158875067523000f, 0.5 79546611782525000f,

	133 0.585972983949661000f, 0.592438030320847000f, 0.598941789493296000f, 0.6 05484299910907000f,

	134 0.612065599865624000f, 0.618685727498780000f, 0.625344720802427000f, 0.6 32042617620641000f,

	135 0.638779455650817000f, 0.645555272444935000f, 0.652370105410821000f, 0.6 59223991813387000f,

	136 0.666116968775851000f, 0.673049073280942000f, 0.680020342172095000f, 0.6 87030812154625000f,

	137 0.694080519796882000f, 0.701169501531402000f, 0.708297793656032000f, 0.7 15465432335048000f,

	138 0.722672453600255000f, 0.729918893352071000f, 0.737204787360605000f, 0.7 44530171266715000f,

	139 0.751895080583051000f, 0.759299550695091000f, 0.766743616862161000f, 0.7 74227314218442000f,

	140 0.781750677773962000f, 0.789313742415586000f, 0.796916542907978000f, 0.8 04559113894567000f,

	141 0.812241489898490000f, 0.819963705323528000f, 0.827725794455034000f, 0.8 35527791460841000f,

	142 0.843369730392169000f, 0.851251645184515000f, 0.859173569658532000f, 0.8 67135537520905000f,

	143 0.875137582365205000f, 0.883179737672745000f, 0.891262036813419000f, 0.8 99384513046529000f,

	144 0.907547199521614000f, 0.915750129279253000f, 0.923993335251873000f, 0.9 32276850264543000f,

	145 0.940600707035753000f, 0.948964938178195000f, 0.957369576199527000f, 0.9 65814653503130000f,

	146 0.974300202388861000f, 0.982826255053791000f, 0.991392843592940000f, 1.0 00000000000000000f,

	147 };

	148

	149 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

	150

	151 // x^(29/64) is a very good approximation of the true value, x^(1/2.2).

	152 static __m128 inverse_gamma_linear_to_2dot2(__m128 x) {
	mtklein_C 2016/06/16 13:27:41 Just some naming questions. Some of these names f Just some naming questions. Some of these names feel redundant, and I'm not sure if they're really redundant or there's some important clarification going on. Would things still mean the same if they were rewritten like this? gamma_srgb_to_linear -> linear_from_srgb gamma_2dot2_to_linear -> linear_from_2dot2 inverse_gamma_linear_to_2dot2 -> linear_to_2dot2 or even from_srgb, from_2dot2, to_2dot2 ? msarett 2016/06/16 15:46:11 sgtm, shorter names are better. Show quoted text On 2016/06/16 13:27:41, mtklein_C wrote: > Just some naming questions. Some of these names feel redundant, and I'm not > sure if they're really redundant or there's some important clarification going > on. Would things still mean the same if they were rewritten like this? > > gamma_srgb_to_linear -> linear_from_srgb > gamma_2dot2_to_linear -> linear_from_2dot2 > inverse_gamma_linear_to_2dot2 -> linear_to_2dot2 > > or even > from_srgb, from_2dot2, to_2dot2 > > ? sgtm, shorter names are better.
	153 // x^(-1/2)

	154 __m128 x2 = _mm_rsqrt_ps(x);

	155

	156 // x^(-1/32)

	157 __m128 x32 = _mm_rsqrt_ps(_mm_rsqrt_ps(_mm_rsqrt_ps(_mm_rsqrt_ps(x2))));

	158

	159 // x^(+1/64)

	160 __m128 x64 = _mm_rsqrt_ps(x32);

	161

	162 // x^(+29/64) = x^(+1/2) * x^(-1/32) * x^(-1/64)

	163 // Note that we also scale to the 0-255 range.

	164 // These terms can be combined more minimally with 3 muls and 1 reciprocal. However, this

	165 // is faster, because it allows us to start the muls in parallel with the rs qrts.

	166 __m128 scale = _mm_set1_ps(255.0f);

	167 return _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(scale, _mm_rcp_ps(x2)), x32), _mm_rc p_ps(x64));

	168 }

	169

	170 template <SkColorSpace::GammaNamed kGammaNamed>

	171 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,

	172 const float matrix[16]) {
	mtklein_C 2016/06/16 13:27:41 This line might want to be re-wrapped? This line might want to be re-wrapped? msarett 2016/06/16 15:46:12 Done. Show quoted text On 2016/06/16 13:27:41, mtklein_C wrote: > This line might want to be re-wrapped? Done.
	173 const float* gamma_to_linear;

	174 if (SkColorSpace::kSRGB_GammaNamed == kGammaNamed) {
	mtklein_C 2016/06/16 13:27:41 Since we're not otherwise using kGammaNamed, I thi Since we're not otherwise using kGammaNamed, I think this would read more clearly if we just templatize these functions (here and portable) on the table pointers themselves: static const float gamma_srgb_to_linear[] = { ... }; template <const float* gamma_to_linear> static void color_xform_RGB1(...) { ... use gamma_to_linear directly ... } static void color_xform_RGB1_srgb_to_2dot2(...) { color_xform_RGB1<gamma_srgb_to_linear>(...); } It's one of the underhyped features of C++11 that you can use pointers to static const arrays as template arguments. In C++98 they had to be extern. And if you want to be really careful, you can specify the template like template <const float (&gamma_to_linear)[256]> which will make sure you passed an array with exactly 256 entries. msarett 2016/06/16 15:46:12 Woohoo, this is cool! Show quoted text On 2016/06/16 13:27:41, mtklein_C wrote: > Since we're not otherwise using kGammaNamed, I think this would read more > clearly if we just templatize these functions (here and portable) on the table > pointers themselves: > > static const float gamma_srgb_to_linear[] = { ... }; > > template <const float* gamma_to_linear> > static void color_xform_RGB1(...) { > ... use gamma_to_linear directly ... > } > > static void color_xform_RGB1_srgb_to_2dot2(...) { > color_xform_RGB1<gamma_srgb_to_linear>(...); > } > > It's one of the underhyped features of C++11 that you can use pointers to static > const arrays as template arguments. In C++98 they had to be extern. > > And if you want to be really careful, you can specify the template like > template <const float (&gamma_to_linear)[256]> > which will make sure you passed an array with exactly 256 entries. Woohoo, this is cool!
	175 gamma_to_linear = gamma_srgb_to_linear;

20 } else {	176 } else {

21 return (uint8_t) (v + 0.5f);	177 gamma_to_linear = gamma_2dot2_to_linear;

22 }	178 }

23 }	179

24

25 static void color_xform_2Dot2_RGBA_to_8888_portable(uint32_t* dst, const uint32_ t* src, int len,

26 const float matrix[16]) {

27 while (len-- > 0) {

28 float srcFloats[3];

29 srcFloats[0] = (float) ((*src >> 0) & 0xFF);

30 srcFloats[1] = (float) ((*src >> 8) & 0xFF);

31 srcFloats[2] = (float) ((*src >> 16) & 0xFF);

32

33 // Convert to linear.

34 // TODO (msarett):

35 // We should use X^2.2 here instead of X^2. What is the impact on corre ctness?

36 // We should be able to get closer to 2.2 at a small performance cost.

37 srcFloats[0] = srcFloats[0] * srcFloats[0];

38 srcFloats[1] = srcFloats[1] * srcFloats[1];

39 srcFloats[2] = srcFloats[2] * srcFloats[2];

40

41 // Convert to dst gamut.

42 float dstFloats[3];

43 // TODO (msarett): matrix[12], matrix[13], and matrix[14] are almost alw ays zero.

44 // Should we have another optimized path that avoids the extra addition when they

45 // are zero?

46 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] +

47 srcFloats[2] * matrix[8] + matrix[12];

48 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] +

49 srcFloats[2] * matrix[9] + matrix[13];

50 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] +

51 srcFloats[2] * matrix[10] + matrix[14];

52

53 // Convert to dst gamma.

54 // TODO (msarett):

55 // We should use X^(1/2.2) here instead of X^(1/2). What is the impact on correctness?

56 // We should be able to get closer to (1/2.2) at a small performance cos t.

57 dstFloats[0] = sqrtf(dstFloats[0]);

58 dstFloats[1] = sqrtf(dstFloats[1]);

59 dstFloats[2] = sqrtf(dstFloats[2]);

60

61 dst = SkPackARGB32NoCheck(((src >> 24) & 0xFF),

62 clamp_float_to_byte(dstFloats[0]),

63 clamp_float_to_byte(dstFloats[1]),

64 clamp_float_to_byte(dstFloats[2]));

65

66 dst++;

67 src++;

68 }

69 }

70

71 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

72

73 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i nt len,

74 const float matrix[16]) {

75 // Load transformation matrix.	180 // Load transformation matrix.

76 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]);	181 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]);

77 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]);	182 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]);

78 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]);	183 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]);

79 __m128 rQgQbQ = _mm_loadu_ps(&matrix[12]);

80	184

81 while (len >= 4) {	185 while (len >= 4) {

82 // Load 4 pixels and convert them to floats.	186 // Convert to linear. The look-up table has perfect accuracy.

83 __m128i rgba = _mm_loadu_si128((const __m128i*) src);	187 __m128 reds = _mm_setr_ps(gamma_to_linear[(src[0] >> 0) & 0xFF],

84 __m128i byteMask = _mm_set1_epi32(0xFF);	188 gamma_to_linear[(src[1] >> 0) & 0xFF],

85 __m128 reds = _mm_cvtepi32_ps(_mm_and_si128( rgba, byteMask));	189 gamma_to_linear[(src[2] >> 0) & 0xFF],

86 __m128 greens = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 8), byteMask));	190 gamma_to_linear[(src[3] >> 0) & 0xFF]);

87 __m128 blues = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 16), byteMask));	191 __m128 greens = _mm_setr_ps(gamma_to_linear[(src[0] >> 8) & 0xFF],

88	192 gamma_to_linear[(src[1] >> 8) & 0xFF],

89 // Convert to linear.	193 gamma_to_linear[(src[2] >> 8) & 0xFF],

90 // FIXME (msarett):	194 gamma_to_linear[(src[3] >> 8) & 0xFF]);

91 // Should we be more accurate?	195 __m128 blues = _mm_setr_ps(gamma_to_linear[(src[0] >> 16) & 0xFF],

92 reds = _mm_mul_ps(reds, reds);	196 gamma_to_linear[(src[1] >> 16) & 0xFF],

93 greens = _mm_mul_ps(greens, greens);	197 gamma_to_linear[(src[2] >> 16) & 0xFF],

94 blues = _mm_mul_ps(blues, blues);	198 gamma_to_linear[(src[3] >> 16) & 0xFF]);

95	199

96 // Apply the transformation matrix to dst gamut.	200 // Apply the transformation matrix to dst gamut.

97 // FIXME (msarett):

98 // rQ, gQ, and bQ are almost always zero. Can we save a couple instruct ions?

99

100 // Splat rX, rY, rZ, and rQ each across a register.	201 // Splat rX, rY, rZ, and rQ each across a register.

101 __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00);	202 __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00);

102 __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00);	203 __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00);

103 __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00);	204 __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00);

104 __m128 rQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x00);	205

105	206 // dstReds = rX * reds + rY * greens + rZ * blues

106 // dstReds = rX * reds + rY * greens + rZ * blues + rQ

107 __m128 dstReds = _mm_mul_ps(reds, rX);	207 __m128 dstReds = _mm_mul_ps(reds, rX);

108 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY));	208 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY));

109 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ));	209 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ));

110 dstReds = _mm_add_ps(dstReds, rQ);

111	210

112 // Splat gX, gY, gZ, and gQ each across a register.	211 // Splat gX, gY, gZ, and gQ each across a register.

113 __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55);	212 __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55);

114 __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55);	213 __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55);

115 __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55);	214 __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55);

116 __m128 gQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x55);	215

117	216 // dstGreens = gX * reds + gY * greens + gZ * blues

118 // dstGreens = gX * reds + gY * greens + gZ * blues + gQ

119 __m128 dstGreens = _mm_mul_ps(reds, gX);	217 __m128 dstGreens = _mm_mul_ps(reds, gX);

120 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY));	218 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY));

121 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ));	219 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ));

122 dstGreens = _mm_add_ps(dstGreens, gQ);

123	220

124 // Splat bX, bY, bZ, and bQ each across a register.	221 // Splat bX, bY, bZ, and bQ each across a register.

125 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA);	222 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA);

126 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA);	223 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA);

127 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA);	224 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA);

128 __m128 bQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0xAA);	225

129	226 // dstBlues = bX * reds + bY * greens + bZ * blues

130 // dstBlues = bX * reds + bY * greens + bZ * blues + bQ

131 __m128 dstBlues = _mm_mul_ps(reds, bX);	227 __m128 dstBlues = _mm_mul_ps(reds, bX);

132 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY));	228 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY));

133 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ));	229 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ));

134 dstBlues = _mm_add_ps(dstBlues, bQ);

135	230

136 // Convert to dst gamma.	231 // Convert to dst gamma.

137 // Note that the reciprocal of the reciprocal sqrt, is just a fast sqrt.	232 dstReds = inverse_gamma_linear_to_2dot2(dstReds);

138 // FIXME (msarett):	233 dstGreens = inverse_gamma_linear_to_2dot2(dstGreens);

139 // Should we be more accurate?	234 dstBlues = inverse_gamma_linear_to_2dot2(dstBlues);

140 dstReds = _mm_rcp_ps(_mm_rsqrt_ps(dstReds));

141 dstGreens = _mm_rcp_ps(_mm_rsqrt_ps(dstGreens));

142 dstBlues = _mm_rcp_ps(_mm_rsqrt_ps(dstBlues));

143	235

144 // Clamp floats to 0-255 range.	236 // Clamp floats to 0-255 range.

145 dstReds = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstReds, _mm_set1_ ps(255.0f)));	237 // The order of the arguments is important here. We want to make sure t hat NaN

146 dstGreens = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstGreens, _mm_set1_ ps(255.0f)));	238 // clamps to zero. Note that max(NaN, 0) = 0, while max(0, NaN) = NaN.
	mtklein_C 2016/06/16 13:27:40 Do we have test cases exercising the NaN input? J Do we have test cases exercising the NaN input? Just want to make sure that if we think we care, we care enough to test it and keep it correct. msarett 2016/06/16 15:46:11 Yes we are hitting this case quite frequently actu Show quoted text On 2016/06/16 13:27:40, mtklein_C wrote: > Do we have test cases exercising the NaN input? Just want to make sure that if > we think we care, we care enough to test it and keep it correct. Yes we are hitting this case quite frequently actually. The inverse_gamma_xform(0) is giving NaN.
147 dstBlues = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstBlues, _mm_set1_ ps(255.0f)));	239 dstReds = _mm_min_ps(_mm_max_ps(dstReds, _mm_setzero_ps()), _mm_set1 _ps(255.0f));

	240 dstGreens = _mm_min_ps(_mm_max_ps(dstGreens, _mm_setzero_ps()), _mm_set1 _ps(255.0f));

	241 dstBlues = _mm_min_ps(_mm_max_ps(dstBlues, _mm_setzero_ps()), _mm_set1 _ps(255.0f));

148	242

149 // Convert to bytes and store to memory.	243 // Convert to bytes and store to memory.

150 rgba = _mm_and_si128(_mm_set1_epi32(0xFF000000), rgba);	244 __m128i rgba = _mm_set1_epi32(0xFF000000);

151 #ifdef SK_PMCOLOR_IS_RGBA

152 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) );	245 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) );

153 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) );	246 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) );

154 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16) );	247 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16) );

155 #else

156 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstBlues) );

157 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) );

158 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstReds), 16) );

159 #endif

160 _mm_storeu_si128((__m128i*) dst, rgba);	248 _mm_storeu_si128((__m128i*) dst, rgba);

161	249

162 dst += 4;	250 dst += 4;

163 src += 4;	251 src += 4;

164 len -= 4;	252 len -= 4;

165 }	253 }

166	254

167 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix);	255 while (len > 0) {

	256 // Convert to linear. The look-up table has perfect accuracy.

	257 __m128 srcPixel = _mm_setr_ps(gamma_srgb_to_linear[(src[0] >> 0) & 0xFF ],
	mtklein_C 2016/06/16 13:27:40 Wouldn't this part be simpler as, // Splat red, g Wouldn't this part be simpler as, // Splat red, green, and blue components. __m128 r = _mm_set1_ps(gamma_srgb_to_linear[...]), g = _mm_set1_ps(gamma_srgb_to_linear[...]), b = _mm_set1_ps(gamma_srgb_to_linear[...]); Seems like there's no need for the srcPixel intermediate. msarett 2016/06/16 15:46:12 Yes I like this better, done. Show quoted text On 2016/06/16 13:27:40, mtklein_C wrote: > Wouldn't this part be simpler as, > > // Splat red, green, and blue components. > __m128 r = _mm_set1_ps(gamma_srgb_to_linear[...]), > g = _mm_set1_ps(gamma_srgb_to_linear[...]), > b = _mm_set1_ps(gamma_srgb_to_linear[...]); > > Seems like there's no need for the srcPixel intermediate. Yes I like this better, done.
	258 gamma_srgb_to_linear[(src[0] >> 8) & 0xFF ],

	259 gamma_srgb_to_linear[(src[0] >> 16) & 0xFF ],

	260 0.0f);

	261

	262 // Apply the transformation matrix to dst gamut.

	263 // This time, splat the red, green, and blue components.

	264 __m128 r = _mm_shuffle_ps(srcPixel, srcPixel, 0x00);

	265 __m128 g = _mm_shuffle_ps(srcPixel, srcPixel, 0x55);

	266 __m128 b = _mm_shuffle_ps(srcPixel, srcPixel, 0xAA);

	267 __m128 dstPixel = _mm_mul_ps(r, rXgXbX);

	268 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(g, rYgYbY));

	269 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(b, rZgZbZ));

	270

	271 // Convert to dst gamma.

	272 dstPixel = inverse_gamma_linear_to_2dot2(dstPixel);

	273

	274 // Clamp floats to 0-255 range.

	275 dstPixel = _mm_min_ps(_mm_max_ps(dstPixel, _mm_setzero_ps()), _mm_set1_p s(255.0f));
	mtklein_C 2016/06/16 13:27:41 Let's make the clamping a static function? That c Let's make the clamping a static function? That can help make sure the important comment about NaN and argument order is always paired up with the code. msarett 2016/06/16 15:46:12 SGTM Show quoted text On 2016/06/16 13:27:41, mtklein_C wrote: > Let's make the clamping a static function? That can help make sure the > important comment about NaN and argument order is always paired up with the > code. SGTM
	276

	277 // Convert to bytes and store to memory.

	278 __m128i dstInts = _mm_cvtps_epi32(dstPixel);

	279 __m128i dstBytes = _mm_packus_epi16(_mm_packus_epi16(dstInts, dstInts), dstInts);

	280 dstBytes = _mm_or_si128(_mm_set1_epi32(0xFF000000), dstBytes);

	281 _mm_store_ss((float*) dst, _mm_castsi128_ps(dstBytes));

	282

	283 dst += 1;

	284 src += 1;

	285 len -= 1;

	286 }

168 }	287 }

169	288

170 #else	289 #else

171	290

172 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i nt len,	291 static uint8_t clamp_float_to_byte(float v) {

	292 if (v >= 254.5f) {

	293 return 255;

	294 } else if (v < 0.5f) {

	295 return 0;

	296 } else {

	297 return (uint8_t) (v + 0.5f);

	298 }

	299 }

	300

	301 template <SkColorSpace::GammaNamed kGammaNamed>

	302 static void color_xform_RGB1_portable(uint32_t* dst, const uint32_t* src, int le n,

	303 const float matrix[16]) {

	304 const float* gamma_to_linear;

	305 if (SkColorSpace::kSRGB_GammaNamed == kGammaNamed) {

	306 gamma_to_linear = gamma_srgb_to_linear;

	307 } else {

	308 gamma_to_linear = gamma_2dot2_to_linear;

	309 }

	310

	311 while (len-- > 0) {

	312 // Convert to linear.

	313 float srcFloats[3];

	314 srcFloats[0] = gamma_to_linear[(*src >> 0) & 0xFF];

	315 srcFloats[1] = gamma_to_linear[(*src >> 8) & 0xFF];

	316 srcFloats[2] = gamma_to_linear[(*src >> 16) & 0xFF];

	317

	318 // Convert to dst gamut.

	319 float dstFloats[3];

	320 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] +

	321 srcFloats[2] * matrix[8];

	322 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] +

	323 srcFloats[2] * matrix[9];

	324 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] +

	325 srcFloats[2] * matrix[10];

	326

	327 // Convert to dst gamma.

	328 // Note: pow is really, really slow. We will suffer when SSE2 is not su pported.

	329 dstFloats[0] = powf(dstFloats[0], (1/2.2f)) * 255.0f;

	330 dstFloats[1] = powf(dstFloats[1], (1/2.2f)) * 255.0f;

	331 dstFloats[2] = powf(dstFloats[2], (1/2.2f)) * 255.0f;

	332

	333 *dst = SkPackARGB32NoCheck(0xFF,
	mtklein_C 2016/06/16 13:27:41 Didn't you already munge the matrix so that we sho Didn't you already munge the matrix so that we should always act as if the output were RGBA and then that is actually SkPMColor order? SkPackARGB32NoCheck applies SkPMColor order itself, which seems wrong... shouldn't this just be shifts? This is made complicated, of course, by all our non-x86 bots having RGBA as their SkPMColor order. There shouldn't be any visible bug here, but I think this code is semantically misleading. msarett 2016/06/16 15:46:12 Agreed, this is wrong. Using shifts. Show quoted text On 2016/06/16 13:27:41, mtklein_C wrote: > Didn't you already munge the matrix so that we should always act as if the > output were RGBA and then that is actually SkPMColor order? SkPackARGB32NoCheck > applies SkPMColor order itself, which seems wrong... shouldn't this just be > shifts? > > This is made complicated, of course, by all our non-x86 bots having RGBA as > their SkPMColor order. There shouldn't be any visible bug here, but I think > this code is semantically misleading. Agreed, this is wrong. Using shifts.
	334 clamp_float_to_byte(dstFloats[0]),

	335 clamp_float_to_byte(dstFloats[1]),

	336 clamp_float_to_byte(dstFloats[2]));

	337

	338 dst++;

	339 src++;

	340 }

	341 }

	342

	343 template <SkColorSpace::GammaNamed kGammaNamed>

	344 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,

173 const float matrix[16]) {	345 const float matrix[16]) {

174 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix);	346 color_xform_RGB1_portable<kGammaNamed>(dst, src, len, matrix);
	mtklein_C 2016/06/16 13:27:40 This appears to be a complete passthrough function This appears to be a complete passthrough function. Why don't we just rename what's now color_xform_RGB1_portable to color_xform_RGB1? msarett 2016/06/16 15:46:12 Of course, Done. Show quoted text On 2016/06/16 13:27:40, mtklein_C wrote: > This appears to be a complete passthrough function. Why don't we just rename > what's now color_xform_RGB1_portable to color_xform_RGB1? Of course, Done.
175 }	347 }

176	348

177 #endif	349 #endif

178	350

	351 static void color_xform_RGB1_srgb_to_2dot2(uint32_t* dst, const uint32_t* src, i nt len,

	352 const float matrix[16]) {

	353 color_xform_RGB1<SkColorSpace::kSRGB_GammaNamed>(dst, src, len, matrix);

	354 }

	355

	356 static void color_xform_RGB1_2dot2_to_2dot2(uint32_t* dst, const uint32_t* src, int len,

	357 const float matrix[16]) {

	358 color_xform_RGB1<SkColorSpace::k2Dot2Curve_GammaNamed>(dst, src, len, matrix );

	359 }

	360

179 }	361 }

180	362

181 #endif // SkColorXform_opts_DEFINED	363 #endif // SkColorXform_opts_DEFINED

OLD	NEW

« src/core/SkOpts.h ('K') | « src/core/SkOpts.cpp ('k') | src/opts/SkOpts_sse41.cpp » ('j') | no next file with comments »