openssl/crypto/bn/asm/sparcv9a-mont.pl - Issue 9254031: Upgrade chrome's OpenSSL to same version Android ships with.

Side by Side Diff: openssl/crypto/bn/asm/sparcv9a-mont.pl

Issue 9254031: Upgrade chrome's OpenSSL to same version Android ships with. (Closed) Base URL: http://src.chromium.org/svn/trunk/deps/third_party/openssl/

Patch Set: '' Created 8 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 #!/usr/bin/env perl

	2

	3 # ====================================================================

	4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL

	5 # project. The module is, however, dual licensed under OpenSSL and

	6 # CRYPTOGAMS licenses depending on where you obtain it. For further

	7 # details see http://www.openssl.org/~appro/cryptogams/.

	8 # ====================================================================

	9

	10 # October 2005

	11 #

	12 # "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?

	13 # Because unlike integer multiplier, which simply stalls whole CPU,

	14 # FPU is fully pipelined and can effectively emit 48 bit partial

	15 # product every cycle. Why not blended SPARC v9? One can argue that

	16 # making this module dependent on UltraSPARC VIS extension limits its

	17 # binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)

	18 # implementations from compatibility matrix. But the rest, whole Sun

	19 # UltraSPARC family and brand new Fujitsu's SPARC64 V, all support

	20 # VIS extension instructions used in this module. This is considered

	21 # good enough to not care about HAL SPARC64 users [if any] who have

	22 # integer-only pure SPARCv9 module to "fall down" to.

	23

	24 # USI&II cores currently exhibit uniform 2x improvement [over pre-

	25 # bn_mul_mont codebase] for all key lengths and benchmarks. On USIII

	26 # performance improves few percents for shorter keys and worsens few

	27 # percents for longer keys. This is because USIII integer multiplier

	28 # is >3x faster than USI&II one, which is harder to match [but see

	29 # TODO list below]. It should also be noted that SPARC64 V features

	30 # out-of-order execution, which might mean that integer multiplier

	31 # is pipelined, which in turn might be impossible to match... On

	32 # additional note, SPARC64 V implements FP Multiply-Add instruction,

	33 # which is perfectly usable in this context... In other words, as far

	34 # as Fujitsu SPARC64 V goes, talk to the author:-)

	35

	36 # The implementation implies following "non-natural" limitations on

	37 # input arguments:

	38 # - num may not be less than 4;

	39 # - num has to be even;

	40 # Failure to meet either condition has no fatal effects, simply

	41 # doesn't give any performance gain.

	42

	43 # TODO:

	44 # - modulo-schedule inner loop for better performance (on in-order

	45 # execution core such as UltraSPARC this shall result in further

	46 # noticeable(!) improvement);

	47 # - dedicated squaring procedure[?];

	48

	49 ######################################################################

	50 # November 2006

	51 #

	52 # Modulo-scheduled inner loops allow to interleave floating point and

	53 # integer instructions and minimize Read-After-Write penalties. This

	54 # results in further 20-50% perfromance improvement [depending on

	55 # key length, more for longer keys] on USI&II cores and 30-80% - on

	56 # USIII&IV.

	57

	58 $fname="bn_mul_mont_fpu";

	59 $bits=32;

	60 for (@ARGV) { $bits=64 if (/\-m64/ \|\| /\-xarch\=v9/); }

	61

	62 if ($bits==64) {

	63 $bias=2047;

	64 $frame=192;

	65 } else {

	66 $bias=0;

	67 $frame=128; # 96 rounded up to largest known cache-line

	68 }

	69 $locals=64;

	70

	71 # In order to provide for 32-/64-bit ABI duality, I keep integers wider

	72 # than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used

	73 # exclusively for pointers, indexes and other small values...

	74 # int bn_mul_mont(

	75 $rp="%i0"; # BN_ULONG *rp,

	76 $ap="%i1"; # const BN_ULONG *ap,

	77 $bp="%i2"; # const BN_ULONG *bp,

	78 $np="%i3"; # const BN_ULONG *np,

	79 $n0="%i4"; # const BN_ULONG *n0,

	80 $num="%i5"; # int num);

	81

	82 $tp="%l0"; # t[num]

	83 $ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved

	84 $ap_h="%l2"; # to these four vectors as double-precision FP values.

	85 $np_l="%l3"; # This way a bunch of fxtods are eliminated in second

	86 $np_h="%l4"; # loop and L1-cache aliasing is minimized...

	87 $i="%l5";

	88 $j="%l6";

	89 $mask="%l7"; # 16-bit mask, 0xffff

	90

	91 $n0="%g4"; # reassigned(!) to "64-bit" register

	92 $carry="%i4"; # %i4 reused(!) for a carry bit

	93

	94 # FP register naming chart

	95 #

	96 # ..HILO

	97 # dcba

	98 # --------

	99 # LOa

	100 # LOb

	101 # LOc

	102 # LOd

	103 # HIa

	104 # HIb

	105 # HIc

	106 # HId

	107 # ..a

	108 # ..b

	109 $ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";

	110 $na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";

	111 $alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";

	112 $nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";

	113

	114 $dota="%f24"; $dotb="%f26";

	115

	116 $aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";

	117 $ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";

	118 $nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";

	119 $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";

	120

	121 $ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load

	122

	123 $code=<<___;

	124 .section ".text",#alloc,#execinstr

	125

	126 .global $fname

	127 .align 32

	128 $fname:

	129 save %sp,-$frame-$locals,%sp

	130

	131 cmp $num,4

	132 bl,a,pn %icc,.Lret

	133 clr %i0

	134 andcc $num,1,%g0 ! $num has to be even...

	135 bnz,a,pn %icc,.Lret

	136 clr %i0 ! signal "unsupported input value"

	137

	138 srl $num,1,$num

	139 sethi %hi(0xffff),$mask

	140 ld [%i4+0],$n0 ! $n0 reassigned, remember?

	141 or $mask,%lo(0xffff),$mask

	142 ld [%i4+4],%o0

	143 sllx %o0,32,%o0

	144 or %o0,$n0,$n0 ! $n0=n0[1].n0[0]

	145

	146 sll $num,3,$num ! num*=8

	147

	148 add %sp,$bias,%o0 ! real top of stack

	149 sll $num,2,%o1

	150 add %o1,$num,%o1 ! %o1=num*5

	151 sub %o0,%o1,%o0

	152 and %o0,-2048,%o0 ! optimize TLB utilization

	153 sub %o0,$bias,%sp ! alloca(5num8)

	154

	155 rd %asi,%o7 ! save %asi

	156 add %sp,$bias+$frame+$locals,$tp

	157 add $tp,$num,$ap_l

	158 add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !

	159 add $ap_l,$num,$ap_h

	160 add $ap_h,$num,$np_l

	161 add $np_l,$num,$np_h

	162

	163 wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads

	164

	165 add $rp,$num,$rp ! readjust input pointers to point

	166 add $ap,$num,$ap ! at the ends too...

	167 add $bp,$num,$bp

	168 add $np,$num,$np

	169

	170 stx %o7,[%sp+$bias+$frame+48] ! save %asi

	171

	172 sub %g0,$num,$i ! i=-num

	173 sub %g0,$num,$j ! j=-num

	174

	175 add $ap,$j,%o3

	176 add $bp,$i,%o4

	177

	178 ld [%o3+4],%g1 ! bp[0]

	179 ld [%o3+0],%o0

	180 ld [%o4+4],%g5 ! ap[0]

	181 sllx %g1,32,%g1

	182 ld [%o4+0],%o1

	183 sllx %g5,32,%g5

	184 or %g1,%o0,%o0

	185 or %g5,%o1,%o1

	186

	187 add $np,$j,%o5

	188

	189 mulx %o1,%o0,%o0 ! ap[0]*bp[0]

	190 mulx $n0,%o0,%o0 ! ap[0]bp[0]n0

	191 stx %o0,[%sp+$bias+$frame+0]

	192

	193 ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words

	194 fzeros $alo

	195 ld [%o3+4],$ahi_

	196 fzeros $ahi

	197 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words

	198 fzeros $nlo

	199 ld [%o5+4],$nhi_

	200 fzeros $nhi

	201

	202 ! transfer b[i] to FPU as 4x16-bit values

	203 ldda [%o4+2]%asi,$ba

	204 fxtod $alo,$alo

	205 ldda [%o4+0]%asi,$bb

	206 fxtod $ahi,$ahi

	207 ldda [%o4+6]%asi,$bc

	208 fxtod $nlo,$nlo

	209 ldda [%o4+4]%asi,$bd

	210 fxtod $nhi,$nhi

	211

	212 ! transfer ap[0]b[0]n0 to FPU as 4x16-bit values

	213 ldda [%sp+$bias+$frame+6]%asi,$na

	214 fxtod $ba,$ba

	215 ldda [%sp+$bias+$frame+4]%asi,$nb

	216 fxtod $bb,$bb

	217 ldda [%sp+$bias+$frame+2]%asi,$nc

	218 fxtod $bc,$bc

	219 ldda [%sp+$bias+$frame+0]%asi,$nd

	220 fxtod $bd,$bd

	221

	222 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format

	223 fxtod $na,$na

	224 std $ahi,[$ap_h+$j]

	225 fxtod $nb,$nb

	226 std $nlo,[$np_l+$j] ! save smashed np[j] in double format

	227 fxtod $nc,$nc

	228 std $nhi,[$np_h+$j]

	229 fxtod $nd,$nd

	230

	231 fmuld $alo,$ba,$aloa

	232 fmuld $nlo,$na,$nloa

	233 fmuld $alo,$bb,$alob

	234 fmuld $nlo,$nb,$nlob

	235 fmuld $alo,$bc,$aloc

	236 faddd $aloa,$nloa,$nloa

	237 fmuld $nlo,$nc,$nloc

	238 fmuld $alo,$bd,$alod

	239 faddd $alob,$nlob,$nlob

	240 fmuld $nlo,$nd,$nlod

	241 fmuld $ahi,$ba,$ahia

	242 faddd $aloc,$nloc,$nloc

	243 fmuld $nhi,$na,$nhia

	244 fmuld $ahi,$bb,$ahib

	245 faddd $alod,$nlod,$nlod

	246 fmuld $nhi,$nb,$nhib

	247 fmuld $ahi,$bc,$ahic

	248 faddd $ahia,$nhia,$nhia

	249 fmuld $nhi,$nc,$nhic

	250 fmuld $ahi,$bd,$ahid

	251 faddd $ahib,$nhib,$nhib

	252 fmuld $nhi,$nd,$nhid

	253

	254 faddd $ahic,$nhic,$dota ! $nhic

	255 faddd $ahid,$nhid,$dotb ! $nhid

	256

	257 faddd $nloc,$nhia,$nloc

	258 faddd $nlod,$nhib,$nlod

	259

	260 fdtox $nloa,$nloa

	261 fdtox $nlob,$nlob

	262 fdtox $nloc,$nloc

	263 fdtox $nlod,$nlod

	264

	265 std $nloa,[%sp+$bias+$frame+0]

	266 add $j,8,$j

	267 std $nlob,[%sp+$bias+$frame+8]

	268 add $ap,$j,%o4

	269 std $nloc,[%sp+$bias+$frame+16]

	270 add $np,$j,%o5

	271 std $nlod,[%sp+$bias+$frame+24]

	272

	273 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words

	274 fzeros $alo

	275 ld [%o4+4],$ahi_

	276 fzeros $ahi

	277 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words

	278 fzeros $nlo

	279 ld [%o5+4],$nhi_

	280 fzeros $nhi

	281

	282 fxtod $alo,$alo

	283 fxtod $ahi,$ahi

	284 fxtod $nlo,$nlo

	285 fxtod $nhi,$nhi

	286

	287 ldx [%sp+$bias+$frame+0],%o0

	288 fmuld $alo,$ba,$aloa

	289 ldx [%sp+$bias+$frame+8],%o1

	290 fmuld $nlo,$na,$nloa

	291 ldx [%sp+$bias+$frame+16],%o2

	292 fmuld $alo,$bb,$alob

	293 ldx [%sp+$bias+$frame+24],%o3

	294 fmuld $nlo,$nb,$nlob

	295

	296 srlx %o0,16,%o7

	297 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format

	298 fmuld $alo,$bc,$aloc

	299 add %o7,%o1,%o1

	300 std $ahi,[$ap_h+$j]

	301 faddd $aloa,$nloa,$nloa

	302 fmuld $nlo,$nc,$nloc

	303 srlx %o1,16,%o7

	304 std $nlo,[$np_l+$j] ! save smashed np[j] in double format

	305 fmuld $alo,$bd,$alod

	306 add %o7,%o2,%o2

	307 std $nhi,[$np_h+$j]

	308 faddd $alob,$nlob,$nlob

	309 fmuld $nlo,$nd,$nlod

	310 srlx %o2,16,%o7

	311 fmuld $ahi,$ba,$ahia

	312 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]

	313 faddd $aloc,$nloc,$nloc

	314 fmuld $nhi,$na,$nhia

	315 !and %o0,$mask,%o0

	316 !and %o1,$mask,%o1

	317 !and %o2,$mask,%o2

	318 !sllx %o1,16,%o1

	319 !sllx %o2,32,%o2

	320 !sllx %o3,48,%o7

	321 !or %o1,%o0,%o0

	322 !or %o2,%o0,%o0

	323 !or %o7,%o0,%o0 ! 64-bit result

	324 srlx %o3,16,%g1 ! 34-bit carry

	325 fmuld $ahi,$bb,$ahib

	326

	327 faddd $alod,$nlod,$nlod

	328 fmuld $nhi,$nb,$nhib

	329 fmuld $ahi,$bc,$ahic

	330 faddd $ahia,$nhia,$nhia

	331 fmuld $nhi,$nc,$nhic

	332 fmuld $ahi,$bd,$ahid

	333 faddd $ahib,$nhib,$nhib

	334 fmuld $nhi,$nd,$nhid

	335

	336 faddd $dota,$nloa,$nloa

	337 faddd $dotb,$nlob,$nlob

	338 faddd $ahic,$nhic,$dota ! $nhic

	339 faddd $ahid,$nhid,$dotb ! $nhid

	340

	341 faddd $nloc,$nhia,$nloc

	342 faddd $nlod,$nhib,$nlod

	343

	344 fdtox $nloa,$nloa

	345 fdtox $nlob,$nlob

	346 fdtox $nloc,$nloc

	347 fdtox $nlod,$nlod

	348

	349 std $nloa,[%sp+$bias+$frame+0]

	350 std $nlob,[%sp+$bias+$frame+8]

	351 addcc $j,8,$j

	352 std $nloc,[%sp+$bias+$frame+16]

	353 bz,pn %icc,.L1stskip

	354 std $nlod,[%sp+$bias+$frame+24]

	355

	356 .align 32 ! incidentally already aligned !

	357 .L1st:

	358 add $ap,$j,%o4

	359 add $np,$j,%o5

	360 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words

	361 fzeros $alo

	362 ld [%o4+4],$ahi_

	363 fzeros $ahi

	364 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words

	365 fzeros $nlo

	366 ld [%o5+4],$nhi_

	367 fzeros $nhi

	368

	369 fxtod $alo,$alo

	370 fxtod $ahi,$ahi

	371 fxtod $nlo,$nlo

	372 fxtod $nhi,$nhi

	373

	374 ldx [%sp+$bias+$frame+0],%o0

	375 fmuld $alo,$ba,$aloa

	376 ldx [%sp+$bias+$frame+8],%o1

	377 fmuld $nlo,$na,$nloa

	378 ldx [%sp+$bias+$frame+16],%o2

	379 fmuld $alo,$bb,$alob

	380 ldx [%sp+$bias+$frame+24],%o3

	381 fmuld $nlo,$nb,$nlob

	382

	383 srlx %o0,16,%o7

	384 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format

	385 fmuld $alo,$bc,$aloc

	386 add %o7,%o1,%o1

	387 std $ahi,[$ap_h+$j]

	388 faddd $aloa,$nloa,$nloa

	389 fmuld $nlo,$nc,$nloc

	390 srlx %o1,16,%o7

	391 std $nlo,[$np_l+$j] ! save smashed np[j] in double format

	392 fmuld $alo,$bd,$alod

	393 add %o7,%o2,%o2

	394 std $nhi,[$np_h+$j]

	395 faddd $alob,$nlob,$nlob

	396 fmuld $nlo,$nd,$nlod

	397 srlx %o2,16,%o7

	398 fmuld $ahi,$ba,$ahia

	399 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]

	400 and %o0,$mask,%o0

	401 faddd $aloc,$nloc,$nloc

	402 fmuld $nhi,$na,$nhia

	403 and %o1,$mask,%o1

	404 and %o2,$mask,%o2

	405 fmuld $ahi,$bb,$ahib

	406 sllx %o1,16,%o1

	407 faddd $alod,$nlod,$nlod

	408 fmuld $nhi,$nb,$nhib

	409 sllx %o2,32,%o2

	410 fmuld $ahi,$bc,$ahic

	411 sllx %o3,48,%o7

	412 or %o1,%o0,%o0

	413 faddd $ahia,$nhia,$nhia

	414 fmuld $nhi,$nc,$nhic

	415 or %o2,%o0,%o0

	416 fmuld $ahi,$bd,$ahid

	417 or %o7,%o0,%o0 ! 64-bit result

	418 faddd $ahib,$nhib,$nhib

	419 fmuld $nhi,$nd,$nhid

	420 addcc %g1,%o0,%o0

	421 faddd $dota,$nloa,$nloa

	422 srlx %o3,16,%g1 ! 34-bit carry

	423 faddd $dotb,$nlob,$nlob

	424 bcs,a %xcc,.+8

	425 add %g1,1,%g1

	426

	427 stx %o0,[$tp] ! tp[j-1]=

	428

	429 faddd $ahic,$nhic,$dota ! $nhic

	430 faddd $ahid,$nhid,$dotb ! $nhid

	431

	432 faddd $nloc,$nhia,$nloc

	433 faddd $nlod,$nhib,$nlod

	434

	435 fdtox $nloa,$nloa

	436 fdtox $nlob,$nlob

	437 fdtox $nloc,$nloc

	438 fdtox $nlod,$nlod

	439

	440 std $nloa,[%sp+$bias+$frame+0]

	441 std $nlob,[%sp+$bias+$frame+8]

	442 std $nloc,[%sp+$bias+$frame+16]

	443 std $nlod,[%sp+$bias+$frame+24]

	444

	445 addcc $j,8,$j

	446 bnz,pt %icc,.L1st

	447 add $tp,8,$tp

	448

	449 .L1stskip:

	450 fdtox $dota,$dota

	451 fdtox $dotb,$dotb

	452

	453 ldx [%sp+$bias+$frame+0],%o0

	454 ldx [%sp+$bias+$frame+8],%o1

	455 ldx [%sp+$bias+$frame+16],%o2

	456 ldx [%sp+$bias+$frame+24],%o3

	457

	458 srlx %o0,16,%o7

	459 std $dota,[%sp+$bias+$frame+32]

	460 add %o7,%o1,%o1

	461 std $dotb,[%sp+$bias+$frame+40]

	462 srlx %o1,16,%o7

	463 add %o7,%o2,%o2

	464 srlx %o2,16,%o7

	465 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]

	466 and %o0,$mask,%o0

	467 and %o1,$mask,%o1

	468 and %o2,$mask,%o2

	469 sllx %o1,16,%o1

	470 sllx %o2,32,%o2

	471 sllx %o3,48,%o7

	472 or %o1,%o0,%o0

	473 or %o2,%o0,%o0

	474 or %o7,%o0,%o0 ! 64-bit result

	475 ldx [%sp+$bias+$frame+32],%o4

	476 addcc %g1,%o0,%o0

	477 ldx [%sp+$bias+$frame+40],%o5

	478 srlx %o3,16,%g1 ! 34-bit carry

	479 bcs,a %xcc,.+8

	480 add %g1,1,%g1

	481

	482 stx %o0,[$tp] ! tp[j-1]=

	483 add $tp,8,$tp

	484

	485 srlx %o4,16,%o7

	486 add %o7,%o5,%o5

	487 and %o4,$mask,%o4

	488 sllx %o5,16,%o7

	489 or %o7,%o4,%o4

	490 addcc %g1,%o4,%o4

	491 srlx %o5,48,%g1

	492 bcs,a %xcc,.+8

	493 add %g1,1,%g1

	494

	495 mov %g1,$carry

	496 stx %o4,[$tp] ! tp[num-1]=

	497

	498 ba .Louter

	499 add $i,8,$i

	500 .align 32

	501 .Louter:

	502 sub %g0,$num,$j ! j=-num

	503 add %sp,$bias+$frame+$locals,$tp

	504

	505 add $ap,$j,%o3

	506 add $bp,$i,%o4

	507

	508 ld [%o3+4],%g1 ! bp[i]

	509 ld [%o3+0],%o0

	510 ld [%o4+4],%g5 ! ap[0]

	511 sllx %g1,32,%g1

	512 ld [%o4+0],%o1

	513 sllx %g5,32,%g5

	514 or %g1,%o0,%o0

	515 or %g5,%o1,%o1

	516

	517 ldx [$tp],%o2 ! tp[0]

	518 mulx %o1,%o0,%o0

	519 addcc %o2,%o0,%o0

	520 mulx $n0,%o0,%o0 ! (ap[0]bp[i]+t[0])n0

	521 stx %o0,[%sp+$bias+$frame+0]

	522

	523 ! transfer b[i] to FPU as 4x16-bit values

	524 ldda [%o4+2]%asi,$ba

	525 ldda [%o4+0]%asi,$bb

	526 ldda [%o4+6]%asi,$bc

	527 ldda [%o4+4]%asi,$bd

	528

	529 ! transfer (ap[0]b[i]+t[0])n0 to FPU as 4x16-bit values

	530 ldda [%sp+$bias+$frame+6]%asi,$na

	531 fxtod $ba,$ba

	532 ldda [%sp+$bias+$frame+4]%asi,$nb

	533 fxtod $bb,$bb

	534 ldda [%sp+$bias+$frame+2]%asi,$nc

	535 fxtod $bc,$bc

	536 ldda [%sp+$bias+$frame+0]%asi,$nd

	537 fxtod $bd,$bd

	538 ldd [$ap_l+$j],$alo ! load a[j] in double format

	539 fxtod $na,$na

	540 ldd [$ap_h+$j],$ahi

	541 fxtod $nb,$nb

	542 ldd [$np_l+$j],$nlo ! load n[j] in double format

	543 fxtod $nc,$nc

	544 ldd [$np_h+$j],$nhi

	545 fxtod $nd,$nd

	546

	547 fmuld $alo,$ba,$aloa

	548 fmuld $nlo,$na,$nloa

	549 fmuld $alo,$bb,$alob

	550 fmuld $nlo,$nb,$nlob

	551 fmuld $alo,$bc,$aloc

	552 faddd $aloa,$nloa,$nloa

	553 fmuld $nlo,$nc,$nloc

	554 fmuld $alo,$bd,$alod

	555 faddd $alob,$nlob,$nlob

	556 fmuld $nlo,$nd,$nlod

	557 fmuld $ahi,$ba,$ahia

	558 faddd $aloc,$nloc,$nloc

	559 fmuld $nhi,$na,$nhia

	560 fmuld $ahi,$bb,$ahib

	561 faddd $alod,$nlod,$nlod

	562 fmuld $nhi,$nb,$nhib

	563 fmuld $ahi,$bc,$ahic

	564 faddd $ahia,$nhia,$nhia

	565 fmuld $nhi,$nc,$nhic

	566 fmuld $ahi,$bd,$ahid

	567 faddd $ahib,$nhib,$nhib

	568 fmuld $nhi,$nd,$nhid

	569

	570 faddd $ahic,$nhic,$dota ! $nhic

	571 faddd $ahid,$nhid,$dotb ! $nhid

	572

	573 faddd $nloc,$nhia,$nloc

	574 faddd $nlod,$nhib,$nlod

	575

	576 fdtox $nloa,$nloa

	577 fdtox $nlob,$nlob

	578 fdtox $nloc,$nloc

	579 fdtox $nlod,$nlod

	580

	581 std $nloa,[%sp+$bias+$frame+0]

	582 std $nlob,[%sp+$bias+$frame+8]

	583 std $nloc,[%sp+$bias+$frame+16]

	584 add $j,8,$j

	585 std $nlod,[%sp+$bias+$frame+24]

	586

	587 ldd [$ap_l+$j],$alo ! load a[j] in double format

	588 ldd [$ap_h+$j],$ahi

	589 ldd [$np_l+$j],$nlo ! load n[j] in double format

	590 ldd [$np_h+$j],$nhi

	591

	592 fmuld $alo,$ba,$aloa

	593 fmuld $nlo,$na,$nloa

	594 fmuld $alo,$bb,$alob

	595 fmuld $nlo,$nb,$nlob

	596 fmuld $alo,$bc,$aloc

	597 ldx [%sp+$bias+$frame+0],%o0

	598 faddd $aloa,$nloa,$nloa

	599 fmuld $nlo,$nc,$nloc

	600 ldx [%sp+$bias+$frame+8],%o1

	601 fmuld $alo,$bd,$alod

	602 ldx [%sp+$bias+$frame+16],%o2

	603 faddd $alob,$nlob,$nlob

	604 fmuld $nlo,$nd,$nlod

	605 ldx [%sp+$bias+$frame+24],%o3

	606 fmuld $ahi,$ba,$ahia

	607

	608 srlx %o0,16,%o7

	609 faddd $aloc,$nloc,$nloc

	610 fmuld $nhi,$na,$nhia

	611 add %o7,%o1,%o1

	612 fmuld $ahi,$bb,$ahib

	613 srlx %o1,16,%o7

	614 faddd $alod,$nlod,$nlod

	615 fmuld $nhi,$nb,$nhib

	616 add %o7,%o2,%o2

	617 fmuld $ahi,$bc,$ahic

	618 srlx %o2,16,%o7

	619 faddd $ahia,$nhia,$nhia

	620 fmuld $nhi,$nc,$nhic

	621 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]

	622 ! why?

	623 and %o0,$mask,%o0

	624 fmuld $ahi,$bd,$ahid

	625 and %o1,$mask,%o1

	626 and %o2,$mask,%o2

	627 faddd $ahib,$nhib,$nhib

	628 fmuld $nhi,$nd,$nhid

	629 sllx %o1,16,%o1

	630 faddd $dota,$nloa,$nloa

	631 sllx %o2,32,%o2

	632 faddd $dotb,$nlob,$nlob

	633 sllx %o3,48,%o7

	634 or %o1,%o0,%o0

	635 faddd $ahic,$nhic,$dota ! $nhic

	636 or %o2,%o0,%o0

	637 faddd $ahid,$nhid,$dotb ! $nhid

	638 or %o7,%o0,%o0 ! 64-bit result

	639 ldx [$tp],%o7

	640 faddd $nloc,$nhia,$nloc

	641 addcc %o7,%o0,%o0

	642 ! end-of-why?

	643 faddd $nlod,$nhib,$nlod

	644 srlx %o3,16,%g1 ! 34-bit carry

	645 fdtox $nloa,$nloa

	646 bcs,a %xcc,.+8

	647 add %g1,1,%g1

	648

	649 fdtox $nlob,$nlob

	650 fdtox $nloc,$nloc

	651 fdtox $nlod,$nlod

	652

	653 std $nloa,[%sp+$bias+$frame+0]

	654 std $nlob,[%sp+$bias+$frame+8]

	655 addcc $j,8,$j

	656 std $nloc,[%sp+$bias+$frame+16]

	657 bz,pn %icc,.Linnerskip

	658 std $nlod,[%sp+$bias+$frame+24]

	659

	660 ba .Linner

	661 nop

	662 .align 32

	663 .Linner:

	664 ldd [$ap_l+$j],$alo ! load a[j] in double format

	665 ldd [$ap_h+$j],$ahi

	666 ldd [$np_l+$j],$nlo ! load n[j] in double format

	667 ldd [$np_h+$j],$nhi

	668

	669 fmuld $alo,$ba,$aloa

	670 fmuld $nlo,$na,$nloa

	671 fmuld $alo,$bb,$alob

	672 fmuld $nlo,$nb,$nlob

	673 fmuld $alo,$bc,$aloc

	674 ldx [%sp+$bias+$frame+0],%o0

	675 faddd $aloa,$nloa,$nloa

	676 fmuld $nlo,$nc,$nloc

	677 ldx [%sp+$bias+$frame+8],%o1

	678 fmuld $alo,$bd,$alod

	679 ldx [%sp+$bias+$frame+16],%o2

	680 faddd $alob,$nlob,$nlob

	681 fmuld $nlo,$nd,$nlod

	682 ldx [%sp+$bias+$frame+24],%o3

	683 fmuld $ahi,$ba,$ahia

	684

	685 srlx %o0,16,%o7

	686 faddd $aloc,$nloc,$nloc

	687 fmuld $nhi,$na,$nhia

	688 add %o7,%o1,%o1

	689 fmuld $ahi,$bb,$ahib

	690 srlx %o1,16,%o7

	691 faddd $alod,$nlod,$nlod

	692 fmuld $nhi,$nb,$nhib

	693 add %o7,%o2,%o2

	694 fmuld $ahi,$bc,$ahic

	695 srlx %o2,16,%o7

	696 faddd $ahia,$nhia,$nhia

	697 fmuld $nhi,$nc,$nhic

	698 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]

	699 and %o0,$mask,%o0

	700 fmuld $ahi,$bd,$ahid

	701 and %o1,$mask,%o1

	702 and %o2,$mask,%o2

	703 faddd $ahib,$nhib,$nhib

	704 fmuld $nhi,$nd,$nhid

	705 sllx %o1,16,%o1

	706 faddd $dota,$nloa,$nloa

	707 sllx %o2,32,%o2

	708 faddd $dotb,$nlob,$nlob

	709 sllx %o3,48,%o7

	710 or %o1,%o0,%o0

	711 faddd $ahic,$nhic,$dota ! $nhic

	712 or %o2,%o0,%o0

	713 faddd $ahid,$nhid,$dotb ! $nhid

	714 or %o7,%o0,%o0 ! 64-bit result

	715 faddd $nloc,$nhia,$nloc

	716 addcc %g1,%o0,%o0

	717 ldx [$tp+8],%o7 ! tp[j]

	718 faddd $nlod,$nhib,$nlod

	719 srlx %o3,16,%g1 ! 34-bit carry

	720 fdtox $nloa,$nloa

	721 bcs,a %xcc,.+8

	722 add %g1,1,%g1

	723 fdtox $nlob,$nlob

	724 addcc %o7,%o0,%o0

	725 fdtox $nloc,$nloc

	726 bcs,a %xcc,.+8

	727 add %g1,1,%g1

	728

	729 stx %o0,[$tp] ! tp[j-1]

	730 fdtox $nlod,$nlod

	731

	732 std $nloa,[%sp+$bias+$frame+0]

	733 std $nlob,[%sp+$bias+$frame+8]

	734 std $nloc,[%sp+$bias+$frame+16]

	735 addcc $j,8,$j

	736 std $nlod,[%sp+$bias+$frame+24]

	737 bnz,pt %icc,.Linner

	738 add $tp,8,$tp

	739

	740 .Linnerskip:

	741 fdtox $dota,$dota

	742 fdtox $dotb,$dotb

	743

	744 ldx [%sp+$bias+$frame+0],%o0

	745 ldx [%sp+$bias+$frame+8],%o1

	746 ldx [%sp+$bias+$frame+16],%o2

	747 ldx [%sp+$bias+$frame+24],%o3

	748

	749 srlx %o0,16,%o7

	750 std $dota,[%sp+$bias+$frame+32]

	751 add %o7,%o1,%o1

	752 std $dotb,[%sp+$bias+$frame+40]

	753 srlx %o1,16,%o7

	754 add %o7,%o2,%o2

	755 srlx %o2,16,%o7

	756 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]

	757 and %o0,$mask,%o0

	758 and %o1,$mask,%o1

	759 and %o2,$mask,%o2

	760 sllx %o1,16,%o1

	761 sllx %o2,32,%o2

	762 sllx %o3,48,%o7

	763 or %o1,%o0,%o0

	764 or %o2,%o0,%o0

	765 ldx [%sp+$bias+$frame+32],%o4

	766 or %o7,%o0,%o0 ! 64-bit result

	767 ldx [%sp+$bias+$frame+40],%o5

	768 addcc %g1,%o0,%o0

	769 ldx [$tp+8],%o7 ! tp[j]

	770 srlx %o3,16,%g1 ! 34-bit carry

	771 bcs,a %xcc,.+8

	772 add %g1,1,%g1

	773

	774 addcc %o7,%o0,%o0

	775 bcs,a %xcc,.+8

	776 add %g1,1,%g1

	777

	778 stx %o0,[$tp] ! tp[j-1]

	779 add $tp,8,$tp

	780

	781 srlx %o4,16,%o7

	782 add %o7,%o5,%o5

	783 and %o4,$mask,%o4

	784 sllx %o5,16,%o7

	785 or %o7,%o4,%o4

	786 addcc %g1,%o4,%o4

	787 srlx %o5,48,%g1

	788 bcs,a %xcc,.+8

	789 add %g1,1,%g1

	790

	791 addcc $carry,%o4,%o4

	792 stx %o4,[$tp] ! tp[num-1]

	793 mov %g1,$carry

	794 bcs,a %xcc,.+8

	795 add $carry,1,$carry

	796

	797 addcc $i,8,$i

	798 bnz %icc,.Louter

	799 nop

	800

	801 add $tp,8,$tp ! adjust tp to point at the end

	802 orn %g0,%g0,%g4

	803 sub %g0,$num,%o7 ! n=-num

	804 ba .Lsub

	805 subcc %g0,%g0,%g0 ! clear %icc.c

	806

	807 .align 32

	808 .Lsub:

	809 ldx [$tp+%o7],%o0

	810 add $np,%o7,%g1

	811 ld [%g1+0],%o2

	812 ld [%g1+4],%o3

	813 srlx %o0,32,%o1

	814 subccc %o0,%o2,%o2

	815 add $rp,%o7,%g1

	816 subccc %o1,%o3,%o3

	817 st %o2,[%g1+0]

	818 add %o7,8,%o7

	819 brnz,pt %o7,.Lsub

	820 st %o3,[%g1+4]

	821 subc $carry,0,%g4

	822 sub %g0,$num,%o7 ! n=-num

	823 ba .Lcopy

	824 nop

	825

	826 .align 32

	827 .Lcopy:

	828 ldx [$tp+%o7],%o0

	829 add $rp,%o7,%g1

	830 ld [%g1+0],%o2

	831 ld [%g1+4],%o3

	832 stx %g0,[$tp+%o7]

	833 and %o0,%g4,%o0

	834 srlx %o0,32,%o1

	835 andn %o2,%g4,%o2

	836 andn %o3,%g4,%o3

	837 or %o2,%o0,%o0

	838 or %o3,%o1,%o1

	839 st %o0,[%g1+0]

	840 add %o7,8,%o7

	841 brnz,pt %o7,.Lcopy

	842 st %o1,[%g1+4]

	843 sub %g0,$num,%o7 ! n=-num

	844

	845 .Lzap:

	846 stx %g0,[$ap_l+%o7]

	847 stx %g0,[$ap_h+%o7]

	848 stx %g0,[$np_l+%o7]

	849 stx %g0,[$np_h+%o7]

	850 add %o7,8,%o7

	851 brnz,pt %o7,.Lzap

	852 nop

	853

	854 ldx [%sp+$bias+$frame+48],%o7

	855 wr %g0,%o7,%asi ! restore %asi

	856

	857 mov 1,%i0

	858 .Lret:

	859 ret

	860 restore

	861 .type $fname,#function

	862 .size $fname,(.-$fname)

	863 .asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openss l.org>"

	864 .align 32

	865 ___

	866

	867 $code =~ s/\`([^\`]*)\`/eval($1)/gem;

	868

	869 # Below substitution makes it possible to compile without demanding

	870 # VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I

	871 # dare to do this, because VIS capability is detected at run-time now

	872 # and this routine is not called on CPU not capable to execute it. Do

	873 # note that fzeros is not the only VIS dependency! Another dependency

	874 # is implicit and is just _a_ numerical value loaded to %asi register,

	875 # which assembler can't recognize as VIS specific...

	876 $code =~ s/fzeros\s+%f([0-9]+)/

	877 sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20\|($1<<25),$1)

	878 /gem;

	879

	880 print $code;

	881 # flush

	882 close STDOUT;

OLD	NEW

« no previous file with comments | « openssl/crypto/bn/asm/sparcv9-mont.pl ('k') | openssl/crypto/bn/asm/via-mont.pl » ('j') | no next file with comments »