openssl/crypto/bn/asm/ppc.pl - Issue 9254031: Upgrade chrome's OpenSSL to same version Android ships with.

Side by Side Diff: openssl/crypto/bn/asm/ppc.pl

Issue 9254031: Upgrade chrome's OpenSSL to same version Android ships with. (Closed) Base URL: http://src.chromium.org/svn/trunk/deps/third_party/openssl/

Patch Set: '' Created 8 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 #!/usr/bin/env perl	1 #!/usr/bin/env perl

2 #	2 #

3 # Implemented as a Perl wrapper as we want to support several different	3 # Implemented as a Perl wrapper as we want to support several different

4 # architectures with single file. We pick up the target based on the	4 # architectures with single file. We pick up the target based on the

5 # file name we are asked to generate.	5 # file name we are asked to generate.

6 #	6 #

7 # It should be noted though that this perl code is nothing like	7 # It should be noted though that this perl code is nothing like

8 # <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much	8 # <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much

9 # as pre-processor to cover for platform differences in name decoration,	9 # as pre-processor to cover for platform differences in name decoration,

10 # linker tables, 32-/64-bit instruction sets...	10 # linker tables, 32-/64-bit instruction sets...

(...skipping 82 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
93 #dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2	93 #dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2

94 #dsa 1024 bits 0.0018s 0.0023s 545.0 442.2	94 #dsa 1024 bits 0.0018s 0.0023s 545.0 442.2

95 #dsa 2048 bits 0.0061s 0.0075s 163.5 132.8	95 #dsa 2048 bits 0.0061s 0.0075s 163.5 132.8

96 #	96 #

97 # Performance increase of ~60%	97 # Performance increase of ~60%

98 #	98 #

99 # If you have comments or suggestions to improve code send	99 # If you have comments or suggestions to improve code send

100 # me a note at schari@us.ibm.com	100 # me a note at schari@us.ibm.com

101 #	101 #

102	102

103 $opf = shift;	103 $flavour = shift;

104	104

105 if ($opf =~ /32\.s/) {	105 if ($flavour =~ /32/) {

106 $BITS= 32;	106 $BITS= 32;

107 $BNSZ= $BITS/8;	107 $BNSZ= $BITS/8;

108 $ISA= "\"ppc\"";	108 $ISA= "\"ppc\"";

109	109

110 $LD= "lwz"; # load	110 $LD= "lwz"; # load

111 $LDU= "lwzu"; # load and update	111 $LDU= "lwzu"; # load and update

112 $ST= "stw"; # store	112 $ST= "stw"; # store

113 $STU= "stwu"; # store and update	113 $STU= "stwu"; # store and update

114 $UMULL= "mullw"; # unsigned multiply low	114 $UMULL= "mullw"; # unsigned multiply low

115 $UMULH= "mulhwu"; # unsigned multiply high	115 $UMULH= "mulhwu"; # unsigned multiply high

116 $UDIV= "divwu"; # unsigned divide	116 $UDIV= "divwu"; # unsigned divide

117 $UCMPI= "cmplwi"; # unsigned compare with immediate	117 $UCMPI= "cmplwi"; # unsigned compare with immediate

118 $UCMP= "cmplw"; # unsigned compare	118 $UCMP= "cmplw"; # unsigned compare

119 $CNTLZ= "cntlzw"; # count leading zeros	119 $CNTLZ= "cntlzw"; # count leading zeros

120 $SHL= "slw"; # shift left	120 $SHL= "slw"; # shift left

121 $SHR= "srw"; # unsigned shift right	121 $SHR= "srw"; # unsigned shift right

122 $SHRI= "srwi"; # unsigned shift right by immediate	122 $SHRI= "srwi"; # unsigned shift right by immediate

123 $SHLI= "slwi"; # shift left by immediate	123 $SHLI= "slwi"; # shift left by immediate

124 $CLRU= "clrlwi"; # clear upper bits	124 $CLRU= "clrlwi"; # clear upper bits

125 $INSR= "insrwi"; # insert right	125 $INSR= "insrwi"; # insert right

126 $ROTL= "rotlwi"; # rotate left by immediate	126 $ROTL= "rotlwi"; # rotate left by immediate

127 $TR= "tw"; # conditional trap	127 $TR= "tw"; # conditional trap

128 } elsif ($opf =~ /64\.s/) {	128 } elsif ($flavour =~ /64/) {

129 $BITS= 64;	129 $BITS= 64;

130 $BNSZ= $BITS/8;	130 $BNSZ= $BITS/8;

131 $ISA= "\"ppc64\"";	131 $ISA= "\"ppc64\"";

132	132

133 # same as above, but 64-bit mnemonics...	133 # same as above, but 64-bit mnemonics...

134 $LD= "ld"; # load	134 $LD= "ld"; # load

135 $LDU= "ldu"; # load and update	135 $LDU= "ldu"; # load and update

136 $ST= "std"; # store	136 $ST= "std"; # store

137 $STU= "stdu"; # store and update	137 $STU= "stdu"; # store and update

138 $UMULL= "mulld"; # unsigned multiply low	138 $UMULL= "mulld"; # unsigned multiply low

139 $UMULH= "mulhdu"; # unsigned multiply high	139 $UMULH= "mulhdu"; # unsigned multiply high

140 $UDIV= "divdu"; # unsigned divide	140 $UDIV= "divdu"; # unsigned divide

141 $UCMPI= "cmpldi"; # unsigned compare with immediate	141 $UCMPI= "cmpldi"; # unsigned compare with immediate

142 $UCMP= "cmpld"; # unsigned compare	142 $UCMP= "cmpld"; # unsigned compare

143 $CNTLZ= "cntlzd"; # count leading zeros	143 $CNTLZ= "cntlzd"; # count leading zeros

144 $SHL= "sld"; # shift left	144 $SHL= "sld"; # shift left

145 $SHR= "srd"; # unsigned shift right	145 $SHR= "srd"; # unsigned shift right

146 $SHRI= "srdi"; # unsigned shift right by immediate	146 $SHRI= "srdi"; # unsigned shift right by immediate

147 $SHLI= "sldi"; # shift left by immediate	147 $SHLI= "sldi"; # shift left by immediate

148 $CLRU= "clrldi"; # clear upper bits	148 $CLRU= "clrldi"; # clear upper bits

149 $INSR= "insrdi"; # insert right	149 $INSR= "insrdi"; # insert right

150 $ROTL= "rotldi"; # rotate left by immediate	150 $ROTL= "rotldi"; # rotate left by immediate

151 $TR= "td"; # conditional trap	151 $TR= "td"; # conditional trap

152 } else { die "nonsense $opf"; }	152 } else { die "nonsense $flavour"; }

153	153

154 ( defined shift \|\| open STDOUT,">$opf" ) \|\| die "can't open $opf: $!";	154 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;

	155 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or

	156 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or

	157 die "can't locate ppc-xlate.pl";

155	158

156 # function entry points from the AIX code	159 open STDOUT,"\| $^X $xlate $flavour ".shift \|\| die "can't call $xlate: $!";

157 #

158 # There are other, more elegant, ways to handle this. We (IBM) chose

159 # this approach as it plays well with scripts we run to 'namespace'

160 # OpenSSL .i.e. we add a prefix to all the public symbols so we can

161 # co-exist in the same process with other implementations of OpenSSL.

162 # 'cleverer' ways of doing these substitutions tend to hide data we

163 # need to be obvious.

164 #

165 my @items = ("bn_sqr_comba4",

166 » "bn_sqr_comba8",

167 » "bn_mul_comba4",

168 » "bn_mul_comba8",

169 » "bn_sub_words",

170 » "bn_add_words",

171 » "bn_div_words",

172 » "bn_sqr_words",

173 » "bn_mul_words",

174 » "bn_mul_add_words");

175	160

176 if ($opf =~ /linux/)»{ do_linux();» }	161 $data=<<EOF;

177 elsif ($opf =~ /aix/)» { do_aix();» }

178 elsif ($opf =~ /osx/)» { do_osx();» }

179 else» » » { do_bsd();» }

180

181 sub do_linux {

182 $d=&data();

183

184 if ($BITS==64) {

185 foreach $t (@items) {

186 $d =~ s/\.$t:/\

187 \t.section\t".opd","aw"\

188 \t.align\t3\

189 \t.globl\t$t\

190 $t:\

191 \t.quad\t.$t,.TOC.\@tocbase,0\

192 \t.size\t$t,24\

193 \t.previous\n\

194 \t.type\t.$t,\@function\

195 \t.globl\t.$t\

196 .$t:/g;

197 }

198 }

199 else {

200 foreach $t (@items) {

201 $d=~s/\.$t/$t/g;

202 }

203 }

204 # hide internal labels to avoid pollution of name table...

205 $d=~s/Lppcasm_/.Lppcasm_/gm;

206 print $d;

207 }

208

209 sub do_aix {

210 # AIX assembler is smart enough to please the linker without

211 # making us do something special...

212 print &data();

213 }

214

215 # MacOSX 32 bit

216 sub do_osx {

217 $d=&data();

218 # Change the bn symbol prefix from '.' to '_'

219 foreach $t (@items) {

220 $d=~s/\.$t/_$t/g;

221 }

222 # Change .machine to something OS X asm will accept

223 $d=~s/\.machine.*/.text/g;

224 $d=~s/\#/;/g; # change comment from '#' to ';'

225 print $d;

226 }

227

228 # BSD (Untested)

229 sub do_bsd {

230 $d=&data();

231 foreach $t (@items) {

232 $d=~s/\.$t/_$t/g;

233 }

234 print $d;

235 }

236

237 sub data {

238 » local($data)=<<EOF;

239 #--------------------------------------------------------------------	162 #--------------------------------------------------------------------

240 #	163 #

241 #	164 #

242 #	165 #

243 #	166 #

244 # File: ppc32.s	167 # File: ppc32.s

245 #	168 #

246 # Created by: Suresh Chari	169 # Created by: Suresh Chari

247 # IBM Thomas J. Watson Research Library	170 # IBM Thomas J. Watson Research Library

248 # Hawthorne, NY	171 # Hawthorne, NY

(...skipping 41 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
290 # architecture the optimizations in this file do	213 # architecture the optimizations in this file do

291 # NOT provide much improvement.	214 # NOT provide much improvement.

292 #	215 #

293 # If you have comments or suggestions to improve code send	216 # If you have comments or suggestions to improve code send

294 # me a note at schari\@us.ibm.com	217 # me a note at schari\@us.ibm.com

295 #	218 #

296 #--------------------------------------------------------------------------	219 #--------------------------------------------------------------------------

297 #	220 #

298 # Defines to be used in the assembly code.	221 # Defines to be used in the assembly code.

299 #	222 #

300 .set r0,0» # we use it as storage for value of 0	223 #.set r0,0» # we use it as storage for value of 0

301 .set SP,1» # preserved	224 #.set SP,1» # preserved

302 .set RTOC,2» # preserved	225 #.set RTOC,2» # preserved

303 .set r3,3» # 1st argument/return value	226 #.set r3,3» # 1st argument/return value

304 .set r4,4» # 2nd argument/volatile register	227 #.set r4,4» # 2nd argument/volatile register

305 .set r5,5» # 3rd argument/volatile register	228 #.set r5,5» # 3rd argument/volatile register

306 .set r6,6» # ...	229 #.set r6,6» # ...

307 .set r7,7	230 #.set r7,7

308 .set r8,8	231 #.set r8,8

309 .set r9,9	232 #.set r9,9

310 .set r10,10	233 #.set r10,10

311 .set r11,11	234 #.set r11,11

312 .set r12,12	235 #.set r12,12

313 .set r13,13» # not used, nor any other "below" it...	236 #.set r13,13» # not used, nor any other "below" it...

314

315 .set BO_IF_NOT,4

316 .set BO_IF,12

317 .set BO_dCTR_NZERO,16

318 .set BO_dCTR_ZERO,18

319 .set BO_ALWAYS,20

320 .set CR0_LT,0;

321 .set CR0_GT,1;

322 .set CR0_EQ,2

323 .set CR1_FX,4;

324 .set CR1_FEX,5;

325 .set CR1_VX,6

326 .set LR,8

327	237

328 # Declare function names to be global	238 # Declare function names to be global

329 # NOTE: For gcc these names MUST be changed to remove	239 # NOTE: For gcc these names MUST be changed to remove

330 # the first . i.e. for example change ".bn_sqr_comba4"	240 # the first . i.e. for example change ".bn_sqr_comba4"

331 # to "bn_sqr_comba4". This should be automatically done	241 # to "bn_sqr_comba4". This should be automatically done

332 # in the build.	242 # in the build.

333	243

334 .globl .bn_sqr_comba4	244 .globl .bn_sqr_comba4

335 .globl .bn_sqr_comba8	245 .globl .bn_sqr_comba8

336 .globl .bn_mul_comba4	246 .globl .bn_mul_comba4

337 .globl .bn_mul_comba8	247 .globl .bn_mul_comba8

338 .globl .bn_sub_words	248 .globl .bn_sub_words

339 .globl .bn_add_words	249 .globl .bn_add_words

340 .globl .bn_div_words	250 .globl .bn_div_words

341 .globl .bn_sqr_words	251 .globl .bn_sqr_words

342 .globl .bn_mul_words	252 .globl .bn_mul_words

343 .globl .bn_mul_add_words	253 .globl .bn_mul_add_words

344	254

345 # .text section	255 # .text section

346	256

347 » .machine» $ISA	257 » .machine» "any"

348	258

349 #	259 #

350 # NOTE: The following label name should be changed to	260 # NOTE: The following label name should be changed to

351 # "bn_sqr_comba4" i.e. remove the first dot	261 # "bn_sqr_comba4" i.e. remove the first dot

352 # for the gcc compiler. This should be automatically	262 # for the gcc compiler. This should be automatically

353 # done in the build	263 # done in the build

354 #	264 #

355	265

356 .align 4	266 .align 4

357 .bn_sqr_comba4:	267 .bn_sqr_comba4:

(...skipping 113 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
471 addze r10,r10	381 addze r10,r10

472 $ST r11,`5*$BNSZ`(r3) #r[5] = c3	382 $ST r11,`5*$BNSZ`(r3) #r[5] = c3

473 #sqr_add_c(a,3,c1,c2,c3);	383 #sqr_add_c(a,3,c1,c2,c3);

474 $UMULL r7,r6,r6	384 $UMULL r7,r6,r6

475 $UMULH r8,r6,r6	385 $UMULH r8,r6,r6

476 addc r9,r7,r9	386 addc r9,r7,r9

477 adde r10,r8,r10	387 adde r10,r8,r10

478	388

479 $ST r9,`6*$BNSZ`(r3) #r[6]=c1	389 $ST r9,`6*$BNSZ`(r3) #r[6]=c1

480 $ST r10,`7*$BNSZ`(r3) #r[7]=c2	390 $ST r10,`7*$BNSZ`(r3) #r[7]=c2

481 » bclr» BO_ALWAYS,CR0_LT	391 » blr

482 .long 0x00000000	392 .long 0x00000000

483	393

484 #	394 #

485 # NOTE: The following label name should be changed to	395 # NOTE: The following label name should be changed to

486 # "bn_sqr_comba8" i.e. remove the first dot	396 # "bn_sqr_comba8" i.e. remove the first dot

487 # for the gcc compiler. This should be automatically	397 # for the gcc compiler. This should be automatically

488 # done in the build	398 # done in the build

489 #	399 #

490	400

491 .align 4	401 .align 4

(...skipping 404 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
896 $ST r10,`13*$BNSZ`(r3) #r[13]=c2;	806 $ST r10,`13*$BNSZ`(r3) #r[13]=c2;

897 #sqr_add_c(a,7,c3,c1,c2);	807 #sqr_add_c(a,7,c3,c1,c2);

898 $UMULL r7,r6,r6	808 $UMULL r7,r6,r6

899 $UMULH r8,r6,r6	809 $UMULH r8,r6,r6

900 addc r11,r7,r11	810 addc r11,r7,r11

901 adde r9,r8,r9	811 adde r9,r8,r9

902 $ST r11,`14*$BNSZ`(r3) #r[14]=c3;	812 $ST r11,`14*$BNSZ`(r3) #r[14]=c3;

903 $ST r9, `15*$BNSZ`(r3) #r[15]=c1;	813 $ST r9, `15*$BNSZ`(r3) #r[15]=c1;

904	814

905	815

906 » bclr» BO_ALWAYS,CR0_LT	816 » blr

907	817

908 .long 0x00000000	818 .long 0x00000000

909	819

910 #	820 #

911 # NOTE: The following label name should be changed to	821 # NOTE: The following label name should be changed to

912 # "bn_mul_comba4" i.e. remove the first dot	822 # "bn_mul_comba4" i.e. remove the first dot

913 # for the gcc compiler. This should be automatically	823 # for the gcc compiler. This should be automatically

914 # done in the build	824 # done in the build

915 #	825 #

916	826

(...skipping 115 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1032 $ST r11,`4*$BNSZ`(r3) #r[4]=c2	942 $ST r11,`4*$BNSZ`(r3) #r[4]=c2

1033 #mul_add_c(a[2],b[3],c3,c1,c2);	943 #mul_add_c(a[2],b[3],c3,c1,c2);

1034 $LD r6,`2*$BNSZ`(r4)	944 $LD r6,`2*$BNSZ`(r4)

1035 $UMULL r8,r6,r7	945 $UMULL r8,r6,r7

1036 $UMULH r9,r6,r7	946 $UMULH r9,r6,r7

1037 addc r12,r8,r12	947 addc r12,r8,r12

1038 adde r10,r9,r10	948 adde r10,r9,r10

1039 addze r11,r0	949 addze r11,r0

1040 #mul_add_c(a[3],b[2],c3,c1,c2);	950 #mul_add_c(a[3],b[2],c3,c1,c2);

1041 $LD r6,`3*$BNSZ`(r4)	951 $LD r6,`3*$BNSZ`(r4)

1042 » $LD» r7,`2*$BNSZ`(r4)	952 » $LD» r7,`2*$BNSZ`(r5)

1043 $UMULL r8,r6,r7	953 $UMULL r8,r6,r7

1044 $UMULH r9,r6,r7	954 $UMULH r9,r6,r7

1045 addc r12,r8,r12	955 addc r12,r8,r12

1046 adde r10,r9,r10	956 adde r10,r9,r10

1047 addze r11,r11	957 addze r11,r11

1048 $ST r12,`5*$BNSZ`(r3) #r[5]=c3	958 $ST r12,`5*$BNSZ`(r3) #r[5]=c3

1049 #mul_add_c(a[3],b[3],c1,c2,c3);	959 #mul_add_c(a[3],b[3],c1,c2,c3);

1050 $LD r7,`3*$BNSZ`(r5)	960 $LD r7,`3*$BNSZ`(r5)

1051 $UMULL r8,r6,r7	961 $UMULL r8,r6,r7

1052 $UMULH r9,r6,r7	962 $UMULH r9,r6,r7

1053 addc r10,r8,r10	963 addc r10,r8,r10

1054 adde r11,r9,r11	964 adde r11,r9,r11

1055	965

1056 $ST r10,`6*$BNSZ`(r3) #r[6]=c1	966 $ST r10,`6*$BNSZ`(r3) #r[6]=c1

1057 $ST r11,`7*$BNSZ`(r3) #r[7]=c2	967 $ST r11,`7*$BNSZ`(r3) #r[7]=c2

1058 » bclr» BO_ALWAYS,CR0_LT	968 » blr

1059 .long 0x00000000	969 .long 0x00000000

1060	970

1061 #	971 #

1062 # NOTE: The following label name should be changed to	972 # NOTE: The following label name should be changed to

1063 # "bn_mul_comba8" i.e. remove the first dot	973 # "bn_mul_comba8" i.e. remove the first dot

1064 # for the gcc compiler. This should be automatically	974 # for the gcc compiler. This should be automatically

1065 # done in the build	975 # done in the build

1066 #	976 #

1067	977

1068 .align 4	978 .align 4

(...skipping 515 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1584 addze r10,r10	1494 addze r10,r10

1585 $ST r11,`13*$BNSZ`(r3) #r[13]=c2;	1495 $ST r11,`13*$BNSZ`(r3) #r[13]=c2;

1586 #mul_add_c(a[7],b[7],c3,c1,c2);	1496 #mul_add_c(a[7],b[7],c3,c1,c2);

1587 $LD r7,`7*$BNSZ`(r5)	1497 $LD r7,`7*$BNSZ`(r5)

1588 $UMULL r8,r6,r7	1498 $UMULL r8,r6,r7

1589 $UMULH r9,r6,r7	1499 $UMULH r9,r6,r7

1590 addc r12,r12,r8	1500 addc r12,r12,r8

1591 adde r10,r10,r9	1501 adde r10,r10,r9

1592 $ST r12,`14*$BNSZ`(r3) #r[14]=c3;	1502 $ST r12,`14*$BNSZ`(r3) #r[14]=c3;

1593 $ST r10,`15*$BNSZ`(r3) #r[15]=c1;	1503 $ST r10,`15*$BNSZ`(r3) #r[15]=c1;

1594 » bclr» BO_ALWAYS,CR0_LT	1504 » blr

1595 .long 0x00000000	1505 .long 0x00000000

1596	1506

1597 #	1507 #

1598 # NOTE: The following label name should be changed to	1508 # NOTE: The following label name should be changed to

1599 # "bn_sub_words" i.e. remove the first dot	1509 # "bn_sub_words" i.e. remove the first dot

1600 # for the gcc compiler. This should be automatically	1510 # for the gcc compiler. This should be automatically

1601 # done in the build	1511 # done in the build

1602 #	1512 #

1603 #	1513 #

1604 .align 4	1514 .align 4

(...skipping 11 matching lines...) Expand all Loading...
1616 # Note: No loop unrolling done since this is not a performance	1526 # Note: No loop unrolling done since this is not a performance

1617 # critical loop.	1527 # critical loop.

1618	1528

1619 xor r0,r0,r0 #set r0 = 0	1529 xor r0,r0,r0 #set r0 = 0

1620 #	1530 #

1621 # check for r6 = 0 AND set carry bit.	1531 # check for r6 = 0 AND set carry bit.

1622 #	1532 #

1623 subfc. r7,r0,r6 # If r6 is 0 then result is 0.	1533 subfc. r7,r0,r6 # If r6 is 0 then result is 0.

1624 # if r6 > 0 then result !=0	1534 # if r6 > 0 then result !=0

1625 # In either case carry bit is set.	1535 # In either case carry bit is set.

1626 » bc» BO_IF,CR0_EQ,Lppcasm_sub_adios	1536 » beq» Lppcasm_sub_adios

1627 addi r4,r4,-$BNSZ	1537 addi r4,r4,-$BNSZ

1628 addi r3,r3,-$BNSZ	1538 addi r3,r3,-$BNSZ

1629 addi r5,r5,-$BNSZ	1539 addi r5,r5,-$BNSZ

1630 mtctr r6	1540 mtctr r6

1631 Lppcasm_sub_mainloop:	1541 Lppcasm_sub_mainloop:

1632 $LDU r7,$BNSZ(r4)	1542 $LDU r7,$BNSZ(r4)

1633 $LDU r8,$BNSZ(r5)	1543 $LDU r8,$BNSZ(r5)

1634 subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8)	1544 subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8)

1635 # if carry = 1 this is r7-r8. Else it	1545 # if carry = 1 this is r7-r8. Else it

1636 # is r7-r8 -1 as we need.	1546 # is r7-r8 -1 as we need.

1637 $STU r6,$BNSZ(r3)	1547 $STU r6,$BNSZ(r3)

1638 » bc» BO_dCTR_NZERO,CR0_EQ,Lppcasm_sub_mainloop	1548 » bdnz-» Lppcasm_sub_mainloop

1639 Lppcasm_sub_adios:	1549 Lppcasm_sub_adios:

1640 subfze r3,r0 # if carry bit is set then r3 = 0 else -1	1550 subfze r3,r0 # if carry bit is set then r3 = 0 else -1

1641 andi. r3,r3,1 # keep only last bit.	1551 andi. r3,r3,1 # keep only last bit.

1642 » bclr» BO_ALWAYS,CR0_LT	1552 » blr

1643 .long 0x00000000	1553 .long 0x00000000

1644	1554

1645	1555

1646 #	1556 #

1647 # NOTE: The following label name should be changed to	1557 # NOTE: The following label name should be changed to

1648 # "bn_add_words" i.e. remove the first dot	1558 # "bn_add_words" i.e. remove the first dot

1649 # for the gcc compiler. This should be automatically	1559 # for the gcc compiler. This should be automatically

1650 # done in the build	1560 # done in the build

1651 #	1561 #

1652	1562

(...skipping 10 matching lines...) Expand all Loading...
1663 # r6 = n	1573 # r6 = n

1664 #	1574 #

1665 # Note: No loop unrolling done since this is not a performance	1575 # Note: No loop unrolling done since this is not a performance

1666 # critical loop.	1576 # critical loop.

1667	1577

1668 xor r0,r0,r0	1578 xor r0,r0,r0

1669 #	1579 #

1670 # check for r6 = 0. Is this needed?	1580 # check for r6 = 0. Is this needed?

1671 #	1581 #

1672 addic. r6,r6,0 #test r6 and clear carry bit.	1582 addic. r6,r6,0 #test r6 and clear carry bit.

1673 » bc» BO_IF,CR0_EQ,Lppcasm_add_adios	1583 » beq» Lppcasm_add_adios

1674 addi r4,r4,-$BNSZ	1584 addi r4,r4,-$BNSZ

1675 addi r3,r3,-$BNSZ	1585 addi r3,r3,-$BNSZ

1676 addi r5,r5,-$BNSZ	1586 addi r5,r5,-$BNSZ

1677 mtctr r6	1587 mtctr r6

1678 Lppcasm_add_mainloop:	1588 Lppcasm_add_mainloop:

1679 $LDU r7,$BNSZ(r4)	1589 $LDU r7,$BNSZ(r4)

1680 $LDU r8,$BNSZ(r5)	1590 $LDU r8,$BNSZ(r5)

1681 adde r8,r7,r8	1591 adde r8,r7,r8

1682 $STU r8,$BNSZ(r3)	1592 $STU r8,$BNSZ(r3)

1683 » bc» BO_dCTR_NZERO,CR0_EQ,Lppcasm_add_mainloop	1593 » bdnz-» Lppcasm_add_mainloop

1684 Lppcasm_add_adios:	1594 Lppcasm_add_adios:

1685 addze r3,r0 #return carry bit.	1595 addze r3,r0 #return carry bit.

1686 » bclr» BO_ALWAYS,CR0_LT	1596 » blr

1687 .long 0x00000000	1597 .long 0x00000000

1688	1598

1689 #	1599 #

1690 # NOTE: The following label name should be changed to	1600 # NOTE: The following label name should be changed to

1691 # "bn_div_words" i.e. remove the first dot	1601 # "bn_div_words" i.e. remove the first dot

1692 # for the gcc compiler. This should be automatically	1602 # for the gcc compiler. This should be automatically

1693 # done in the build	1603 # done in the build

1694 #	1604 #

1695	1605

1696 .align 4	1606 .align 4

1697 .bn_div_words:	1607 .bn_div_words:

1698 #	1608 #

1699 # This is a cleaned up version of code generated by	1609 # This is a cleaned up version of code generated by

1700 # the AIX compiler. The only optimization is to use	1610 # the AIX compiler. The only optimization is to use

1701 # the PPC instruction to count leading zeros instead	1611 # the PPC instruction to count leading zeros instead

1702 # of call to num_bits_word. Since this was compiled	1612 # of call to num_bits_word. Since this was compiled

1703 # only at level -O2 we can possibly squeeze it more?	1613 # only at level -O2 we can possibly squeeze it more?

1704 #	1614 #

1705 # r3 = h	1615 # r3 = h

1706 # r4 = l	1616 # r4 = l

1707 # r5 = d	1617 # r5 = d

1708	1618

1709 $UCMPI 0,r5,0 # compare r5 and 0	1619 $UCMPI 0,r5,0 # compare r5 and 0

1710 » bc» BO_IF_NOT,CR0_EQ,Lppcasm_div1» # proceed if d!=0	1620 » bne» Lppcasm_div1» » # proceed if d!=0

1711 li r3,-1 # d=0 return -1	1621 li r3,-1 # d=0 return -1

1712 » bclr» BO_ALWAYS,CR0_LT»	1622 » blr

1713 Lppcasm_div1:	1623 Lppcasm_div1:

1714 xor r0,r0,r0 #r0=0	1624 xor r0,r0,r0 #r0=0

1715 li r8,$BITS	1625 li r8,$BITS

1716 $CNTLZ. r7,r5 #r7 = num leading 0s in d.	1626 $CNTLZ. r7,r5 #r7 = num leading 0s in d.

1717 » bc» BO_IF,CR0_EQ,Lppcasm_div2» #proceed if no leading zeros	1627 » beq» Lppcasm_div2» » #proceed if no leading zeros

1718 subf r8,r7,r8 #r8 = BN_num_bits_word(d)	1628 subf r8,r7,r8 #r8 = BN_num_bits_word(d)

1719 $SHR. r9,r3,r8 #are there any bits above r8'th?	1629 $SHR. r9,r3,r8 #are there any bits above r8'th?

1720 $TR 16,r9,r0 #if there're, signal to dump core...	1630 $TR 16,r9,r0 #if there're, signal to dump core...

1721 Lppcasm_div2:	1631 Lppcasm_div2:

1722 $UCMP 0,r3,r5 #h>=d?	1632 $UCMP 0,r3,r5 #h>=d?

1723 » bc» BO_IF,CR0_LT,Lppcasm_div3» #goto Lppcasm_div3 if not	1633 » blt» Lppcasm_div3» » #goto Lppcasm_div3 if not

1724 subf r3,r5,r3 #h-=d ;	1634 subf r3,r5,r3 #h-=d ;

1725 Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i	1635 Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i

1726 cmpi 0,0,r7,0 # is (i == 0)?	1636 cmpi 0,0,r7,0 # is (i == 0)?

1727 » bc» BO_IF,CR0_EQ,Lppcasm_div4	1637 » beq» Lppcasm_div4

1728 $SHL r3,r3,r7 # h = (h<< i)	1638 $SHL r3,r3,r7 # h = (h<< i)

1729 $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i)	1639 $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i)

1730 $SHL r5,r5,r7 # d<<=i	1640 $SHL r5,r5,r7 # d<<=i

1731 or r3,r3,r8 # h = (h<<i)\|(l>>(BN_BITS2-i))	1641 or r3,r3,r8 # h = (h<<i)\|(l>>(BN_BITS2-i))

1732 $SHL r4,r4,r7 # l <<=i	1642 $SHL r4,r4,r7 # l <<=i

1733 Lppcasm_div4:	1643 Lppcasm_div4:

1734 $SHRI r9,r5,`$BITS/2` # r9 = dh	1644 $SHRI r9,r5,`$BITS/2` # r9 = dh

1735 # dl will be computed when needed	1645 # dl will be computed when needed

1736 # as it saves registers.	1646 # as it saves registers.

1737 li r6,2 #r6=2	1647 li r6,2 #r6=2

1738 mtctr r6 #counter will be in count.	1648 mtctr r6 #counter will be in count.

1739 Lppcasm_divouterloop:	1649 Lppcasm_divouterloop:

1740 $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4)	1650 $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4)

1741 $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4	1651 $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4

1742 # compute here for innerloop.	1652 # compute here for innerloop.

1743 $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh	1653 $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh

1744 » bc» BO_IF_NOT,CR0_EQ,Lppcasm_div5» # goto Lppcasm_div5 if not	1654 » bne» Lppcasm_div5» » # goto Lppcasm_div5 if not

1745	1655

1746 li r8,-1	1656 li r8,-1

1747 $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l	1657 $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l

1748 b Lppcasm_div6	1658 b Lppcasm_div6

1749 Lppcasm_div5:	1659 Lppcasm_div5:

1750 $UDIV r8,r3,r9 #q = h/dh	1660 $UDIV r8,r3,r9 #q = h/dh

1751 Lppcasm_div6:	1661 Lppcasm_div6:

1752 $UMULL r12,r9,r8 #th = q*dh	1662 $UMULL r12,r9,r8 #th = q*dh

1753 $CLRU r10,r5,`$BITS/2` #r10=dl	1663 $CLRU r10,r5,`$BITS/2` #r10=dl

1754 $UMULL r6,r8,r10 #tl = q*dl	1664 $UMULL r6,r8,r10 #tl = q*dl

1755	1665

1756 Lppcasm_divinnerloop:	1666 Lppcasm_divinnerloop:

1757 subf r10,r12,r3 #t = h -th	1667 subf r10,r12,r3 #t = h -th

1758 $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of...	1668 $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of...

1759 addic. r7,r7,0 #test if r7 == 0. used below.	1669 addic. r7,r7,0 #test if r7 == 0. used below.

1760 # now want to compute	1670 # now want to compute

1761 # r7 = (t<<BN_BITS4)\|((l&BN_MASK2h)>>BN_ BITS4)	1671 # r7 = (t<<BN_BITS4)\|((l&BN_MASK2h)>>BN_ BITS4)

1762 # the following 2 instructions do that	1672 # the following 2 instructions do that

1763 $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4)	1673 $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4)

1764 or r7,r7,r11 # r7\|=((l&BN_MASK2h)>>BN_BITS4)	1674 or r7,r7,r11 # r7\|=((l&BN_MASK2h)>>BN_BITS4)

1765 » $UCMP» 1,r6,r7»» » # compare (tl <= r7)	1675 » $UCMP» cr1,r6,r7» » # compare (tl <= r7)

1766 » bc» BO_IF_NOT,CR0_EQ,Lppcasm_divinnerexit	1676 » bne» Lppcasm_divinnerexit

1767 » bc» BO_IF_NOT,CR1_FEX,Lppcasm_divinnerexit	1677 » ble» cr1,Lppcasm_divinnerexit

1768 addi r8,r8,-1 #q--	1678 addi r8,r8,-1 #q--

1769 subf r12,r9,r12 #th -=dh	1679 subf r12,r9,r12 #th -=dh

1770 $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop.	1680 $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop.

1771 subf r6,r10,r6 #tl -=dl	1681 subf r6,r10,r6 #tl -=dl

1772 b Lppcasm_divinnerloop	1682 b Lppcasm_divinnerloop

1773 Lppcasm_divinnerexit:	1683 Lppcasm_divinnerexit:

1774 $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4)	1684 $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4)

1775 $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h;	1685 $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h;

1776 » $UCMP» 1,r4,r11» » # compare l and tl	1686 » $UCMP» cr1,r4,r11» » # compare l and tl

1777 add r12,r12,r10 # th+=t	1687 add r12,r12,r10 # th+=t

1778 » bc» BO_IF_NOT,CR1_FX,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7	1688 » bge» cr1,Lppcasm_div7» # if (l>=tl) goto Lppcasm_div7

1779 addi r12,r12,1 # th++	1689 addi r12,r12,1 # th++

1780 Lppcasm_div7:	1690 Lppcasm_div7:

1781 subf r11,r11,r4 #r11=l-tl	1691 subf r11,r11,r4 #r11=l-tl

1782 » $UCMP» 1,r3,r12» » #compare h and th	1692 » $UCMP» cr1,r3,r12» » #compare h and th

1783 » bc» BO_IF_NOT,CR1_FX,Lppcasm_div8» #if (h>=th) goto Lppcasm_div8	1693 » bge» cr1,Lppcasm_div8» #if (h>=th) goto Lppcasm_div8

1784 addi r8,r8,-1 # q--	1694 addi r8,r8,-1 # q--

1785 add r3,r5,r3 # h+=d	1695 add r3,r5,r3 # h+=d

1786 Lppcasm_div8:	1696 Lppcasm_div8:

1787 subf r12,r12,r3 #r12 = h-th	1697 subf r12,r12,r3 #r12 = h-th

1788 $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4	1698 $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4

1789 # want to compute	1699 # want to compute

1790 # h = ((h<<BN_BITS4)\|(l>>BN_BITS4))&BN_M ASK2	1700 # h = ((h<<BN_BITS4)\|(l>>BN_BITS4))&BN_M ASK2

1791 # the following 2 instructions will do t his.	1701 # the following 2 instructions will do t his.

1792 $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotat ed $BITS/2.	1702 $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotat ed $BITS/2.

1793 $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3	1703 $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3

1794 » bc» BO_dCTR_ZERO,CR0_EQ,Lppcasm_div9#if (count==0) break ;	1704 » bdz» Lppcasm_div9» » #if (count==0) break ;

1795 $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4	1705 $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4

1796 b Lppcasm_divouterloop	1706 b Lppcasm_divouterloop

1797 Lppcasm_div9:	1707 Lppcasm_div9:

1798 or r3,r8,r0	1708 or r3,r8,r0

1799 » bclr» BO_ALWAYS,CR0_LT	1709 » blr

1800 .long 0x00000000	1710 .long 0x00000000

1801	1711

1802 #	1712 #

1803 # NOTE: The following label name should be changed to	1713 # NOTE: The following label name should be changed to

1804 # "bn_sqr_words" i.e. remove the first dot	1714 # "bn_sqr_words" i.e. remove the first dot

1805 # for the gcc compiler. This should be automatically	1715 # for the gcc compiler. This should be automatically

1806 # done in the build	1716 # done in the build

1807 #	1717 #

1808 .align 4	1718 .align 4

1809 .bn_sqr_words:	1719 .bn_sqr_words:

1810 #	1720 #

1811 # Optimized version of bn_sqr_words	1721 # Optimized version of bn_sqr_words

1812 #	1722 #

1813 # void bn_sqr_words(BN_ULONG r, BN_ULONG a, int n)	1723 # void bn_sqr_words(BN_ULONG r, BN_ULONG a, int n)

1814 #	1724 #

1815 # r3 = r	1725 # r3 = r

1816 # r4 = a	1726 # r4 = a

1817 # r5 = n	1727 # r5 = n

1818 #	1728 #

1819 # r6 = a[i].	1729 # r6 = a[i].

1820 # r7,r8 = product.	1730 # r7,r8 = product.

1821 #	1731 #

1822 # No unrolling done here. Not performance critical.	1732 # No unrolling done here. Not performance critical.

1823	1733

1824 addic. r5,r5,0 #test r5.	1734 addic. r5,r5,0 #test r5.

1825 » bc» BO_IF,CR0_EQ,Lppcasm_sqr_adios	1735 » beq» Lppcasm_sqr_adios

1826 addi r4,r4,-$BNSZ	1736 addi r4,r4,-$BNSZ

1827 addi r3,r3,-$BNSZ	1737 addi r3,r3,-$BNSZ

1828 mtctr r5	1738 mtctr r5

1829 Lppcasm_sqr_mainloop:	1739 Lppcasm_sqr_mainloop:

1830 #sqr(r[0],r[1],a[0]);	1740 #sqr(r[0],r[1],a[0]);

1831 $LDU r6,$BNSZ(r4)	1741 $LDU r6,$BNSZ(r4)

1832 $UMULL r7,r6,r6	1742 $UMULL r7,r6,r6

1833 $UMULH r8,r6,r6	1743 $UMULH r8,r6,r6

1834 $STU r7,$BNSZ(r3)	1744 $STU r7,$BNSZ(r3)

1835 $STU r8,$BNSZ(r3)	1745 $STU r8,$BNSZ(r3)

1836 » bc» BO_dCTR_NZERO,CR0_EQ,Lppcasm_sqr_mainloop	1746 » bdnz-» Lppcasm_sqr_mainloop

1837 Lppcasm_sqr_adios:	1747 Lppcasm_sqr_adios:

1838 » bclr» BO_ALWAYS,CR0_LT	1748 » blr

1839 .long 0x00000000	1749 .long 0x00000000

1840	1750

1841	1751

1842 #	1752 #

1843 # NOTE: The following label name should be changed to	1753 # NOTE: The following label name should be changed to

1844 # "bn_mul_words" i.e. remove the first dot	1754 # "bn_mul_words" i.e. remove the first dot

1845 # for the gcc compiler. This should be automatically	1755 # for the gcc compiler. This should be automatically

1846 # done in the build	1756 # done in the build

1847 #	1757 #

1848	1758

1849 .align 4	1759 .align 4

1850 .bn_mul_words:	1760 .bn_mul_words:

1851 #	1761 #

1852 # BN_ULONG bn_mul_words(BN_ULONG rp, BN_ULONG ap, int num, BN_ULONG w)	1762 # BN_ULONG bn_mul_words(BN_ULONG rp, BN_ULONG ap, int num, BN_ULONG w)

1853 #	1763 #

1854 # r3 = rp	1764 # r3 = rp

1855 # r4 = ap	1765 # r4 = ap

1856 # r5 = num	1766 # r5 = num

1857 # r6 = w	1767 # r6 = w

1858 xor r0,r0,r0	1768 xor r0,r0,r0

1859 xor r12,r12,r12 # used for carry	1769 xor r12,r12,r12 # used for carry

1860 rlwinm. r7,r5,30,2,31 # num >> 2	1770 rlwinm. r7,r5,30,2,31 # num >> 2

1861 » bc» BO_IF,CR0_EQ,Lppcasm_mw_REM	1771 » beq» Lppcasm_mw_REM

1862 mtctr r7	1772 mtctr r7

1863 Lppcasm_mw_LOOP:	1773 Lppcasm_mw_LOOP:

1864 #mul(rp[0],ap[0],w,c1);	1774 #mul(rp[0],ap[0],w,c1);

1865 $LD r8,`0*$BNSZ`(r4)	1775 $LD r8,`0*$BNSZ`(r4)

1866 $UMULL r9,r6,r8	1776 $UMULL r9,r6,r8

1867 $UMULH r10,r6,r8	1777 $UMULH r10,r6,r8

1868 addc r9,r9,r12	1778 addc r9,r9,r12

1869 #addze r10,r10 #carry is NOT ignored.	1779 #addze r10,r10 #carry is NOT ignored.

1870 #will be taken care of	1780 #will be taken care of

1871 #in second spin below	1781 #in second spin below

(...skipping 17 matching lines...) Expand all Loading...
1889 $LD r8,`3*$BNSZ`(r4)	1799 $LD r8,`3*$BNSZ`(r4)

1890 $UMULL r11,r6,r8	1800 $UMULL r11,r6,r8

1891 $UMULH r12,r6,r8	1801 $UMULH r12,r6,r8

1892 adde r11,r11,r10	1802 adde r11,r11,r10

1893 addze r12,r12 #this spin we collect carry into	1803 addze r12,r12 #this spin we collect carry into

1894 #r12	1804 #r12

1895 $ST r11,`3*$BNSZ`(r3)	1805 $ST r11,`3*$BNSZ`(r3)

1896	1806

1897 addi r3,r3,`4*$BNSZ`	1807 addi r3,r3,`4*$BNSZ`

1898 addi r4,r4,`4*$BNSZ`	1808 addi r4,r4,`4*$BNSZ`

1899 » bc» BO_dCTR_NZERO,CR0_EQ,Lppcasm_mw_LOOP	1809 » bdnz-» Lppcasm_mw_LOOP

1900	1810

1901 Lppcasm_mw_REM:	1811 Lppcasm_mw_REM:

1902 andi. r5,r5,0x3	1812 andi. r5,r5,0x3

1903 » bc» BO_IF,CR0_EQ,Lppcasm_mw_OVER	1813 » beq» Lppcasm_mw_OVER

1904 #mul(rp[0],ap[0],w,c1);	1814 #mul(rp[0],ap[0],w,c1);

1905 $LD r8,`0*$BNSZ`(r4)	1815 $LD r8,`0*$BNSZ`(r4)

1906 $UMULL r9,r6,r8	1816 $UMULL r9,r6,r8

1907 $UMULH r10,r6,r8	1817 $UMULH r10,r6,r8

1908 addc r9,r9,r12	1818 addc r9,r9,r12

1909 addze r10,r10	1819 addze r10,r10

1910 $ST r9,`0*$BNSZ`(r3)	1820 $ST r9,`0*$BNSZ`(r3)

1911 addi r12,r10,0	1821 addi r12,r10,0

1912	1822

1913 addi r5,r5,-1	1823 addi r5,r5,-1

1914 cmpli 0,0,r5,0	1824 cmpli 0,0,r5,0

1915 » bc» BO_IF,CR0_EQ,Lppcasm_mw_OVER	1825 » beq» Lppcasm_mw_OVER

1916	1826

1917	1827

1918 #mul(rp[1],ap[1],w,c1);	1828 #mul(rp[1],ap[1],w,c1);

1919 $LD r8,`1*$BNSZ`(r4)	1829 $LD r8,`1*$BNSZ`(r4)

1920 $UMULL r9,r6,r8	1830 $UMULL r9,r6,r8

1921 $UMULH r10,r6,r8	1831 $UMULH r10,r6,r8

1922 addc r9,r9,r12	1832 addc r9,r9,r12

1923 addze r10,r10	1833 addze r10,r10

1924 $ST r9,`1*$BNSZ`(r3)	1834 $ST r9,`1*$BNSZ`(r3)

1925 addi r12,r10,0	1835 addi r12,r10,0

1926	1836

1927 addi r5,r5,-1	1837 addi r5,r5,-1

1928 cmpli 0,0,r5,0	1838 cmpli 0,0,r5,0

1929 » bc» BO_IF,CR0_EQ,Lppcasm_mw_OVER	1839 » beq» Lppcasm_mw_OVER

1930	1840

1931 #mul_add(rp[2],ap[2],w,c1);	1841 #mul_add(rp[2],ap[2],w,c1);

1932 $LD r8,`2*$BNSZ`(r4)	1842 $LD r8,`2*$BNSZ`(r4)

1933 $UMULL r9,r6,r8	1843 $UMULL r9,r6,r8

1934 $UMULH r10,r6,r8	1844 $UMULH r10,r6,r8

1935 addc r9,r9,r12	1845 addc r9,r9,r12

1936 addze r10,r10	1846 addze r10,r10

1937 $ST r9,`2*$BNSZ`(r3)	1847 $ST r9,`2*$BNSZ`(r3)

1938 addi r12,r10,0	1848 addi r12,r10,0

1939	1849

1940 Lppcasm_mw_OVER:	1850 Lppcasm_mw_OVER:

1941 addi r3,r12,0	1851 addi r3,r12,0

1942 » bclr» BO_ALWAYS,CR0_LT	1852 » blr

1943 .long 0x00000000	1853 .long 0x00000000

1944	1854

1945 #	1855 #

1946 # NOTE: The following label name should be changed to	1856 # NOTE: The following label name should be changed to

1947 # "bn_mul_add_words" i.e. remove the first dot	1857 # "bn_mul_add_words" i.e. remove the first dot

1948 # for the gcc compiler. This should be automatically	1858 # for the gcc compiler. This should be automatically

1949 # done in the build	1859 # done in the build

1950 #	1860 #

1951	1861

1952 .align 4	1862 .align 4

1953 .bn_mul_add_words:	1863 .bn_mul_add_words:

1954 #	1864 #

1955 # BN_ULONG bn_mul_add_words(BN_ULONG rp, BN_ULONG ap, int num, BN_ULONG w)	1865 # BN_ULONG bn_mul_add_words(BN_ULONG rp, BN_ULONG ap, int num, BN_ULONG w)

1956 #	1866 #

1957 # r3 = rp	1867 # r3 = rp

1958 # r4 = ap	1868 # r4 = ap

1959 # r5 = num	1869 # r5 = num

1960 # r6 = w	1870 # r6 = w

1961 #	1871 #

1962 # empirical evidence suggests that unrolled version performs best!!	1872 # empirical evidence suggests that unrolled version performs best!!

1963 #	1873 #

1964 xor r0,r0,r0 #r0 = 0	1874 xor r0,r0,r0 #r0 = 0

1965 xor r12,r12,r12 #r12 = 0 . used for carry	1875 xor r12,r12,r12 #r12 = 0 . used for carry

1966 rlwinm. r7,r5,30,2,31 # num >> 2	1876 rlwinm. r7,r5,30,2,31 # num >> 2

1967 » bc» BO_IF,CR0_EQ,Lppcasm_maw_leftover» # if (num < 4) go LPPCAS M_maw_leftover	1877 » beq» Lppcasm_maw_leftover» # if (num < 4) go LPPCASM_maw_leftover

1968 mtctr r7	1878 mtctr r7

1969 Lppcasm_maw_mainloop:	1879 Lppcasm_maw_mainloop:

1970 #mul_add(rp[0],ap[0],w,c1);	1880 #mul_add(rp[0],ap[0],w,c1);

1971 $LD r8,`0*$BNSZ`(r4)	1881 $LD r8,`0*$BNSZ`(r4)

1972 $LD r11,`0*$BNSZ`(r3)	1882 $LD r11,`0*$BNSZ`(r3)

1973 $UMULL r9,r6,r8	1883 $UMULL r9,r6,r8

1974 $UMULH r10,r6,r8	1884 $UMULH r10,r6,r8

1975 addc r9,r9,r12 #r12 is carry.	1885 addc r9,r9,r12 #r12 is carry.

1976 addze r10,r10	1886 addze r10,r10

1977 addc r9,r9,r11	1887 addc r9,r9,r11

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2010 $UMULL r11,r6,r8	1920 $UMULL r11,r6,r8

2011 $LD r9,`3*$BNSZ`(r3)	1921 $LD r9,`3*$BNSZ`(r3)

2012 $UMULH r12,r6,r8	1922 $UMULH r12,r6,r8

2013 adde r11,r11,r10	1923 adde r11,r11,r10

2014 addze r12,r12	1924 addze r12,r12

2015 addc r11,r11,r9	1925 addc r11,r11,r9

2016 addze r12,r12	1926 addze r12,r12

2017 $ST r11,`3*$BNSZ`(r3)	1927 $ST r11,`3*$BNSZ`(r3)

2018 addi r3,r3,`4*$BNSZ`	1928 addi r3,r3,`4*$BNSZ`

2019 addi r4,r4,`4*$BNSZ`	1929 addi r4,r4,`4*$BNSZ`

2020 » bc» BO_dCTR_NZERO,CR0_EQ,Lppcasm_maw_mainloop	1930 » bdnz-» Lppcasm_maw_mainloop

2021	1931

2022 Lppcasm_maw_leftover:	1932 Lppcasm_maw_leftover:

2023 andi. r5,r5,0x3	1933 andi. r5,r5,0x3

2024 » bc» BO_IF,CR0_EQ,Lppcasm_maw_adios	1934 » beq» Lppcasm_maw_adios

2025 addi r3,r3,-$BNSZ	1935 addi r3,r3,-$BNSZ

2026 addi r4,r4,-$BNSZ	1936 addi r4,r4,-$BNSZ

2027 #mul_add(rp[0],ap[0],w,c1);	1937 #mul_add(rp[0],ap[0],w,c1);

2028 mtctr r5	1938 mtctr r5

2029 $LDU r8,$BNSZ(r4)	1939 $LDU r8,$BNSZ(r4)

2030 $UMULL r9,r6,r8	1940 $UMULL r9,r6,r8

2031 $UMULH r10,r6,r8	1941 $UMULH r10,r6,r8

2032 $LDU r11,$BNSZ(r3)	1942 $LDU r11,$BNSZ(r3)

2033 addc r9,r9,r11	1943 addc r9,r9,r11

2034 addze r10,r10	1944 addze r10,r10

2035 addc r9,r9,r12	1945 addc r9,r9,r12

2036 addze r12,r10	1946 addze r12,r10

2037 $ST r9,0(r3)	1947 $ST r9,0(r3)

2038	1948

2039 » bc» BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios	1949 » bdz» Lppcasm_maw_adios

2040 #mul_add(rp[1],ap[1],w,c1);	1950 #mul_add(rp[1],ap[1],w,c1);

2041 $LDU r8,$BNSZ(r4)	1951 $LDU r8,$BNSZ(r4)

2042 $UMULL r9,r6,r8	1952 $UMULL r9,r6,r8

2043 $UMULH r10,r6,r8	1953 $UMULH r10,r6,r8

2044 $LDU r11,$BNSZ(r3)	1954 $LDU r11,$BNSZ(r3)

2045 addc r9,r9,r11	1955 addc r9,r9,r11

2046 addze r10,r10	1956 addze r10,r10

2047 addc r9,r9,r12	1957 addc r9,r9,r12

2048 addze r12,r10	1958 addze r12,r10

2049 $ST r9,0(r3)	1959 $ST r9,0(r3)

2050	1960

2051 » bc» BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios	1961 » bdz» Lppcasm_maw_adios

2052 #mul_add(rp[2],ap[2],w,c1);	1962 #mul_add(rp[2],ap[2],w,c1);

2053 $LDU r8,$BNSZ(r4)	1963 $LDU r8,$BNSZ(r4)

2054 $UMULL r9,r6,r8	1964 $UMULL r9,r6,r8

2055 $UMULH r10,r6,r8	1965 $UMULH r10,r6,r8

2056 $LDU r11,$BNSZ(r3)	1966 $LDU r11,$BNSZ(r3)

2057 addc r9,r9,r11	1967 addc r9,r9,r11

2058 addze r10,r10	1968 addze r10,r10

2059 addc r9,r9,r12	1969 addc r9,r9,r12

2060 addze r12,r10	1970 addze r12,r10

2061 $ST r9,0(r3)	1971 $ST r9,0(r3)

2062	1972

2063 Lppcasm_maw_adios:	1973 Lppcasm_maw_adios:

2064 addi r3,r12,0	1974 addi r3,r12,0

2065 » bclr» BO_ALWAYS,CR0_LT	1975 » blr

2066 .long 0x00000000	1976 .long 0x00000000

2067 .align 4	1977 .align 4

2068 EOF	1978 EOF

2069 » $data =~ s/\`([^\`]*)\`/eval $1/gem;	1979 $data =~ s/\`([^\`]*)\`/eval $1/gem;

2070	1980 print $data;

2071 » # if some assembler chokes on some simplified mnemonic,	1981 close STDOUT;

2072 » # this is the spot to fix it up, e.g.:

2073 » # GNU as doesn't seem to accept cmplw, 32-bit unsigned compare

2074 » $data =~ s/^(\s)cmplw(\s+)([^,]+),(.)/$1cmpl$2$3,0,$4/gm;

2075 » # assembler X doesn't accept li, load immediate value

2076 » #$data =~ s/^(\s)li(\s+)([^,]+),(.)/$1addi$2$3,0,$4/gm;

2077 » # assembler Y chokes on apostrophes in comments

2078 » $data =~ s/'//gm;

2079 » return($data);

2080 }

OLD	NEW

« no previous file with comments | « openssl/crypto/bn/asm/mo-586.pl ('k') | openssl/crypto/bn/asm/ppc-mont.pl » ('j') | no next file with comments »