Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(897)

Side by Side Diff: openssl/crypto/bn/asm/ppc.pl

Issue 9254031: Upgrade chrome's OpenSSL to same version Android ships with. (Closed) Base URL: http://src.chromium.org/svn/trunk/deps/third_party/openssl/
Patch Set: '' Created 8 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « openssl/crypto/bn/asm/mo-586.pl ('k') | openssl/crypto/bn/asm/ppc-mont.pl » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 #!/usr/bin/env perl 1 #!/usr/bin/env perl
2 # 2 #
3 # Implemented as a Perl wrapper as we want to support several different 3 # Implemented as a Perl wrapper as we want to support several different
4 # architectures with single file. We pick up the target based on the 4 # architectures with single file. We pick up the target based on the
5 # file name we are asked to generate. 5 # file name we are asked to generate.
6 # 6 #
7 # It should be noted though that this perl code is nothing like 7 # It should be noted though that this perl code is nothing like
8 # <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much 8 # <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
9 # as pre-processor to cover for platform differences in name decoration, 9 # as pre-processor to cover for platform differences in name decoration,
10 # linker tables, 32-/64-bit instruction sets... 10 # linker tables, 32-/64-bit instruction sets...
(...skipping 82 matching lines...) Expand 10 before | Expand all | Expand 10 after
93 #dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2 93 #dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2
94 #dsa 1024 bits 0.0018s 0.0023s 545.0 442.2 94 #dsa 1024 bits 0.0018s 0.0023s 545.0 442.2
95 #dsa 2048 bits 0.0061s 0.0075s 163.5 132.8 95 #dsa 2048 bits 0.0061s 0.0075s 163.5 132.8
96 # 96 #
97 # Performance increase of ~60% 97 # Performance increase of ~60%
98 # 98 #
99 # If you have comments or suggestions to improve code send 99 # If you have comments or suggestions to improve code send
100 # me a note at schari@us.ibm.com 100 # me a note at schari@us.ibm.com
101 # 101 #
102 102
103 $opf = shift; 103 $flavour = shift;
104 104
105 if ($opf =~ /32\.s/) { 105 if ($flavour =~ /32/) {
106 $BITS= 32; 106 $BITS= 32;
107 $BNSZ= $BITS/8; 107 $BNSZ= $BITS/8;
108 $ISA= "\"ppc\""; 108 $ISA= "\"ppc\"";
109 109
110 $LD= "lwz"; # load 110 $LD= "lwz"; # load
111 $LDU= "lwzu"; # load and update 111 $LDU= "lwzu"; # load and update
112 $ST= "stw"; # store 112 $ST= "stw"; # store
113 $STU= "stwu"; # store and update 113 $STU= "stwu"; # store and update
114 $UMULL= "mullw"; # unsigned multiply low 114 $UMULL= "mullw"; # unsigned multiply low
115 $UMULH= "mulhwu"; # unsigned multiply high 115 $UMULH= "mulhwu"; # unsigned multiply high
116 $UDIV= "divwu"; # unsigned divide 116 $UDIV= "divwu"; # unsigned divide
117 $UCMPI= "cmplwi"; # unsigned compare with immediate 117 $UCMPI= "cmplwi"; # unsigned compare with immediate
118 $UCMP= "cmplw"; # unsigned compare 118 $UCMP= "cmplw"; # unsigned compare
119 $CNTLZ= "cntlzw"; # count leading zeros 119 $CNTLZ= "cntlzw"; # count leading zeros
120 $SHL= "slw"; # shift left 120 $SHL= "slw"; # shift left
121 $SHR= "srw"; # unsigned shift right 121 $SHR= "srw"; # unsigned shift right
122 $SHRI= "srwi"; # unsigned shift right by immediate 122 $SHRI= "srwi"; # unsigned shift right by immediate
123 $SHLI= "slwi"; # shift left by immediate 123 $SHLI= "slwi"; # shift left by immediate
124 $CLRU= "clrlwi"; # clear upper bits 124 $CLRU= "clrlwi"; # clear upper bits
125 $INSR= "insrwi"; # insert right 125 $INSR= "insrwi"; # insert right
126 $ROTL= "rotlwi"; # rotate left by immediate 126 $ROTL= "rotlwi"; # rotate left by immediate
127 $TR= "tw"; # conditional trap 127 $TR= "tw"; # conditional trap
128 } elsif ($opf =~ /64\.s/) { 128 } elsif ($flavour =~ /64/) {
129 $BITS= 64; 129 $BITS= 64;
130 $BNSZ= $BITS/8; 130 $BNSZ= $BITS/8;
131 $ISA= "\"ppc64\""; 131 $ISA= "\"ppc64\"";
132 132
133 # same as above, but 64-bit mnemonics... 133 # same as above, but 64-bit mnemonics...
134 $LD= "ld"; # load 134 $LD= "ld"; # load
135 $LDU= "ldu"; # load and update 135 $LDU= "ldu"; # load and update
136 $ST= "std"; # store 136 $ST= "std"; # store
137 $STU= "stdu"; # store and update 137 $STU= "stdu"; # store and update
138 $UMULL= "mulld"; # unsigned multiply low 138 $UMULL= "mulld"; # unsigned multiply low
139 $UMULH= "mulhdu"; # unsigned multiply high 139 $UMULH= "mulhdu"; # unsigned multiply high
140 $UDIV= "divdu"; # unsigned divide 140 $UDIV= "divdu"; # unsigned divide
141 $UCMPI= "cmpldi"; # unsigned compare with immediate 141 $UCMPI= "cmpldi"; # unsigned compare with immediate
142 $UCMP= "cmpld"; # unsigned compare 142 $UCMP= "cmpld"; # unsigned compare
143 $CNTLZ= "cntlzd"; # count leading zeros 143 $CNTLZ= "cntlzd"; # count leading zeros
144 $SHL= "sld"; # shift left 144 $SHL= "sld"; # shift left
145 $SHR= "srd"; # unsigned shift right 145 $SHR= "srd"; # unsigned shift right
146 $SHRI= "srdi"; # unsigned shift right by immediate 146 $SHRI= "srdi"; # unsigned shift right by immediate
147 $SHLI= "sldi"; # shift left by immediate 147 $SHLI= "sldi"; # shift left by immediate
148 $CLRU= "clrldi"; # clear upper bits 148 $CLRU= "clrldi"; # clear upper bits
149 $INSR= "insrdi"; # insert right 149 $INSR= "insrdi"; # insert right
150 $ROTL= "rotldi"; # rotate left by immediate 150 $ROTL= "rotldi"; # rotate left by immediate
151 $TR= "td"; # conditional trap 151 $TR= "td"; # conditional trap
152 } else { die "nonsense $opf"; } 152 } else { die "nonsense $flavour"; }
153 153
154 ( defined shift || open STDOUT,">$opf" ) || die "can't open $opf: $!"; 154 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
155 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
156 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
157 die "can't locate ppc-xlate.pl";
155 158
156 # function entry points from the AIX code 159 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
157 #
158 # There are other, more elegant, ways to handle this. We (IBM) chose
159 # this approach as it plays well with scripts we run to 'namespace'
160 # OpenSSL .i.e. we add a prefix to all the public symbols so we can
161 # co-exist in the same process with other implementations of OpenSSL.
162 # 'cleverer' ways of doing these substitutions tend to hide data we
163 # need to be obvious.
164 #
165 my @items = ("bn_sqr_comba4",
166 » "bn_sqr_comba8",
167 » "bn_mul_comba4",
168 » "bn_mul_comba8",
169 » "bn_sub_words",
170 » "bn_add_words",
171 » "bn_div_words",
172 » "bn_sqr_words",
173 » "bn_mul_words",
174 » "bn_mul_add_words");
175 160
176 if ($opf =~ /linux/)»{ do_linux();» } 161 $data=<<EOF;
177 elsif ($opf =~ /aix/)» { do_aix();» }
178 elsif ($opf =~ /osx/)» { do_osx();» }
179 else» » » { do_bsd();» }
180
181 sub do_linux {
182 $d=&data();
183
184 if ($BITS==64) {
185 foreach $t (@items) {
186 $d =~ s/\.$t:/\
187 \t.section\t".opd","aw"\
188 \t.align\t3\
189 \t.globl\t$t\
190 $t:\
191 \t.quad\t.$t,.TOC.\@tocbase,0\
192 \t.size\t$t,24\
193 \t.previous\n\
194 \t.type\t.$t,\@function\
195 \t.globl\t.$t\
196 .$t:/g;
197 }
198 }
199 else {
200 foreach $t (@items) {
201 $d=~s/\.$t/$t/g;
202 }
203 }
204 # hide internal labels to avoid pollution of name table...
205 $d=~s/Lppcasm_/.Lppcasm_/gm;
206 print $d;
207 }
208
209 sub do_aix {
210 # AIX assembler is smart enough to please the linker without
211 # making us do something special...
212 print &data();
213 }
214
215 # MacOSX 32 bit
216 sub do_osx {
217 $d=&data();
218 # Change the bn symbol prefix from '.' to '_'
219 foreach $t (@items) {
220 $d=~s/\.$t/_$t/g;
221 }
222 # Change .machine to something OS X asm will accept
223 $d=~s/\.machine.*/.text/g;
224 $d=~s/\#/;/g; # change comment from '#' to ';'
225 print $d;
226 }
227
228 # BSD (Untested)
229 sub do_bsd {
230 $d=&data();
231 foreach $t (@items) {
232 $d=~s/\.$t/_$t/g;
233 }
234 print $d;
235 }
236
237 sub data {
238 » local($data)=<<EOF;
239 #-------------------------------------------------------------------- 162 #--------------------------------------------------------------------
240 # 163 #
241 # 164 #
242 # 165 #
243 # 166 #
244 # File: ppc32.s 167 # File: ppc32.s
245 # 168 #
246 # Created by: Suresh Chari 169 # Created by: Suresh Chari
247 # IBM Thomas J. Watson Research Library 170 # IBM Thomas J. Watson Research Library
248 # Hawthorne, NY 171 # Hawthorne, NY
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
290 # architecture the optimizations in this file do 213 # architecture the optimizations in this file do
291 # NOT provide much improvement. 214 # NOT provide much improvement.
292 # 215 #
293 # If you have comments or suggestions to improve code send 216 # If you have comments or suggestions to improve code send
294 # me a note at schari\@us.ibm.com 217 # me a note at schari\@us.ibm.com
295 # 218 #
296 #-------------------------------------------------------------------------- 219 #--------------------------------------------------------------------------
297 # 220 #
298 # Defines to be used in the assembly code. 221 # Defines to be used in the assembly code.
299 # 222 #
300 .set r0,0» # we use it as storage for value of 0 223 #.set r0,0» # we use it as storage for value of 0
301 .set SP,1» # preserved 224 #.set SP,1» # preserved
302 .set RTOC,2» # preserved 225 #.set RTOC,2» # preserved
303 .set r3,3» # 1st argument/return value 226 #.set r3,3» # 1st argument/return value
304 .set r4,4» # 2nd argument/volatile register 227 #.set r4,4» # 2nd argument/volatile register
305 .set r5,5» # 3rd argument/volatile register 228 #.set r5,5» # 3rd argument/volatile register
306 .set r6,6» # ... 229 #.set r6,6» # ...
307 .set r7,7 230 #.set r7,7
308 .set r8,8 231 #.set r8,8
309 .set r9,9 232 #.set r9,9
310 .set r10,10 233 #.set r10,10
311 .set r11,11 234 #.set r11,11
312 .set r12,12 235 #.set r12,12
313 .set r13,13» # not used, nor any other "below" it... 236 #.set r13,13» # not used, nor any other "below" it...
314
315 .set BO_IF_NOT,4
316 .set BO_IF,12
317 .set BO_dCTR_NZERO,16
318 .set BO_dCTR_ZERO,18
319 .set BO_ALWAYS,20
320 .set CR0_LT,0;
321 .set CR0_GT,1;
322 .set CR0_EQ,2
323 .set CR1_FX,4;
324 .set CR1_FEX,5;
325 .set CR1_VX,6
326 .set LR,8
327 237
328 # Declare function names to be global 238 # Declare function names to be global
329 # NOTE: For gcc these names MUST be changed to remove 239 # NOTE: For gcc these names MUST be changed to remove
330 # the first . i.e. for example change ".bn_sqr_comba4" 240 # the first . i.e. for example change ".bn_sqr_comba4"
331 # to "bn_sqr_comba4". This should be automatically done 241 # to "bn_sqr_comba4". This should be automatically done
332 # in the build. 242 # in the build.
333 243
334 .globl .bn_sqr_comba4 244 .globl .bn_sqr_comba4
335 .globl .bn_sqr_comba8 245 .globl .bn_sqr_comba8
336 .globl .bn_mul_comba4 246 .globl .bn_mul_comba4
337 .globl .bn_mul_comba8 247 .globl .bn_mul_comba8
338 .globl .bn_sub_words 248 .globl .bn_sub_words
339 .globl .bn_add_words 249 .globl .bn_add_words
340 .globl .bn_div_words 250 .globl .bn_div_words
341 .globl .bn_sqr_words 251 .globl .bn_sqr_words
342 .globl .bn_mul_words 252 .globl .bn_mul_words
343 .globl .bn_mul_add_words 253 .globl .bn_mul_add_words
344 254
345 # .text section 255 # .text section
346 256
347 » .machine» $ISA 257 » .machine» "any"
348 258
349 # 259 #
350 # NOTE: The following label name should be changed to 260 # NOTE: The following label name should be changed to
351 # "bn_sqr_comba4" i.e. remove the first dot 261 # "bn_sqr_comba4" i.e. remove the first dot
352 # for the gcc compiler. This should be automatically 262 # for the gcc compiler. This should be automatically
353 # done in the build 263 # done in the build
354 # 264 #
355 265
356 .align 4 266 .align 4
357 .bn_sqr_comba4: 267 .bn_sqr_comba4:
(...skipping 113 matching lines...) Expand 10 before | Expand all | Expand 10 after
471 addze r10,r10 381 addze r10,r10
472 $ST r11,`5*$BNSZ`(r3) #r[5] = c3 382 $ST r11,`5*$BNSZ`(r3) #r[5] = c3
473 #sqr_add_c(a,3,c1,c2,c3); 383 #sqr_add_c(a,3,c1,c2,c3);
474 $UMULL r7,r6,r6 384 $UMULL r7,r6,r6
475 $UMULH r8,r6,r6 385 $UMULH r8,r6,r6
476 addc r9,r7,r9 386 addc r9,r7,r9
477 adde r10,r8,r10 387 adde r10,r8,r10
478 388
479 $ST r9,`6*$BNSZ`(r3) #r[6]=c1 389 $ST r9,`6*$BNSZ`(r3) #r[6]=c1
480 $ST r10,`7*$BNSZ`(r3) #r[7]=c2 390 $ST r10,`7*$BNSZ`(r3) #r[7]=c2
481 » bclr» BO_ALWAYS,CR0_LT 391 » blr
482 .long 0x00000000 392 .long 0x00000000
483 393
484 # 394 #
485 # NOTE: The following label name should be changed to 395 # NOTE: The following label name should be changed to
486 # "bn_sqr_comba8" i.e. remove the first dot 396 # "bn_sqr_comba8" i.e. remove the first dot
487 # for the gcc compiler. This should be automatically 397 # for the gcc compiler. This should be automatically
488 # done in the build 398 # done in the build
489 # 399 #
490 400
491 .align 4 401 .align 4
(...skipping 404 matching lines...) Expand 10 before | Expand all | Expand 10 after
896 $ST r10,`13*$BNSZ`(r3) #r[13]=c2; 806 $ST r10,`13*$BNSZ`(r3) #r[13]=c2;
897 #sqr_add_c(a,7,c3,c1,c2); 807 #sqr_add_c(a,7,c3,c1,c2);
898 $UMULL r7,r6,r6 808 $UMULL r7,r6,r6
899 $UMULH r8,r6,r6 809 $UMULH r8,r6,r6
900 addc r11,r7,r11 810 addc r11,r7,r11
901 adde r9,r8,r9 811 adde r9,r8,r9
902 $ST r11,`14*$BNSZ`(r3) #r[14]=c3; 812 $ST r11,`14*$BNSZ`(r3) #r[14]=c3;
903 $ST r9, `15*$BNSZ`(r3) #r[15]=c1; 813 $ST r9, `15*$BNSZ`(r3) #r[15]=c1;
904 814
905 815
906 » bclr» BO_ALWAYS,CR0_LT 816 » blr
907 817
908 .long 0x00000000 818 .long 0x00000000
909 819
910 # 820 #
911 # NOTE: The following label name should be changed to 821 # NOTE: The following label name should be changed to
912 # "bn_mul_comba4" i.e. remove the first dot 822 # "bn_mul_comba4" i.e. remove the first dot
913 # for the gcc compiler. This should be automatically 823 # for the gcc compiler. This should be automatically
914 # done in the build 824 # done in the build
915 # 825 #
916 826
(...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after
1032 $ST r11,`4*$BNSZ`(r3) #r[4]=c2 942 $ST r11,`4*$BNSZ`(r3) #r[4]=c2
1033 #mul_add_c(a[2],b[3],c3,c1,c2); 943 #mul_add_c(a[2],b[3],c3,c1,c2);
1034 $LD r6,`2*$BNSZ`(r4) 944 $LD r6,`2*$BNSZ`(r4)
1035 $UMULL r8,r6,r7 945 $UMULL r8,r6,r7
1036 $UMULH r9,r6,r7 946 $UMULH r9,r6,r7
1037 addc r12,r8,r12 947 addc r12,r8,r12
1038 adde r10,r9,r10 948 adde r10,r9,r10
1039 addze r11,r0 949 addze r11,r0
1040 #mul_add_c(a[3],b[2],c3,c1,c2); 950 #mul_add_c(a[3],b[2],c3,c1,c2);
1041 $LD r6,`3*$BNSZ`(r4) 951 $LD r6,`3*$BNSZ`(r4)
1042 » $LD» r7,`2*$BNSZ`(r4) 952 » $LD» r7,`2*$BNSZ`(r5)
1043 $UMULL r8,r6,r7 953 $UMULL r8,r6,r7
1044 $UMULH r9,r6,r7 954 $UMULH r9,r6,r7
1045 addc r12,r8,r12 955 addc r12,r8,r12
1046 adde r10,r9,r10 956 adde r10,r9,r10
1047 addze r11,r11 957 addze r11,r11
1048 $ST r12,`5*$BNSZ`(r3) #r[5]=c3 958 $ST r12,`5*$BNSZ`(r3) #r[5]=c3
1049 #mul_add_c(a[3],b[3],c1,c2,c3); 959 #mul_add_c(a[3],b[3],c1,c2,c3);
1050 $LD r7,`3*$BNSZ`(r5) 960 $LD r7,`3*$BNSZ`(r5)
1051 $UMULL r8,r6,r7 961 $UMULL r8,r6,r7
1052 $UMULH r9,r6,r7 962 $UMULH r9,r6,r7
1053 addc r10,r8,r10 963 addc r10,r8,r10
1054 adde r11,r9,r11 964 adde r11,r9,r11
1055 965
1056 $ST r10,`6*$BNSZ`(r3) #r[6]=c1 966 $ST r10,`6*$BNSZ`(r3) #r[6]=c1
1057 $ST r11,`7*$BNSZ`(r3) #r[7]=c2 967 $ST r11,`7*$BNSZ`(r3) #r[7]=c2
1058 » bclr» BO_ALWAYS,CR0_LT 968 » blr
1059 .long 0x00000000 969 .long 0x00000000
1060 970
1061 # 971 #
1062 # NOTE: The following label name should be changed to 972 # NOTE: The following label name should be changed to
1063 # "bn_mul_comba8" i.e. remove the first dot 973 # "bn_mul_comba8" i.e. remove the first dot
1064 # for the gcc compiler. This should be automatically 974 # for the gcc compiler. This should be automatically
1065 # done in the build 975 # done in the build
1066 # 976 #
1067 977
1068 .align 4 978 .align 4
(...skipping 515 matching lines...) Expand 10 before | Expand all | Expand 10 after
1584 addze r10,r10 1494 addze r10,r10
1585 $ST r11,`13*$BNSZ`(r3) #r[13]=c2; 1495 $ST r11,`13*$BNSZ`(r3) #r[13]=c2;
1586 #mul_add_c(a[7],b[7],c3,c1,c2); 1496 #mul_add_c(a[7],b[7],c3,c1,c2);
1587 $LD r7,`7*$BNSZ`(r5) 1497 $LD r7,`7*$BNSZ`(r5)
1588 $UMULL r8,r6,r7 1498 $UMULL r8,r6,r7
1589 $UMULH r9,r6,r7 1499 $UMULH r9,r6,r7
1590 addc r12,r12,r8 1500 addc r12,r12,r8
1591 adde r10,r10,r9 1501 adde r10,r10,r9
1592 $ST r12,`14*$BNSZ`(r3) #r[14]=c3; 1502 $ST r12,`14*$BNSZ`(r3) #r[14]=c3;
1593 $ST r10,`15*$BNSZ`(r3) #r[15]=c1; 1503 $ST r10,`15*$BNSZ`(r3) #r[15]=c1;
1594 » bclr» BO_ALWAYS,CR0_LT 1504 » blr
1595 .long 0x00000000 1505 .long 0x00000000
1596 1506
1597 # 1507 #
1598 # NOTE: The following label name should be changed to 1508 # NOTE: The following label name should be changed to
1599 # "bn_sub_words" i.e. remove the first dot 1509 # "bn_sub_words" i.e. remove the first dot
1600 # for the gcc compiler. This should be automatically 1510 # for the gcc compiler. This should be automatically
1601 # done in the build 1511 # done in the build
1602 # 1512 #
1603 # 1513 #
1604 .align 4 1514 .align 4
(...skipping 11 matching lines...) Expand all
1616 # Note: No loop unrolling done since this is not a performance 1526 # Note: No loop unrolling done since this is not a performance
1617 # critical loop. 1527 # critical loop.
1618 1528
1619 xor r0,r0,r0 #set r0 = 0 1529 xor r0,r0,r0 #set r0 = 0
1620 # 1530 #
1621 # check for r6 = 0 AND set carry bit. 1531 # check for r6 = 0 AND set carry bit.
1622 # 1532 #
1623 subfc. r7,r0,r6 # If r6 is 0 then result is 0. 1533 subfc. r7,r0,r6 # If r6 is 0 then result is 0.
1624 # if r6 > 0 then result !=0 1534 # if r6 > 0 then result !=0
1625 # In either case carry bit is set. 1535 # In either case carry bit is set.
1626 » bc» BO_IF,CR0_EQ,Lppcasm_sub_adios 1536 » beq» Lppcasm_sub_adios
1627 addi r4,r4,-$BNSZ 1537 addi r4,r4,-$BNSZ
1628 addi r3,r3,-$BNSZ 1538 addi r3,r3,-$BNSZ
1629 addi r5,r5,-$BNSZ 1539 addi r5,r5,-$BNSZ
1630 mtctr r6 1540 mtctr r6
1631 Lppcasm_sub_mainloop: 1541 Lppcasm_sub_mainloop:
1632 $LDU r7,$BNSZ(r4) 1542 $LDU r7,$BNSZ(r4)
1633 $LDU r8,$BNSZ(r5) 1543 $LDU r8,$BNSZ(r5)
1634 subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8) 1544 subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8)
1635 # if carry = 1 this is r7-r8. Else it 1545 # if carry = 1 this is r7-r8. Else it
1636 # is r7-r8 -1 as we need. 1546 # is r7-r8 -1 as we need.
1637 $STU r6,$BNSZ(r3) 1547 $STU r6,$BNSZ(r3)
1638 » bc» BO_dCTR_NZERO,CR0_EQ,Lppcasm_sub_mainloop 1548 » bdnz-» Lppcasm_sub_mainloop
1639 Lppcasm_sub_adios: 1549 Lppcasm_sub_adios:
1640 subfze r3,r0 # if carry bit is set then r3 = 0 else -1 1550 subfze r3,r0 # if carry bit is set then r3 = 0 else -1
1641 andi. r3,r3,1 # keep only last bit. 1551 andi. r3,r3,1 # keep only last bit.
1642 » bclr» BO_ALWAYS,CR0_LT 1552 » blr
1643 .long 0x00000000 1553 .long 0x00000000
1644 1554
1645 1555
1646 # 1556 #
1647 # NOTE: The following label name should be changed to 1557 # NOTE: The following label name should be changed to
1648 # "bn_add_words" i.e. remove the first dot 1558 # "bn_add_words" i.e. remove the first dot
1649 # for the gcc compiler. This should be automatically 1559 # for the gcc compiler. This should be automatically
1650 # done in the build 1560 # done in the build
1651 # 1561 #
1652 1562
(...skipping 10 matching lines...) Expand all
1663 # r6 = n 1573 # r6 = n
1664 # 1574 #
1665 # Note: No loop unrolling done since this is not a performance 1575 # Note: No loop unrolling done since this is not a performance
1666 # critical loop. 1576 # critical loop.
1667 1577
1668 xor r0,r0,r0 1578 xor r0,r0,r0
1669 # 1579 #
1670 # check for r6 = 0. Is this needed? 1580 # check for r6 = 0. Is this needed?
1671 # 1581 #
1672 addic. r6,r6,0 #test r6 and clear carry bit. 1582 addic. r6,r6,0 #test r6 and clear carry bit.
1673 » bc» BO_IF,CR0_EQ,Lppcasm_add_adios 1583 » beq» Lppcasm_add_adios
1674 addi r4,r4,-$BNSZ 1584 addi r4,r4,-$BNSZ
1675 addi r3,r3,-$BNSZ 1585 addi r3,r3,-$BNSZ
1676 addi r5,r5,-$BNSZ 1586 addi r5,r5,-$BNSZ
1677 mtctr r6 1587 mtctr r6
1678 Lppcasm_add_mainloop: 1588 Lppcasm_add_mainloop:
1679 $LDU r7,$BNSZ(r4) 1589 $LDU r7,$BNSZ(r4)
1680 $LDU r8,$BNSZ(r5) 1590 $LDU r8,$BNSZ(r5)
1681 adde r8,r7,r8 1591 adde r8,r7,r8
1682 $STU r8,$BNSZ(r3) 1592 $STU r8,$BNSZ(r3)
1683 » bc» BO_dCTR_NZERO,CR0_EQ,Lppcasm_add_mainloop 1593 » bdnz-» Lppcasm_add_mainloop
1684 Lppcasm_add_adios: 1594 Lppcasm_add_adios:
1685 addze r3,r0 #return carry bit. 1595 addze r3,r0 #return carry bit.
1686 » bclr» BO_ALWAYS,CR0_LT 1596 » blr
1687 .long 0x00000000 1597 .long 0x00000000
1688 1598
1689 # 1599 #
1690 # NOTE: The following label name should be changed to 1600 # NOTE: The following label name should be changed to
1691 # "bn_div_words" i.e. remove the first dot 1601 # "bn_div_words" i.e. remove the first dot
1692 # for the gcc compiler. This should be automatically 1602 # for the gcc compiler. This should be automatically
1693 # done in the build 1603 # done in the build
1694 # 1604 #
1695 1605
1696 .align 4 1606 .align 4
1697 .bn_div_words: 1607 .bn_div_words:
1698 # 1608 #
1699 # This is a cleaned up version of code generated by 1609 # This is a cleaned up version of code generated by
1700 # the AIX compiler. The only optimization is to use 1610 # the AIX compiler. The only optimization is to use
1701 # the PPC instruction to count leading zeros instead 1611 # the PPC instruction to count leading zeros instead
1702 # of call to num_bits_word. Since this was compiled 1612 # of call to num_bits_word. Since this was compiled
1703 # only at level -O2 we can possibly squeeze it more? 1613 # only at level -O2 we can possibly squeeze it more?
1704 # 1614 #
1705 # r3 = h 1615 # r3 = h
1706 # r4 = l 1616 # r4 = l
1707 # r5 = d 1617 # r5 = d
1708 1618
1709 $UCMPI 0,r5,0 # compare r5 and 0 1619 $UCMPI 0,r5,0 # compare r5 and 0
1710 » bc» BO_IF_NOT,CR0_EQ,Lppcasm_div1» # proceed if d!=0 1620 » bne» Lppcasm_div1» » # proceed if d!=0
1711 li r3,-1 # d=0 return -1 1621 li r3,-1 # d=0 return -1
1712 » bclr» BO_ALWAYS,CR0_LT» 1622 » blr
1713 Lppcasm_div1: 1623 Lppcasm_div1:
1714 xor r0,r0,r0 #r0=0 1624 xor r0,r0,r0 #r0=0
1715 li r8,$BITS 1625 li r8,$BITS
1716 $CNTLZ. r7,r5 #r7 = num leading 0s in d. 1626 $CNTLZ. r7,r5 #r7 = num leading 0s in d.
1717 » bc» BO_IF,CR0_EQ,Lppcasm_div2» #proceed if no leading zeros 1627 » beq» Lppcasm_div2» » #proceed if no leading zeros
1718 subf r8,r7,r8 #r8 = BN_num_bits_word(d) 1628 subf r8,r7,r8 #r8 = BN_num_bits_word(d)
1719 $SHR. r9,r3,r8 #are there any bits above r8'th? 1629 $SHR. r9,r3,r8 #are there any bits above r8'th?
1720 $TR 16,r9,r0 #if there're, signal to dump core... 1630 $TR 16,r9,r0 #if there're, signal to dump core...
1721 Lppcasm_div2: 1631 Lppcasm_div2:
1722 $UCMP 0,r3,r5 #h>=d? 1632 $UCMP 0,r3,r5 #h>=d?
1723 » bc» BO_IF,CR0_LT,Lppcasm_div3» #goto Lppcasm_div3 if not 1633 » blt» Lppcasm_div3» » #goto Lppcasm_div3 if not
1724 subf r3,r5,r3 #h-=d ; 1634 subf r3,r5,r3 #h-=d ;
1725 Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i 1635 Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i
1726 cmpi 0,0,r7,0 # is (i == 0)? 1636 cmpi 0,0,r7,0 # is (i == 0)?
1727 » bc» BO_IF,CR0_EQ,Lppcasm_div4 1637 » beq» Lppcasm_div4
1728 $SHL r3,r3,r7 # h = (h<< i) 1638 $SHL r3,r3,r7 # h = (h<< i)
1729 $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i) 1639 $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i)
1730 $SHL r5,r5,r7 # d<<=i 1640 $SHL r5,r5,r7 # d<<=i
1731 or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i)) 1641 or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i))
1732 $SHL r4,r4,r7 # l <<=i 1642 $SHL r4,r4,r7 # l <<=i
1733 Lppcasm_div4: 1643 Lppcasm_div4:
1734 $SHRI r9,r5,`$BITS/2` # r9 = dh 1644 $SHRI r9,r5,`$BITS/2` # r9 = dh
1735 # dl will be computed when needed 1645 # dl will be computed when needed
1736 # as it saves registers. 1646 # as it saves registers.
1737 li r6,2 #r6=2 1647 li r6,2 #r6=2
1738 mtctr r6 #counter will be in count. 1648 mtctr r6 #counter will be in count.
1739 Lppcasm_divouterloop: 1649 Lppcasm_divouterloop:
1740 $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4) 1650 $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4)
1741 $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 1651 $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4
1742 # compute here for innerloop. 1652 # compute here for innerloop.
1743 $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh 1653 $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh
1744 » bc» BO_IF_NOT,CR0_EQ,Lppcasm_div5» # goto Lppcasm_div5 if not 1654 » bne» Lppcasm_div5» » # goto Lppcasm_div5 if not
1745 1655
1746 li r8,-1 1656 li r8,-1
1747 $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l 1657 $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
1748 b Lppcasm_div6 1658 b Lppcasm_div6
1749 Lppcasm_div5: 1659 Lppcasm_div5:
1750 $UDIV r8,r3,r9 #q = h/dh 1660 $UDIV r8,r3,r9 #q = h/dh
1751 Lppcasm_div6: 1661 Lppcasm_div6:
1752 $UMULL r12,r9,r8 #th = q*dh 1662 $UMULL r12,r9,r8 #th = q*dh
1753 $CLRU r10,r5,`$BITS/2` #r10=dl 1663 $CLRU r10,r5,`$BITS/2` #r10=dl
1754 $UMULL r6,r8,r10 #tl = q*dl 1664 $UMULL r6,r8,r10 #tl = q*dl
1755 1665
1756 Lppcasm_divinnerloop: 1666 Lppcasm_divinnerloop:
1757 subf r10,r12,r3 #t = h -th 1667 subf r10,r12,r3 #t = h -th
1758 $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of... 1668 $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of...
1759 addic. r7,r7,0 #test if r7 == 0. used below. 1669 addic. r7,r7,0 #test if r7 == 0. used below.
1760 # now want to compute 1670 # now want to compute
1761 # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_ BITS4) 1671 # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_ BITS4)
1762 # the following 2 instructions do that 1672 # the following 2 instructions do that
1763 $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4) 1673 $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4)
1764 or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4) 1674 or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4)
1765 » $UCMP» 1,r6,r7»» » # compare (tl <= r7) 1675 » $UCMP» cr1,r6,r7» » # compare (tl <= r7)
1766 » bc» BO_IF_NOT,CR0_EQ,Lppcasm_divinnerexit 1676 » bne» Lppcasm_divinnerexit
1767 » bc» BO_IF_NOT,CR1_FEX,Lppcasm_divinnerexit 1677 » ble» cr1,Lppcasm_divinnerexit
1768 addi r8,r8,-1 #q-- 1678 addi r8,r8,-1 #q--
1769 subf r12,r9,r12 #th -=dh 1679 subf r12,r9,r12 #th -=dh
1770 $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop. 1680 $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop.
1771 subf r6,r10,r6 #tl -=dl 1681 subf r6,r10,r6 #tl -=dl
1772 b Lppcasm_divinnerloop 1682 b Lppcasm_divinnerloop
1773 Lppcasm_divinnerexit: 1683 Lppcasm_divinnerexit:
1774 $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4) 1684 $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4)
1775 $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h; 1685 $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h;
1776 » $UCMP» 1,r4,r11» » # compare l and tl 1686 » $UCMP» cr1,r4,r11» » # compare l and tl
1777 add r12,r12,r10 # th+=t 1687 add r12,r12,r10 # th+=t
1778 » bc» BO_IF_NOT,CR1_FX,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7 1688 » bge» cr1,Lppcasm_div7» # if (l>=tl) goto Lppcasm_div7
1779 addi r12,r12,1 # th++ 1689 addi r12,r12,1 # th++
1780 Lppcasm_div7: 1690 Lppcasm_div7:
1781 subf r11,r11,r4 #r11=l-tl 1691 subf r11,r11,r4 #r11=l-tl
1782 » $UCMP» 1,r3,r12» » #compare h and th 1692 » $UCMP» cr1,r3,r12» » #compare h and th
1783 » bc» BO_IF_NOT,CR1_FX,Lppcasm_div8» #if (h>=th) goto Lppcasm_div8 1693 » bge» cr1,Lppcasm_div8» #if (h>=th) goto Lppcasm_div8
1784 addi r8,r8,-1 # q-- 1694 addi r8,r8,-1 # q--
1785 add r3,r5,r3 # h+=d 1695 add r3,r5,r3 # h+=d
1786 Lppcasm_div8: 1696 Lppcasm_div8:
1787 subf r12,r12,r3 #r12 = h-th 1697 subf r12,r12,r3 #r12 = h-th
1788 $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4 1698 $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4
1789 # want to compute 1699 # want to compute
1790 # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_M ASK2 1700 # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_M ASK2
1791 # the following 2 instructions will do t his. 1701 # the following 2 instructions will do t his.
1792 $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotat ed $BITS/2. 1702 $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotat ed $BITS/2.
1793 $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3 1703 $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3
1794 » bc» BO_dCTR_ZERO,CR0_EQ,Lppcasm_div9#if (count==0) break ; 1704 » bdz» Lppcasm_div9» » #if (count==0) break ;
1795 $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4 1705 $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4
1796 b Lppcasm_divouterloop 1706 b Lppcasm_divouterloop
1797 Lppcasm_div9: 1707 Lppcasm_div9:
1798 or r3,r8,r0 1708 or r3,r8,r0
1799 » bclr» BO_ALWAYS,CR0_LT 1709 » blr
1800 .long 0x00000000 1710 .long 0x00000000
1801 1711
1802 # 1712 #
1803 # NOTE: The following label name should be changed to 1713 # NOTE: The following label name should be changed to
1804 # "bn_sqr_words" i.e. remove the first dot 1714 # "bn_sqr_words" i.e. remove the first dot
1805 # for the gcc compiler. This should be automatically 1715 # for the gcc compiler. This should be automatically
1806 # done in the build 1716 # done in the build
1807 # 1717 #
1808 .align 4 1718 .align 4
1809 .bn_sqr_words: 1719 .bn_sqr_words:
1810 # 1720 #
1811 # Optimized version of bn_sqr_words 1721 # Optimized version of bn_sqr_words
1812 # 1722 #
1813 # void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) 1723 # void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
1814 # 1724 #
1815 # r3 = r 1725 # r3 = r
1816 # r4 = a 1726 # r4 = a
1817 # r5 = n 1727 # r5 = n
1818 # 1728 #
1819 # r6 = a[i]. 1729 # r6 = a[i].
1820 # r7,r8 = product. 1730 # r7,r8 = product.
1821 # 1731 #
1822 # No unrolling done here. Not performance critical. 1732 # No unrolling done here. Not performance critical.
1823 1733
1824 addic. r5,r5,0 #test r5. 1734 addic. r5,r5,0 #test r5.
1825 » bc» BO_IF,CR0_EQ,Lppcasm_sqr_adios 1735 » beq» Lppcasm_sqr_adios
1826 addi r4,r4,-$BNSZ 1736 addi r4,r4,-$BNSZ
1827 addi r3,r3,-$BNSZ 1737 addi r3,r3,-$BNSZ
1828 mtctr r5 1738 mtctr r5
1829 Lppcasm_sqr_mainloop: 1739 Lppcasm_sqr_mainloop:
1830 #sqr(r[0],r[1],a[0]); 1740 #sqr(r[0],r[1],a[0]);
1831 $LDU r6,$BNSZ(r4) 1741 $LDU r6,$BNSZ(r4)
1832 $UMULL r7,r6,r6 1742 $UMULL r7,r6,r6
1833 $UMULH r8,r6,r6 1743 $UMULH r8,r6,r6
1834 $STU r7,$BNSZ(r3) 1744 $STU r7,$BNSZ(r3)
1835 $STU r8,$BNSZ(r3) 1745 $STU r8,$BNSZ(r3)
1836 » bc» BO_dCTR_NZERO,CR0_EQ,Lppcasm_sqr_mainloop 1746 » bdnz-» Lppcasm_sqr_mainloop
1837 Lppcasm_sqr_adios: 1747 Lppcasm_sqr_adios:
1838 » bclr» BO_ALWAYS,CR0_LT 1748 » blr
1839 .long 0x00000000 1749 .long 0x00000000
1840 1750
1841 1751
1842 # 1752 #
1843 # NOTE: The following label name should be changed to 1753 # NOTE: The following label name should be changed to
1844 # "bn_mul_words" i.e. remove the first dot 1754 # "bn_mul_words" i.e. remove the first dot
1845 # for the gcc compiler. This should be automatically 1755 # for the gcc compiler. This should be automatically
1846 # done in the build 1756 # done in the build
1847 # 1757 #
1848 1758
1849 .align 4 1759 .align 4
1850 .bn_mul_words: 1760 .bn_mul_words:
1851 # 1761 #
1852 # BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1762 # BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1853 # 1763 #
1854 # r3 = rp 1764 # r3 = rp
1855 # r4 = ap 1765 # r4 = ap
1856 # r5 = num 1766 # r5 = num
1857 # r6 = w 1767 # r6 = w
1858 xor r0,r0,r0 1768 xor r0,r0,r0
1859 xor r12,r12,r12 # used for carry 1769 xor r12,r12,r12 # used for carry
1860 rlwinm. r7,r5,30,2,31 # num >> 2 1770 rlwinm. r7,r5,30,2,31 # num >> 2
1861 » bc» BO_IF,CR0_EQ,Lppcasm_mw_REM 1771 » beq» Lppcasm_mw_REM
1862 mtctr r7 1772 mtctr r7
1863 Lppcasm_mw_LOOP: 1773 Lppcasm_mw_LOOP:
1864 #mul(rp[0],ap[0],w,c1); 1774 #mul(rp[0],ap[0],w,c1);
1865 $LD r8,`0*$BNSZ`(r4) 1775 $LD r8,`0*$BNSZ`(r4)
1866 $UMULL r9,r6,r8 1776 $UMULL r9,r6,r8
1867 $UMULH r10,r6,r8 1777 $UMULH r10,r6,r8
1868 addc r9,r9,r12 1778 addc r9,r9,r12
1869 #addze r10,r10 #carry is NOT ignored. 1779 #addze r10,r10 #carry is NOT ignored.
1870 #will be taken care of 1780 #will be taken care of
1871 #in second spin below 1781 #in second spin below
(...skipping 17 matching lines...) Expand all
1889 $LD r8,`3*$BNSZ`(r4) 1799 $LD r8,`3*$BNSZ`(r4)
1890 $UMULL r11,r6,r8 1800 $UMULL r11,r6,r8
1891 $UMULH r12,r6,r8 1801 $UMULH r12,r6,r8
1892 adde r11,r11,r10 1802 adde r11,r11,r10
1893 addze r12,r12 #this spin we collect carry into 1803 addze r12,r12 #this spin we collect carry into
1894 #r12 1804 #r12
1895 $ST r11,`3*$BNSZ`(r3) 1805 $ST r11,`3*$BNSZ`(r3)
1896 1806
1897 addi r3,r3,`4*$BNSZ` 1807 addi r3,r3,`4*$BNSZ`
1898 addi r4,r4,`4*$BNSZ` 1808 addi r4,r4,`4*$BNSZ`
1899 » bc» BO_dCTR_NZERO,CR0_EQ,Lppcasm_mw_LOOP 1809 » bdnz-» Lppcasm_mw_LOOP
1900 1810
1901 Lppcasm_mw_REM: 1811 Lppcasm_mw_REM:
1902 andi. r5,r5,0x3 1812 andi. r5,r5,0x3
1903 » bc» BO_IF,CR0_EQ,Lppcasm_mw_OVER 1813 » beq» Lppcasm_mw_OVER
1904 #mul(rp[0],ap[0],w,c1); 1814 #mul(rp[0],ap[0],w,c1);
1905 $LD r8,`0*$BNSZ`(r4) 1815 $LD r8,`0*$BNSZ`(r4)
1906 $UMULL r9,r6,r8 1816 $UMULL r9,r6,r8
1907 $UMULH r10,r6,r8 1817 $UMULH r10,r6,r8
1908 addc r9,r9,r12 1818 addc r9,r9,r12
1909 addze r10,r10 1819 addze r10,r10
1910 $ST r9,`0*$BNSZ`(r3) 1820 $ST r9,`0*$BNSZ`(r3)
1911 addi r12,r10,0 1821 addi r12,r10,0
1912 1822
1913 addi r5,r5,-1 1823 addi r5,r5,-1
1914 cmpli 0,0,r5,0 1824 cmpli 0,0,r5,0
1915 » bc» BO_IF,CR0_EQ,Lppcasm_mw_OVER 1825 » beq» Lppcasm_mw_OVER
1916 1826
1917 1827
1918 #mul(rp[1],ap[1],w,c1); 1828 #mul(rp[1],ap[1],w,c1);
1919 $LD r8,`1*$BNSZ`(r4) 1829 $LD r8,`1*$BNSZ`(r4)
1920 $UMULL r9,r6,r8 1830 $UMULL r9,r6,r8
1921 $UMULH r10,r6,r8 1831 $UMULH r10,r6,r8
1922 addc r9,r9,r12 1832 addc r9,r9,r12
1923 addze r10,r10 1833 addze r10,r10
1924 $ST r9,`1*$BNSZ`(r3) 1834 $ST r9,`1*$BNSZ`(r3)
1925 addi r12,r10,0 1835 addi r12,r10,0
1926 1836
1927 addi r5,r5,-1 1837 addi r5,r5,-1
1928 cmpli 0,0,r5,0 1838 cmpli 0,0,r5,0
1929 » bc» BO_IF,CR0_EQ,Lppcasm_mw_OVER 1839 » beq» Lppcasm_mw_OVER
1930 1840
1931 #mul_add(rp[2],ap[2],w,c1); 1841 #mul_add(rp[2],ap[2],w,c1);
1932 $LD r8,`2*$BNSZ`(r4) 1842 $LD r8,`2*$BNSZ`(r4)
1933 $UMULL r9,r6,r8 1843 $UMULL r9,r6,r8
1934 $UMULH r10,r6,r8 1844 $UMULH r10,r6,r8
1935 addc r9,r9,r12 1845 addc r9,r9,r12
1936 addze r10,r10 1846 addze r10,r10
1937 $ST r9,`2*$BNSZ`(r3) 1847 $ST r9,`2*$BNSZ`(r3)
1938 addi r12,r10,0 1848 addi r12,r10,0
1939 1849
1940 Lppcasm_mw_OVER: 1850 Lppcasm_mw_OVER:
1941 addi r3,r12,0 1851 addi r3,r12,0
1942 » bclr» BO_ALWAYS,CR0_LT 1852 » blr
1943 .long 0x00000000 1853 .long 0x00000000
1944 1854
1945 # 1855 #
1946 # NOTE: The following label name should be changed to 1856 # NOTE: The following label name should be changed to
1947 # "bn_mul_add_words" i.e. remove the first dot 1857 # "bn_mul_add_words" i.e. remove the first dot
1948 # for the gcc compiler. This should be automatically 1858 # for the gcc compiler. This should be automatically
1949 # done in the build 1859 # done in the build
1950 # 1860 #
1951 1861
1952 .align 4 1862 .align 4
1953 .bn_mul_add_words: 1863 .bn_mul_add_words:
1954 # 1864 #
1955 # BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1865 # BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1956 # 1866 #
1957 # r3 = rp 1867 # r3 = rp
1958 # r4 = ap 1868 # r4 = ap
1959 # r5 = num 1869 # r5 = num
1960 # r6 = w 1870 # r6 = w
1961 # 1871 #
1962 # empirical evidence suggests that unrolled version performs best!! 1872 # empirical evidence suggests that unrolled version performs best!!
1963 # 1873 #
1964 xor r0,r0,r0 #r0 = 0 1874 xor r0,r0,r0 #r0 = 0
1965 xor r12,r12,r12 #r12 = 0 . used for carry 1875 xor r12,r12,r12 #r12 = 0 . used for carry
1966 rlwinm. r7,r5,30,2,31 # num >> 2 1876 rlwinm. r7,r5,30,2,31 # num >> 2
1967 » bc» BO_IF,CR0_EQ,Lppcasm_maw_leftover» # if (num < 4) go LPPCAS M_maw_leftover 1877 » beq» Lppcasm_maw_leftover» # if (num < 4) go LPPCASM_maw_leftover
1968 mtctr r7 1878 mtctr r7
1969 Lppcasm_maw_mainloop: 1879 Lppcasm_maw_mainloop:
1970 #mul_add(rp[0],ap[0],w,c1); 1880 #mul_add(rp[0],ap[0],w,c1);
1971 $LD r8,`0*$BNSZ`(r4) 1881 $LD r8,`0*$BNSZ`(r4)
1972 $LD r11,`0*$BNSZ`(r3) 1882 $LD r11,`0*$BNSZ`(r3)
1973 $UMULL r9,r6,r8 1883 $UMULL r9,r6,r8
1974 $UMULH r10,r6,r8 1884 $UMULH r10,r6,r8
1975 addc r9,r9,r12 #r12 is carry. 1885 addc r9,r9,r12 #r12 is carry.
1976 addze r10,r10 1886 addze r10,r10
1977 addc r9,r9,r11 1887 addc r9,r9,r11
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
2010 $UMULL r11,r6,r8 1920 $UMULL r11,r6,r8
2011 $LD r9,`3*$BNSZ`(r3) 1921 $LD r9,`3*$BNSZ`(r3)
2012 $UMULH r12,r6,r8 1922 $UMULH r12,r6,r8
2013 adde r11,r11,r10 1923 adde r11,r11,r10
2014 addze r12,r12 1924 addze r12,r12
2015 addc r11,r11,r9 1925 addc r11,r11,r9
2016 addze r12,r12 1926 addze r12,r12
2017 $ST r11,`3*$BNSZ`(r3) 1927 $ST r11,`3*$BNSZ`(r3)
2018 addi r3,r3,`4*$BNSZ` 1928 addi r3,r3,`4*$BNSZ`
2019 addi r4,r4,`4*$BNSZ` 1929 addi r4,r4,`4*$BNSZ`
2020 » bc» BO_dCTR_NZERO,CR0_EQ,Lppcasm_maw_mainloop 1930 » bdnz-» Lppcasm_maw_mainloop
2021 1931
2022 Lppcasm_maw_leftover: 1932 Lppcasm_maw_leftover:
2023 andi. r5,r5,0x3 1933 andi. r5,r5,0x3
2024 » bc» BO_IF,CR0_EQ,Lppcasm_maw_adios 1934 » beq» Lppcasm_maw_adios
2025 addi r3,r3,-$BNSZ 1935 addi r3,r3,-$BNSZ
2026 addi r4,r4,-$BNSZ 1936 addi r4,r4,-$BNSZ
2027 #mul_add(rp[0],ap[0],w,c1); 1937 #mul_add(rp[0],ap[0],w,c1);
2028 mtctr r5 1938 mtctr r5
2029 $LDU r8,$BNSZ(r4) 1939 $LDU r8,$BNSZ(r4)
2030 $UMULL r9,r6,r8 1940 $UMULL r9,r6,r8
2031 $UMULH r10,r6,r8 1941 $UMULH r10,r6,r8
2032 $LDU r11,$BNSZ(r3) 1942 $LDU r11,$BNSZ(r3)
2033 addc r9,r9,r11 1943 addc r9,r9,r11
2034 addze r10,r10 1944 addze r10,r10
2035 addc r9,r9,r12 1945 addc r9,r9,r12
2036 addze r12,r10 1946 addze r12,r10
2037 $ST r9,0(r3) 1947 $ST r9,0(r3)
2038 1948
2039 » bc» BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios 1949 » bdz» Lppcasm_maw_adios
2040 #mul_add(rp[1],ap[1],w,c1); 1950 #mul_add(rp[1],ap[1],w,c1);
2041 $LDU r8,$BNSZ(r4) 1951 $LDU r8,$BNSZ(r4)
2042 $UMULL r9,r6,r8 1952 $UMULL r9,r6,r8
2043 $UMULH r10,r6,r8 1953 $UMULH r10,r6,r8
2044 $LDU r11,$BNSZ(r3) 1954 $LDU r11,$BNSZ(r3)
2045 addc r9,r9,r11 1955 addc r9,r9,r11
2046 addze r10,r10 1956 addze r10,r10
2047 addc r9,r9,r12 1957 addc r9,r9,r12
2048 addze r12,r10 1958 addze r12,r10
2049 $ST r9,0(r3) 1959 $ST r9,0(r3)
2050 1960
2051 » bc» BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios 1961 » bdz» Lppcasm_maw_adios
2052 #mul_add(rp[2],ap[2],w,c1); 1962 #mul_add(rp[2],ap[2],w,c1);
2053 $LDU r8,$BNSZ(r4) 1963 $LDU r8,$BNSZ(r4)
2054 $UMULL r9,r6,r8 1964 $UMULL r9,r6,r8
2055 $UMULH r10,r6,r8 1965 $UMULH r10,r6,r8
2056 $LDU r11,$BNSZ(r3) 1966 $LDU r11,$BNSZ(r3)
2057 addc r9,r9,r11 1967 addc r9,r9,r11
2058 addze r10,r10 1968 addze r10,r10
2059 addc r9,r9,r12 1969 addc r9,r9,r12
2060 addze r12,r10 1970 addze r12,r10
2061 $ST r9,0(r3) 1971 $ST r9,0(r3)
2062 1972
2063 Lppcasm_maw_adios: 1973 Lppcasm_maw_adios:
2064 addi r3,r12,0 1974 addi r3,r12,0
2065 » bclr» BO_ALWAYS,CR0_LT 1975 » blr
2066 .long 0x00000000 1976 .long 0x00000000
2067 .align 4 1977 .align 4
2068 EOF 1978 EOF
2069 » $data =~ s/\`([^\`]*)\`/eval $1/gem; 1979 $data =~ s/\`([^\`]*)\`/eval $1/gem;
2070 1980 print $data;
2071 » # if some assembler chokes on some simplified mnemonic, 1981 close STDOUT;
2072 » # this is the spot to fix it up, e.g.:
2073 » # GNU as doesn't seem to accept cmplw, 32-bit unsigned compare
2074 » $data =~ s/^(\s*)cmplw(\s+)([^,]+),(.*)/$1cmpl$2$3,0,$4/gm;
2075 » # assembler X doesn't accept li, load immediate value
2076 » #$data =~ s/^(\s*)li(\s+)([^,]+),(.*)/$1addi$2$3,0,$4/gm;
2077 » # assembler Y chokes on apostrophes in comments
2078 » $data =~ s/'//gm;
2079 » return($data);
2080 }
OLDNEW
« no previous file with comments | « openssl/crypto/bn/asm/mo-586.pl ('k') | openssl/crypto/bn/asm/ppc-mont.pl » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698