OLD | NEW |
1 #!/usr/bin/env perl | 1 #!/usr/bin/env perl |
2 # | 2 # |
3 # Implemented as a Perl wrapper as we want to support several different | 3 # Implemented as a Perl wrapper as we want to support several different |
4 # architectures with single file. We pick up the target based on the | 4 # architectures with single file. We pick up the target based on the |
5 # file name we are asked to generate. | 5 # file name we are asked to generate. |
6 # | 6 # |
7 # It should be noted though that this perl code is nothing like | 7 # It should be noted though that this perl code is nothing like |
8 # <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much | 8 # <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much |
9 # as pre-processor to cover for platform differences in name decoration, | 9 # as pre-processor to cover for platform differences in name decoration, |
10 # linker tables, 32-/64-bit instruction sets... | 10 # linker tables, 32-/64-bit instruction sets... |
(...skipping 82 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
93 #dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2 | 93 #dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2 |
94 #dsa 1024 bits 0.0018s 0.0023s 545.0 442.2 | 94 #dsa 1024 bits 0.0018s 0.0023s 545.0 442.2 |
95 #dsa 2048 bits 0.0061s 0.0075s 163.5 132.8 | 95 #dsa 2048 bits 0.0061s 0.0075s 163.5 132.8 |
96 # | 96 # |
97 # Performance increase of ~60% | 97 # Performance increase of ~60% |
98 # | 98 # |
99 # If you have comments or suggestions to improve code send | 99 # If you have comments or suggestions to improve code send |
100 # me a note at schari@us.ibm.com | 100 # me a note at schari@us.ibm.com |
101 # | 101 # |
102 | 102 |
103 $opf = shift; | 103 $flavour = shift; |
104 | 104 |
105 if ($opf =~ /32\.s/) { | 105 if ($flavour =~ /32/) { |
106 $BITS= 32; | 106 $BITS= 32; |
107 $BNSZ= $BITS/8; | 107 $BNSZ= $BITS/8; |
108 $ISA= "\"ppc\""; | 108 $ISA= "\"ppc\""; |
109 | 109 |
110 $LD= "lwz"; # load | 110 $LD= "lwz"; # load |
111 $LDU= "lwzu"; # load and update | 111 $LDU= "lwzu"; # load and update |
112 $ST= "stw"; # store | 112 $ST= "stw"; # store |
113 $STU= "stwu"; # store and update | 113 $STU= "stwu"; # store and update |
114 $UMULL= "mullw"; # unsigned multiply low | 114 $UMULL= "mullw"; # unsigned multiply low |
115 $UMULH= "mulhwu"; # unsigned multiply high | 115 $UMULH= "mulhwu"; # unsigned multiply high |
116 $UDIV= "divwu"; # unsigned divide | 116 $UDIV= "divwu"; # unsigned divide |
117 $UCMPI= "cmplwi"; # unsigned compare with immediate | 117 $UCMPI= "cmplwi"; # unsigned compare with immediate |
118 $UCMP= "cmplw"; # unsigned compare | 118 $UCMP= "cmplw"; # unsigned compare |
119 $CNTLZ= "cntlzw"; # count leading zeros | 119 $CNTLZ= "cntlzw"; # count leading zeros |
120 $SHL= "slw"; # shift left | 120 $SHL= "slw"; # shift left |
121 $SHR= "srw"; # unsigned shift right | 121 $SHR= "srw"; # unsigned shift right |
122 $SHRI= "srwi"; # unsigned shift right by immediate | 122 $SHRI= "srwi"; # unsigned shift right by immediate |
123 $SHLI= "slwi"; # shift left by immediate | 123 $SHLI= "slwi"; # shift left by immediate |
124 $CLRU= "clrlwi"; # clear upper bits | 124 $CLRU= "clrlwi"; # clear upper bits |
125 $INSR= "insrwi"; # insert right | 125 $INSR= "insrwi"; # insert right |
126 $ROTL= "rotlwi"; # rotate left by immediate | 126 $ROTL= "rotlwi"; # rotate left by immediate |
127 $TR= "tw"; # conditional trap | 127 $TR= "tw"; # conditional trap |
128 } elsif ($opf =~ /64\.s/) { | 128 } elsif ($flavour =~ /64/) { |
129 $BITS= 64; | 129 $BITS= 64; |
130 $BNSZ= $BITS/8; | 130 $BNSZ= $BITS/8; |
131 $ISA= "\"ppc64\""; | 131 $ISA= "\"ppc64\""; |
132 | 132 |
133 # same as above, but 64-bit mnemonics... | 133 # same as above, but 64-bit mnemonics... |
134 $LD= "ld"; # load | 134 $LD= "ld"; # load |
135 $LDU= "ldu"; # load and update | 135 $LDU= "ldu"; # load and update |
136 $ST= "std"; # store | 136 $ST= "std"; # store |
137 $STU= "stdu"; # store and update | 137 $STU= "stdu"; # store and update |
138 $UMULL= "mulld"; # unsigned multiply low | 138 $UMULL= "mulld"; # unsigned multiply low |
139 $UMULH= "mulhdu"; # unsigned multiply high | 139 $UMULH= "mulhdu"; # unsigned multiply high |
140 $UDIV= "divdu"; # unsigned divide | 140 $UDIV= "divdu"; # unsigned divide |
141 $UCMPI= "cmpldi"; # unsigned compare with immediate | 141 $UCMPI= "cmpldi"; # unsigned compare with immediate |
142 $UCMP= "cmpld"; # unsigned compare | 142 $UCMP= "cmpld"; # unsigned compare |
143 $CNTLZ= "cntlzd"; # count leading zeros | 143 $CNTLZ= "cntlzd"; # count leading zeros |
144 $SHL= "sld"; # shift left | 144 $SHL= "sld"; # shift left |
145 $SHR= "srd"; # unsigned shift right | 145 $SHR= "srd"; # unsigned shift right |
146 $SHRI= "srdi"; # unsigned shift right by immediate | 146 $SHRI= "srdi"; # unsigned shift right by immediate |
147 $SHLI= "sldi"; # shift left by immediate | 147 $SHLI= "sldi"; # shift left by immediate |
148 $CLRU= "clrldi"; # clear upper bits | 148 $CLRU= "clrldi"; # clear upper bits |
149 $INSR= "insrdi"; # insert right | 149 $INSR= "insrdi"; # insert right |
150 $ROTL= "rotldi"; # rotate left by immediate | 150 $ROTL= "rotldi"; # rotate left by immediate |
151 $TR= "td"; # conditional trap | 151 $TR= "td"; # conditional trap |
152 } else { die "nonsense $opf"; } | 152 } else { die "nonsense $flavour"; } |
153 | 153 |
154 ( defined shift || open STDOUT,">$opf" ) || die "can't open $opf: $!"; | 154 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 155 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or |
| 156 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or |
| 157 die "can't locate ppc-xlate.pl"; |
155 | 158 |
156 # function entry points from the AIX code | 159 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; |
157 # | |
158 # There are other, more elegant, ways to handle this. We (IBM) chose | |
159 # this approach as it plays well with scripts we run to 'namespace' | |
160 # OpenSSL .i.e. we add a prefix to all the public symbols so we can | |
161 # co-exist in the same process with other implementations of OpenSSL. | |
162 # 'cleverer' ways of doing these substitutions tend to hide data we | |
163 # need to be obvious. | |
164 # | |
165 my @items = ("bn_sqr_comba4", | |
166 » "bn_sqr_comba8", | |
167 » "bn_mul_comba4", | |
168 » "bn_mul_comba8", | |
169 » "bn_sub_words", | |
170 » "bn_add_words", | |
171 » "bn_div_words", | |
172 » "bn_sqr_words", | |
173 » "bn_mul_words", | |
174 » "bn_mul_add_words"); | |
175 | 160 |
176 if ($opf =~ /linux/)»{ do_linux();» } | 161 $data=<<EOF; |
177 elsif ($opf =~ /aix/)» { do_aix();» } | |
178 elsif ($opf =~ /osx/)» { do_osx();» } | |
179 else» » » { do_bsd();» } | |
180 | |
181 sub do_linux { | |
182 $d=&data(); | |
183 | |
184 if ($BITS==64) { | |
185 foreach $t (@items) { | |
186 $d =~ s/\.$t:/\ | |
187 \t.section\t".opd","aw"\ | |
188 \t.align\t3\ | |
189 \t.globl\t$t\ | |
190 $t:\ | |
191 \t.quad\t.$t,.TOC.\@tocbase,0\ | |
192 \t.size\t$t,24\ | |
193 \t.previous\n\ | |
194 \t.type\t.$t,\@function\ | |
195 \t.globl\t.$t\ | |
196 .$t:/g; | |
197 } | |
198 } | |
199 else { | |
200 foreach $t (@items) { | |
201 $d=~s/\.$t/$t/g; | |
202 } | |
203 } | |
204 # hide internal labels to avoid pollution of name table... | |
205 $d=~s/Lppcasm_/.Lppcasm_/gm; | |
206 print $d; | |
207 } | |
208 | |
209 sub do_aix { | |
210 # AIX assembler is smart enough to please the linker without | |
211 # making us do something special... | |
212 print &data(); | |
213 } | |
214 | |
215 # MacOSX 32 bit | |
216 sub do_osx { | |
217 $d=&data(); | |
218 # Change the bn symbol prefix from '.' to '_' | |
219 foreach $t (@items) { | |
220 $d=~s/\.$t/_$t/g; | |
221 } | |
222 # Change .machine to something OS X asm will accept | |
223 $d=~s/\.machine.*/.text/g; | |
224 $d=~s/\#/;/g; # change comment from '#' to ';' | |
225 print $d; | |
226 } | |
227 | |
228 # BSD (Untested) | |
229 sub do_bsd { | |
230 $d=&data(); | |
231 foreach $t (@items) { | |
232 $d=~s/\.$t/_$t/g; | |
233 } | |
234 print $d; | |
235 } | |
236 | |
237 sub data { | |
238 » local($data)=<<EOF; | |
239 #-------------------------------------------------------------------- | 162 #-------------------------------------------------------------------- |
240 # | 163 # |
241 # | 164 # |
242 # | 165 # |
243 # | 166 # |
244 # File: ppc32.s | 167 # File: ppc32.s |
245 # | 168 # |
246 # Created by: Suresh Chari | 169 # Created by: Suresh Chari |
247 # IBM Thomas J. Watson Research Library | 170 # IBM Thomas J. Watson Research Library |
248 # Hawthorne, NY | 171 # Hawthorne, NY |
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
290 # architecture the optimizations in this file do | 213 # architecture the optimizations in this file do |
291 # NOT provide much improvement. | 214 # NOT provide much improvement. |
292 # | 215 # |
293 # If you have comments or suggestions to improve code send | 216 # If you have comments or suggestions to improve code send |
294 # me a note at schari\@us.ibm.com | 217 # me a note at schari\@us.ibm.com |
295 # | 218 # |
296 #-------------------------------------------------------------------------- | 219 #-------------------------------------------------------------------------- |
297 # | 220 # |
298 # Defines to be used in the assembly code. | 221 # Defines to be used in the assembly code. |
299 # | 222 # |
300 .set r0,0» # we use it as storage for value of 0 | 223 #.set r0,0» # we use it as storage for value of 0 |
301 .set SP,1» # preserved | 224 #.set SP,1» # preserved |
302 .set RTOC,2» # preserved | 225 #.set RTOC,2» # preserved |
303 .set r3,3» # 1st argument/return value | 226 #.set r3,3» # 1st argument/return value |
304 .set r4,4» # 2nd argument/volatile register | 227 #.set r4,4» # 2nd argument/volatile register |
305 .set r5,5» # 3rd argument/volatile register | 228 #.set r5,5» # 3rd argument/volatile register |
306 .set r6,6» # ... | 229 #.set r6,6» # ... |
307 .set r7,7 | 230 #.set r7,7 |
308 .set r8,8 | 231 #.set r8,8 |
309 .set r9,9 | 232 #.set r9,9 |
310 .set r10,10 | 233 #.set r10,10 |
311 .set r11,11 | 234 #.set r11,11 |
312 .set r12,12 | 235 #.set r12,12 |
313 .set r13,13» # not used, nor any other "below" it... | 236 #.set r13,13» # not used, nor any other "below" it... |
314 | |
315 .set BO_IF_NOT,4 | |
316 .set BO_IF,12 | |
317 .set BO_dCTR_NZERO,16 | |
318 .set BO_dCTR_ZERO,18 | |
319 .set BO_ALWAYS,20 | |
320 .set CR0_LT,0; | |
321 .set CR0_GT,1; | |
322 .set CR0_EQ,2 | |
323 .set CR1_FX,4; | |
324 .set CR1_FEX,5; | |
325 .set CR1_VX,6 | |
326 .set LR,8 | |
327 | 237 |
328 # Declare function names to be global | 238 # Declare function names to be global |
329 # NOTE: For gcc these names MUST be changed to remove | 239 # NOTE: For gcc these names MUST be changed to remove |
330 # the first . i.e. for example change ".bn_sqr_comba4" | 240 # the first . i.e. for example change ".bn_sqr_comba4" |
331 # to "bn_sqr_comba4". This should be automatically done | 241 # to "bn_sqr_comba4". This should be automatically done |
332 # in the build. | 242 # in the build. |
333 | 243 |
334 .globl .bn_sqr_comba4 | 244 .globl .bn_sqr_comba4 |
335 .globl .bn_sqr_comba8 | 245 .globl .bn_sqr_comba8 |
336 .globl .bn_mul_comba4 | 246 .globl .bn_mul_comba4 |
337 .globl .bn_mul_comba8 | 247 .globl .bn_mul_comba8 |
338 .globl .bn_sub_words | 248 .globl .bn_sub_words |
339 .globl .bn_add_words | 249 .globl .bn_add_words |
340 .globl .bn_div_words | 250 .globl .bn_div_words |
341 .globl .bn_sqr_words | 251 .globl .bn_sqr_words |
342 .globl .bn_mul_words | 252 .globl .bn_mul_words |
343 .globl .bn_mul_add_words | 253 .globl .bn_mul_add_words |
344 | 254 |
345 # .text section | 255 # .text section |
346 | 256 |
347 » .machine» $ISA | 257 » .machine» "any" |
348 | 258 |
349 # | 259 # |
350 # NOTE: The following label name should be changed to | 260 # NOTE: The following label name should be changed to |
351 # "bn_sqr_comba4" i.e. remove the first dot | 261 # "bn_sqr_comba4" i.e. remove the first dot |
352 # for the gcc compiler. This should be automatically | 262 # for the gcc compiler. This should be automatically |
353 # done in the build | 263 # done in the build |
354 # | 264 # |
355 | 265 |
356 .align 4 | 266 .align 4 |
357 .bn_sqr_comba4: | 267 .bn_sqr_comba4: |
(...skipping 113 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
471 addze r10,r10 | 381 addze r10,r10 |
472 $ST r11,`5*$BNSZ`(r3) #r[5] = c3 | 382 $ST r11,`5*$BNSZ`(r3) #r[5] = c3 |
473 #sqr_add_c(a,3,c1,c2,c3); | 383 #sqr_add_c(a,3,c1,c2,c3); |
474 $UMULL r7,r6,r6 | 384 $UMULL r7,r6,r6 |
475 $UMULH r8,r6,r6 | 385 $UMULH r8,r6,r6 |
476 addc r9,r7,r9 | 386 addc r9,r7,r9 |
477 adde r10,r8,r10 | 387 adde r10,r8,r10 |
478 | 388 |
479 $ST r9,`6*$BNSZ`(r3) #r[6]=c1 | 389 $ST r9,`6*$BNSZ`(r3) #r[6]=c1 |
480 $ST r10,`7*$BNSZ`(r3) #r[7]=c2 | 390 $ST r10,`7*$BNSZ`(r3) #r[7]=c2 |
481 » bclr» BO_ALWAYS,CR0_LT | 391 » blr |
482 .long 0x00000000 | 392 .long 0x00000000 |
483 | 393 |
484 # | 394 # |
485 # NOTE: The following label name should be changed to | 395 # NOTE: The following label name should be changed to |
486 # "bn_sqr_comba8" i.e. remove the first dot | 396 # "bn_sqr_comba8" i.e. remove the first dot |
487 # for the gcc compiler. This should be automatically | 397 # for the gcc compiler. This should be automatically |
488 # done in the build | 398 # done in the build |
489 # | 399 # |
490 | 400 |
491 .align 4 | 401 .align 4 |
(...skipping 404 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
896 $ST r10,`13*$BNSZ`(r3) #r[13]=c2; | 806 $ST r10,`13*$BNSZ`(r3) #r[13]=c2; |
897 #sqr_add_c(a,7,c3,c1,c2); | 807 #sqr_add_c(a,7,c3,c1,c2); |
898 $UMULL r7,r6,r6 | 808 $UMULL r7,r6,r6 |
899 $UMULH r8,r6,r6 | 809 $UMULH r8,r6,r6 |
900 addc r11,r7,r11 | 810 addc r11,r7,r11 |
901 adde r9,r8,r9 | 811 adde r9,r8,r9 |
902 $ST r11,`14*$BNSZ`(r3) #r[14]=c3; | 812 $ST r11,`14*$BNSZ`(r3) #r[14]=c3; |
903 $ST r9, `15*$BNSZ`(r3) #r[15]=c1; | 813 $ST r9, `15*$BNSZ`(r3) #r[15]=c1; |
904 | 814 |
905 | 815 |
906 » bclr» BO_ALWAYS,CR0_LT | 816 » blr |
907 | 817 |
908 .long 0x00000000 | 818 .long 0x00000000 |
909 | 819 |
910 # | 820 # |
911 # NOTE: The following label name should be changed to | 821 # NOTE: The following label name should be changed to |
912 # "bn_mul_comba4" i.e. remove the first dot | 822 # "bn_mul_comba4" i.e. remove the first dot |
913 # for the gcc compiler. This should be automatically | 823 # for the gcc compiler. This should be automatically |
914 # done in the build | 824 # done in the build |
915 # | 825 # |
916 | 826 |
(...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1032 $ST r11,`4*$BNSZ`(r3) #r[4]=c2 | 942 $ST r11,`4*$BNSZ`(r3) #r[4]=c2 |
1033 #mul_add_c(a[2],b[3],c3,c1,c2); | 943 #mul_add_c(a[2],b[3],c3,c1,c2); |
1034 $LD r6,`2*$BNSZ`(r4) | 944 $LD r6,`2*$BNSZ`(r4) |
1035 $UMULL r8,r6,r7 | 945 $UMULL r8,r6,r7 |
1036 $UMULH r9,r6,r7 | 946 $UMULH r9,r6,r7 |
1037 addc r12,r8,r12 | 947 addc r12,r8,r12 |
1038 adde r10,r9,r10 | 948 adde r10,r9,r10 |
1039 addze r11,r0 | 949 addze r11,r0 |
1040 #mul_add_c(a[3],b[2],c3,c1,c2); | 950 #mul_add_c(a[3],b[2],c3,c1,c2); |
1041 $LD r6,`3*$BNSZ`(r4) | 951 $LD r6,`3*$BNSZ`(r4) |
1042 » $LD» r7,`2*$BNSZ`(r4) | 952 » $LD» r7,`2*$BNSZ`(r5) |
1043 $UMULL r8,r6,r7 | 953 $UMULL r8,r6,r7 |
1044 $UMULH r9,r6,r7 | 954 $UMULH r9,r6,r7 |
1045 addc r12,r8,r12 | 955 addc r12,r8,r12 |
1046 adde r10,r9,r10 | 956 adde r10,r9,r10 |
1047 addze r11,r11 | 957 addze r11,r11 |
1048 $ST r12,`5*$BNSZ`(r3) #r[5]=c3 | 958 $ST r12,`5*$BNSZ`(r3) #r[5]=c3 |
1049 #mul_add_c(a[3],b[3],c1,c2,c3); | 959 #mul_add_c(a[3],b[3],c1,c2,c3); |
1050 $LD r7,`3*$BNSZ`(r5) | 960 $LD r7,`3*$BNSZ`(r5) |
1051 $UMULL r8,r6,r7 | 961 $UMULL r8,r6,r7 |
1052 $UMULH r9,r6,r7 | 962 $UMULH r9,r6,r7 |
1053 addc r10,r8,r10 | 963 addc r10,r8,r10 |
1054 adde r11,r9,r11 | 964 adde r11,r9,r11 |
1055 | 965 |
1056 $ST r10,`6*$BNSZ`(r3) #r[6]=c1 | 966 $ST r10,`6*$BNSZ`(r3) #r[6]=c1 |
1057 $ST r11,`7*$BNSZ`(r3) #r[7]=c2 | 967 $ST r11,`7*$BNSZ`(r3) #r[7]=c2 |
1058 » bclr» BO_ALWAYS,CR0_LT | 968 » blr |
1059 .long 0x00000000 | 969 .long 0x00000000 |
1060 | 970 |
1061 # | 971 # |
1062 # NOTE: The following label name should be changed to | 972 # NOTE: The following label name should be changed to |
1063 # "bn_mul_comba8" i.e. remove the first dot | 973 # "bn_mul_comba8" i.e. remove the first dot |
1064 # for the gcc compiler. This should be automatically | 974 # for the gcc compiler. This should be automatically |
1065 # done in the build | 975 # done in the build |
1066 # | 976 # |
1067 | 977 |
1068 .align 4 | 978 .align 4 |
(...skipping 515 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1584 addze r10,r10 | 1494 addze r10,r10 |
1585 $ST r11,`13*$BNSZ`(r3) #r[13]=c2; | 1495 $ST r11,`13*$BNSZ`(r3) #r[13]=c2; |
1586 #mul_add_c(a[7],b[7],c3,c1,c2); | 1496 #mul_add_c(a[7],b[7],c3,c1,c2); |
1587 $LD r7,`7*$BNSZ`(r5) | 1497 $LD r7,`7*$BNSZ`(r5) |
1588 $UMULL r8,r6,r7 | 1498 $UMULL r8,r6,r7 |
1589 $UMULH r9,r6,r7 | 1499 $UMULH r9,r6,r7 |
1590 addc r12,r12,r8 | 1500 addc r12,r12,r8 |
1591 adde r10,r10,r9 | 1501 adde r10,r10,r9 |
1592 $ST r12,`14*$BNSZ`(r3) #r[14]=c3; | 1502 $ST r12,`14*$BNSZ`(r3) #r[14]=c3; |
1593 $ST r10,`15*$BNSZ`(r3) #r[15]=c1; | 1503 $ST r10,`15*$BNSZ`(r3) #r[15]=c1; |
1594 » bclr» BO_ALWAYS,CR0_LT | 1504 » blr |
1595 .long 0x00000000 | 1505 .long 0x00000000 |
1596 | 1506 |
1597 # | 1507 # |
1598 # NOTE: The following label name should be changed to | 1508 # NOTE: The following label name should be changed to |
1599 # "bn_sub_words" i.e. remove the first dot | 1509 # "bn_sub_words" i.e. remove the first dot |
1600 # for the gcc compiler. This should be automatically | 1510 # for the gcc compiler. This should be automatically |
1601 # done in the build | 1511 # done in the build |
1602 # | 1512 # |
1603 # | 1513 # |
1604 .align 4 | 1514 .align 4 |
(...skipping 11 matching lines...) Expand all Loading... |
1616 # Note: No loop unrolling done since this is not a performance | 1526 # Note: No loop unrolling done since this is not a performance |
1617 # critical loop. | 1527 # critical loop. |
1618 | 1528 |
1619 xor r0,r0,r0 #set r0 = 0 | 1529 xor r0,r0,r0 #set r0 = 0 |
1620 # | 1530 # |
1621 # check for r6 = 0 AND set carry bit. | 1531 # check for r6 = 0 AND set carry bit. |
1622 # | 1532 # |
1623 subfc. r7,r0,r6 # If r6 is 0 then result is 0. | 1533 subfc. r7,r0,r6 # If r6 is 0 then result is 0. |
1624 # if r6 > 0 then result !=0 | 1534 # if r6 > 0 then result !=0 |
1625 # In either case carry bit is set. | 1535 # In either case carry bit is set. |
1626 » bc» BO_IF,CR0_EQ,Lppcasm_sub_adios | 1536 » beq» Lppcasm_sub_adios |
1627 addi r4,r4,-$BNSZ | 1537 addi r4,r4,-$BNSZ |
1628 addi r3,r3,-$BNSZ | 1538 addi r3,r3,-$BNSZ |
1629 addi r5,r5,-$BNSZ | 1539 addi r5,r5,-$BNSZ |
1630 mtctr r6 | 1540 mtctr r6 |
1631 Lppcasm_sub_mainloop: | 1541 Lppcasm_sub_mainloop: |
1632 $LDU r7,$BNSZ(r4) | 1542 $LDU r7,$BNSZ(r4) |
1633 $LDU r8,$BNSZ(r5) | 1543 $LDU r8,$BNSZ(r5) |
1634 subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8) | 1544 subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8) |
1635 # if carry = 1 this is r7-r8. Else it | 1545 # if carry = 1 this is r7-r8. Else it |
1636 # is r7-r8 -1 as we need. | 1546 # is r7-r8 -1 as we need. |
1637 $STU r6,$BNSZ(r3) | 1547 $STU r6,$BNSZ(r3) |
1638 » bc» BO_dCTR_NZERO,CR0_EQ,Lppcasm_sub_mainloop | 1548 » bdnz-» Lppcasm_sub_mainloop |
1639 Lppcasm_sub_adios: | 1549 Lppcasm_sub_adios: |
1640 subfze r3,r0 # if carry bit is set then r3 = 0 else -1 | 1550 subfze r3,r0 # if carry bit is set then r3 = 0 else -1 |
1641 andi. r3,r3,1 # keep only last bit. | 1551 andi. r3,r3,1 # keep only last bit. |
1642 » bclr» BO_ALWAYS,CR0_LT | 1552 » blr |
1643 .long 0x00000000 | 1553 .long 0x00000000 |
1644 | 1554 |
1645 | 1555 |
1646 # | 1556 # |
1647 # NOTE: The following label name should be changed to | 1557 # NOTE: The following label name should be changed to |
1648 # "bn_add_words" i.e. remove the first dot | 1558 # "bn_add_words" i.e. remove the first dot |
1649 # for the gcc compiler. This should be automatically | 1559 # for the gcc compiler. This should be automatically |
1650 # done in the build | 1560 # done in the build |
1651 # | 1561 # |
1652 | 1562 |
(...skipping 10 matching lines...) Expand all Loading... |
1663 # r6 = n | 1573 # r6 = n |
1664 # | 1574 # |
1665 # Note: No loop unrolling done since this is not a performance | 1575 # Note: No loop unrolling done since this is not a performance |
1666 # critical loop. | 1576 # critical loop. |
1667 | 1577 |
1668 xor r0,r0,r0 | 1578 xor r0,r0,r0 |
1669 # | 1579 # |
1670 # check for r6 = 0. Is this needed? | 1580 # check for r6 = 0. Is this needed? |
1671 # | 1581 # |
1672 addic. r6,r6,0 #test r6 and clear carry bit. | 1582 addic. r6,r6,0 #test r6 and clear carry bit. |
1673 » bc» BO_IF,CR0_EQ,Lppcasm_add_adios | 1583 » beq» Lppcasm_add_adios |
1674 addi r4,r4,-$BNSZ | 1584 addi r4,r4,-$BNSZ |
1675 addi r3,r3,-$BNSZ | 1585 addi r3,r3,-$BNSZ |
1676 addi r5,r5,-$BNSZ | 1586 addi r5,r5,-$BNSZ |
1677 mtctr r6 | 1587 mtctr r6 |
1678 Lppcasm_add_mainloop: | 1588 Lppcasm_add_mainloop: |
1679 $LDU r7,$BNSZ(r4) | 1589 $LDU r7,$BNSZ(r4) |
1680 $LDU r8,$BNSZ(r5) | 1590 $LDU r8,$BNSZ(r5) |
1681 adde r8,r7,r8 | 1591 adde r8,r7,r8 |
1682 $STU r8,$BNSZ(r3) | 1592 $STU r8,$BNSZ(r3) |
1683 » bc» BO_dCTR_NZERO,CR0_EQ,Lppcasm_add_mainloop | 1593 » bdnz-» Lppcasm_add_mainloop |
1684 Lppcasm_add_adios: | 1594 Lppcasm_add_adios: |
1685 addze r3,r0 #return carry bit. | 1595 addze r3,r0 #return carry bit. |
1686 » bclr» BO_ALWAYS,CR0_LT | 1596 » blr |
1687 .long 0x00000000 | 1597 .long 0x00000000 |
1688 | 1598 |
1689 # | 1599 # |
1690 # NOTE: The following label name should be changed to | 1600 # NOTE: The following label name should be changed to |
1691 # "bn_div_words" i.e. remove the first dot | 1601 # "bn_div_words" i.e. remove the first dot |
1692 # for the gcc compiler. This should be automatically | 1602 # for the gcc compiler. This should be automatically |
1693 # done in the build | 1603 # done in the build |
1694 # | 1604 # |
1695 | 1605 |
1696 .align 4 | 1606 .align 4 |
1697 .bn_div_words: | 1607 .bn_div_words: |
1698 # | 1608 # |
1699 # This is a cleaned up version of code generated by | 1609 # This is a cleaned up version of code generated by |
1700 # the AIX compiler. The only optimization is to use | 1610 # the AIX compiler. The only optimization is to use |
1701 # the PPC instruction to count leading zeros instead | 1611 # the PPC instruction to count leading zeros instead |
1702 # of call to num_bits_word. Since this was compiled | 1612 # of call to num_bits_word. Since this was compiled |
1703 # only at level -O2 we can possibly squeeze it more? | 1613 # only at level -O2 we can possibly squeeze it more? |
1704 # | 1614 # |
1705 # r3 = h | 1615 # r3 = h |
1706 # r4 = l | 1616 # r4 = l |
1707 # r5 = d | 1617 # r5 = d |
1708 | 1618 |
1709 $UCMPI 0,r5,0 # compare r5 and 0 | 1619 $UCMPI 0,r5,0 # compare r5 and 0 |
1710 » bc» BO_IF_NOT,CR0_EQ,Lppcasm_div1» # proceed if d!=0 | 1620 » bne» Lppcasm_div1» » # proceed if d!=0 |
1711 li r3,-1 # d=0 return -1 | 1621 li r3,-1 # d=0 return -1 |
1712 » bclr» BO_ALWAYS,CR0_LT» | 1622 » blr |
1713 Lppcasm_div1: | 1623 Lppcasm_div1: |
1714 xor r0,r0,r0 #r0=0 | 1624 xor r0,r0,r0 #r0=0 |
1715 li r8,$BITS | 1625 li r8,$BITS |
1716 $CNTLZ. r7,r5 #r7 = num leading 0s in d. | 1626 $CNTLZ. r7,r5 #r7 = num leading 0s in d. |
1717 » bc» BO_IF,CR0_EQ,Lppcasm_div2» #proceed if no leading zeros | 1627 » beq» Lppcasm_div2» » #proceed if no leading zeros |
1718 subf r8,r7,r8 #r8 = BN_num_bits_word(d) | 1628 subf r8,r7,r8 #r8 = BN_num_bits_word(d) |
1719 $SHR. r9,r3,r8 #are there any bits above r8'th? | 1629 $SHR. r9,r3,r8 #are there any bits above r8'th? |
1720 $TR 16,r9,r0 #if there're, signal to dump core... | 1630 $TR 16,r9,r0 #if there're, signal to dump core... |
1721 Lppcasm_div2: | 1631 Lppcasm_div2: |
1722 $UCMP 0,r3,r5 #h>=d? | 1632 $UCMP 0,r3,r5 #h>=d? |
1723 » bc» BO_IF,CR0_LT,Lppcasm_div3» #goto Lppcasm_div3 if not | 1633 » blt» Lppcasm_div3» » #goto Lppcasm_div3 if not |
1724 subf r3,r5,r3 #h-=d ; | 1634 subf r3,r5,r3 #h-=d ; |
1725 Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i | 1635 Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i |
1726 cmpi 0,0,r7,0 # is (i == 0)? | 1636 cmpi 0,0,r7,0 # is (i == 0)? |
1727 » bc» BO_IF,CR0_EQ,Lppcasm_div4 | 1637 » beq» Lppcasm_div4 |
1728 $SHL r3,r3,r7 # h = (h<< i) | 1638 $SHL r3,r3,r7 # h = (h<< i) |
1729 $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i) | 1639 $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i) |
1730 $SHL r5,r5,r7 # d<<=i | 1640 $SHL r5,r5,r7 # d<<=i |
1731 or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i)) | 1641 or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i)) |
1732 $SHL r4,r4,r7 # l <<=i | 1642 $SHL r4,r4,r7 # l <<=i |
1733 Lppcasm_div4: | 1643 Lppcasm_div4: |
1734 $SHRI r9,r5,`$BITS/2` # r9 = dh | 1644 $SHRI r9,r5,`$BITS/2` # r9 = dh |
1735 # dl will be computed when needed | 1645 # dl will be computed when needed |
1736 # as it saves registers. | 1646 # as it saves registers. |
1737 li r6,2 #r6=2 | 1647 li r6,2 #r6=2 |
1738 mtctr r6 #counter will be in count. | 1648 mtctr r6 #counter will be in count. |
1739 Lppcasm_divouterloop: | 1649 Lppcasm_divouterloop: |
1740 $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4) | 1650 $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4) |
1741 $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 | 1651 $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 |
1742 # compute here for innerloop. | 1652 # compute here for innerloop. |
1743 $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh | 1653 $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh |
1744 » bc» BO_IF_NOT,CR0_EQ,Lppcasm_div5» # goto Lppcasm_div5 if not | 1654 » bne» Lppcasm_div5» » # goto Lppcasm_div5 if not |
1745 | 1655 |
1746 li r8,-1 | 1656 li r8,-1 |
1747 $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l | 1657 $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l |
1748 b Lppcasm_div6 | 1658 b Lppcasm_div6 |
1749 Lppcasm_div5: | 1659 Lppcasm_div5: |
1750 $UDIV r8,r3,r9 #q = h/dh | 1660 $UDIV r8,r3,r9 #q = h/dh |
1751 Lppcasm_div6: | 1661 Lppcasm_div6: |
1752 $UMULL r12,r9,r8 #th = q*dh | 1662 $UMULL r12,r9,r8 #th = q*dh |
1753 $CLRU r10,r5,`$BITS/2` #r10=dl | 1663 $CLRU r10,r5,`$BITS/2` #r10=dl |
1754 $UMULL r6,r8,r10 #tl = q*dl | 1664 $UMULL r6,r8,r10 #tl = q*dl |
1755 | 1665 |
1756 Lppcasm_divinnerloop: | 1666 Lppcasm_divinnerloop: |
1757 subf r10,r12,r3 #t = h -th | 1667 subf r10,r12,r3 #t = h -th |
1758 $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of... | 1668 $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of... |
1759 addic. r7,r7,0 #test if r7 == 0. used below. | 1669 addic. r7,r7,0 #test if r7 == 0. used below. |
1760 # now want to compute | 1670 # now want to compute |
1761 # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_
BITS4) | 1671 # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_
BITS4) |
1762 # the following 2 instructions do that | 1672 # the following 2 instructions do that |
1763 $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4) | 1673 $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4) |
1764 or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4) | 1674 or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4) |
1765 » $UCMP» 1,r6,r7»» » # compare (tl <= r7) | 1675 » $UCMP» cr1,r6,r7» » # compare (tl <= r7) |
1766 » bc» BO_IF_NOT,CR0_EQ,Lppcasm_divinnerexit | 1676 » bne» Lppcasm_divinnerexit |
1767 » bc» BO_IF_NOT,CR1_FEX,Lppcasm_divinnerexit | 1677 » ble» cr1,Lppcasm_divinnerexit |
1768 addi r8,r8,-1 #q-- | 1678 addi r8,r8,-1 #q-- |
1769 subf r12,r9,r12 #th -=dh | 1679 subf r12,r9,r12 #th -=dh |
1770 $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop. | 1680 $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop. |
1771 subf r6,r10,r6 #tl -=dl | 1681 subf r6,r10,r6 #tl -=dl |
1772 b Lppcasm_divinnerloop | 1682 b Lppcasm_divinnerloop |
1773 Lppcasm_divinnerexit: | 1683 Lppcasm_divinnerexit: |
1774 $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4) | 1684 $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4) |
1775 $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h; | 1685 $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h; |
1776 » $UCMP» 1,r4,r11» » # compare l and tl | 1686 » $UCMP» cr1,r4,r11» » # compare l and tl |
1777 add r12,r12,r10 # th+=t | 1687 add r12,r12,r10 # th+=t |
1778 » bc» BO_IF_NOT,CR1_FX,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7 | 1688 » bge» cr1,Lppcasm_div7» # if (l>=tl) goto Lppcasm_div7 |
1779 addi r12,r12,1 # th++ | 1689 addi r12,r12,1 # th++ |
1780 Lppcasm_div7: | 1690 Lppcasm_div7: |
1781 subf r11,r11,r4 #r11=l-tl | 1691 subf r11,r11,r4 #r11=l-tl |
1782 » $UCMP» 1,r3,r12» » #compare h and th | 1692 » $UCMP» cr1,r3,r12» » #compare h and th |
1783 » bc» BO_IF_NOT,CR1_FX,Lppcasm_div8» #if (h>=th) goto Lppcasm_div8 | 1693 » bge» cr1,Lppcasm_div8» #if (h>=th) goto Lppcasm_div8 |
1784 addi r8,r8,-1 # q-- | 1694 addi r8,r8,-1 # q-- |
1785 add r3,r5,r3 # h+=d | 1695 add r3,r5,r3 # h+=d |
1786 Lppcasm_div8: | 1696 Lppcasm_div8: |
1787 subf r12,r12,r3 #r12 = h-th | 1697 subf r12,r12,r3 #r12 = h-th |
1788 $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4 | 1698 $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4 |
1789 # want to compute | 1699 # want to compute |
1790 # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_M
ASK2 | 1700 # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_M
ASK2 |
1791 # the following 2 instructions will do t
his. | 1701 # the following 2 instructions will do t
his. |
1792 $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotat
ed $BITS/2. | 1702 $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotat
ed $BITS/2. |
1793 $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3 | 1703 $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3 |
1794 » bc» BO_dCTR_ZERO,CR0_EQ,Lppcasm_div9#if (count==0) break ; | 1704 » bdz» Lppcasm_div9» » #if (count==0) break ; |
1795 $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4 | 1705 $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4 |
1796 b Lppcasm_divouterloop | 1706 b Lppcasm_divouterloop |
1797 Lppcasm_div9: | 1707 Lppcasm_div9: |
1798 or r3,r8,r0 | 1708 or r3,r8,r0 |
1799 » bclr» BO_ALWAYS,CR0_LT | 1709 » blr |
1800 .long 0x00000000 | 1710 .long 0x00000000 |
1801 | 1711 |
1802 # | 1712 # |
1803 # NOTE: The following label name should be changed to | 1713 # NOTE: The following label name should be changed to |
1804 # "bn_sqr_words" i.e. remove the first dot | 1714 # "bn_sqr_words" i.e. remove the first dot |
1805 # for the gcc compiler. This should be automatically | 1715 # for the gcc compiler. This should be automatically |
1806 # done in the build | 1716 # done in the build |
1807 # | 1717 # |
1808 .align 4 | 1718 .align 4 |
1809 .bn_sqr_words: | 1719 .bn_sqr_words: |
1810 # | 1720 # |
1811 # Optimized version of bn_sqr_words | 1721 # Optimized version of bn_sqr_words |
1812 # | 1722 # |
1813 # void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) | 1723 # void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) |
1814 # | 1724 # |
1815 # r3 = r | 1725 # r3 = r |
1816 # r4 = a | 1726 # r4 = a |
1817 # r5 = n | 1727 # r5 = n |
1818 # | 1728 # |
1819 # r6 = a[i]. | 1729 # r6 = a[i]. |
1820 # r7,r8 = product. | 1730 # r7,r8 = product. |
1821 # | 1731 # |
1822 # No unrolling done here. Not performance critical. | 1732 # No unrolling done here. Not performance critical. |
1823 | 1733 |
1824 addic. r5,r5,0 #test r5. | 1734 addic. r5,r5,0 #test r5. |
1825 » bc» BO_IF,CR0_EQ,Lppcasm_sqr_adios | 1735 » beq» Lppcasm_sqr_adios |
1826 addi r4,r4,-$BNSZ | 1736 addi r4,r4,-$BNSZ |
1827 addi r3,r3,-$BNSZ | 1737 addi r3,r3,-$BNSZ |
1828 mtctr r5 | 1738 mtctr r5 |
1829 Lppcasm_sqr_mainloop: | 1739 Lppcasm_sqr_mainloop: |
1830 #sqr(r[0],r[1],a[0]); | 1740 #sqr(r[0],r[1],a[0]); |
1831 $LDU r6,$BNSZ(r4) | 1741 $LDU r6,$BNSZ(r4) |
1832 $UMULL r7,r6,r6 | 1742 $UMULL r7,r6,r6 |
1833 $UMULH r8,r6,r6 | 1743 $UMULH r8,r6,r6 |
1834 $STU r7,$BNSZ(r3) | 1744 $STU r7,$BNSZ(r3) |
1835 $STU r8,$BNSZ(r3) | 1745 $STU r8,$BNSZ(r3) |
1836 » bc» BO_dCTR_NZERO,CR0_EQ,Lppcasm_sqr_mainloop | 1746 » bdnz-» Lppcasm_sqr_mainloop |
1837 Lppcasm_sqr_adios: | 1747 Lppcasm_sqr_adios: |
1838 » bclr» BO_ALWAYS,CR0_LT | 1748 » blr |
1839 .long 0x00000000 | 1749 .long 0x00000000 |
1840 | 1750 |
1841 | 1751 |
1842 # | 1752 # |
1843 # NOTE: The following label name should be changed to | 1753 # NOTE: The following label name should be changed to |
1844 # "bn_mul_words" i.e. remove the first dot | 1754 # "bn_mul_words" i.e. remove the first dot |
1845 # for the gcc compiler. This should be automatically | 1755 # for the gcc compiler. This should be automatically |
1846 # done in the build | 1756 # done in the build |
1847 # | 1757 # |
1848 | 1758 |
1849 .align 4 | 1759 .align 4 |
1850 .bn_mul_words: | 1760 .bn_mul_words: |
1851 # | 1761 # |
1852 # BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | 1762 # BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) |
1853 # | 1763 # |
1854 # r3 = rp | 1764 # r3 = rp |
1855 # r4 = ap | 1765 # r4 = ap |
1856 # r5 = num | 1766 # r5 = num |
1857 # r6 = w | 1767 # r6 = w |
1858 xor r0,r0,r0 | 1768 xor r0,r0,r0 |
1859 xor r12,r12,r12 # used for carry | 1769 xor r12,r12,r12 # used for carry |
1860 rlwinm. r7,r5,30,2,31 # num >> 2 | 1770 rlwinm. r7,r5,30,2,31 # num >> 2 |
1861 » bc» BO_IF,CR0_EQ,Lppcasm_mw_REM | 1771 » beq» Lppcasm_mw_REM |
1862 mtctr r7 | 1772 mtctr r7 |
1863 Lppcasm_mw_LOOP: | 1773 Lppcasm_mw_LOOP: |
1864 #mul(rp[0],ap[0],w,c1); | 1774 #mul(rp[0],ap[0],w,c1); |
1865 $LD r8,`0*$BNSZ`(r4) | 1775 $LD r8,`0*$BNSZ`(r4) |
1866 $UMULL r9,r6,r8 | 1776 $UMULL r9,r6,r8 |
1867 $UMULH r10,r6,r8 | 1777 $UMULH r10,r6,r8 |
1868 addc r9,r9,r12 | 1778 addc r9,r9,r12 |
1869 #addze r10,r10 #carry is NOT ignored. | 1779 #addze r10,r10 #carry is NOT ignored. |
1870 #will be taken care of | 1780 #will be taken care of |
1871 #in second spin below | 1781 #in second spin below |
(...skipping 17 matching lines...) Expand all Loading... |
1889 $LD r8,`3*$BNSZ`(r4) | 1799 $LD r8,`3*$BNSZ`(r4) |
1890 $UMULL r11,r6,r8 | 1800 $UMULL r11,r6,r8 |
1891 $UMULH r12,r6,r8 | 1801 $UMULH r12,r6,r8 |
1892 adde r11,r11,r10 | 1802 adde r11,r11,r10 |
1893 addze r12,r12 #this spin we collect carry into | 1803 addze r12,r12 #this spin we collect carry into |
1894 #r12 | 1804 #r12 |
1895 $ST r11,`3*$BNSZ`(r3) | 1805 $ST r11,`3*$BNSZ`(r3) |
1896 | 1806 |
1897 addi r3,r3,`4*$BNSZ` | 1807 addi r3,r3,`4*$BNSZ` |
1898 addi r4,r4,`4*$BNSZ` | 1808 addi r4,r4,`4*$BNSZ` |
1899 » bc» BO_dCTR_NZERO,CR0_EQ,Lppcasm_mw_LOOP | 1809 » bdnz-» Lppcasm_mw_LOOP |
1900 | 1810 |
1901 Lppcasm_mw_REM: | 1811 Lppcasm_mw_REM: |
1902 andi. r5,r5,0x3 | 1812 andi. r5,r5,0x3 |
1903 » bc» BO_IF,CR0_EQ,Lppcasm_mw_OVER | 1813 » beq» Lppcasm_mw_OVER |
1904 #mul(rp[0],ap[0],w,c1); | 1814 #mul(rp[0],ap[0],w,c1); |
1905 $LD r8,`0*$BNSZ`(r4) | 1815 $LD r8,`0*$BNSZ`(r4) |
1906 $UMULL r9,r6,r8 | 1816 $UMULL r9,r6,r8 |
1907 $UMULH r10,r6,r8 | 1817 $UMULH r10,r6,r8 |
1908 addc r9,r9,r12 | 1818 addc r9,r9,r12 |
1909 addze r10,r10 | 1819 addze r10,r10 |
1910 $ST r9,`0*$BNSZ`(r3) | 1820 $ST r9,`0*$BNSZ`(r3) |
1911 addi r12,r10,0 | 1821 addi r12,r10,0 |
1912 | 1822 |
1913 addi r5,r5,-1 | 1823 addi r5,r5,-1 |
1914 cmpli 0,0,r5,0 | 1824 cmpli 0,0,r5,0 |
1915 » bc» BO_IF,CR0_EQ,Lppcasm_mw_OVER | 1825 » beq» Lppcasm_mw_OVER |
1916 | 1826 |
1917 | 1827 |
1918 #mul(rp[1],ap[1],w,c1); | 1828 #mul(rp[1],ap[1],w,c1); |
1919 $LD r8,`1*$BNSZ`(r4) | 1829 $LD r8,`1*$BNSZ`(r4) |
1920 $UMULL r9,r6,r8 | 1830 $UMULL r9,r6,r8 |
1921 $UMULH r10,r6,r8 | 1831 $UMULH r10,r6,r8 |
1922 addc r9,r9,r12 | 1832 addc r9,r9,r12 |
1923 addze r10,r10 | 1833 addze r10,r10 |
1924 $ST r9,`1*$BNSZ`(r3) | 1834 $ST r9,`1*$BNSZ`(r3) |
1925 addi r12,r10,0 | 1835 addi r12,r10,0 |
1926 | 1836 |
1927 addi r5,r5,-1 | 1837 addi r5,r5,-1 |
1928 cmpli 0,0,r5,0 | 1838 cmpli 0,0,r5,0 |
1929 » bc» BO_IF,CR0_EQ,Lppcasm_mw_OVER | 1839 » beq» Lppcasm_mw_OVER |
1930 | 1840 |
1931 #mul_add(rp[2],ap[2],w,c1); | 1841 #mul_add(rp[2],ap[2],w,c1); |
1932 $LD r8,`2*$BNSZ`(r4) | 1842 $LD r8,`2*$BNSZ`(r4) |
1933 $UMULL r9,r6,r8 | 1843 $UMULL r9,r6,r8 |
1934 $UMULH r10,r6,r8 | 1844 $UMULH r10,r6,r8 |
1935 addc r9,r9,r12 | 1845 addc r9,r9,r12 |
1936 addze r10,r10 | 1846 addze r10,r10 |
1937 $ST r9,`2*$BNSZ`(r3) | 1847 $ST r9,`2*$BNSZ`(r3) |
1938 addi r12,r10,0 | 1848 addi r12,r10,0 |
1939 | 1849 |
1940 Lppcasm_mw_OVER: | 1850 Lppcasm_mw_OVER: |
1941 addi r3,r12,0 | 1851 addi r3,r12,0 |
1942 » bclr» BO_ALWAYS,CR0_LT | 1852 » blr |
1943 .long 0x00000000 | 1853 .long 0x00000000 |
1944 | 1854 |
1945 # | 1855 # |
1946 # NOTE: The following label name should be changed to | 1856 # NOTE: The following label name should be changed to |
1947 # "bn_mul_add_words" i.e. remove the first dot | 1857 # "bn_mul_add_words" i.e. remove the first dot |
1948 # for the gcc compiler. This should be automatically | 1858 # for the gcc compiler. This should be automatically |
1949 # done in the build | 1859 # done in the build |
1950 # | 1860 # |
1951 | 1861 |
1952 .align 4 | 1862 .align 4 |
1953 .bn_mul_add_words: | 1863 .bn_mul_add_words: |
1954 # | 1864 # |
1955 # BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | 1865 # BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) |
1956 # | 1866 # |
1957 # r3 = rp | 1867 # r3 = rp |
1958 # r4 = ap | 1868 # r4 = ap |
1959 # r5 = num | 1869 # r5 = num |
1960 # r6 = w | 1870 # r6 = w |
1961 # | 1871 # |
1962 # empirical evidence suggests that unrolled version performs best!! | 1872 # empirical evidence suggests that unrolled version performs best!! |
1963 # | 1873 # |
1964 xor r0,r0,r0 #r0 = 0 | 1874 xor r0,r0,r0 #r0 = 0 |
1965 xor r12,r12,r12 #r12 = 0 . used for carry | 1875 xor r12,r12,r12 #r12 = 0 . used for carry |
1966 rlwinm. r7,r5,30,2,31 # num >> 2 | 1876 rlwinm. r7,r5,30,2,31 # num >> 2 |
1967 » bc» BO_IF,CR0_EQ,Lppcasm_maw_leftover» # if (num < 4) go LPPCAS
M_maw_leftover | 1877 » beq» Lppcasm_maw_leftover» # if (num < 4) go LPPCASM_maw_leftover |
1968 mtctr r7 | 1878 mtctr r7 |
1969 Lppcasm_maw_mainloop: | 1879 Lppcasm_maw_mainloop: |
1970 #mul_add(rp[0],ap[0],w,c1); | 1880 #mul_add(rp[0],ap[0],w,c1); |
1971 $LD r8,`0*$BNSZ`(r4) | 1881 $LD r8,`0*$BNSZ`(r4) |
1972 $LD r11,`0*$BNSZ`(r3) | 1882 $LD r11,`0*$BNSZ`(r3) |
1973 $UMULL r9,r6,r8 | 1883 $UMULL r9,r6,r8 |
1974 $UMULH r10,r6,r8 | 1884 $UMULH r10,r6,r8 |
1975 addc r9,r9,r12 #r12 is carry. | 1885 addc r9,r9,r12 #r12 is carry. |
1976 addze r10,r10 | 1886 addze r10,r10 |
1977 addc r9,r9,r11 | 1887 addc r9,r9,r11 |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2010 $UMULL r11,r6,r8 | 1920 $UMULL r11,r6,r8 |
2011 $LD r9,`3*$BNSZ`(r3) | 1921 $LD r9,`3*$BNSZ`(r3) |
2012 $UMULH r12,r6,r8 | 1922 $UMULH r12,r6,r8 |
2013 adde r11,r11,r10 | 1923 adde r11,r11,r10 |
2014 addze r12,r12 | 1924 addze r12,r12 |
2015 addc r11,r11,r9 | 1925 addc r11,r11,r9 |
2016 addze r12,r12 | 1926 addze r12,r12 |
2017 $ST r11,`3*$BNSZ`(r3) | 1927 $ST r11,`3*$BNSZ`(r3) |
2018 addi r3,r3,`4*$BNSZ` | 1928 addi r3,r3,`4*$BNSZ` |
2019 addi r4,r4,`4*$BNSZ` | 1929 addi r4,r4,`4*$BNSZ` |
2020 » bc» BO_dCTR_NZERO,CR0_EQ,Lppcasm_maw_mainloop | 1930 » bdnz-» Lppcasm_maw_mainloop |
2021 | 1931 |
2022 Lppcasm_maw_leftover: | 1932 Lppcasm_maw_leftover: |
2023 andi. r5,r5,0x3 | 1933 andi. r5,r5,0x3 |
2024 » bc» BO_IF,CR0_EQ,Lppcasm_maw_adios | 1934 » beq» Lppcasm_maw_adios |
2025 addi r3,r3,-$BNSZ | 1935 addi r3,r3,-$BNSZ |
2026 addi r4,r4,-$BNSZ | 1936 addi r4,r4,-$BNSZ |
2027 #mul_add(rp[0],ap[0],w,c1); | 1937 #mul_add(rp[0],ap[0],w,c1); |
2028 mtctr r5 | 1938 mtctr r5 |
2029 $LDU r8,$BNSZ(r4) | 1939 $LDU r8,$BNSZ(r4) |
2030 $UMULL r9,r6,r8 | 1940 $UMULL r9,r6,r8 |
2031 $UMULH r10,r6,r8 | 1941 $UMULH r10,r6,r8 |
2032 $LDU r11,$BNSZ(r3) | 1942 $LDU r11,$BNSZ(r3) |
2033 addc r9,r9,r11 | 1943 addc r9,r9,r11 |
2034 addze r10,r10 | 1944 addze r10,r10 |
2035 addc r9,r9,r12 | 1945 addc r9,r9,r12 |
2036 addze r12,r10 | 1946 addze r12,r10 |
2037 $ST r9,0(r3) | 1947 $ST r9,0(r3) |
2038 | 1948 |
2039 » bc» BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios | 1949 » bdz» Lppcasm_maw_adios |
2040 #mul_add(rp[1],ap[1],w,c1); | 1950 #mul_add(rp[1],ap[1],w,c1); |
2041 $LDU r8,$BNSZ(r4) | 1951 $LDU r8,$BNSZ(r4) |
2042 $UMULL r9,r6,r8 | 1952 $UMULL r9,r6,r8 |
2043 $UMULH r10,r6,r8 | 1953 $UMULH r10,r6,r8 |
2044 $LDU r11,$BNSZ(r3) | 1954 $LDU r11,$BNSZ(r3) |
2045 addc r9,r9,r11 | 1955 addc r9,r9,r11 |
2046 addze r10,r10 | 1956 addze r10,r10 |
2047 addc r9,r9,r12 | 1957 addc r9,r9,r12 |
2048 addze r12,r10 | 1958 addze r12,r10 |
2049 $ST r9,0(r3) | 1959 $ST r9,0(r3) |
2050 | 1960 |
2051 » bc» BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios | 1961 » bdz» Lppcasm_maw_adios |
2052 #mul_add(rp[2],ap[2],w,c1); | 1962 #mul_add(rp[2],ap[2],w,c1); |
2053 $LDU r8,$BNSZ(r4) | 1963 $LDU r8,$BNSZ(r4) |
2054 $UMULL r9,r6,r8 | 1964 $UMULL r9,r6,r8 |
2055 $UMULH r10,r6,r8 | 1965 $UMULH r10,r6,r8 |
2056 $LDU r11,$BNSZ(r3) | 1966 $LDU r11,$BNSZ(r3) |
2057 addc r9,r9,r11 | 1967 addc r9,r9,r11 |
2058 addze r10,r10 | 1968 addze r10,r10 |
2059 addc r9,r9,r12 | 1969 addc r9,r9,r12 |
2060 addze r12,r10 | 1970 addze r12,r10 |
2061 $ST r9,0(r3) | 1971 $ST r9,0(r3) |
2062 | 1972 |
2063 Lppcasm_maw_adios: | 1973 Lppcasm_maw_adios: |
2064 addi r3,r12,0 | 1974 addi r3,r12,0 |
2065 » bclr» BO_ALWAYS,CR0_LT | 1975 » blr |
2066 .long 0x00000000 | 1976 .long 0x00000000 |
2067 .align 4 | 1977 .align 4 |
2068 EOF | 1978 EOF |
2069 » $data =~ s/\`([^\`]*)\`/eval $1/gem; | 1979 $data =~ s/\`([^\`]*)\`/eval $1/gem; |
2070 | 1980 print $data; |
2071 » # if some assembler chokes on some simplified mnemonic, | 1981 close STDOUT; |
2072 » # this is the spot to fix it up, e.g.: | |
2073 » # GNU as doesn't seem to accept cmplw, 32-bit unsigned compare | |
2074 » $data =~ s/^(\s*)cmplw(\s+)([^,]+),(.*)/$1cmpl$2$3,0,$4/gm; | |
2075 » # assembler X doesn't accept li, load immediate value | |
2076 » #$data =~ s/^(\s*)li(\s+)([^,]+),(.*)/$1addi$2$3,0,$4/gm; | |
2077 » # assembler Y chokes on apostrophes in comments | |
2078 » $data =~ s/'//gm; | |
2079 » return($data); | |
2080 } | |
OLD | NEW |