OLD | NEW |
| (Empty) |
1 #!/usr/local/bin/perl | |
2 | |
3 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
4 push(@INC,"${dir}","${dir}../../perlasm"); | |
5 require "x86asm.pl"; | |
6 | |
7 &asm_init($ARGV[0],$0); | |
8 | |
9 $sse2=0; | |
10 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | |
11 | |
12 &external_label("OPENSSL_ia32cap_P") if ($sse2); | |
13 | |
14 &bn_mul_add_words("bn_mul_add_words"); | |
15 &bn_mul_words("bn_mul_words"); | |
16 &bn_sqr_words("bn_sqr_words"); | |
17 &bn_div_words("bn_div_words"); | |
18 &bn_add_words("bn_add_words"); | |
19 &bn_sub_words("bn_sub_words"); | |
20 &bn_sub_part_words("bn_sub_part_words"); | |
21 | |
22 &asm_finish(); | |
23 | |
24 sub bn_mul_add_words | |
25 { | |
26 local($name)=@_; | |
27 | |
28 &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":""); | |
29 | |
30 $r="eax"; | |
31 $a="edx"; | |
32 $c="ecx"; | |
33 | |
34 if ($sse2) { | |
35 &picmeup("eax","OPENSSL_ia32cap_P"); | |
36 &bt(&DWP(0,"eax"),26); | |
37 &jnc(&label("maw_non_sse2")); | |
38 | |
39 &mov($r,&wparam(0)); | |
40 &mov($a,&wparam(1)); | |
41 &mov($c,&wparam(2)); | |
42 &movd("mm0",&wparam(3)); # mm0 = w | |
43 &pxor("mm1","mm1"); # mm1 = carry_in | |
44 &jmp(&label("maw_sse2_entry")); | |
45 | |
46 &set_label("maw_sse2_unrolled",16); | |
47 &movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0] | |
48 &paddq("mm1","mm3"); # mm1 = carry_in + r[0] | |
49 &movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0] | |
50 &pmuludq("mm2","mm0"); # mm2 = w*a[0] | |
51 &movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1] | |
52 &pmuludq("mm4","mm0"); # mm4 = w*a[1] | |
53 &movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2] | |
54 &pmuludq("mm6","mm0"); # mm6 = w*a[2] | |
55 &movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3] | |
56 &pmuludq("mm7","mm0"); # mm7 = w*a[3] | |
57 &paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0] | |
58 &movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1] | |
59 &paddq("mm3","mm4"); # mm3 = r[1] + w*a[1] | |
60 &movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2] | |
61 &paddq("mm5","mm6"); # mm5 = r[2] + w*a[2] | |
62 &movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3] | |
63 &paddq("mm7","mm4"); # mm7 = r[3] + w*a[3] | |
64 &movd(&DWP(0,$r,"",0),"mm1"); | |
65 &movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4] | |
66 &pmuludq("mm2","mm0"); # mm2 = w*a[4] | |
67 &psrlq("mm1",32); # mm1 = carry0 | |
68 &movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5] | |
69 &pmuludq("mm4","mm0"); # mm4 = w*a[5] | |
70 &paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1] | |
71 &movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6] | |
72 &pmuludq("mm6","mm0"); # mm6 = w*a[6] | |
73 &movd(&DWP(4,$r,"",0),"mm1"); | |
74 &psrlq("mm1",32); # mm1 = carry1 | |
75 &movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7] | |
76 &add($a,32); | |
77 &pmuludq("mm3","mm0"); # mm3 = w*a[7] | |
78 &paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2] | |
79 &movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4] | |
80 &paddq("mm2","mm5"); # mm2 = r[4] + w*a[4] | |
81 &movd(&DWP(8,$r,"",0),"mm1"); | |
82 &psrlq("mm1",32); # mm1 = carry2 | |
83 &paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3] | |
84 &movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5] | |
85 &paddq("mm4","mm5"); # mm4 = r[5] + w*a[5] | |
86 &movd(&DWP(12,$r,"",0),"mm1"); | |
87 &psrlq("mm1",32); # mm1 = carry3 | |
88 &paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4] | |
89 &movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6] | |
90 &paddq("mm6","mm5"); # mm6 = r[6] + w*a[6] | |
91 &movd(&DWP(16,$r,"",0),"mm1"); | |
92 &psrlq("mm1",32); # mm1 = carry4 | |
93 &paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5] | |
94 &movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7] | |
95 &paddq("mm3","mm5"); # mm3 = r[7] + w*a[7] | |
96 &movd(&DWP(20,$r,"",0),"mm1"); | |
97 &psrlq("mm1",32); # mm1 = carry5 | |
98 &paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6] | |
99 &movd(&DWP(24,$r,"",0),"mm1"); | |
100 &psrlq("mm1",32); # mm1 = carry6 | |
101 &paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7] | |
102 &movd(&DWP(28,$r,"",0),"mm1"); | |
103 &lea($r,&DWP(32,$r)); | |
104 &psrlq("mm1",32); # mm1 = carry_out | |
105 | |
106 &sub($c,8); | |
107 &jz(&label("maw_sse2_exit")); | |
108 &set_label("maw_sse2_entry"); | |
109 &test($c,0xfffffff8); | |
110 &jnz(&label("maw_sse2_unrolled")); | |
111 | |
112 &set_label("maw_sse2_loop",4); | |
113 &movd("mm2",&DWP(0,$a)); # mm2 = a[i] | |
114 &movd("mm3",&DWP(0,$r)); # mm3 = r[i] | |
115 &pmuludq("mm2","mm0"); # a[i] *= w | |
116 &lea($a,&DWP(4,$a)); | |
117 &paddq("mm1","mm3"); # carry += r[i] | |
118 &paddq("mm1","mm2"); # carry += a[i]*w | |
119 &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low | |
120 &sub($c,1); | |
121 &psrlq("mm1",32); # carry = carry_high | |
122 &lea($r,&DWP(4,$r)); | |
123 &jnz(&label("maw_sse2_loop")); | |
124 &set_label("maw_sse2_exit"); | |
125 &movd("eax","mm1"); # c = carry_out | |
126 &emms(); | |
127 &ret(); | |
128 | |
129 &set_label("maw_non_sse2",16); | |
130 } | |
131 | |
132 # function_begin prologue | |
133 &push("ebp"); | |
134 &push("ebx"); | |
135 &push("esi"); | |
136 &push("edi"); | |
137 | |
138 &comment(""); | |
139 $Low="eax"; | |
140 $High="edx"; | |
141 $a="ebx"; | |
142 $w="ebp"; | |
143 $r="edi"; | |
144 $c="esi"; | |
145 | |
146 &xor($c,$c); # clear carry | |
147 &mov($r,&wparam(0)); # | |
148 | |
149 &mov("ecx",&wparam(2)); # | |
150 &mov($a,&wparam(1)); # | |
151 | |
152 &and("ecx",0xfffffff8); # num / 8 | |
153 &mov($w,&wparam(3)); # | |
154 | |
155 &push("ecx"); # Up the stack for a tmp variable | |
156 | |
157 &jz(&label("maw_finish")); | |
158 | |
159 &set_label("maw_loop",16); | |
160 | |
161 for ($i=0; $i<32; $i+=4) | |
162 { | |
163 &comment("Round $i"); | |
164 | |
165 &mov("eax",&DWP($i,$a)); # *a | |
166 &mul($w); # *a * w | |
167 &add("eax",$c); # L(t)+= c | |
168 &adc("edx",0); # H(t)+=carry | |
169 &add("eax",&DWP($i,$r)); # L(t)+= *r | |
170 &adc("edx",0); # H(t)+=carry | |
171 &mov(&DWP($i,$r),"eax"); # *r= L(t); | |
172 &mov($c,"edx"); # c= H(t); | |
173 } | |
174 | |
175 &comment(""); | |
176 &sub("ecx",8); | |
177 &lea($a,&DWP(32,$a)); | |
178 &lea($r,&DWP(32,$r)); | |
179 &jnz(&label("maw_loop")); | |
180 | |
181 &set_label("maw_finish",0); | |
182 &mov("ecx",&wparam(2)); # get num | |
183 &and("ecx",7); | |
184 &jnz(&label("maw_finish2")); # helps branch prediction | |
185 &jmp(&label("maw_end")); | |
186 | |
187 &set_label("maw_finish2",1); | |
188 for ($i=0; $i<7; $i++) | |
189 { | |
190 &comment("Tail Round $i"); | |
191 &mov("eax",&DWP($i*4,$a)); # *a | |
192 &mul($w); # *a * w | |
193 &add("eax",$c); # L(t)+=c | |
194 &adc("edx",0); # H(t)+=carry | |
195 &add("eax",&DWP($i*4,$r)); # L(t)+= *r | |
196 &adc("edx",0); # H(t)+=carry | |
197 &dec("ecx") if ($i != 7-1); | |
198 &mov(&DWP($i*4,$r),"eax"); # *r= L(t); | |
199 &mov($c,"edx"); # c= H(t); | |
200 &jz(&label("maw_end")) if ($i != 7-1); | |
201 } | |
202 &set_label("maw_end",0); | |
203 &mov("eax",$c); | |
204 | |
205 &pop("ecx"); # clear variable from | |
206 | |
207 &function_end($name); | |
208 } | |
209 | |
210 sub bn_mul_words | |
211 { | |
212 local($name)=@_; | |
213 | |
214 &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":""); | |
215 | |
216 $r="eax"; | |
217 $a="edx"; | |
218 $c="ecx"; | |
219 | |
220 if ($sse2) { | |
221 &picmeup("eax","OPENSSL_ia32cap_P"); | |
222 &bt(&DWP(0,"eax"),26); | |
223 &jnc(&label("mw_non_sse2")); | |
224 | |
225 &mov($r,&wparam(0)); | |
226 &mov($a,&wparam(1)); | |
227 &mov($c,&wparam(2)); | |
228 &movd("mm0",&wparam(3)); # mm0 = w | |
229 &pxor("mm1","mm1"); # mm1 = carry = 0 | |
230 | |
231 &set_label("mw_sse2_loop",16); | |
232 &movd("mm2",&DWP(0,$a)); # mm2 = a[i] | |
233 &pmuludq("mm2","mm0"); # a[i] *= w | |
234 &lea($a,&DWP(4,$a)); | |
235 &paddq("mm1","mm2"); # carry += a[i]*w | |
236 &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low | |
237 &sub($c,1); | |
238 &psrlq("mm1",32); # carry = carry_high | |
239 &lea($r,&DWP(4,$r)); | |
240 &jnz(&label("mw_sse2_loop")); | |
241 | |
242 &movd("eax","mm1"); # return carry | |
243 &emms(); | |
244 &ret(); | |
245 &set_label("mw_non_sse2",16); | |
246 } | |
247 | |
248 # function_begin prologue | |
249 &push("ebp"); | |
250 &push("ebx"); | |
251 &push("esi"); | |
252 &push("edi"); | |
253 | |
254 &comment(""); | |
255 $Low="eax"; | |
256 $High="edx"; | |
257 $a="ebx"; | |
258 $w="ecx"; | |
259 $r="edi"; | |
260 $c="esi"; | |
261 $num="ebp"; | |
262 | |
263 &xor($c,$c); # clear carry | |
264 &mov($r,&wparam(0)); # | |
265 &mov($a,&wparam(1)); # | |
266 &mov($num,&wparam(2)); # | |
267 &mov($w,&wparam(3)); # | |
268 | |
269 &and($num,0xfffffff8); # num / 8 | |
270 &jz(&label("mw_finish")); | |
271 | |
272 &set_label("mw_loop",0); | |
273 for ($i=0; $i<32; $i+=4) | |
274 { | |
275 &comment("Round $i"); | |
276 | |
277 &mov("eax",&DWP($i,$a,"",0)); # *a | |
278 &mul($w); # *a * w | |
279 &add("eax",$c); # L(t)+=c | |
280 # XXX | |
281 | |
282 &adc("edx",0); # H(t)+=carry | |
283 &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t); | |
284 | |
285 &mov($c,"edx"); # c= H(t); | |
286 } | |
287 | |
288 &comment(""); | |
289 &add($a,32); | |
290 &add($r,32); | |
291 &sub($num,8); | |
292 &jz(&label("mw_finish")); | |
293 &jmp(&label("mw_loop")); | |
294 | |
295 &set_label("mw_finish",0); | |
296 &mov($num,&wparam(2)); # get num | |
297 &and($num,7); | |
298 &jnz(&label("mw_finish2")); | |
299 &jmp(&label("mw_end")); | |
300 | |
301 &set_label("mw_finish2",1); | |
302 for ($i=0; $i<7; $i++) | |
303 { | |
304 &comment("Tail Round $i"); | |
305 &mov("eax",&DWP($i*4,$a,"",0));# *a | |
306 &mul($w); # *a * w | |
307 &add("eax",$c); # L(t)+=c | |
308 # XXX | |
309 &adc("edx",0); # H(t)+=carry | |
310 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t); | |
311 &mov($c,"edx"); # c= H(t); | |
312 &dec($num) if ($i != 7-1); | |
313 &jz(&label("mw_end")) if ($i != 7-1); | |
314 } | |
315 &set_label("mw_end",0); | |
316 &mov("eax",$c); | |
317 | |
318 &function_end($name); | |
319 } | |
320 | |
321 sub bn_sqr_words | |
322 { | |
323 local($name)=@_; | |
324 | |
325 &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":""); | |
326 | |
327 $r="eax"; | |
328 $a="edx"; | |
329 $c="ecx"; | |
330 | |
331 if ($sse2) { | |
332 &picmeup("eax","OPENSSL_ia32cap_P"); | |
333 &bt(&DWP(0,"eax"),26); | |
334 &jnc(&label("sqr_non_sse2")); | |
335 | |
336 &mov($r,&wparam(0)); | |
337 &mov($a,&wparam(1)); | |
338 &mov($c,&wparam(2)); | |
339 | |
340 &set_label("sqr_sse2_loop",16); | |
341 &movd("mm0",&DWP(0,$a)); # mm0 = a[i] | |
342 &pmuludq("mm0","mm0"); # a[i] *= a[i] | |
343 &lea($a,&DWP(4,$a)); # a++ | |
344 &movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i] | |
345 &sub($c,1); | |
346 &lea($r,&DWP(8,$r)); # r += 2 | |
347 &jnz(&label("sqr_sse2_loop")); | |
348 | |
349 &emms(); | |
350 &ret(); | |
351 &set_label("sqr_non_sse2",16); | |
352 } | |
353 | |
354 # function_begin prologue | |
355 &push("ebp"); | |
356 &push("ebx"); | |
357 &push("esi"); | |
358 &push("edi"); | |
359 | |
360 &comment(""); | |
361 $r="esi"; | |
362 $a="edi"; | |
363 $num="ebx"; | |
364 | |
365 &mov($r,&wparam(0)); # | |
366 &mov($a,&wparam(1)); # | |
367 &mov($num,&wparam(2)); # | |
368 | |
369 &and($num,0xfffffff8); # num / 8 | |
370 &jz(&label("sw_finish")); | |
371 | |
372 &set_label("sw_loop",0); | |
373 for ($i=0; $i<32; $i+=4) | |
374 { | |
375 &comment("Round $i"); | |
376 &mov("eax",&DWP($i,$a,"",0)); # *a | |
377 # XXX | |
378 &mul("eax"); # *a * *a | |
379 &mov(&DWP($i*2,$r,"",0),"eax"); # | |
380 &mov(&DWP($i*2+4,$r,"",0),"edx");# | |
381 } | |
382 | |
383 &comment(""); | |
384 &add($a,32); | |
385 &add($r,64); | |
386 &sub($num,8); | |
387 &jnz(&label("sw_loop")); | |
388 | |
389 &set_label("sw_finish",0); | |
390 &mov($num,&wparam(2)); # get num | |
391 &and($num,7); | |
392 &jz(&label("sw_end")); | |
393 | |
394 for ($i=0; $i<7; $i++) | |
395 { | |
396 &comment("Tail Round $i"); | |
397 &mov("eax",&DWP($i*4,$a,"",0)); # *a | |
398 # XXX | |
399 &mul("eax"); # *a * *a | |
400 &mov(&DWP($i*8,$r,"",0),"eax"); # | |
401 &dec($num) if ($i != 7-1); | |
402 &mov(&DWP($i*8+4,$r,"",0),"edx"); | |
403 &jz(&label("sw_end")) if ($i != 7-1); | |
404 } | |
405 &set_label("sw_end",0); | |
406 | |
407 &function_end($name); | |
408 } | |
409 | |
410 sub bn_div_words | |
411 { | |
412 local($name)=@_; | |
413 | |
414 &function_begin_B($name,""); | |
415 &mov("edx",&wparam(0)); # | |
416 &mov("eax",&wparam(1)); # | |
417 &mov("ecx",&wparam(2)); # | |
418 &div("ecx"); | |
419 &ret(); | |
420 &function_end_B($name); | |
421 } | |
422 | |
423 sub bn_add_words | |
424 { | |
425 local($name)=@_; | |
426 | |
427 &function_begin($name,""); | |
428 | |
429 &comment(""); | |
430 $a="esi"; | |
431 $b="edi"; | |
432 $c="eax"; | |
433 $r="ebx"; | |
434 $tmp1="ecx"; | |
435 $tmp2="edx"; | |
436 $num="ebp"; | |
437 | |
438 &mov($r,&wparam(0)); # get r | |
439 &mov($a,&wparam(1)); # get a | |
440 &mov($b,&wparam(2)); # get b | |
441 &mov($num,&wparam(3)); # get num | |
442 &xor($c,$c); # clear carry | |
443 &and($num,0xfffffff8); # num / 8 | |
444 | |
445 &jz(&label("aw_finish")); | |
446 | |
447 &set_label("aw_loop",0); | |
448 for ($i=0; $i<8; $i++) | |
449 { | |
450 &comment("Round $i"); | |
451 | |
452 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a | |
453 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b | |
454 &add($tmp1,$c); | |
455 &mov($c,0); | |
456 &adc($c,$c); | |
457 &add($tmp1,$tmp2); | |
458 &adc($c,0); | |
459 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r | |
460 } | |
461 | |
462 &comment(""); | |
463 &add($a,32); | |
464 &add($b,32); | |
465 &add($r,32); | |
466 &sub($num,8); | |
467 &jnz(&label("aw_loop")); | |
468 | |
469 &set_label("aw_finish",0); | |
470 &mov($num,&wparam(3)); # get num | |
471 &and($num,7); | |
472 &jz(&label("aw_end")); | |
473 | |
474 for ($i=0; $i<7; $i++) | |
475 { | |
476 &comment("Tail Round $i"); | |
477 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a | |
478 &mov($tmp2,&DWP($i*4,$b,"",0));# *b | |
479 &add($tmp1,$c); | |
480 &mov($c,0); | |
481 &adc($c,$c); | |
482 &add($tmp1,$tmp2); | |
483 &adc($c,0); | |
484 &dec($num) if ($i != 6); | |
485 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r | |
486 &jz(&label("aw_end")) if ($i != 6); | |
487 } | |
488 &set_label("aw_end",0); | |
489 | |
490 # &mov("eax",$c); # $c is "eax" | |
491 | |
492 &function_end($name); | |
493 } | |
494 | |
495 sub bn_sub_words | |
496 { | |
497 local($name)=@_; | |
498 | |
499 &function_begin($name,""); | |
500 | |
501 &comment(""); | |
502 $a="esi"; | |
503 $b="edi"; | |
504 $c="eax"; | |
505 $r="ebx"; | |
506 $tmp1="ecx"; | |
507 $tmp2="edx"; | |
508 $num="ebp"; | |
509 | |
510 &mov($r,&wparam(0)); # get r | |
511 &mov($a,&wparam(1)); # get a | |
512 &mov($b,&wparam(2)); # get b | |
513 &mov($num,&wparam(3)); # get num | |
514 &xor($c,$c); # clear carry | |
515 &and($num,0xfffffff8); # num / 8 | |
516 | |
517 &jz(&label("aw_finish")); | |
518 | |
519 &set_label("aw_loop",0); | |
520 for ($i=0; $i<8; $i++) | |
521 { | |
522 &comment("Round $i"); | |
523 | |
524 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a | |
525 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b | |
526 &sub($tmp1,$c); | |
527 &mov($c,0); | |
528 &adc($c,$c); | |
529 &sub($tmp1,$tmp2); | |
530 &adc($c,0); | |
531 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r | |
532 } | |
533 | |
534 &comment(""); | |
535 &add($a,32); | |
536 &add($b,32); | |
537 &add($r,32); | |
538 &sub($num,8); | |
539 &jnz(&label("aw_loop")); | |
540 | |
541 &set_label("aw_finish",0); | |
542 &mov($num,&wparam(3)); # get num | |
543 &and($num,7); | |
544 &jz(&label("aw_end")); | |
545 | |
546 for ($i=0; $i<7; $i++) | |
547 { | |
548 &comment("Tail Round $i"); | |
549 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a | |
550 &mov($tmp2,&DWP($i*4,$b,"",0));# *b | |
551 &sub($tmp1,$c); | |
552 &mov($c,0); | |
553 &adc($c,$c); | |
554 &sub($tmp1,$tmp2); | |
555 &adc($c,0); | |
556 &dec($num) if ($i != 6); | |
557 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r | |
558 &jz(&label("aw_end")) if ($i != 6); | |
559 } | |
560 &set_label("aw_end",0); | |
561 | |
562 # &mov("eax",$c); # $c is "eax" | |
563 | |
564 &function_end($name); | |
565 } | |
566 | |
567 sub bn_sub_part_words | |
568 { | |
569 local($name)=@_; | |
570 | |
571 &function_begin($name,""); | |
572 | |
573 &comment(""); | |
574 $a="esi"; | |
575 $b="edi"; | |
576 $c="eax"; | |
577 $r="ebx"; | |
578 $tmp1="ecx"; | |
579 $tmp2="edx"; | |
580 $num="ebp"; | |
581 | |
582 &mov($r,&wparam(0)); # get r | |
583 &mov($a,&wparam(1)); # get a | |
584 &mov($b,&wparam(2)); # get b | |
585 &mov($num,&wparam(3)); # get num | |
586 &xor($c,$c); # clear carry | |
587 &and($num,0xfffffff8); # num / 8 | |
588 | |
589 &jz(&label("aw_finish")); | |
590 | |
591 &set_label("aw_loop",0); | |
592 for ($i=0; $i<8; $i++) | |
593 { | |
594 &comment("Round $i"); | |
595 | |
596 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a | |
597 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b | |
598 &sub($tmp1,$c); | |
599 &mov($c,0); | |
600 &adc($c,$c); | |
601 &sub($tmp1,$tmp2); | |
602 &adc($c,0); | |
603 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r | |
604 } | |
605 | |
606 &comment(""); | |
607 &add($a,32); | |
608 &add($b,32); | |
609 &add($r,32); | |
610 &sub($num,8); | |
611 &jnz(&label("aw_loop")); | |
612 | |
613 &set_label("aw_finish",0); | |
614 &mov($num,&wparam(3)); # get num | |
615 &and($num,7); | |
616 &jz(&label("aw_end")); | |
617 | |
618 for ($i=0; $i<7; $i++) | |
619 { | |
620 &comment("Tail Round $i"); | |
621 &mov($tmp1,&DWP(0,$a,"",0)); # *a | |
622 &mov($tmp2,&DWP(0,$b,"",0));# *b | |
623 &sub($tmp1,$c); | |
624 &mov($c,0); | |
625 &adc($c,$c); | |
626 &sub($tmp1,$tmp2); | |
627 &adc($c,0); | |
628 &mov(&DWP(0,$r,"",0),$tmp1); # *r | |
629 &add($a, 4); | |
630 &add($b, 4); | |
631 &add($r, 4); | |
632 &dec($num) if ($i != 6); | |
633 &jz(&label("aw_end")) if ($i != 6); | |
634 } | |
635 &set_label("aw_end",0); | |
636 | |
637 &cmp(&wparam(4),0); | |
638 &je(&label("pw_end")); | |
639 | |
640 &mov($num,&wparam(4)); # get dl | |
641 &cmp($num,0); | |
642 &je(&label("pw_end")); | |
643 &jge(&label("pw_pos")); | |
644 | |
645 &comment("pw_neg"); | |
646 &mov($tmp2,0); | |
647 &sub($tmp2,$num); | |
648 &mov($num,$tmp2); | |
649 &and($num,0xfffffff8); # num / 8 | |
650 &jz(&label("pw_neg_finish")); | |
651 | |
652 &set_label("pw_neg_loop",0); | |
653 for ($i=0; $i<8; $i++) | |
654 { | |
655 &comment("dl<0 Round $i"); | |
656 | |
657 &mov($tmp1,0); | |
658 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b | |
659 &sub($tmp1,$c); | |
660 &mov($c,0); | |
661 &adc($c,$c); | |
662 &sub($tmp1,$tmp2); | |
663 &adc($c,0); | |
664 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r | |
665 } | |
666 | |
667 &comment(""); | |
668 &add($b,32); | |
669 &add($r,32); | |
670 &sub($num,8); | |
671 &jnz(&label("pw_neg_loop")); | |
672 | |
673 &set_label("pw_neg_finish",0); | |
674 &mov($tmp2,&wparam(4)); # get dl | |
675 &mov($num,0); | |
676 &sub($num,$tmp2); | |
677 &and($num,7); | |
678 &jz(&label("pw_end")); | |
679 | |
680 for ($i=0; $i<7; $i++) | |
681 { | |
682 &comment("dl<0 Tail Round $i"); | |
683 &mov($tmp1,0); | |
684 &mov($tmp2,&DWP($i*4,$b,"",0));# *b | |
685 &sub($tmp1,$c); | |
686 &mov($c,0); | |
687 &adc($c,$c); | |
688 &sub($tmp1,$tmp2); | |
689 &adc($c,0); | |
690 &dec($num) if ($i != 6); | |
691 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r | |
692 &jz(&label("pw_end")) if ($i != 6); | |
693 } | |
694 | |
695 &jmp(&label("pw_end")); | |
696 | |
697 &set_label("pw_pos",0); | |
698 | |
699 &and($num,0xfffffff8); # num / 8 | |
700 &jz(&label("pw_pos_finish")); | |
701 | |
702 &set_label("pw_pos_loop",0); | |
703 | |
704 for ($i=0; $i<8; $i++) | |
705 { | |
706 &comment("dl>0 Round $i"); | |
707 | |
708 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a | |
709 &sub($tmp1,$c); | |
710 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r | |
711 &jnc(&label("pw_nc".$i)); | |
712 } | |
713 | |
714 &comment(""); | |
715 &add($a,32); | |
716 &add($r,32); | |
717 &sub($num,8); | |
718 &jnz(&label("pw_pos_loop")); | |
719 | |
720 &set_label("pw_pos_finish",0); | |
721 &mov($num,&wparam(4)); # get dl | |
722 &and($num,7); | |
723 &jz(&label("pw_end")); | |
724 | |
725 for ($i=0; $i<7; $i++) | |
726 { | |
727 &comment("dl>0 Tail Round $i"); | |
728 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a | |
729 &sub($tmp1,$c); | |
730 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r | |
731 &jnc(&label("pw_tail_nc".$i)); | |
732 &dec($num) if ($i != 6); | |
733 &jz(&label("pw_end")) if ($i != 6); | |
734 } | |
735 &mov($c,1); | |
736 &jmp(&label("pw_end")); | |
737 | |
738 &set_label("pw_nc_loop",0); | |
739 for ($i=0; $i<8; $i++) | |
740 { | |
741 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a | |
742 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r | |
743 &set_label("pw_nc".$i,0); | |
744 } | |
745 | |
746 &comment(""); | |
747 &add($a,32); | |
748 &add($r,32); | |
749 &sub($num,8); | |
750 &jnz(&label("pw_nc_loop")); | |
751 | |
752 &mov($num,&wparam(4)); # get dl | |
753 &and($num,7); | |
754 &jz(&label("pw_nc_end")); | |
755 | |
756 for ($i=0; $i<7; $i++) | |
757 { | |
758 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a | |
759 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r | |
760 &set_label("pw_tail_nc".$i,0); | |
761 &dec($num) if ($i != 6); | |
762 &jz(&label("pw_nc_end")) if ($i != 6); | |
763 } | |
764 | |
765 &set_label("pw_nc_end",0); | |
766 &mov($c,0); | |
767 | |
768 &set_label("pw_end",0); | |
769 | |
770 # &mov("eax",$c); # $c is "eax" | |
771 | |
772 &function_end($name); | |
773 } | |
774 | |
OLD | NEW |