OLD | NEW |
| (Empty) |
1 %ifidn __OUTPUT_FORMAT__,obj | |
2 section code use32 class=code align=64 | |
3 %elifidn __OUTPUT_FORMAT__,win32 | |
4 %ifdef __YASM_VERSION_ID__ | |
5 %if __YASM_VERSION_ID__ < 01010000h | |
6 %error yasm version 1.1.0 or later needed. | |
7 %endif | |
8 ; Yasm automatically includes .00 and complains about redefining it. | |
9 ; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html | |
10 %else | |
11 $@feat.00 equ 1 | |
12 %endif | |
13 section .text code align=64 | |
14 %else | |
15 section .text code | |
16 %endif | |
17 ;extern _OPENSSL_ia32cap_P | |
18 global _bn_mul_mont | |
19 align 16 | |
20 _bn_mul_mont: | |
21 L$_bn_mul_mont_begin: | |
22 push ebp | |
23 push ebx | |
24 push esi | |
25 push edi | |
26 xor eax,eax | |
27 mov edi,DWORD [40+esp] | |
28 cmp edi,4 | |
29 jl NEAR L$000just_leave | |
30 lea esi,[20+esp] | |
31 lea edx,[24+esp] | |
32 mov ebp,esp | |
33 add edi,2 | |
34 neg edi | |
35 lea esp,[edi*4+esp-32] | |
36 neg edi | |
37 mov eax,esp | |
38 sub eax,edx | |
39 and eax,2047 | |
40 sub esp,eax | |
41 xor edx,esp | |
42 and edx,2048 | |
43 xor edx,2048 | |
44 sub esp,edx | |
45 and esp,-64 | |
46 mov eax,DWORD [esi] | |
47 mov ebx,DWORD [4+esi] | |
48 mov ecx,DWORD [8+esi] | |
49 mov edx,DWORD [12+esi] | |
50 mov esi,DWORD [16+esi] | |
51 mov esi,DWORD [esi] | |
52 mov DWORD [4+esp],eax | |
53 mov DWORD [8+esp],ebx | |
54 mov DWORD [12+esp],ecx | |
55 mov DWORD [16+esp],edx | |
56 mov DWORD [20+esp],esi | |
57 lea ebx,[edi-3] | |
58 mov DWORD [24+esp],ebp | |
59 lea eax,[_OPENSSL_ia32cap_P] | |
60 bt DWORD [eax],26 | |
61 jnc NEAR L$001non_sse2 | |
62 mov eax,-1 | |
63 movd mm7,eax | |
64 mov esi,DWORD [8+esp] | |
65 mov edi,DWORD [12+esp] | |
66 mov ebp,DWORD [16+esp] | |
67 xor edx,edx | |
68 xor ecx,ecx | |
69 movd mm4,DWORD [edi] | |
70 movd mm5,DWORD [esi] | |
71 movd mm3,DWORD [ebp] | |
72 pmuludq mm5,mm4 | |
73 movq mm2,mm5 | |
74 movq mm0,mm5 | |
75 pand mm0,mm7 | |
76 pmuludq mm5,[20+esp] | |
77 pmuludq mm3,mm5 | |
78 paddq mm3,mm0 | |
79 movd mm1,DWORD [4+ebp] | |
80 movd mm0,DWORD [4+esi] | |
81 psrlq mm2,32 | |
82 psrlq mm3,32 | |
83 inc ecx | |
84 align 16 | |
85 L$0021st: | |
86 pmuludq mm0,mm4 | |
87 pmuludq mm1,mm5 | |
88 paddq mm2,mm0 | |
89 paddq mm3,mm1 | |
90 movq mm0,mm2 | |
91 pand mm0,mm7 | |
92 movd mm1,DWORD [4+ecx*4+ebp] | |
93 paddq mm3,mm0 | |
94 movd mm0,DWORD [4+ecx*4+esi] | |
95 psrlq mm2,32 | |
96 movd DWORD [28+ecx*4+esp],mm3 | |
97 psrlq mm3,32 | |
98 lea ecx,[1+ecx] | |
99 cmp ecx,ebx | |
100 jl NEAR L$0021st | |
101 pmuludq mm0,mm4 | |
102 pmuludq mm1,mm5 | |
103 paddq mm2,mm0 | |
104 paddq mm3,mm1 | |
105 movq mm0,mm2 | |
106 pand mm0,mm7 | |
107 paddq mm3,mm0 | |
108 movd DWORD [28+ecx*4+esp],mm3 | |
109 psrlq mm2,32 | |
110 psrlq mm3,32 | |
111 paddq mm3,mm2 | |
112 movq [32+ebx*4+esp],mm3 | |
113 inc edx | |
114 L$003outer: | |
115 xor ecx,ecx | |
116 movd mm4,DWORD [edx*4+edi] | |
117 movd mm5,DWORD [esi] | |
118 movd mm6,DWORD [32+esp] | |
119 movd mm3,DWORD [ebp] | |
120 pmuludq mm5,mm4 | |
121 paddq mm5,mm6 | |
122 movq mm0,mm5 | |
123 movq mm2,mm5 | |
124 pand mm0,mm7 | |
125 pmuludq mm5,[20+esp] | |
126 pmuludq mm3,mm5 | |
127 paddq mm3,mm0 | |
128 movd mm6,DWORD [36+esp] | |
129 movd mm1,DWORD [4+ebp] | |
130 movd mm0,DWORD [4+esi] | |
131 psrlq mm2,32 | |
132 psrlq mm3,32 | |
133 paddq mm2,mm6 | |
134 inc ecx | |
135 dec ebx | |
136 L$004inner: | |
137 pmuludq mm0,mm4 | |
138 pmuludq mm1,mm5 | |
139 paddq mm2,mm0 | |
140 paddq mm3,mm1 | |
141 movq mm0,mm2 | |
142 movd mm6,DWORD [36+ecx*4+esp] | |
143 pand mm0,mm7 | |
144 movd mm1,DWORD [4+ecx*4+ebp] | |
145 paddq mm3,mm0 | |
146 movd mm0,DWORD [4+ecx*4+esi] | |
147 psrlq mm2,32 | |
148 movd DWORD [28+ecx*4+esp],mm3 | |
149 psrlq mm3,32 | |
150 paddq mm2,mm6 | |
151 dec ebx | |
152 lea ecx,[1+ecx] | |
153 jnz NEAR L$004inner | |
154 mov ebx,ecx | |
155 pmuludq mm0,mm4 | |
156 pmuludq mm1,mm5 | |
157 paddq mm2,mm0 | |
158 paddq mm3,mm1 | |
159 movq mm0,mm2 | |
160 pand mm0,mm7 | |
161 paddq mm3,mm0 | |
162 movd DWORD [28+ecx*4+esp],mm3 | |
163 psrlq mm2,32 | |
164 psrlq mm3,32 | |
165 movd mm6,DWORD [36+ebx*4+esp] | |
166 paddq mm3,mm2 | |
167 paddq mm3,mm6 | |
168 movq [32+ebx*4+esp],mm3 | |
169 lea edx,[1+edx] | |
170 cmp edx,ebx | |
171 jle NEAR L$003outer | |
172 emms | |
173 jmp NEAR L$005common_tail | |
174 align 16 | |
175 L$001non_sse2: | |
176 mov esi,DWORD [8+esp] | |
177 lea ebp,[1+ebx] | |
178 mov edi,DWORD [12+esp] | |
179 xor ecx,ecx | |
180 mov edx,esi | |
181 and ebp,1 | |
182 sub edx,edi | |
183 lea eax,[4+ebx*4+edi] | |
184 or ebp,edx | |
185 mov edi,DWORD [edi] | |
186 jz NEAR L$006bn_sqr_mont | |
187 mov DWORD [28+esp],eax | |
188 mov eax,DWORD [esi] | |
189 xor edx,edx | |
190 align 16 | |
191 L$007mull: | |
192 mov ebp,edx | |
193 mul edi | |
194 add ebp,eax | |
195 lea ecx,[1+ecx] | |
196 adc edx,0 | |
197 mov eax,DWORD [ecx*4+esi] | |
198 cmp ecx,ebx | |
199 mov DWORD [28+ecx*4+esp],ebp | |
200 jl NEAR L$007mull | |
201 mov ebp,edx | |
202 mul edi | |
203 mov edi,DWORD [20+esp] | |
204 add eax,ebp | |
205 mov esi,DWORD [16+esp] | |
206 adc edx,0 | |
207 imul edi,DWORD [32+esp] | |
208 mov DWORD [32+ebx*4+esp],eax | |
209 xor ecx,ecx | |
210 mov DWORD [36+ebx*4+esp],edx | |
211 mov DWORD [40+ebx*4+esp],ecx | |
212 mov eax,DWORD [esi] | |
213 mul edi | |
214 add eax,DWORD [32+esp] | |
215 mov eax,DWORD [4+esi] | |
216 adc edx,0 | |
217 inc ecx | |
218 jmp NEAR L$0082ndmadd | |
219 align 16 | |
220 L$0091stmadd: | |
221 mov ebp,edx | |
222 mul edi | |
223 add ebp,DWORD [32+ecx*4+esp] | |
224 lea ecx,[1+ecx] | |
225 adc edx,0 | |
226 add ebp,eax | |
227 mov eax,DWORD [ecx*4+esi] | |
228 adc edx,0 | |
229 cmp ecx,ebx | |
230 mov DWORD [28+ecx*4+esp],ebp | |
231 jl NEAR L$0091stmadd | |
232 mov ebp,edx | |
233 mul edi | |
234 add eax,DWORD [32+ebx*4+esp] | |
235 mov edi,DWORD [20+esp] | |
236 adc edx,0 | |
237 mov esi,DWORD [16+esp] | |
238 add ebp,eax | |
239 adc edx,0 | |
240 imul edi,DWORD [32+esp] | |
241 xor ecx,ecx | |
242 add edx,DWORD [36+ebx*4+esp] | |
243 mov DWORD [32+ebx*4+esp],ebp | |
244 adc ecx,0 | |
245 mov eax,DWORD [esi] | |
246 mov DWORD [36+ebx*4+esp],edx | |
247 mov DWORD [40+ebx*4+esp],ecx | |
248 mul edi | |
249 add eax,DWORD [32+esp] | |
250 mov eax,DWORD [4+esi] | |
251 adc edx,0 | |
252 mov ecx,1 | |
253 align 16 | |
254 L$0082ndmadd: | |
255 mov ebp,edx | |
256 mul edi | |
257 add ebp,DWORD [32+ecx*4+esp] | |
258 lea ecx,[1+ecx] | |
259 adc edx,0 | |
260 add ebp,eax | |
261 mov eax,DWORD [ecx*4+esi] | |
262 adc edx,0 | |
263 cmp ecx,ebx | |
264 mov DWORD [24+ecx*4+esp],ebp | |
265 jl NEAR L$0082ndmadd | |
266 mov ebp,edx | |
267 mul edi | |
268 add ebp,DWORD [32+ebx*4+esp] | |
269 adc edx,0 | |
270 add ebp,eax | |
271 adc edx,0 | |
272 mov DWORD [28+ebx*4+esp],ebp | |
273 xor eax,eax | |
274 mov ecx,DWORD [12+esp] | |
275 add edx,DWORD [36+ebx*4+esp] | |
276 adc eax,DWORD [40+ebx*4+esp] | |
277 lea ecx,[4+ecx] | |
278 mov DWORD [32+ebx*4+esp],edx | |
279 cmp ecx,DWORD [28+esp] | |
280 mov DWORD [36+ebx*4+esp],eax | |
281 je NEAR L$005common_tail | |
282 mov edi,DWORD [ecx] | |
283 mov esi,DWORD [8+esp] | |
284 mov DWORD [12+esp],ecx | |
285 xor ecx,ecx | |
286 xor edx,edx | |
287 mov eax,DWORD [esi] | |
288 jmp NEAR L$0091stmadd | |
289 align 16 | |
290 L$006bn_sqr_mont: | |
291 mov DWORD [esp],ebx | |
292 mov DWORD [12+esp],ecx | |
293 mov eax,edi | |
294 mul edi | |
295 mov DWORD [32+esp],eax | |
296 mov ebx,edx | |
297 shr edx,1 | |
298 and ebx,1 | |
299 inc ecx | |
300 align 16 | |
301 L$010sqr: | |
302 mov eax,DWORD [ecx*4+esi] | |
303 mov ebp,edx | |
304 mul edi | |
305 add eax,ebp | |
306 lea ecx,[1+ecx] | |
307 adc edx,0 | |
308 lea ebp,[eax*2+ebx] | |
309 shr eax,31 | |
310 cmp ecx,DWORD [esp] | |
311 mov ebx,eax | |
312 mov DWORD [28+ecx*4+esp],ebp | |
313 jl NEAR L$010sqr | |
314 mov eax,DWORD [ecx*4+esi] | |
315 mov ebp,edx | |
316 mul edi | |
317 add eax,ebp | |
318 mov edi,DWORD [20+esp] | |
319 adc edx,0 | |
320 mov esi,DWORD [16+esp] | |
321 lea ebp,[eax*2+ebx] | |
322 imul edi,DWORD [32+esp] | |
323 shr eax,31 | |
324 mov DWORD [32+ecx*4+esp],ebp | |
325 lea ebp,[edx*2+eax] | |
326 mov eax,DWORD [esi] | |
327 shr edx,31 | |
328 mov DWORD [36+ecx*4+esp],ebp | |
329 mov DWORD [40+ecx*4+esp],edx | |
330 mul edi | |
331 add eax,DWORD [32+esp] | |
332 mov ebx,ecx | |
333 adc edx,0 | |
334 mov eax,DWORD [4+esi] | |
335 mov ecx,1 | |
336 align 16 | |
337 L$0113rdmadd: | |
338 mov ebp,edx | |
339 mul edi | |
340 add ebp,DWORD [32+ecx*4+esp] | |
341 adc edx,0 | |
342 add ebp,eax | |
343 mov eax,DWORD [4+ecx*4+esi] | |
344 adc edx,0 | |
345 mov DWORD [28+ecx*4+esp],ebp | |
346 mov ebp,edx | |
347 mul edi | |
348 add ebp,DWORD [36+ecx*4+esp] | |
349 lea ecx,[2+ecx] | |
350 adc edx,0 | |
351 add ebp,eax | |
352 mov eax,DWORD [ecx*4+esi] | |
353 adc edx,0 | |
354 cmp ecx,ebx | |
355 mov DWORD [24+ecx*4+esp],ebp | |
356 jl NEAR L$0113rdmadd | |
357 mov ebp,edx | |
358 mul edi | |
359 add ebp,DWORD [32+ebx*4+esp] | |
360 adc edx,0 | |
361 add ebp,eax | |
362 adc edx,0 | |
363 mov DWORD [28+ebx*4+esp],ebp | |
364 mov ecx,DWORD [12+esp] | |
365 xor eax,eax | |
366 mov esi,DWORD [8+esp] | |
367 add edx,DWORD [36+ebx*4+esp] | |
368 adc eax,DWORD [40+ebx*4+esp] | |
369 mov DWORD [32+ebx*4+esp],edx | |
370 cmp ecx,ebx | |
371 mov DWORD [36+ebx*4+esp],eax | |
372 je NEAR L$005common_tail | |
373 mov edi,DWORD [4+ecx*4+esi] | |
374 lea ecx,[1+ecx] | |
375 mov eax,edi | |
376 mov DWORD [12+esp],ecx | |
377 mul edi | |
378 add eax,DWORD [32+ecx*4+esp] | |
379 adc edx,0 | |
380 mov DWORD [32+ecx*4+esp],eax | |
381 xor ebp,ebp | |
382 cmp ecx,ebx | |
383 lea ecx,[1+ecx] | |
384 je NEAR L$012sqrlast | |
385 mov ebx,edx | |
386 shr edx,1 | |
387 and ebx,1 | |
388 align 16 | |
389 L$013sqradd: | |
390 mov eax,DWORD [ecx*4+esi] | |
391 mov ebp,edx | |
392 mul edi | |
393 add eax,ebp | |
394 lea ebp,[eax*1+eax] | |
395 adc edx,0 | |
396 shr eax,31 | |
397 add ebp,DWORD [32+ecx*4+esp] | |
398 lea ecx,[1+ecx] | |
399 adc eax,0 | |
400 add ebp,ebx | |
401 adc eax,0 | |
402 cmp ecx,DWORD [esp] | |
403 mov DWORD [28+ecx*4+esp],ebp | |
404 mov ebx,eax | |
405 jle NEAR L$013sqradd | |
406 mov ebp,edx | |
407 add edx,edx | |
408 shr ebp,31 | |
409 add edx,ebx | |
410 adc ebp,0 | |
411 L$012sqrlast: | |
412 mov edi,DWORD [20+esp] | |
413 mov esi,DWORD [16+esp] | |
414 imul edi,DWORD [32+esp] | |
415 add edx,DWORD [32+ecx*4+esp] | |
416 mov eax,DWORD [esi] | |
417 adc ebp,0 | |
418 mov DWORD [32+ecx*4+esp],edx | |
419 mov DWORD [36+ecx*4+esp],ebp | |
420 mul edi | |
421 add eax,DWORD [32+esp] | |
422 lea ebx,[ecx-1] | |
423 adc edx,0 | |
424 mov ecx,1 | |
425 mov eax,DWORD [4+esi] | |
426 jmp NEAR L$0113rdmadd | |
427 align 16 | |
428 L$005common_tail: | |
429 mov ebp,DWORD [16+esp] | |
430 mov edi,DWORD [4+esp] | |
431 lea esi,[32+esp] | |
432 mov eax,DWORD [esi] | |
433 mov ecx,ebx | |
434 xor edx,edx | |
435 align 16 | |
436 L$014sub: | |
437 sbb eax,DWORD [edx*4+ebp] | |
438 mov DWORD [edx*4+edi],eax | |
439 dec ecx | |
440 mov eax,DWORD [4+edx*4+esi] | |
441 lea edx,[1+edx] | |
442 jge NEAR L$014sub | |
443 sbb eax,0 | |
444 align 16 | |
445 L$015copy: | |
446 mov edx,DWORD [ebx*4+esi] | |
447 mov ebp,DWORD [ebx*4+edi] | |
448 xor edx,ebp | |
449 and edx,eax | |
450 xor edx,ebp | |
451 mov DWORD [ebx*4+esi],ecx | |
452 mov DWORD [ebx*4+edi],edx | |
453 dec ebx | |
454 jge NEAR L$015copy | |
455 mov esp,DWORD [24+esp] | |
456 mov eax,1 | |
457 L$000just_leave: | |
458 pop edi | |
459 pop esi | |
460 pop ebx | |
461 pop ebp | |
462 ret | |
463 db 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 | |
464 db 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 | |
465 db 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 | |
466 db 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 | |
467 db 111,114,103,62,0 | |
468 segment .bss | |
469 common _OPENSSL_ia32cap_P 16 | |
OLD | NEW |