OLD | NEW |
| (Empty) |
1 #if defined(__aarch64__) | |
2 .text | |
3 | |
4 .globl bn_mul_mont | |
5 .hidden bn_mul_mont | |
6 .type bn_mul_mont,%function | |
7 .align 5 | |
8 bn_mul_mont: | |
9 tst x5,#7 | |
10 b.eq __bn_sqr8x_mont | |
11 tst x5,#3 | |
12 b.eq __bn_mul4x_mont | |
13 .Lmul_mont: | |
14 stp x29,x30,[sp,#-64]! | |
15 add x29,sp,#0 | |
16 stp x19,x20,[sp,#16] | |
17 stp x21,x22,[sp,#32] | |
18 stp x23,x24,[sp,#48] | |
19 | |
20 ldr x9,[x2],#8 // bp[0] | |
21 sub x22,sp,x5,lsl#3 | |
22 ldp x7,x8,[x1],#16 // ap[0..1] | |
23 lsl x5,x5,#3 | |
24 ldr x4,[x4] // *n0 | |
25 and x22,x22,#-16 // ABI says so | |
26 ldp x13,x14,[x3],#16 // np[0..1] | |
27 | |
28 mul x6,x7,x9 // ap[0]*bp[0] | |
29 sub x21,x5,#16 // j=num-2 | |
30 umulh x7,x7,x9 | |
31 mul x10,x8,x9 // ap[1]*bp[0] | |
32 umulh x11,x8,x9 | |
33 | |
34 mul x15,x6,x4 // "tp[0]"*n0 | |
35 mov sp,x22 // alloca | |
36 | |
37 // (*) mul x12,x13,x15 // np[0]*m1 | |
38 umulh x13,x13,x15 | |
39 mul x16,x14,x15 // np[1]*m1 | |
40 // (*) adds x12,x12,x6 // discarded | |
41 // (*) As for removal of first multiplication and addition | |
42 // instructions. The outcome of first addition is | |
43 // guaranteed to be zero, which leaves two computationally | |
44 // significant outcomes: it either carries or not. Then | |
45 // question is when does it carry? Is there alternative | |
46 // way to deduce it? If you follow operations, you can | |
47 // observe that condition for carry is quite simple: | |
48 // x6 being non-zero. So that carry can be calculated | |
49 // by adding -1 to x6. That's what next instruction does. | |
50 subs xzr,x6,#1 // (*) | |
51 umulh x17,x14,x15 | |
52 adc x13,x13,xzr | |
53 cbz x21,.L1st_skip | |
54 | |
55 .L1st: | |
56 ldr x8,[x1],#8 | |
57 adds x6,x10,x7 | |
58 sub x21,x21,#8 // j-- | |
59 adc x7,x11,xzr | |
60 | |
61 ldr x14,[x3],#8 | |
62 adds x12,x16,x13 | |
63 mul x10,x8,x9 // ap[j]*bp[0] | |
64 adc x13,x17,xzr | |
65 umulh x11,x8,x9 | |
66 | |
67 adds x12,x12,x6 | |
68 mul x16,x14,x15 // np[j]*m1 | |
69 adc x13,x13,xzr | |
70 umulh x17,x14,x15 | |
71 str x12,[x22],#8 // tp[j-1] | |
72 cbnz x21,.L1st | |
73 | |
74 .L1st_skip: | |
75 adds x6,x10,x7 | |
76 sub x1,x1,x5 // rewind x1 | |
77 adc x7,x11,xzr | |
78 | |
79 adds x12,x16,x13 | |
80 sub x3,x3,x5 // rewind x3 | |
81 adc x13,x17,xzr | |
82 | |
83 adds x12,x12,x6 | |
84 sub x20,x5,#8 // i=num-1 | |
85 adcs x13,x13,x7 | |
86 | |
87 adc x19,xzr,xzr // upmost overflow bit | |
88 stp x12,x13,[x22] | |
89 | |
90 .Louter: | |
91 ldr x9,[x2],#8 // bp[i] | |
92 ldp x7,x8,[x1],#16 | |
93 ldr x23,[sp] // tp[0] | |
94 add x22,sp,#8 | |
95 | |
96 mul x6,x7,x9 // ap[0]*bp[i] | |
97 sub x21,x5,#16 // j=num-2 | |
98 umulh x7,x7,x9 | |
99 ldp x13,x14,[x3],#16 | |
100 mul x10,x8,x9 // ap[1]*bp[i] | |
101 adds x6,x6,x23 | |
102 umulh x11,x8,x9 | |
103 adc x7,x7,xzr | |
104 | |
105 mul x15,x6,x4 | |
106 sub x20,x20,#8 // i-- | |
107 | |
108 // (*) mul x12,x13,x15 // np[0]*m1 | |
109 umulh x13,x13,x15 | |
110 mul x16,x14,x15 // np[1]*m1 | |
111 // (*) adds x12,x12,x6 | |
112 subs xzr,x6,#1 // (*) | |
113 umulh x17,x14,x15 | |
114 cbz x21,.Linner_skip | |
115 | |
116 .Linner: | |
117 ldr x8,[x1],#8 | |
118 adc x13,x13,xzr | |
119 ldr x23,[x22],#8 // tp[j] | |
120 adds x6,x10,x7 | |
121 sub x21,x21,#8 // j-- | |
122 adc x7,x11,xzr | |
123 | |
124 adds x12,x16,x13 | |
125 ldr x14,[x3],#8 | |
126 adc x13,x17,xzr | |
127 | |
128 mul x10,x8,x9 // ap[j]*bp[i] | |
129 adds x6,x6,x23 | |
130 umulh x11,x8,x9 | |
131 adc x7,x7,xzr | |
132 | |
133 mul x16,x14,x15 // np[j]*m1 | |
134 adds x12,x12,x6 | |
135 umulh x17,x14,x15 | |
136 str x12,[x22,#-16] // tp[j-1] | |
137 cbnz x21,.Linner | |
138 | |
139 .Linner_skip: | |
140 ldr x23,[x22],#8 // tp[j] | |
141 adc x13,x13,xzr | |
142 adds x6,x10,x7 | |
143 sub x1,x1,x5 // rewind x1 | |
144 adc x7,x11,xzr | |
145 | |
146 adds x12,x16,x13 | |
147 sub x3,x3,x5 // rewind x3 | |
148 adcs x13,x17,x19 | |
149 adc x19,xzr,xzr | |
150 | |
151 adds x6,x6,x23 | |
152 adc x7,x7,xzr | |
153 | |
154 adds x12,x12,x6 | |
155 adcs x13,x13,x7 | |
156 adc x19,x19,xzr // upmost overflow bit | |
157 stp x12,x13,[x22,#-16] | |
158 | |
159 cbnz x20,.Louter | |
160 | |
161 // Final step. We see if result is larger than modulus, and | |
162 // if it is, subtract the modulus. But comparison implies | |
163 // subtraction. So we subtract modulus, see if it borrowed, | |
164 // and conditionally copy original value. | |
165 ldr x23,[sp] // tp[0] | |
166 add x22,sp,#8 | |
167 ldr x14,[x3],#8 // np[0] | |
168 subs x21,x5,#8 // j=num-1 and clear borrow | |
169 mov x1,x0 | |
170 .Lsub: | |
171 sbcs x8,x23,x14 // tp[j]-np[j] | |
172 ldr x23,[x22],#8 | |
173 sub x21,x21,#8 // j-- | |
174 ldr x14,[x3],#8 | |
175 str x8,[x1],#8 // rp[j]=tp[j]-np[j] | |
176 cbnz x21,.Lsub | |
177 | |
178 sbcs x8,x23,x14 | |
179 sbcs x19,x19,xzr // did it borrow? | |
180 str x8,[x1],#8 // rp[num-1] | |
181 | |
182 ldr x23,[sp] // tp[0] | |
183 add x22,sp,#8 | |
184 ldr x8,[x0],#8 // rp[0] | |
185 sub x5,x5,#8 // num-- | |
186 nop | |
187 .Lcond_copy: | |
188 sub x5,x5,#8 // num-- | |
189 csel x14,x23,x8,lo // did it borrow? | |
190 ldr x23,[x22],#8 | |
191 ldr x8,[x0],#8 | |
192 str xzr,[x22,#-16] // wipe tp | |
193 str x14,[x0,#-16] | |
194 cbnz x5,.Lcond_copy | |
195 | |
196 csel x14,x23,x8,lo | |
197 str xzr,[x22,#-8] // wipe tp | |
198 str x14,[x0,#-8] | |
199 | |
200 ldp x19,x20,[x29,#16] | |
201 mov sp,x29 | |
202 ldp x21,x22,[x29,#32] | |
203 mov x0,#1 | |
204 ldp x23,x24,[x29,#48] | |
205 ldr x29,[sp],#64 | |
206 ret | |
207 .size bn_mul_mont,.-bn_mul_mont | |
208 .type __bn_sqr8x_mont,%function | |
209 .align 5 | |
210 __bn_sqr8x_mont: | |
211 cmp x1,x2 | |
212 b.ne __bn_mul4x_mont | |
213 .Lsqr8x_mont: | |
214 stp x29,x30,[sp,#-128]! | |
215 add x29,sp,#0 | |
216 stp x19,x20,[sp,#16] | |
217 stp x21,x22,[sp,#32] | |
218 stp x23,x24,[sp,#48] | |
219 stp x25,x26,[sp,#64] | |
220 stp x27,x28,[sp,#80] | |
221 stp x0,x3,[sp,#96] // offload rp and np | |
222 | |
223 ldp x6,x7,[x1,#8*0] | |
224 ldp x8,x9,[x1,#8*2] | |
225 ldp x10,x11,[x1,#8*4] | |
226 ldp x12,x13,[x1,#8*6] | |
227 | |
228 sub x2,sp,x5,lsl#4 | |
229 lsl x5,x5,#3 | |
230 ldr x4,[x4] // *n0 | |
231 mov sp,x2 // alloca | |
232 sub x27,x5,#8*8 | |
233 b .Lsqr8x_zero_start | |
234 | |
235 .Lsqr8x_zero: | |
236 sub x27,x27,#8*8 | |
237 stp xzr,xzr,[x2,#8*0] | |
238 stp xzr,xzr,[x2,#8*2] | |
239 stp xzr,xzr,[x2,#8*4] | |
240 stp xzr,xzr,[x2,#8*6] | |
241 .Lsqr8x_zero_start: | |
242 stp xzr,xzr,[x2,#8*8] | |
243 stp xzr,xzr,[x2,#8*10] | |
244 stp xzr,xzr,[x2,#8*12] | |
245 stp xzr,xzr,[x2,#8*14] | |
246 add x2,x2,#8*16 | |
247 cbnz x27,.Lsqr8x_zero | |
248 | |
249 add x3,x1,x5 | |
250 add x1,x1,#8*8 | |
251 mov x19,xzr | |
252 mov x20,xzr | |
253 mov x21,xzr | |
254 mov x22,xzr | |
255 mov x23,xzr | |
256 mov x24,xzr | |
257 mov x25,xzr | |
258 mov x26,xzr | |
259 mov x2,sp | |
260 str x4,[x29,#112] // offload n0 | |
261 | |
262 // Multiply everything but a[i]*a[i] | |
263 .align 4 | |
264 .Lsqr8x_outer_loop: | |
265 // a[1]a[0] (i) | |
266 // a[2]a[0] | |
267 // a[3]a[0] | |
268 // a[4]a[0] | |
269 // a[5]a[0] | |
270 // a[6]a[0] | |
271 // a[7]a[0] | |
272 // a[2]a[1] (ii) | |
273 // a[3]a[1] | |
274 // a[4]a[1] | |
275 // a[5]a[1] | |
276 // a[6]a[1] | |
277 // a[7]a[1] | |
278 // a[3]a[2] (iii) | |
279 // a[4]a[2] | |
280 // a[5]a[2] | |
281 // a[6]a[2] | |
282 // a[7]a[2] | |
283 // a[4]a[3] (iv) | |
284 // a[5]a[3] | |
285 // a[6]a[3] | |
286 // a[7]a[3] | |
287 // a[5]a[4] (v) | |
288 // a[6]a[4] | |
289 // a[7]a[4] | |
290 // a[6]a[5] (vi) | |
291 // a[7]a[5] | |
292 // a[7]a[6] (vii) | |
293 | |
294 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) | |
295 mul x15,x8,x6 | |
296 mul x16,x9,x6 | |
297 mul x17,x10,x6 | |
298 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) | |
299 mul x14,x11,x6 | |
300 adcs x21,x21,x15 | |
301 mul x15,x12,x6 | |
302 adcs x22,x22,x16 | |
303 mul x16,x13,x6 | |
304 adcs x23,x23,x17 | |
305 umulh x17,x7,x6 // hi(a[1..7]*a[0]) | |
306 adcs x24,x24,x14 | |
307 umulh x14,x8,x6 | |
308 adcs x25,x25,x15 | |
309 umulh x15,x9,x6 | |
310 adcs x26,x26,x16 | |
311 umulh x16,x10,x6 | |
312 stp x19,x20,[x2],#8*2 // t[0..1] | |
313 adc x19,xzr,xzr // t[8] | |
314 adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) | |
315 umulh x17,x11,x6 | |
316 adcs x22,x22,x14 | |
317 umulh x14,x12,x6 | |
318 adcs x23,x23,x15 | |
319 umulh x15,x13,x6 | |
320 adcs x24,x24,x16 | |
321 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) | |
322 adcs x25,x25,x17 | |
323 mul x17,x9,x7 | |
324 adcs x26,x26,x14 | |
325 mul x14,x10,x7 | |
326 adc x19,x19,x15 | |
327 | |
328 mul x15,x11,x7 | |
329 adds x22,x22,x16 | |
330 mul x16,x12,x7 | |
331 adcs x23,x23,x17 | |
332 mul x17,x13,x7 | |
333 adcs x24,x24,x14 | |
334 umulh x14,x8,x7 // hi(a[2..7]*a[1]) | |
335 adcs x25,x25,x15 | |
336 umulh x15,x9,x7 | |
337 adcs x26,x26,x16 | |
338 umulh x16,x10,x7 | |
339 adcs x19,x19,x17 | |
340 umulh x17,x11,x7 | |
341 stp x21,x22,[x2],#8*2 // t[2..3] | |
342 adc x20,xzr,xzr // t[9] | |
343 adds x23,x23,x14 | |
344 umulh x14,x12,x7 | |
345 adcs x24,x24,x15 | |
346 umulh x15,x13,x7 | |
347 adcs x25,x25,x16 | |
348 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) | |
349 adcs x26,x26,x17 | |
350 mul x17,x10,x8 | |
351 adcs x19,x19,x14 | |
352 mul x14,x11,x8 | |
353 adc x20,x20,x15 | |
354 | |
355 mul x15,x12,x8 | |
356 adds x24,x24,x16 | |
357 mul x16,x13,x8 | |
358 adcs x25,x25,x17 | |
359 umulh x17,x9,x8 // hi(a[3..7]*a[2]) | |
360 adcs x26,x26,x14 | |
361 umulh x14,x10,x8 | |
362 adcs x19,x19,x15 | |
363 umulh x15,x11,x8 | |
364 adcs x20,x20,x16 | |
365 umulh x16,x12,x8 | |
366 stp x23,x24,[x2],#8*2 // t[4..5] | |
367 adc x21,xzr,xzr // t[10] | |
368 adds x25,x25,x17 | |
369 umulh x17,x13,x8 | |
370 adcs x26,x26,x14 | |
371 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) | |
372 adcs x19,x19,x15 | |
373 mul x15,x11,x9 | |
374 adcs x20,x20,x16 | |
375 mul x16,x12,x9 | |
376 adc x21,x21,x17 | |
377 | |
378 mul x17,x13,x9 | |
379 adds x26,x26,x14 | |
380 umulh x14,x10,x9 // hi(a[4..7]*a[3]) | |
381 adcs x19,x19,x15 | |
382 umulh x15,x11,x9 | |
383 adcs x20,x20,x16 | |
384 umulh x16,x12,x9 | |
385 adcs x21,x21,x17 | |
386 umulh x17,x13,x9 | |
387 stp x25,x26,[x2],#8*2 // t[6..7] | |
388 adc x22,xzr,xzr // t[11] | |
389 adds x19,x19,x14 | |
390 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) | |
391 adcs x20,x20,x15 | |
392 mul x15,x12,x10 | |
393 adcs x21,x21,x16 | |
394 mul x16,x13,x10 | |
395 adc x22,x22,x17 | |
396 | |
397 umulh x17,x11,x10 // hi(a[5..7]*a[4]) | |
398 adds x20,x20,x14 | |
399 umulh x14,x12,x10 | |
400 adcs x21,x21,x15 | |
401 umulh x15,x13,x10 | |
402 adcs x22,x22,x16 | |
403 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) | |
404 adc x23,xzr,xzr // t[12] | |
405 adds x21,x21,x17 | |
406 mul x17,x13,x11 | |
407 adcs x22,x22,x14 | |
408 umulh x14,x12,x11 // hi(a[6..7]*a[5]) | |
409 adc x23,x23,x15 | |
410 | |
411 umulh x15,x13,x11 | |
412 adds x22,x22,x16 | |
413 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) | |
414 adcs x23,x23,x17 | |
415 umulh x17,x13,x12 // hi(a[7]*a[6]) | |
416 adc x24,xzr,xzr // t[13] | |
417 adds x23,x23,x14 | |
418 sub x27,x3,x1 // done yet? | |
419 adc x24,x24,x15 | |
420 | |
421 adds x24,x24,x16 | |
422 sub x14,x3,x5 // rewinded ap | |
423 adc x25,xzr,xzr // t[14] | |
424 add x25,x25,x17 | |
425 | |
426 cbz x27,.Lsqr8x_outer_break | |
427 | |
428 mov x4,x6 | |
429 ldp x6,x7,[x2,#8*0] | |
430 ldp x8,x9,[x2,#8*2] | |
431 ldp x10,x11,[x2,#8*4] | |
432 ldp x12,x13,[x2,#8*6] | |
433 adds x19,x19,x6 | |
434 adcs x20,x20,x7 | |
435 ldp x6,x7,[x1,#8*0] | |
436 adcs x21,x21,x8 | |
437 adcs x22,x22,x9 | |
438 ldp x8,x9,[x1,#8*2] | |
439 adcs x23,x23,x10 | |
440 adcs x24,x24,x11 | |
441 ldp x10,x11,[x1,#8*4] | |
442 adcs x25,x25,x12 | |
443 mov x0,x1 | |
444 adcs x26,xzr,x13 | |
445 ldp x12,x13,[x1,#8*6] | |
446 add x1,x1,#8*8 | |
447 //adc x28,xzr,xzr // moved below | |
448 mov x27,#-8*8 | |
449 | |
450 // a[8]a[0] | |
451 // a[9]a[0] | |
452 // a[a]a[0] | |
453 // a[b]a[0] | |
454 // a[c]a[0] | |
455 // a[d]a[0] | |
456 // a[e]a[0] | |
457 // a[f]a[0] | |
458 // a[8]a[1] | |
459 // a[f]a[1]........................ | |
460 // a[8]a[2] | |
461 // a[f]a[2]........................ | |
462 // a[8]a[3] | |
463 // a[f]a[3]........................ | |
464 // a[8]a[4] | |
465 // a[f]a[4]........................ | |
466 // a[8]a[5] | |
467 // a[f]a[5]........................ | |
468 // a[8]a[6] | |
469 // a[f]a[6]........................ | |
470 // a[8]a[7] | |
471 // a[f]a[7]........................ | |
472 .Lsqr8x_mul: | |
473 mul x14,x6,x4 | |
474 adc x28,xzr,xzr // carry bit, modulo-scheduled | |
475 mul x15,x7,x4 | |
476 add x27,x27,#8 | |
477 mul x16,x8,x4 | |
478 mul x17,x9,x4 | |
479 adds x19,x19,x14 | |
480 mul x14,x10,x4 | |
481 adcs x20,x20,x15 | |
482 mul x15,x11,x4 | |
483 adcs x21,x21,x16 | |
484 mul x16,x12,x4 | |
485 adcs x22,x22,x17 | |
486 mul x17,x13,x4 | |
487 adcs x23,x23,x14 | |
488 umulh x14,x6,x4 | |
489 adcs x24,x24,x15 | |
490 umulh x15,x7,x4 | |
491 adcs x25,x25,x16 | |
492 umulh x16,x8,x4 | |
493 adcs x26,x26,x17 | |
494 umulh x17,x9,x4 | |
495 adc x28,x28,xzr | |
496 str x19,[x2],#8 | |
497 adds x19,x20,x14 | |
498 umulh x14,x10,x4 | |
499 adcs x20,x21,x15 | |
500 umulh x15,x11,x4 | |
501 adcs x21,x22,x16 | |
502 umulh x16,x12,x4 | |
503 adcs x22,x23,x17 | |
504 umulh x17,x13,x4 | |
505 ldr x4,[x0,x27] | |
506 adcs x23,x24,x14 | |
507 adcs x24,x25,x15 | |
508 adcs x25,x26,x16 | |
509 adcs x26,x28,x17 | |
510 //adc x28,xzr,xzr // moved above | |
511 cbnz x27,.Lsqr8x_mul | |
512 // note that carry flag is guaranteed | |
513 // to be zero at this point | |
514 cmp x1,x3 // done yet? | |
515 b.eq .Lsqr8x_break | |
516 | |
517 ldp x6,x7,[x2,#8*0] | |
518 ldp x8,x9,[x2,#8*2] | |
519 ldp x10,x11,[x2,#8*4] | |
520 ldp x12,x13,[x2,#8*6] | |
521 adds x19,x19,x6 | |
522 ldr x4,[x0,#-8*8] | |
523 adcs x20,x20,x7 | |
524 ldp x6,x7,[x1,#8*0] | |
525 adcs x21,x21,x8 | |
526 adcs x22,x22,x9 | |
527 ldp x8,x9,[x1,#8*2] | |
528 adcs x23,x23,x10 | |
529 adcs x24,x24,x11 | |
530 ldp x10,x11,[x1,#8*4] | |
531 adcs x25,x25,x12 | |
532 mov x27,#-8*8 | |
533 adcs x26,x26,x13 | |
534 ldp x12,x13,[x1,#8*6] | |
535 add x1,x1,#8*8 | |
536 //adc x28,xzr,xzr // moved above | |
537 b .Lsqr8x_mul | |
538 | |
539 .align 4 | |
540 .Lsqr8x_break: | |
541 ldp x6,x7,[x0,#8*0] | |
542 add x1,x0,#8*8 | |
543 ldp x8,x9,[x0,#8*2] | |
544 sub x14,x3,x1 // is it last iteration? | |
545 ldp x10,x11,[x0,#8*4] | |
546 sub x15,x2,x14 | |
547 ldp x12,x13,[x0,#8*6] | |
548 cbz x14,.Lsqr8x_outer_loop | |
549 | |
550 stp x19,x20,[x2,#8*0] | |
551 ldp x19,x20,[x15,#8*0] | |
552 stp x21,x22,[x2,#8*2] | |
553 ldp x21,x22,[x15,#8*2] | |
554 stp x23,x24,[x2,#8*4] | |
555 ldp x23,x24,[x15,#8*4] | |
556 stp x25,x26,[x2,#8*6] | |
557 mov x2,x15 | |
558 ldp x25,x26,[x15,#8*6] | |
559 b .Lsqr8x_outer_loop | |
560 | |
561 .align 4 | |
562 .Lsqr8x_outer_break: | |
563 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] | |
564 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] | |
565 ldp x15,x16,[sp,#8*1] | |
566 ldp x11,x13,[x14,#8*2] | |
567 add x1,x14,#8*4 | |
568 ldp x17,x14,[sp,#8*3] | |
569 | |
570 stp x19,x20,[x2,#8*0] | |
571 mul x19,x7,x7 | |
572 stp x21,x22,[x2,#8*2] | |
573 umulh x7,x7,x7 | |
574 stp x23,x24,[x2,#8*4] | |
575 mul x8,x9,x9 | |
576 stp x25,x26,[x2,#8*6] | |
577 mov x2,sp | |
578 umulh x9,x9,x9 | |
579 adds x20,x7,x15,lsl#1 | |
580 extr x15,x16,x15,#63 | |
581 sub x27,x5,#8*4 | |
582 | |
583 .Lsqr4x_shift_n_add: | |
584 adcs x21,x8,x15 | |
585 extr x16,x17,x16,#63 | |
586 sub x27,x27,#8*4 | |
587 adcs x22,x9,x16 | |
588 ldp x15,x16,[x2,#8*5] | |
589 mul x10,x11,x11 | |
590 ldp x7,x9,[x1],#8*2 | |
591 umulh x11,x11,x11 | |
592 mul x12,x13,x13 | |
593 umulh x13,x13,x13 | |
594 extr x17,x14,x17,#63 | |
595 stp x19,x20,[x2,#8*0] | |
596 adcs x23,x10,x17 | |
597 extr x14,x15,x14,#63 | |
598 stp x21,x22,[x2,#8*2] | |
599 adcs x24,x11,x14 | |
600 ldp x17,x14,[x2,#8*7] | |
601 extr x15,x16,x15,#63 | |
602 adcs x25,x12,x15 | |
603 extr x16,x17,x16,#63 | |
604 adcs x26,x13,x16 | |
605 ldp x15,x16,[x2,#8*9] | |
606 mul x6,x7,x7 | |
607 ldp x11,x13,[x1],#8*2 | |
608 umulh x7,x7,x7 | |
609 mul x8,x9,x9 | |
610 umulh x9,x9,x9 | |
611 stp x23,x24,[x2,#8*4] | |
612 extr x17,x14,x17,#63 | |
613 stp x25,x26,[x2,#8*6] | |
614 add x2,x2,#8*8 | |
615 adcs x19,x6,x17 | |
616 extr x14,x15,x14,#63 | |
617 adcs x20,x7,x14 | |
618 ldp x17,x14,[x2,#8*3] | |
619 extr x15,x16,x15,#63 | |
620 cbnz x27,.Lsqr4x_shift_n_add | |
621 ldp x1,x4,[x29,#104] // pull np and n0 | |
622 | |
623 adcs x21,x8,x15 | |
624 extr x16,x17,x16,#63 | |
625 adcs x22,x9,x16 | |
626 ldp x15,x16,[x2,#8*5] | |
627 mul x10,x11,x11 | |
628 umulh x11,x11,x11 | |
629 stp x19,x20,[x2,#8*0] | |
630 mul x12,x13,x13 | |
631 umulh x13,x13,x13 | |
632 stp x21,x22,[x2,#8*2] | |
633 extr x17,x14,x17,#63 | |
634 adcs x23,x10,x17 | |
635 extr x14,x15,x14,#63 | |
636 ldp x19,x20,[sp,#8*0] | |
637 adcs x24,x11,x14 | |
638 extr x15,x16,x15,#63 | |
639 ldp x6,x7,[x1,#8*0] | |
640 adcs x25,x12,x15 | |
641 extr x16,xzr,x16,#63 | |
642 ldp x8,x9,[x1,#8*2] | |
643 adc x26,x13,x16 | |
644 ldp x10,x11,[x1,#8*4] | |
645 | |
646 // Reduce by 512 bits per iteration | |
647 mul x28,x4,x19 // t[0]*n0 | |
648 ldp x12,x13,[x1,#8*6] | |
649 add x3,x1,x5 | |
650 ldp x21,x22,[sp,#8*2] | |
651 stp x23,x24,[x2,#8*4] | |
652 ldp x23,x24,[sp,#8*4] | |
653 stp x25,x26,[x2,#8*6] | |
654 ldp x25,x26,[sp,#8*6] | |
655 add x1,x1,#8*8 | |
656 mov x30,xzr // initial top-most carry | |
657 mov x2,sp | |
658 mov x27,#8 | |
659 | |
660 .Lsqr8x_reduction: | |
661 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) | |
662 mul x15,x7,x28 | |
663 sub x27,x27,#1 | |
664 mul x16,x8,x28 | |
665 str x28,[x2],#8 // put aside t[0]*n0 for tail processing | |
666 mul x17,x9,x28 | |
667 // (*) adds xzr,x19,x14 | |
668 subs xzr,x19,#1 // (*) | |
669 mul x14,x10,x28 | |
670 adcs x19,x20,x15 | |
671 mul x15,x11,x28 | |
672 adcs x20,x21,x16 | |
673 mul x16,x12,x28 | |
674 adcs x21,x22,x17 | |
675 mul x17,x13,x28 | |
676 adcs x22,x23,x14 | |
677 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) | |
678 adcs x23,x24,x15 | |
679 umulh x15,x7,x28 | |
680 adcs x24,x25,x16 | |
681 umulh x16,x8,x28 | |
682 adcs x25,x26,x17 | |
683 umulh x17,x9,x28 | |
684 adc x26,xzr,xzr | |
685 adds x19,x19,x14 | |
686 umulh x14,x10,x28 | |
687 adcs x20,x20,x15 | |
688 umulh x15,x11,x28 | |
689 adcs x21,x21,x16 | |
690 umulh x16,x12,x28 | |
691 adcs x22,x22,x17 | |
692 umulh x17,x13,x28 | |
693 mul x28,x4,x19 // next t[0]*n0 | |
694 adcs x23,x23,x14 | |
695 adcs x24,x24,x15 | |
696 adcs x25,x25,x16 | |
697 adc x26,x26,x17 | |
698 cbnz x27,.Lsqr8x_reduction | |
699 | |
700 ldp x14,x15,[x2,#8*0] | |
701 ldp x16,x17,[x2,#8*2] | |
702 mov x0,x2 | |
703 sub x27,x3,x1 // done yet? | |
704 adds x19,x19,x14 | |
705 adcs x20,x20,x15 | |
706 ldp x14,x15,[x2,#8*4] | |
707 adcs x21,x21,x16 | |
708 adcs x22,x22,x17 | |
709 ldp x16,x17,[x2,#8*6] | |
710 adcs x23,x23,x14 | |
711 adcs x24,x24,x15 | |
712 adcs x25,x25,x16 | |
713 adcs x26,x26,x17 | |
714 //adc x28,xzr,xzr // moved below | |
715 cbz x27,.Lsqr8x8_post_condition | |
716 | |
717 ldr x4,[x2,#-8*8] | |
718 ldp x6,x7,[x1,#8*0] | |
719 ldp x8,x9,[x1,#8*2] | |
720 ldp x10,x11,[x1,#8*4] | |
721 mov x27,#-8*8 | |
722 ldp x12,x13,[x1,#8*6] | |
723 add x1,x1,#8*8 | |
724 | |
725 .Lsqr8x_tail: | |
726 mul x14,x6,x4 | |
727 adc x28,xzr,xzr // carry bit, modulo-scheduled | |
728 mul x15,x7,x4 | |
729 add x27,x27,#8 | |
730 mul x16,x8,x4 | |
731 mul x17,x9,x4 | |
732 adds x19,x19,x14 | |
733 mul x14,x10,x4 | |
734 adcs x20,x20,x15 | |
735 mul x15,x11,x4 | |
736 adcs x21,x21,x16 | |
737 mul x16,x12,x4 | |
738 adcs x22,x22,x17 | |
739 mul x17,x13,x4 | |
740 adcs x23,x23,x14 | |
741 umulh x14,x6,x4 | |
742 adcs x24,x24,x15 | |
743 umulh x15,x7,x4 | |
744 adcs x25,x25,x16 | |
745 umulh x16,x8,x4 | |
746 adcs x26,x26,x17 | |
747 umulh x17,x9,x4 | |
748 adc x28,x28,xzr | |
749 str x19,[x2],#8 | |
750 adds x19,x20,x14 | |
751 umulh x14,x10,x4 | |
752 adcs x20,x21,x15 | |
753 umulh x15,x11,x4 | |
754 adcs x21,x22,x16 | |
755 umulh x16,x12,x4 | |
756 adcs x22,x23,x17 | |
757 umulh x17,x13,x4 | |
758 ldr x4,[x0,x27] | |
759 adcs x23,x24,x14 | |
760 adcs x24,x25,x15 | |
761 adcs x25,x26,x16 | |
762 adcs x26,x28,x17 | |
763 //adc x28,xzr,xzr // moved above | |
764 cbnz x27,.Lsqr8x_tail | |
765 // note that carry flag is guaranteed | |
766 // to be zero at this point | |
767 ldp x6,x7,[x2,#8*0] | |
768 sub x27,x3,x1 // done yet? | |
769 sub x16,x3,x5 // rewinded np | |
770 ldp x8,x9,[x2,#8*2] | |
771 ldp x10,x11,[x2,#8*4] | |
772 ldp x12,x13,[x2,#8*6] | |
773 cbz x27,.Lsqr8x_tail_break | |
774 | |
775 ldr x4,[x0,#-8*8] | |
776 adds x19,x19,x6 | |
777 adcs x20,x20,x7 | |
778 ldp x6,x7,[x1,#8*0] | |
779 adcs x21,x21,x8 | |
780 adcs x22,x22,x9 | |
781 ldp x8,x9,[x1,#8*2] | |
782 adcs x23,x23,x10 | |
783 adcs x24,x24,x11 | |
784 ldp x10,x11,[x1,#8*4] | |
785 adcs x25,x25,x12 | |
786 mov x27,#-8*8 | |
787 adcs x26,x26,x13 | |
788 ldp x12,x13,[x1,#8*6] | |
789 add x1,x1,#8*8 | |
790 //adc x28,xzr,xzr // moved above | |
791 b .Lsqr8x_tail | |
792 | |
793 .align 4 | |
794 .Lsqr8x_tail_break: | |
795 ldr x4,[x29,#112] // pull n0 | |
796 add x27,x2,#8*8 // end of current t[num] window | |
797 | |
798 subs xzr,x30,#1 // "move" top-most carry to carry bit | |
799 adcs x14,x19,x6 | |
800 adcs x15,x20,x7 | |
801 ldp x19,x20,[x0,#8*0] | |
802 adcs x21,x21,x8 | |
803 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] | |
804 adcs x22,x22,x9 | |
805 ldp x8,x9,[x16,#8*2] | |
806 adcs x23,x23,x10 | |
807 adcs x24,x24,x11 | |
808 ldp x10,x11,[x16,#8*4] | |
809 adcs x25,x25,x12 | |
810 adcs x26,x26,x13 | |
811 ldp x12,x13,[x16,#8*6] | |
812 add x1,x16,#8*8 | |
813 adc x30,xzr,xzr // top-most carry | |
814 mul x28,x4,x19 | |
815 stp x14,x15,[x2,#8*0] | |
816 stp x21,x22,[x2,#8*2] | |
817 ldp x21,x22,[x0,#8*2] | |
818 stp x23,x24,[x2,#8*4] | |
819 ldp x23,x24,[x0,#8*4] | |
820 cmp x27,x29 // did we hit the bottom? | |
821 stp x25,x26,[x2,#8*6] | |
822 mov x2,x0 // slide the window | |
823 ldp x25,x26,[x0,#8*6] | |
824 mov x27,#8 | |
825 b.ne .Lsqr8x_reduction | |
826 | |
827 // Final step. We see if result is larger than modulus, and | |
828 // if it is, subtract the modulus. But comparison implies | |
829 // subtraction. So we subtract modulus, see if it borrowed, | |
830 // and conditionally copy original value. | |
831 ldr x0,[x29,#96] // pull rp | |
832 add x2,x2,#8*8 | |
833 subs x14,x19,x6 | |
834 sbcs x15,x20,x7 | |
835 sub x27,x5,#8*8 | |
836 mov x3,x0 // x0 copy | |
837 | |
838 .Lsqr8x_sub: | |
839 sbcs x16,x21,x8 | |
840 ldp x6,x7,[x1,#8*0] | |
841 sbcs x17,x22,x9 | |
842 stp x14,x15,[x0,#8*0] | |
843 sbcs x14,x23,x10 | |
844 ldp x8,x9,[x1,#8*2] | |
845 sbcs x15,x24,x11 | |
846 stp x16,x17,[x0,#8*2] | |
847 sbcs x16,x25,x12 | |
848 ldp x10,x11,[x1,#8*4] | |
849 sbcs x17,x26,x13 | |
850 ldp x12,x13,[x1,#8*6] | |
851 add x1,x1,#8*8 | |
852 ldp x19,x20,[x2,#8*0] | |
853 sub x27,x27,#8*8 | |
854 ldp x21,x22,[x2,#8*2] | |
855 ldp x23,x24,[x2,#8*4] | |
856 ldp x25,x26,[x2,#8*6] | |
857 add x2,x2,#8*8 | |
858 stp x14,x15,[x0,#8*4] | |
859 sbcs x14,x19,x6 | |
860 stp x16,x17,[x0,#8*6] | |
861 add x0,x0,#8*8 | |
862 sbcs x15,x20,x7 | |
863 cbnz x27,.Lsqr8x_sub | |
864 | |
865 sbcs x16,x21,x8 | |
866 mov x2,sp | |
867 add x1,sp,x5 | |
868 ldp x6,x7,[x3,#8*0] | |
869 sbcs x17,x22,x9 | |
870 stp x14,x15,[x0,#8*0] | |
871 sbcs x14,x23,x10 | |
872 ldp x8,x9,[x3,#8*2] | |
873 sbcs x15,x24,x11 | |
874 stp x16,x17,[x0,#8*2] | |
875 sbcs x16,x25,x12 | |
876 ldp x19,x20,[x1,#8*0] | |
877 sbcs x17,x26,x13 | |
878 ldp x21,x22,[x1,#8*2] | |
879 sbcs xzr,x30,xzr // did it borrow? | |
880 ldr x30,[x29,#8] // pull return address | |
881 stp x14,x15,[x0,#8*4] | |
882 stp x16,x17,[x0,#8*6] | |
883 | |
884 sub x27,x5,#8*4 | |
885 .Lsqr4x_cond_copy: | |
886 sub x27,x27,#8*4 | |
887 csel x14,x19,x6,lo | |
888 stp xzr,xzr,[x2,#8*0] | |
889 csel x15,x20,x7,lo | |
890 ldp x6,x7,[x3,#8*4] | |
891 ldp x19,x20,[x1,#8*4] | |
892 csel x16,x21,x8,lo | |
893 stp xzr,xzr,[x2,#8*2] | |
894 add x2,x2,#8*4 | |
895 csel x17,x22,x9,lo | |
896 ldp x8,x9,[x3,#8*6] | |
897 ldp x21,x22,[x1,#8*6] | |
898 add x1,x1,#8*4 | |
899 stp x14,x15,[x3,#8*0] | |
900 stp x16,x17,[x3,#8*2] | |
901 add x3,x3,#8*4 | |
902 stp xzr,xzr,[x1,#8*0] | |
903 stp xzr,xzr,[x1,#8*2] | |
904 cbnz x27,.Lsqr4x_cond_copy | |
905 | |
906 csel x14,x19,x6,lo | |
907 stp xzr,xzr,[x2,#8*0] | |
908 csel x15,x20,x7,lo | |
909 stp xzr,xzr,[x2,#8*2] | |
910 csel x16,x21,x8,lo | |
911 csel x17,x22,x9,lo | |
912 stp x14,x15,[x3,#8*0] | |
913 stp x16,x17,[x3,#8*2] | |
914 | |
915 b .Lsqr8x_done | |
916 | |
917 .align 4 | |
918 .Lsqr8x8_post_condition: | |
919 adc x28,xzr,xzr | |
920 ldr x30,[x29,#8] // pull return address | |
921 // x19-7,x28 hold result, x6-7 hold modulus | |
922 subs x6,x19,x6 | |
923 ldr x1,[x29,#96] // pull rp | |
924 sbcs x7,x20,x7 | |
925 stp xzr,xzr,[sp,#8*0] | |
926 sbcs x8,x21,x8 | |
927 stp xzr,xzr,[sp,#8*2] | |
928 sbcs x9,x22,x9 | |
929 stp xzr,xzr,[sp,#8*4] | |
930 sbcs x10,x23,x10 | |
931 stp xzr,xzr,[sp,#8*6] | |
932 sbcs x11,x24,x11 | |
933 stp xzr,xzr,[sp,#8*8] | |
934 sbcs x12,x25,x12 | |
935 stp xzr,xzr,[sp,#8*10] | |
936 sbcs x13,x26,x13 | |
937 stp xzr,xzr,[sp,#8*12] | |
938 sbcs x28,x28,xzr // did it borrow? | |
939 stp xzr,xzr,[sp,#8*14] | |
940 | |
941 // x6-7 hold result-modulus | |
942 csel x6,x19,x6,lo | |
943 csel x7,x20,x7,lo | |
944 csel x8,x21,x8,lo | |
945 csel x9,x22,x9,lo | |
946 stp x6,x7,[x1,#8*0] | |
947 csel x10,x23,x10,lo | |
948 csel x11,x24,x11,lo | |
949 stp x8,x9,[x1,#8*2] | |
950 csel x12,x25,x12,lo | |
951 csel x13,x26,x13,lo | |
952 stp x10,x11,[x1,#8*4] | |
953 stp x12,x13,[x1,#8*6] | |
954 | |
955 .Lsqr8x_done: | |
956 ldp x19,x20,[x29,#16] | |
957 mov sp,x29 | |
958 ldp x21,x22,[x29,#32] | |
959 mov x0,#1 | |
960 ldp x23,x24,[x29,#48] | |
961 ldp x25,x26,[x29,#64] | |
962 ldp x27,x28,[x29,#80] | |
963 ldr x29,[sp],#128 | |
964 ret | |
965 .size __bn_sqr8x_mont,.-__bn_sqr8x_mont | |
966 .type __bn_mul4x_mont,%function | |
967 .align 5 | |
968 __bn_mul4x_mont: | |
969 stp x29,x30,[sp,#-128]! | |
970 add x29,sp,#0 | |
971 stp x19,x20,[sp,#16] | |
972 stp x21,x22,[sp,#32] | |
973 stp x23,x24,[sp,#48] | |
974 stp x25,x26,[sp,#64] | |
975 stp x27,x28,[sp,#80] | |
976 | |
977 sub x26,sp,x5,lsl#3 | |
978 lsl x5,x5,#3 | |
979 ldr x4,[x4] // *n0 | |
980 sub sp,x26,#8*4 // alloca | |
981 | |
982 add x10,x2,x5 | |
983 add x27,x1,x5 | |
984 stp x0,x10,[x29,#96] // offload rp and &b[num] | |
985 | |
986 ldr x24,[x2,#8*0] // b[0] | |
987 ldp x6,x7,[x1,#8*0] // a[0..3] | |
988 ldp x8,x9,[x1,#8*2] | |
989 add x1,x1,#8*4 | |
990 mov x19,xzr | |
991 mov x20,xzr | |
992 mov x21,xzr | |
993 mov x22,xzr | |
994 ldp x14,x15,[x3,#8*0] // n[0..3] | |
995 ldp x16,x17,[x3,#8*2] | |
996 adds x3,x3,#8*4 // clear carry bit | |
997 mov x0,xzr | |
998 mov x28,#0 | |
999 mov x26,sp | |
1000 | |
1001 .Loop_mul4x_1st_reduction: | |
1002 mul x10,x6,x24 // lo(a[0..3]*b[0]) | |
1003 adc x0,x0,xzr // modulo-scheduled | |
1004 mul x11,x7,x24 | |
1005 add x28,x28,#8 | |
1006 mul x12,x8,x24 | |
1007 and x28,x28,#31 | |
1008 mul x13,x9,x24 | |
1009 adds x19,x19,x10 | |
1010 umulh x10,x6,x24 // hi(a[0..3]*b[0]) | |
1011 adcs x20,x20,x11 | |
1012 mul x25,x19,x4 // t[0]*n0 | |
1013 adcs x21,x21,x12 | |
1014 umulh x11,x7,x24 | |
1015 adcs x22,x22,x13 | |
1016 umulh x12,x8,x24 | |
1017 adc x23,xzr,xzr | |
1018 umulh x13,x9,x24 | |
1019 ldr x24,[x2,x28] // next b[i] (or b[0]) | |
1020 adds x20,x20,x10 | |
1021 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) | |
1022 str x25,[x26],#8 // put aside t[0]*n0 for tail processing | |
1023 adcs x21,x21,x11 | |
1024 mul x11,x15,x25 | |
1025 adcs x22,x22,x12 | |
1026 mul x12,x16,x25 | |
1027 adc x23,x23,x13 // can't overflow | |
1028 mul x13,x17,x25 | |
1029 // (*) adds xzr,x19,x10 | |
1030 subs xzr,x19,#1 // (*) | |
1031 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) | |
1032 adcs x19,x20,x11 | |
1033 umulh x11,x15,x25 | |
1034 adcs x20,x21,x12 | |
1035 umulh x12,x16,x25 | |
1036 adcs x21,x22,x13 | |
1037 umulh x13,x17,x25 | |
1038 adcs x22,x23,x0 | |
1039 adc x0,xzr,xzr | |
1040 adds x19,x19,x10 | |
1041 sub x10,x27,x1 | |
1042 adcs x20,x20,x11 | |
1043 adcs x21,x21,x12 | |
1044 adcs x22,x22,x13 | |
1045 //adc x0,x0,xzr | |
1046 cbnz x28,.Loop_mul4x_1st_reduction | |
1047 | |
1048 cbz x10,.Lmul4x4_post_condition | |
1049 | |
1050 ldp x6,x7,[x1,#8*0] // a[4..7] | |
1051 ldp x8,x9,[x1,#8*2] | |
1052 add x1,x1,#8*4 | |
1053 ldr x25,[sp] // a[0]*n0 | |
1054 ldp x14,x15,[x3,#8*0] // n[4..7] | |
1055 ldp x16,x17,[x3,#8*2] | |
1056 add x3,x3,#8*4 | |
1057 | |
1058 .Loop_mul4x_1st_tail: | |
1059 mul x10,x6,x24 // lo(a[4..7]*b[i]) | |
1060 adc x0,x0,xzr // modulo-scheduled | |
1061 mul x11,x7,x24 | |
1062 add x28,x28,#8 | |
1063 mul x12,x8,x24 | |
1064 and x28,x28,#31 | |
1065 mul x13,x9,x24 | |
1066 adds x19,x19,x10 | |
1067 umulh x10,x6,x24 // hi(a[4..7]*b[i]) | |
1068 adcs x20,x20,x11 | |
1069 umulh x11,x7,x24 | |
1070 adcs x21,x21,x12 | |
1071 umulh x12,x8,x24 | |
1072 adcs x22,x22,x13 | |
1073 umulh x13,x9,x24 | |
1074 adc x23,xzr,xzr | |
1075 ldr x24,[x2,x28] // next b[i] (or b[0]) | |
1076 adds x20,x20,x10 | |
1077 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) | |
1078 adcs x21,x21,x11 | |
1079 mul x11,x15,x25 | |
1080 adcs x22,x22,x12 | |
1081 mul x12,x16,x25 | |
1082 adc x23,x23,x13 // can't overflow | |
1083 mul x13,x17,x25 | |
1084 adds x19,x19,x10 | |
1085 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) | |
1086 adcs x20,x20,x11 | |
1087 umulh x11,x15,x25 | |
1088 adcs x21,x21,x12 | |
1089 umulh x12,x16,x25 | |
1090 adcs x22,x22,x13 | |
1091 adcs x23,x23,x0 | |
1092 umulh x13,x17,x25 | |
1093 adc x0,xzr,xzr | |
1094 ldr x25,[sp,x28] // next t[0]*n0 | |
1095 str x19,[x26],#8 // result!!! | |
1096 adds x19,x20,x10 | |
1097 sub x10,x27,x1 // done yet? | |
1098 adcs x20,x21,x11 | |
1099 adcs x21,x22,x12 | |
1100 adcs x22,x23,x13 | |
1101 //adc x0,x0,xzr | |
1102 cbnz x28,.Loop_mul4x_1st_tail | |
1103 | |
1104 sub x11,x27,x5 // rewinded x1 | |
1105 cbz x10,.Lmul4x_proceed | |
1106 | |
1107 ldp x6,x7,[x1,#8*0] | |
1108 ldp x8,x9,[x1,#8*2] | |
1109 add x1,x1,#8*4 | |
1110 ldp x14,x15,[x3,#8*0] | |
1111 ldp x16,x17,[x3,#8*2] | |
1112 add x3,x3,#8*4 | |
1113 b .Loop_mul4x_1st_tail | |
1114 | |
1115 .align 5 | |
1116 .Lmul4x_proceed: | |
1117 ldr x24,[x2,#8*4]! // *++b | |
1118 adc x30,x0,xzr | |
1119 ldp x6,x7,[x11,#8*0] // a[0..3] | |
1120 sub x3,x3,x5 // rewind np | |
1121 ldp x8,x9,[x11,#8*2] | |
1122 add x1,x11,#8*4 | |
1123 | |
1124 stp x19,x20,[x26,#8*0] // result!!! | |
1125 ldp x19,x20,[sp,#8*4] // t[0..3] | |
1126 stp x21,x22,[x26,#8*2] // result!!! | |
1127 ldp x21,x22,[sp,#8*6] | |
1128 | |
1129 ldp x14,x15,[x3,#8*0] // n[0..3] | |
1130 mov x26,sp | |
1131 ldp x16,x17,[x3,#8*2] | |
1132 adds x3,x3,#8*4 // clear carry bit | |
1133 mov x0,xzr | |
1134 | |
1135 .align 4 | |
1136 .Loop_mul4x_reduction: | |
1137 mul x10,x6,x24 // lo(a[0..3]*b[4]) | |
1138 adc x0,x0,xzr // modulo-scheduled | |
1139 mul x11,x7,x24 | |
1140 add x28,x28,#8 | |
1141 mul x12,x8,x24 | |
1142 and x28,x28,#31 | |
1143 mul x13,x9,x24 | |
1144 adds x19,x19,x10 | |
1145 umulh x10,x6,x24 // hi(a[0..3]*b[4]) | |
1146 adcs x20,x20,x11 | |
1147 mul x25,x19,x4 // t[0]*n0 | |
1148 adcs x21,x21,x12 | |
1149 umulh x11,x7,x24 | |
1150 adcs x22,x22,x13 | |
1151 umulh x12,x8,x24 | |
1152 adc x23,xzr,xzr | |
1153 umulh x13,x9,x24 | |
1154 ldr x24,[x2,x28] // next b[i] | |
1155 adds x20,x20,x10 | |
1156 // (*) mul x10,x14,x25 | |
1157 str x25,[x26],#8 // put aside t[0]*n0 for tail processing | |
1158 adcs x21,x21,x11 | |
1159 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 | |
1160 adcs x22,x22,x12 | |
1161 mul x12,x16,x25 | |
1162 adc x23,x23,x13 // can't overflow | |
1163 mul x13,x17,x25 | |
1164 // (*) adds xzr,x19,x10 | |
1165 subs xzr,x19,#1 // (*) | |
1166 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 | |
1167 adcs x19,x20,x11 | |
1168 umulh x11,x15,x25 | |
1169 adcs x20,x21,x12 | |
1170 umulh x12,x16,x25 | |
1171 adcs x21,x22,x13 | |
1172 umulh x13,x17,x25 | |
1173 adcs x22,x23,x0 | |
1174 adc x0,xzr,xzr | |
1175 adds x19,x19,x10 | |
1176 adcs x20,x20,x11 | |
1177 adcs x21,x21,x12 | |
1178 adcs x22,x22,x13 | |
1179 //adc x0,x0,xzr | |
1180 cbnz x28,.Loop_mul4x_reduction | |
1181 | |
1182 adc x0,x0,xzr | |
1183 ldp x10,x11,[x26,#8*4] // t[4..7] | |
1184 ldp x12,x13,[x26,#8*6] | |
1185 ldp x6,x7,[x1,#8*0] // a[4..7] | |
1186 ldp x8,x9,[x1,#8*2] | |
1187 add x1,x1,#8*4 | |
1188 adds x19,x19,x10 | |
1189 adcs x20,x20,x11 | |
1190 adcs x21,x21,x12 | |
1191 adcs x22,x22,x13 | |
1192 //adc x0,x0,xzr | |
1193 | |
1194 ldr x25,[sp] // t[0]*n0 | |
1195 ldp x14,x15,[x3,#8*0] // n[4..7] | |
1196 ldp x16,x17,[x3,#8*2] | |
1197 add x3,x3,#8*4 | |
1198 | |
1199 .align 4 | |
1200 .Loop_mul4x_tail: | |
1201 mul x10,x6,x24 // lo(a[4..7]*b[4]) | |
1202 adc x0,x0,xzr // modulo-scheduled | |
1203 mul x11,x7,x24 | |
1204 add x28,x28,#8 | |
1205 mul x12,x8,x24 | |
1206 and x28,x28,#31 | |
1207 mul x13,x9,x24 | |
1208 adds x19,x19,x10 | |
1209 umulh x10,x6,x24 // hi(a[4..7]*b[4]) | |
1210 adcs x20,x20,x11 | |
1211 umulh x11,x7,x24 | |
1212 adcs x21,x21,x12 | |
1213 umulh x12,x8,x24 | |
1214 adcs x22,x22,x13 | |
1215 umulh x13,x9,x24 | |
1216 adc x23,xzr,xzr | |
1217 ldr x24,[x2,x28] // next b[i] | |
1218 adds x20,x20,x10 | |
1219 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) | |
1220 adcs x21,x21,x11 | |
1221 mul x11,x15,x25 | |
1222 adcs x22,x22,x12 | |
1223 mul x12,x16,x25 | |
1224 adc x23,x23,x13 // can't overflow | |
1225 mul x13,x17,x25 | |
1226 adds x19,x19,x10 | |
1227 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) | |
1228 adcs x20,x20,x11 | |
1229 umulh x11,x15,x25 | |
1230 adcs x21,x21,x12 | |
1231 umulh x12,x16,x25 | |
1232 adcs x22,x22,x13 | |
1233 umulh x13,x17,x25 | |
1234 adcs x23,x23,x0 | |
1235 ldr x25,[sp,x28] // next a[0]*n0 | |
1236 adc x0,xzr,xzr | |
1237 str x19,[x26],#8 // result!!! | |
1238 adds x19,x20,x10 | |
1239 sub x10,x27,x1 // done yet? | |
1240 adcs x20,x21,x11 | |
1241 adcs x21,x22,x12 | |
1242 adcs x22,x23,x13 | |
1243 //adc x0,x0,xzr | |
1244 cbnz x28,.Loop_mul4x_tail | |
1245 | |
1246 sub x11,x3,x5 // rewinded np? | |
1247 adc x0,x0,xzr | |
1248 cbz x10,.Loop_mul4x_break | |
1249 | |
1250 ldp x10,x11,[x26,#8*4] | |
1251 ldp x12,x13,[x26,#8*6] | |
1252 ldp x6,x7,[x1,#8*0] | |
1253 ldp x8,x9,[x1,#8*2] | |
1254 add x1,x1,#8*4 | |
1255 adds x19,x19,x10 | |
1256 adcs x20,x20,x11 | |
1257 adcs x21,x21,x12 | |
1258 adcs x22,x22,x13 | |
1259 //adc x0,x0,xzr | |
1260 ldp x14,x15,[x3,#8*0] | |
1261 ldp x16,x17,[x3,#8*2] | |
1262 add x3,x3,#8*4 | |
1263 b .Loop_mul4x_tail | |
1264 | |
1265 .align 4 | |
1266 .Loop_mul4x_break: | |
1267 ldp x12,x13,[x29,#96] // pull rp and &b[num] | |
1268 adds x19,x19,x30 | |
1269 add x2,x2,#8*4 // bp++ | |
1270 adcs x20,x20,xzr | |
1271 sub x1,x1,x5 // rewind ap | |
1272 adcs x21,x21,xzr | |
1273 stp x19,x20,[x26,#8*0] // result!!! | |
1274 adcs x22,x22,xzr | |
1275 ldp x19,x20,[sp,#8*4] // t[0..3] | |
1276 adc x30,x0,xzr | |
1277 stp x21,x22,[x26,#8*2] // result!!! | |
1278 cmp x2,x13 // done yet? | |
1279 ldp x21,x22,[sp,#8*6] | |
1280 ldp x14,x15,[x11,#8*0] // n[0..3] | |
1281 ldp x16,x17,[x11,#8*2] | |
1282 add x3,x11,#8*4 | |
1283 b.eq .Lmul4x_post | |
1284 | |
1285 ldr x24,[x2] | |
1286 ldp x6,x7,[x1,#8*0] // a[0..3] | |
1287 ldp x8,x9,[x1,#8*2] | |
1288 adds x1,x1,#8*4 // clear carry bit | |
1289 mov x0,xzr | |
1290 mov x26,sp | |
1291 b .Loop_mul4x_reduction | |
1292 | |
1293 .align 4 | |
1294 .Lmul4x_post: | |
1295 // Final step. We see if result is larger than modulus, and | |
1296 // if it is, subtract the modulus. But comparison implies | |
1297 // subtraction. So we subtract modulus, see if it borrowed, | |
1298 // and conditionally copy original value. | |
1299 mov x0,x12 | |
1300 mov x27,x12 // x0 copy | |
1301 subs x10,x19,x14 | |
1302 add x26,sp,#8*8 | |
1303 sbcs x11,x20,x15 | |
1304 sub x28,x5,#8*4 | |
1305 | |
1306 .Lmul4x_sub: | |
1307 sbcs x12,x21,x16 | |
1308 ldp x14,x15,[x3,#8*0] | |
1309 sub x28,x28,#8*4 | |
1310 ldp x19,x20,[x26,#8*0] | |
1311 sbcs x13,x22,x17 | |
1312 ldp x16,x17,[x3,#8*2] | |
1313 add x3,x3,#8*4 | |
1314 ldp x21,x22,[x26,#8*2] | |
1315 add x26,x26,#8*4 | |
1316 stp x10,x11,[x0,#8*0] | |
1317 sbcs x10,x19,x14 | |
1318 stp x12,x13,[x0,#8*2] | |
1319 add x0,x0,#8*4 | |
1320 sbcs x11,x20,x15 | |
1321 cbnz x28,.Lmul4x_sub | |
1322 | |
1323 sbcs x12,x21,x16 | |
1324 mov x26,sp | |
1325 add x1,sp,#8*4 | |
1326 ldp x6,x7,[x27,#8*0] | |
1327 sbcs x13,x22,x17 | |
1328 stp x10,x11,[x0,#8*0] | |
1329 ldp x8,x9,[x27,#8*2] | |
1330 stp x12,x13,[x0,#8*2] | |
1331 ldp x19,x20,[x1,#8*0] | |
1332 ldp x21,x22,[x1,#8*2] | |
1333 sbcs xzr,x30,xzr // did it borrow? | |
1334 ldr x30,[x29,#8] // pull return address | |
1335 | |
1336 sub x28,x5,#8*4 | |
1337 .Lmul4x_cond_copy: | |
1338 sub x28,x28,#8*4 | |
1339 csel x10,x19,x6,lo | |
1340 stp xzr,xzr,[x26,#8*0] | |
1341 csel x11,x20,x7,lo | |
1342 ldp x6,x7,[x27,#8*4] | |
1343 ldp x19,x20,[x1,#8*4] | |
1344 csel x12,x21,x8,lo | |
1345 stp xzr,xzr,[x26,#8*2] | |
1346 add x26,x26,#8*4 | |
1347 csel x13,x22,x9,lo | |
1348 ldp x8,x9,[x27,#8*6] | |
1349 ldp x21,x22,[x1,#8*6] | |
1350 add x1,x1,#8*4 | |
1351 stp x10,x11,[x27,#8*0] | |
1352 stp x12,x13,[x27,#8*2] | |
1353 add x27,x27,#8*4 | |
1354 cbnz x28,.Lmul4x_cond_copy | |
1355 | |
1356 csel x10,x19,x6,lo | |
1357 stp xzr,xzr,[x26,#8*0] | |
1358 csel x11,x20,x7,lo | |
1359 stp xzr,xzr,[x26,#8*2] | |
1360 csel x12,x21,x8,lo | |
1361 stp xzr,xzr,[x26,#8*3] | |
1362 csel x13,x22,x9,lo | |
1363 stp xzr,xzr,[x26,#8*4] | |
1364 stp x10,x11,[x27,#8*0] | |
1365 stp x12,x13,[x27,#8*2] | |
1366 | |
1367 b .Lmul4x_done | |
1368 | |
1369 .align 4 | |
1370 .Lmul4x4_post_condition: | |
1371 adc x0,x0,xzr | |
1372 ldr x1,[x29,#96] // pull rp | |
1373 // x19-3,x0 hold result, x14-7 hold modulus | |
1374 subs x6,x19,x14 | |
1375 ldr x30,[x29,#8] // pull return address | |
1376 sbcs x7,x20,x15 | |
1377 stp xzr,xzr,[sp,#8*0] | |
1378 sbcs x8,x21,x16 | |
1379 stp xzr,xzr,[sp,#8*2] | |
1380 sbcs x9,x22,x17 | |
1381 stp xzr,xzr,[sp,#8*4] | |
1382 sbcs xzr,x0,xzr // did it borrow? | |
1383 stp xzr,xzr,[sp,#8*6] | |
1384 | |
1385 // x6-3 hold result-modulus | |
1386 csel x6,x19,x6,lo | |
1387 csel x7,x20,x7,lo | |
1388 csel x8,x21,x8,lo | |
1389 csel x9,x22,x9,lo | |
1390 stp x6,x7,[x1,#8*0] | |
1391 stp x8,x9,[x1,#8*2] | |
1392 | |
1393 .Lmul4x_done: | |
1394 ldp x19,x20,[x29,#16] | |
1395 mov sp,x29 | |
1396 ldp x21,x22,[x29,#32] | |
1397 mov x0,#1 | |
1398 ldp x23,x24,[x29,#48] | |
1399 ldp x25,x26,[x29,#64] | |
1400 ldp x27,x28,[x29,#80] | |
1401 ldr x29,[sp],#128 | |
1402 ret | |
1403 .size __bn_mul4x_mont,.-__bn_mul4x_mont | |
1404 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105
,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79
,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,4
6,111,114,103,62,0 | |
1405 .align 2 | |
1406 .align 4 | |
1407 #endif | |
OLD | NEW |