OLD | NEW |
(Empty) | |
| 1 #if defined(__aarch64__) |
| 2 .text |
| 3 |
| 4 .globl bn_mul_mont |
| 5 .hidden bn_mul_mont |
| 6 .type bn_mul_mont,%function |
| 7 .align 5 |
| 8 bn_mul_mont: |
| 9 tst x5,#7 |
| 10 b.eq __bn_sqr8x_mont |
| 11 tst x5,#3 |
| 12 b.eq __bn_mul4x_mont |
| 13 .Lmul_mont: |
| 14 stp x29,x30,[sp,#-64]! |
| 15 add x29,sp,#0 |
| 16 stp x19,x20,[sp,#16] |
| 17 stp x21,x22,[sp,#32] |
| 18 stp x23,x24,[sp,#48] |
| 19 |
| 20 ldr x9,[x2],#8 // bp[0] |
| 21 sub x22,sp,x5,lsl#3 |
| 22 ldp x7,x8,[x1],#16 // ap[0..1] |
| 23 lsl x5,x5,#3 |
| 24 ldr x4,[x4] // *n0 |
| 25 and x22,x22,#-16 // ABI says so |
| 26 ldp x13,x14,[x3],#16 // np[0..1] |
| 27 |
| 28 mul x6,x7,x9 // ap[0]*bp[0] |
| 29 sub x21,x5,#16 // j=num-2 |
| 30 umulh x7,x7,x9 |
| 31 mul x10,x8,x9 // ap[1]*bp[0] |
| 32 umulh x11,x8,x9 |
| 33 |
| 34 mul x15,x6,x4 // "tp[0]"*n0 |
| 35 mov sp,x22 // alloca |
| 36 |
| 37 // (*) mul x12,x13,x15 // np[0]*m1 |
| 38 umulh x13,x13,x15 |
| 39 mul x16,x14,x15 // np[1]*m1 |
| 40 // (*) adds x12,x12,x6 // discarded |
| 41 // (*) As for removal of first multiplication and addition |
| 42 // instructions. The outcome of first addition is |
| 43 // guaranteed to be zero, which leaves two computationally |
| 44 // significant outcomes: it either carries or not. Then |
| 45 // question is when does it carry? Is there alternative |
| 46 // way to deduce it? If you follow operations, you can |
| 47 // observe that condition for carry is quite simple: |
| 48 // x6 being non-zero. So that carry can be calculated |
| 49 // by adding -1 to x6. That's what next instruction does. |
| 50 subs xzr,x6,#1 // (*) |
| 51 umulh x17,x14,x15 |
| 52 adc x13,x13,xzr |
| 53 cbz x21,.L1st_skip |
| 54 |
| 55 .L1st: |
| 56 ldr x8,[x1],#8 |
| 57 adds x6,x10,x7 |
| 58 sub x21,x21,#8 // j-- |
| 59 adc x7,x11,xzr |
| 60 |
| 61 ldr x14,[x3],#8 |
| 62 adds x12,x16,x13 |
| 63 mul x10,x8,x9 // ap[j]*bp[0] |
| 64 adc x13,x17,xzr |
| 65 umulh x11,x8,x9 |
| 66 |
| 67 adds x12,x12,x6 |
| 68 mul x16,x14,x15 // np[j]*m1 |
| 69 adc x13,x13,xzr |
| 70 umulh x17,x14,x15 |
| 71 str x12,[x22],#8 // tp[j-1] |
| 72 cbnz x21,.L1st |
| 73 |
| 74 .L1st_skip: |
| 75 adds x6,x10,x7 |
| 76 sub x1,x1,x5 // rewind x1 |
| 77 adc x7,x11,xzr |
| 78 |
| 79 adds x12,x16,x13 |
| 80 sub x3,x3,x5 // rewind x3 |
| 81 adc x13,x17,xzr |
| 82 |
| 83 adds x12,x12,x6 |
| 84 sub x20,x5,#8 // i=num-1 |
| 85 adcs x13,x13,x7 |
| 86 |
| 87 adc x19,xzr,xzr // upmost overflow bit |
| 88 stp x12,x13,[x22] |
| 89 |
| 90 .Louter: |
| 91 ldr x9,[x2],#8 // bp[i] |
| 92 ldp x7,x8,[x1],#16 |
| 93 ldr x23,[sp] // tp[0] |
| 94 add x22,sp,#8 |
| 95 |
| 96 mul x6,x7,x9 // ap[0]*bp[i] |
| 97 sub x21,x5,#16 // j=num-2 |
| 98 umulh x7,x7,x9 |
| 99 ldp x13,x14,[x3],#16 |
| 100 mul x10,x8,x9 // ap[1]*bp[i] |
| 101 adds x6,x6,x23 |
| 102 umulh x11,x8,x9 |
| 103 adc x7,x7,xzr |
| 104 |
| 105 mul x15,x6,x4 |
| 106 sub x20,x20,#8 // i-- |
| 107 |
| 108 // (*) mul x12,x13,x15 // np[0]*m1 |
| 109 umulh x13,x13,x15 |
| 110 mul x16,x14,x15 // np[1]*m1 |
| 111 // (*) adds x12,x12,x6 |
| 112 subs xzr,x6,#1 // (*) |
| 113 umulh x17,x14,x15 |
| 114 cbz x21,.Linner_skip |
| 115 |
| 116 .Linner: |
| 117 ldr x8,[x1],#8 |
| 118 adc x13,x13,xzr |
| 119 ldr x23,[x22],#8 // tp[j] |
| 120 adds x6,x10,x7 |
| 121 sub x21,x21,#8 // j-- |
| 122 adc x7,x11,xzr |
| 123 |
| 124 adds x12,x16,x13 |
| 125 ldr x14,[x3],#8 |
| 126 adc x13,x17,xzr |
| 127 |
| 128 mul x10,x8,x9 // ap[j]*bp[i] |
| 129 adds x6,x6,x23 |
| 130 umulh x11,x8,x9 |
| 131 adc x7,x7,xzr |
| 132 |
| 133 mul x16,x14,x15 // np[j]*m1 |
| 134 adds x12,x12,x6 |
| 135 umulh x17,x14,x15 |
| 136 str x12,[x22,#-16] // tp[j-1] |
| 137 cbnz x21,.Linner |
| 138 |
| 139 .Linner_skip: |
| 140 ldr x23,[x22],#8 // tp[j] |
| 141 adc x13,x13,xzr |
| 142 adds x6,x10,x7 |
| 143 sub x1,x1,x5 // rewind x1 |
| 144 adc x7,x11,xzr |
| 145 |
| 146 adds x12,x16,x13 |
| 147 sub x3,x3,x5 // rewind x3 |
| 148 adcs x13,x17,x19 |
| 149 adc x19,xzr,xzr |
| 150 |
| 151 adds x6,x6,x23 |
| 152 adc x7,x7,xzr |
| 153 |
| 154 adds x12,x12,x6 |
| 155 adcs x13,x13,x7 |
| 156 adc x19,x19,xzr // upmost overflow bit |
| 157 stp x12,x13,[x22,#-16] |
| 158 |
| 159 cbnz x20,.Louter |
| 160 |
| 161 // Final step. We see if result is larger than modulus, and |
| 162 // if it is, subtract the modulus. But comparison implies |
| 163 // subtraction. So we subtract modulus, see if it borrowed, |
| 164 // and conditionally copy original value. |
| 165 ldr x23,[sp] // tp[0] |
| 166 add x22,sp,#8 |
| 167 ldr x14,[x3],#8 // np[0] |
| 168 subs x21,x5,#8 // j=num-1 and clear borrow |
| 169 mov x1,x0 |
| 170 .Lsub: |
| 171 sbcs x8,x23,x14 // tp[j]-np[j] |
| 172 ldr x23,[x22],#8 |
| 173 sub x21,x21,#8 // j-- |
| 174 ldr x14,[x3],#8 |
| 175 str x8,[x1],#8 // rp[j]=tp[j]-np[j] |
| 176 cbnz x21,.Lsub |
| 177 |
| 178 sbcs x8,x23,x14 |
| 179 sbcs x19,x19,xzr // did it borrow? |
| 180 str x8,[x1],#8 // rp[num-1] |
| 181 |
| 182 ldr x23,[sp] // tp[0] |
| 183 add x22,sp,#8 |
| 184 ldr x8,[x0],#8 // rp[0] |
| 185 sub x5,x5,#8 // num-- |
| 186 nop |
| 187 .Lcond_copy: |
| 188 sub x5,x5,#8 // num-- |
| 189 csel x14,x23,x8,lo // did it borrow? |
| 190 ldr x23,[x22],#8 |
| 191 ldr x8,[x0],#8 |
| 192 str xzr,[x22,#-16] // wipe tp |
| 193 str x14,[x0,#-16] |
| 194 cbnz x5,.Lcond_copy |
| 195 |
| 196 csel x14,x23,x8,lo |
| 197 str xzr,[x22,#-8] // wipe tp |
| 198 str x14,[x0,#-8] |
| 199 |
| 200 ldp x19,x20,[x29,#16] |
| 201 mov sp,x29 |
| 202 ldp x21,x22,[x29,#32] |
| 203 mov x0,#1 |
| 204 ldp x23,x24,[x29,#48] |
| 205 ldr x29,[sp],#64 |
| 206 ret |
| 207 .size bn_mul_mont,.-bn_mul_mont |
| 208 .type __bn_sqr8x_mont,%function |
| 209 .align 5 |
| 210 __bn_sqr8x_mont: |
| 211 cmp x1,x2 |
| 212 b.ne __bn_mul4x_mont |
| 213 .Lsqr8x_mont: |
| 214 stp x29,x30,[sp,#-128]! |
| 215 add x29,sp,#0 |
| 216 stp x19,x20,[sp,#16] |
| 217 stp x21,x22,[sp,#32] |
| 218 stp x23,x24,[sp,#48] |
| 219 stp x25,x26,[sp,#64] |
| 220 stp x27,x28,[sp,#80] |
| 221 stp x0,x3,[sp,#96] // offload rp and np |
| 222 |
| 223 ldp x6,x7,[x1,#8*0] |
| 224 ldp x8,x9,[x1,#8*2] |
| 225 ldp x10,x11,[x1,#8*4] |
| 226 ldp x12,x13,[x1,#8*6] |
| 227 |
| 228 sub x2,sp,x5,lsl#4 |
| 229 lsl x5,x5,#3 |
| 230 ldr x4,[x4] // *n0 |
| 231 mov sp,x2 // alloca |
| 232 sub x27,x5,#8*8 |
| 233 b .Lsqr8x_zero_start |
| 234 |
| 235 .Lsqr8x_zero: |
| 236 sub x27,x27,#8*8 |
| 237 stp xzr,xzr,[x2,#8*0] |
| 238 stp xzr,xzr,[x2,#8*2] |
| 239 stp xzr,xzr,[x2,#8*4] |
| 240 stp xzr,xzr,[x2,#8*6] |
| 241 .Lsqr8x_zero_start: |
| 242 stp xzr,xzr,[x2,#8*8] |
| 243 stp xzr,xzr,[x2,#8*10] |
| 244 stp xzr,xzr,[x2,#8*12] |
| 245 stp xzr,xzr,[x2,#8*14] |
| 246 add x2,x2,#8*16 |
| 247 cbnz x27,.Lsqr8x_zero |
| 248 |
| 249 add x3,x1,x5 |
| 250 add x1,x1,#8*8 |
| 251 mov x19,xzr |
| 252 mov x20,xzr |
| 253 mov x21,xzr |
| 254 mov x22,xzr |
| 255 mov x23,xzr |
| 256 mov x24,xzr |
| 257 mov x25,xzr |
| 258 mov x26,xzr |
| 259 mov x2,sp |
| 260 str x4,[x29,#112] // offload n0 |
| 261 |
| 262 // Multiply everything but a[i]*a[i] |
| 263 .align 4 |
| 264 .Lsqr8x_outer_loop: |
| 265 // a[1]a[0] (i) |
| 266 // a[2]a[0] |
| 267 // a[3]a[0] |
| 268 // a[4]a[0] |
| 269 // a[5]a[0] |
| 270 // a[6]a[0] |
| 271 // a[7]a[0] |
| 272 // a[2]a[1] (ii) |
| 273 // a[3]a[1] |
| 274 // a[4]a[1] |
| 275 // a[5]a[1] |
| 276 // a[6]a[1] |
| 277 // a[7]a[1] |
| 278 // a[3]a[2] (iii) |
| 279 // a[4]a[2] |
| 280 // a[5]a[2] |
| 281 // a[6]a[2] |
| 282 // a[7]a[2] |
| 283 // a[4]a[3] (iv) |
| 284 // a[5]a[3] |
| 285 // a[6]a[3] |
| 286 // a[7]a[3] |
| 287 // a[5]a[4] (v) |
| 288 // a[6]a[4] |
| 289 // a[7]a[4] |
| 290 // a[6]a[5] (vi) |
| 291 // a[7]a[5] |
| 292 // a[7]a[6] (vii) |
| 293 |
| 294 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) |
| 295 mul x15,x8,x6 |
| 296 mul x16,x9,x6 |
| 297 mul x17,x10,x6 |
| 298 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) |
| 299 mul x14,x11,x6 |
| 300 adcs x21,x21,x15 |
| 301 mul x15,x12,x6 |
| 302 adcs x22,x22,x16 |
| 303 mul x16,x13,x6 |
| 304 adcs x23,x23,x17 |
| 305 umulh x17,x7,x6 // hi(a[1..7]*a[0]) |
| 306 adcs x24,x24,x14 |
| 307 umulh x14,x8,x6 |
| 308 adcs x25,x25,x15 |
| 309 umulh x15,x9,x6 |
| 310 adcs x26,x26,x16 |
| 311 umulh x16,x10,x6 |
| 312 stp x19,x20,[x2],#8*2 // t[0..1] |
| 313 adc x19,xzr,xzr // t[8] |
| 314 adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) |
| 315 umulh x17,x11,x6 |
| 316 adcs x22,x22,x14 |
| 317 umulh x14,x12,x6 |
| 318 adcs x23,x23,x15 |
| 319 umulh x15,x13,x6 |
| 320 adcs x24,x24,x16 |
| 321 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) |
| 322 adcs x25,x25,x17 |
| 323 mul x17,x9,x7 |
| 324 adcs x26,x26,x14 |
| 325 mul x14,x10,x7 |
| 326 adc x19,x19,x15 |
| 327 |
| 328 mul x15,x11,x7 |
| 329 adds x22,x22,x16 |
| 330 mul x16,x12,x7 |
| 331 adcs x23,x23,x17 |
| 332 mul x17,x13,x7 |
| 333 adcs x24,x24,x14 |
| 334 umulh x14,x8,x7 // hi(a[2..7]*a[1]) |
| 335 adcs x25,x25,x15 |
| 336 umulh x15,x9,x7 |
| 337 adcs x26,x26,x16 |
| 338 umulh x16,x10,x7 |
| 339 adcs x19,x19,x17 |
| 340 umulh x17,x11,x7 |
| 341 stp x21,x22,[x2],#8*2 // t[2..3] |
| 342 adc x20,xzr,xzr // t[9] |
| 343 adds x23,x23,x14 |
| 344 umulh x14,x12,x7 |
| 345 adcs x24,x24,x15 |
| 346 umulh x15,x13,x7 |
| 347 adcs x25,x25,x16 |
| 348 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) |
| 349 adcs x26,x26,x17 |
| 350 mul x17,x10,x8 |
| 351 adcs x19,x19,x14 |
| 352 mul x14,x11,x8 |
| 353 adc x20,x20,x15 |
| 354 |
| 355 mul x15,x12,x8 |
| 356 adds x24,x24,x16 |
| 357 mul x16,x13,x8 |
| 358 adcs x25,x25,x17 |
| 359 umulh x17,x9,x8 // hi(a[3..7]*a[2]) |
| 360 adcs x26,x26,x14 |
| 361 umulh x14,x10,x8 |
| 362 adcs x19,x19,x15 |
| 363 umulh x15,x11,x8 |
| 364 adcs x20,x20,x16 |
| 365 umulh x16,x12,x8 |
| 366 stp x23,x24,[x2],#8*2 // t[4..5] |
| 367 adc x21,xzr,xzr // t[10] |
| 368 adds x25,x25,x17 |
| 369 umulh x17,x13,x8 |
| 370 adcs x26,x26,x14 |
| 371 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) |
| 372 adcs x19,x19,x15 |
| 373 mul x15,x11,x9 |
| 374 adcs x20,x20,x16 |
| 375 mul x16,x12,x9 |
| 376 adc x21,x21,x17 |
| 377 |
| 378 mul x17,x13,x9 |
| 379 adds x26,x26,x14 |
| 380 umulh x14,x10,x9 // hi(a[4..7]*a[3]) |
| 381 adcs x19,x19,x15 |
| 382 umulh x15,x11,x9 |
| 383 adcs x20,x20,x16 |
| 384 umulh x16,x12,x9 |
| 385 adcs x21,x21,x17 |
| 386 umulh x17,x13,x9 |
| 387 stp x25,x26,[x2],#8*2 // t[6..7] |
| 388 adc x22,xzr,xzr // t[11] |
| 389 adds x19,x19,x14 |
| 390 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) |
| 391 adcs x20,x20,x15 |
| 392 mul x15,x12,x10 |
| 393 adcs x21,x21,x16 |
| 394 mul x16,x13,x10 |
| 395 adc x22,x22,x17 |
| 396 |
| 397 umulh x17,x11,x10 // hi(a[5..7]*a[4]) |
| 398 adds x20,x20,x14 |
| 399 umulh x14,x12,x10 |
| 400 adcs x21,x21,x15 |
| 401 umulh x15,x13,x10 |
| 402 adcs x22,x22,x16 |
| 403 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) |
| 404 adc x23,xzr,xzr // t[12] |
| 405 adds x21,x21,x17 |
| 406 mul x17,x13,x11 |
| 407 adcs x22,x22,x14 |
| 408 umulh x14,x12,x11 // hi(a[6..7]*a[5]) |
| 409 adc x23,x23,x15 |
| 410 |
| 411 umulh x15,x13,x11 |
| 412 adds x22,x22,x16 |
| 413 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) |
| 414 adcs x23,x23,x17 |
| 415 umulh x17,x13,x12 // hi(a[7]*a[6]) |
| 416 adc x24,xzr,xzr // t[13] |
| 417 adds x23,x23,x14 |
| 418 sub x27,x3,x1 // done yet? |
| 419 adc x24,x24,x15 |
| 420 |
| 421 adds x24,x24,x16 |
| 422 sub x14,x3,x5 // rewinded ap |
| 423 adc x25,xzr,xzr // t[14] |
| 424 add x25,x25,x17 |
| 425 |
| 426 cbz x27,.Lsqr8x_outer_break |
| 427 |
| 428 mov x4,x6 |
| 429 ldp x6,x7,[x2,#8*0] |
| 430 ldp x8,x9,[x2,#8*2] |
| 431 ldp x10,x11,[x2,#8*4] |
| 432 ldp x12,x13,[x2,#8*6] |
| 433 adds x19,x19,x6 |
| 434 adcs x20,x20,x7 |
| 435 ldp x6,x7,[x1,#8*0] |
| 436 adcs x21,x21,x8 |
| 437 adcs x22,x22,x9 |
| 438 ldp x8,x9,[x1,#8*2] |
| 439 adcs x23,x23,x10 |
| 440 adcs x24,x24,x11 |
| 441 ldp x10,x11,[x1,#8*4] |
| 442 adcs x25,x25,x12 |
| 443 mov x0,x1 |
| 444 adcs x26,xzr,x13 |
| 445 ldp x12,x13,[x1,#8*6] |
| 446 add x1,x1,#8*8 |
| 447 //adc x28,xzr,xzr // moved below |
| 448 mov x27,#-8*8 |
| 449 |
| 450 // a[8]a[0] |
| 451 // a[9]a[0] |
| 452 // a[a]a[0] |
| 453 // a[b]a[0] |
| 454 // a[c]a[0] |
| 455 // a[d]a[0] |
| 456 // a[e]a[0] |
| 457 // a[f]a[0] |
| 458 // a[8]a[1] |
| 459 // a[f]a[1]........................ |
| 460 // a[8]a[2] |
| 461 // a[f]a[2]........................ |
| 462 // a[8]a[3] |
| 463 // a[f]a[3]........................ |
| 464 // a[8]a[4] |
| 465 // a[f]a[4]........................ |
| 466 // a[8]a[5] |
| 467 // a[f]a[5]........................ |
| 468 // a[8]a[6] |
| 469 // a[f]a[6]........................ |
| 470 // a[8]a[7] |
| 471 // a[f]a[7]........................ |
| 472 .Lsqr8x_mul: |
| 473 mul x14,x6,x4 |
| 474 adc x28,xzr,xzr // carry bit, modulo-scheduled |
| 475 mul x15,x7,x4 |
| 476 add x27,x27,#8 |
| 477 mul x16,x8,x4 |
| 478 mul x17,x9,x4 |
| 479 adds x19,x19,x14 |
| 480 mul x14,x10,x4 |
| 481 adcs x20,x20,x15 |
| 482 mul x15,x11,x4 |
| 483 adcs x21,x21,x16 |
| 484 mul x16,x12,x4 |
| 485 adcs x22,x22,x17 |
| 486 mul x17,x13,x4 |
| 487 adcs x23,x23,x14 |
| 488 umulh x14,x6,x4 |
| 489 adcs x24,x24,x15 |
| 490 umulh x15,x7,x4 |
| 491 adcs x25,x25,x16 |
| 492 umulh x16,x8,x4 |
| 493 adcs x26,x26,x17 |
| 494 umulh x17,x9,x4 |
| 495 adc x28,x28,xzr |
| 496 str x19,[x2],#8 |
| 497 adds x19,x20,x14 |
| 498 umulh x14,x10,x4 |
| 499 adcs x20,x21,x15 |
| 500 umulh x15,x11,x4 |
| 501 adcs x21,x22,x16 |
| 502 umulh x16,x12,x4 |
| 503 adcs x22,x23,x17 |
| 504 umulh x17,x13,x4 |
| 505 ldr x4,[x0,x27] |
| 506 adcs x23,x24,x14 |
| 507 adcs x24,x25,x15 |
| 508 adcs x25,x26,x16 |
| 509 adcs x26,x28,x17 |
| 510 //adc x28,xzr,xzr // moved above |
| 511 cbnz x27,.Lsqr8x_mul |
| 512 // note that carry flag is guaranteed |
| 513 // to be zero at this point |
| 514 cmp x1,x3 // done yet? |
| 515 b.eq .Lsqr8x_break |
| 516 |
| 517 ldp x6,x7,[x2,#8*0] |
| 518 ldp x8,x9,[x2,#8*2] |
| 519 ldp x10,x11,[x2,#8*4] |
| 520 ldp x12,x13,[x2,#8*6] |
| 521 adds x19,x19,x6 |
| 522 ldr x4,[x0,#-8*8] |
| 523 adcs x20,x20,x7 |
| 524 ldp x6,x7,[x1,#8*0] |
| 525 adcs x21,x21,x8 |
| 526 adcs x22,x22,x9 |
| 527 ldp x8,x9,[x1,#8*2] |
| 528 adcs x23,x23,x10 |
| 529 adcs x24,x24,x11 |
| 530 ldp x10,x11,[x1,#8*4] |
| 531 adcs x25,x25,x12 |
| 532 mov x27,#-8*8 |
| 533 adcs x26,x26,x13 |
| 534 ldp x12,x13,[x1,#8*6] |
| 535 add x1,x1,#8*8 |
| 536 //adc x28,xzr,xzr // moved above |
| 537 b .Lsqr8x_mul |
| 538 |
| 539 .align 4 |
| 540 .Lsqr8x_break: |
| 541 ldp x6,x7,[x0,#8*0] |
| 542 add x1,x0,#8*8 |
| 543 ldp x8,x9,[x0,#8*2] |
| 544 sub x14,x3,x1 // is it last iteration? |
| 545 ldp x10,x11,[x0,#8*4] |
| 546 sub x15,x2,x14 |
| 547 ldp x12,x13,[x0,#8*6] |
| 548 cbz x14,.Lsqr8x_outer_loop |
| 549 |
| 550 stp x19,x20,[x2,#8*0] |
| 551 ldp x19,x20,[x15,#8*0] |
| 552 stp x21,x22,[x2,#8*2] |
| 553 ldp x21,x22,[x15,#8*2] |
| 554 stp x23,x24,[x2,#8*4] |
| 555 ldp x23,x24,[x15,#8*4] |
| 556 stp x25,x26,[x2,#8*6] |
| 557 mov x2,x15 |
| 558 ldp x25,x26,[x15,#8*6] |
| 559 b .Lsqr8x_outer_loop |
| 560 |
| 561 .align 4 |
| 562 .Lsqr8x_outer_break: |
| 563 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] |
| 564 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] |
| 565 ldp x15,x16,[sp,#8*1] |
| 566 ldp x11,x13,[x14,#8*2] |
| 567 add x1,x14,#8*4 |
| 568 ldp x17,x14,[sp,#8*3] |
| 569 |
| 570 stp x19,x20,[x2,#8*0] |
| 571 mul x19,x7,x7 |
| 572 stp x21,x22,[x2,#8*2] |
| 573 umulh x7,x7,x7 |
| 574 stp x23,x24,[x2,#8*4] |
| 575 mul x8,x9,x9 |
| 576 stp x25,x26,[x2,#8*6] |
| 577 mov x2,sp |
| 578 umulh x9,x9,x9 |
| 579 adds x20,x7,x15,lsl#1 |
| 580 extr x15,x16,x15,#63 |
| 581 sub x27,x5,#8*4 |
| 582 |
| 583 .Lsqr4x_shift_n_add: |
| 584 adcs x21,x8,x15 |
| 585 extr x16,x17,x16,#63 |
| 586 sub x27,x27,#8*4 |
| 587 adcs x22,x9,x16 |
| 588 ldp x15,x16,[x2,#8*5] |
| 589 mul x10,x11,x11 |
| 590 ldp x7,x9,[x1],#8*2 |
| 591 umulh x11,x11,x11 |
| 592 mul x12,x13,x13 |
| 593 umulh x13,x13,x13 |
| 594 extr x17,x14,x17,#63 |
| 595 stp x19,x20,[x2,#8*0] |
| 596 adcs x23,x10,x17 |
| 597 extr x14,x15,x14,#63 |
| 598 stp x21,x22,[x2,#8*2] |
| 599 adcs x24,x11,x14 |
| 600 ldp x17,x14,[x2,#8*7] |
| 601 extr x15,x16,x15,#63 |
| 602 adcs x25,x12,x15 |
| 603 extr x16,x17,x16,#63 |
| 604 adcs x26,x13,x16 |
| 605 ldp x15,x16,[x2,#8*9] |
| 606 mul x6,x7,x7 |
| 607 ldp x11,x13,[x1],#8*2 |
| 608 umulh x7,x7,x7 |
| 609 mul x8,x9,x9 |
| 610 umulh x9,x9,x9 |
| 611 stp x23,x24,[x2,#8*4] |
| 612 extr x17,x14,x17,#63 |
| 613 stp x25,x26,[x2,#8*6] |
| 614 add x2,x2,#8*8 |
| 615 adcs x19,x6,x17 |
| 616 extr x14,x15,x14,#63 |
| 617 adcs x20,x7,x14 |
| 618 ldp x17,x14,[x2,#8*3] |
| 619 extr x15,x16,x15,#63 |
| 620 cbnz x27,.Lsqr4x_shift_n_add |
| 621 ldp x1,x4,[x29,#104] // pull np and n0 |
| 622 |
| 623 adcs x21,x8,x15 |
| 624 extr x16,x17,x16,#63 |
| 625 adcs x22,x9,x16 |
| 626 ldp x15,x16,[x2,#8*5] |
| 627 mul x10,x11,x11 |
| 628 umulh x11,x11,x11 |
| 629 stp x19,x20,[x2,#8*0] |
| 630 mul x12,x13,x13 |
| 631 umulh x13,x13,x13 |
| 632 stp x21,x22,[x2,#8*2] |
| 633 extr x17,x14,x17,#63 |
| 634 adcs x23,x10,x17 |
| 635 extr x14,x15,x14,#63 |
| 636 ldp x19,x20,[sp,#8*0] |
| 637 adcs x24,x11,x14 |
| 638 extr x15,x16,x15,#63 |
| 639 ldp x6,x7,[x1,#8*0] |
| 640 adcs x25,x12,x15 |
| 641 extr x16,xzr,x16,#63 |
| 642 ldp x8,x9,[x1,#8*2] |
| 643 adc x26,x13,x16 |
| 644 ldp x10,x11,[x1,#8*4] |
| 645 |
| 646 // Reduce by 512 bits per iteration |
| 647 mul x28,x4,x19 // t[0]*n0 |
| 648 ldp x12,x13,[x1,#8*6] |
| 649 add x3,x1,x5 |
| 650 ldp x21,x22,[sp,#8*2] |
| 651 stp x23,x24,[x2,#8*4] |
| 652 ldp x23,x24,[sp,#8*4] |
| 653 stp x25,x26,[x2,#8*6] |
| 654 ldp x25,x26,[sp,#8*6] |
| 655 add x1,x1,#8*8 |
| 656 mov x30,xzr // initial top-most carry |
| 657 mov x2,sp |
| 658 mov x27,#8 |
| 659 |
| 660 .Lsqr8x_reduction: |
| 661 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) |
| 662 mul x15,x7,x28 |
| 663 sub x27,x27,#1 |
| 664 mul x16,x8,x28 |
| 665 str x28,[x2],#8 // put aside t[0]*n0 for tail processing |
| 666 mul x17,x9,x28 |
| 667 // (*) adds xzr,x19,x14 |
| 668 subs xzr,x19,#1 // (*) |
| 669 mul x14,x10,x28 |
| 670 adcs x19,x20,x15 |
| 671 mul x15,x11,x28 |
| 672 adcs x20,x21,x16 |
| 673 mul x16,x12,x28 |
| 674 adcs x21,x22,x17 |
| 675 mul x17,x13,x28 |
| 676 adcs x22,x23,x14 |
| 677 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) |
| 678 adcs x23,x24,x15 |
| 679 umulh x15,x7,x28 |
| 680 adcs x24,x25,x16 |
| 681 umulh x16,x8,x28 |
| 682 adcs x25,x26,x17 |
| 683 umulh x17,x9,x28 |
| 684 adc x26,xzr,xzr |
| 685 adds x19,x19,x14 |
| 686 umulh x14,x10,x28 |
| 687 adcs x20,x20,x15 |
| 688 umulh x15,x11,x28 |
| 689 adcs x21,x21,x16 |
| 690 umulh x16,x12,x28 |
| 691 adcs x22,x22,x17 |
| 692 umulh x17,x13,x28 |
| 693 mul x28,x4,x19 // next t[0]*n0 |
| 694 adcs x23,x23,x14 |
| 695 adcs x24,x24,x15 |
| 696 adcs x25,x25,x16 |
| 697 adc x26,x26,x17 |
| 698 cbnz x27,.Lsqr8x_reduction |
| 699 |
| 700 ldp x14,x15,[x2,#8*0] |
| 701 ldp x16,x17,[x2,#8*2] |
| 702 mov x0,x2 |
| 703 sub x27,x3,x1 // done yet? |
| 704 adds x19,x19,x14 |
| 705 adcs x20,x20,x15 |
| 706 ldp x14,x15,[x2,#8*4] |
| 707 adcs x21,x21,x16 |
| 708 adcs x22,x22,x17 |
| 709 ldp x16,x17,[x2,#8*6] |
| 710 adcs x23,x23,x14 |
| 711 adcs x24,x24,x15 |
| 712 adcs x25,x25,x16 |
| 713 adcs x26,x26,x17 |
| 714 //adc x28,xzr,xzr // moved below |
| 715 cbz x27,.Lsqr8x8_post_condition |
| 716 |
| 717 ldr x4,[x2,#-8*8] |
| 718 ldp x6,x7,[x1,#8*0] |
| 719 ldp x8,x9,[x1,#8*2] |
| 720 ldp x10,x11,[x1,#8*4] |
| 721 mov x27,#-8*8 |
| 722 ldp x12,x13,[x1,#8*6] |
| 723 add x1,x1,#8*8 |
| 724 |
| 725 .Lsqr8x_tail: |
| 726 mul x14,x6,x4 |
| 727 adc x28,xzr,xzr // carry bit, modulo-scheduled |
| 728 mul x15,x7,x4 |
| 729 add x27,x27,#8 |
| 730 mul x16,x8,x4 |
| 731 mul x17,x9,x4 |
| 732 adds x19,x19,x14 |
| 733 mul x14,x10,x4 |
| 734 adcs x20,x20,x15 |
| 735 mul x15,x11,x4 |
| 736 adcs x21,x21,x16 |
| 737 mul x16,x12,x4 |
| 738 adcs x22,x22,x17 |
| 739 mul x17,x13,x4 |
| 740 adcs x23,x23,x14 |
| 741 umulh x14,x6,x4 |
| 742 adcs x24,x24,x15 |
| 743 umulh x15,x7,x4 |
| 744 adcs x25,x25,x16 |
| 745 umulh x16,x8,x4 |
| 746 adcs x26,x26,x17 |
| 747 umulh x17,x9,x4 |
| 748 adc x28,x28,xzr |
| 749 str x19,[x2],#8 |
| 750 adds x19,x20,x14 |
| 751 umulh x14,x10,x4 |
| 752 adcs x20,x21,x15 |
| 753 umulh x15,x11,x4 |
| 754 adcs x21,x22,x16 |
| 755 umulh x16,x12,x4 |
| 756 adcs x22,x23,x17 |
| 757 umulh x17,x13,x4 |
| 758 ldr x4,[x0,x27] |
| 759 adcs x23,x24,x14 |
| 760 adcs x24,x25,x15 |
| 761 adcs x25,x26,x16 |
| 762 adcs x26,x28,x17 |
| 763 //adc x28,xzr,xzr // moved above |
| 764 cbnz x27,.Lsqr8x_tail |
| 765 // note that carry flag is guaranteed |
| 766 // to be zero at this point |
| 767 ldp x6,x7,[x2,#8*0] |
| 768 sub x27,x3,x1 // done yet? |
| 769 sub x16,x3,x5 // rewinded np |
| 770 ldp x8,x9,[x2,#8*2] |
| 771 ldp x10,x11,[x2,#8*4] |
| 772 ldp x12,x13,[x2,#8*6] |
| 773 cbz x27,.Lsqr8x_tail_break |
| 774 |
| 775 ldr x4,[x0,#-8*8] |
| 776 adds x19,x19,x6 |
| 777 adcs x20,x20,x7 |
| 778 ldp x6,x7,[x1,#8*0] |
| 779 adcs x21,x21,x8 |
| 780 adcs x22,x22,x9 |
| 781 ldp x8,x9,[x1,#8*2] |
| 782 adcs x23,x23,x10 |
| 783 adcs x24,x24,x11 |
| 784 ldp x10,x11,[x1,#8*4] |
| 785 adcs x25,x25,x12 |
| 786 mov x27,#-8*8 |
| 787 adcs x26,x26,x13 |
| 788 ldp x12,x13,[x1,#8*6] |
| 789 add x1,x1,#8*8 |
| 790 //adc x28,xzr,xzr // moved above |
| 791 b .Lsqr8x_tail |
| 792 |
| 793 .align 4 |
| 794 .Lsqr8x_tail_break: |
| 795 ldr x4,[x29,#112] // pull n0 |
| 796 add x27,x2,#8*8 // end of current t[num] window |
| 797 |
| 798 subs xzr,x30,#1 // "move" top-most carry to carry bit |
| 799 adcs x14,x19,x6 |
| 800 adcs x15,x20,x7 |
| 801 ldp x19,x20,[x0,#8*0] |
| 802 adcs x21,x21,x8 |
| 803 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] |
| 804 adcs x22,x22,x9 |
| 805 ldp x8,x9,[x16,#8*2] |
| 806 adcs x23,x23,x10 |
| 807 adcs x24,x24,x11 |
| 808 ldp x10,x11,[x16,#8*4] |
| 809 adcs x25,x25,x12 |
| 810 adcs x26,x26,x13 |
| 811 ldp x12,x13,[x16,#8*6] |
| 812 add x1,x16,#8*8 |
| 813 adc x30,xzr,xzr // top-most carry |
| 814 mul x28,x4,x19 |
| 815 stp x14,x15,[x2,#8*0] |
| 816 stp x21,x22,[x2,#8*2] |
| 817 ldp x21,x22,[x0,#8*2] |
| 818 stp x23,x24,[x2,#8*4] |
| 819 ldp x23,x24,[x0,#8*4] |
| 820 cmp x27,x29 // did we hit the bottom? |
| 821 stp x25,x26,[x2,#8*6] |
| 822 mov x2,x0 // slide the window |
| 823 ldp x25,x26,[x0,#8*6] |
| 824 mov x27,#8 |
| 825 b.ne .Lsqr8x_reduction |
| 826 |
| 827 // Final step. We see if result is larger than modulus, and |
| 828 // if it is, subtract the modulus. But comparison implies |
| 829 // subtraction. So we subtract modulus, see if it borrowed, |
| 830 // and conditionally copy original value. |
| 831 ldr x0,[x29,#96] // pull rp |
| 832 add x2,x2,#8*8 |
| 833 subs x14,x19,x6 |
| 834 sbcs x15,x20,x7 |
| 835 sub x27,x5,#8*8 |
| 836 mov x3,x0 // x0 copy |
| 837 |
| 838 .Lsqr8x_sub: |
| 839 sbcs x16,x21,x8 |
| 840 ldp x6,x7,[x1,#8*0] |
| 841 sbcs x17,x22,x9 |
| 842 stp x14,x15,[x0,#8*0] |
| 843 sbcs x14,x23,x10 |
| 844 ldp x8,x9,[x1,#8*2] |
| 845 sbcs x15,x24,x11 |
| 846 stp x16,x17,[x0,#8*2] |
| 847 sbcs x16,x25,x12 |
| 848 ldp x10,x11,[x1,#8*4] |
| 849 sbcs x17,x26,x13 |
| 850 ldp x12,x13,[x1,#8*6] |
| 851 add x1,x1,#8*8 |
| 852 ldp x19,x20,[x2,#8*0] |
| 853 sub x27,x27,#8*8 |
| 854 ldp x21,x22,[x2,#8*2] |
| 855 ldp x23,x24,[x2,#8*4] |
| 856 ldp x25,x26,[x2,#8*6] |
| 857 add x2,x2,#8*8 |
| 858 stp x14,x15,[x0,#8*4] |
| 859 sbcs x14,x19,x6 |
| 860 stp x16,x17,[x0,#8*6] |
| 861 add x0,x0,#8*8 |
| 862 sbcs x15,x20,x7 |
| 863 cbnz x27,.Lsqr8x_sub |
| 864 |
| 865 sbcs x16,x21,x8 |
| 866 mov x2,sp |
| 867 add x1,sp,x5 |
| 868 ldp x6,x7,[x3,#8*0] |
| 869 sbcs x17,x22,x9 |
| 870 stp x14,x15,[x0,#8*0] |
| 871 sbcs x14,x23,x10 |
| 872 ldp x8,x9,[x3,#8*2] |
| 873 sbcs x15,x24,x11 |
| 874 stp x16,x17,[x0,#8*2] |
| 875 sbcs x16,x25,x12 |
| 876 ldp x19,x20,[x1,#8*0] |
| 877 sbcs x17,x26,x13 |
| 878 ldp x21,x22,[x1,#8*2] |
| 879 sbcs xzr,x30,xzr // did it borrow? |
| 880 ldr x30,[x29,#8] // pull return address |
| 881 stp x14,x15,[x0,#8*4] |
| 882 stp x16,x17,[x0,#8*6] |
| 883 |
| 884 sub x27,x5,#8*4 |
| 885 .Lsqr4x_cond_copy: |
| 886 sub x27,x27,#8*4 |
| 887 csel x14,x19,x6,lo |
| 888 stp xzr,xzr,[x2,#8*0] |
| 889 csel x15,x20,x7,lo |
| 890 ldp x6,x7,[x3,#8*4] |
| 891 ldp x19,x20,[x1,#8*4] |
| 892 csel x16,x21,x8,lo |
| 893 stp xzr,xzr,[x2,#8*2] |
| 894 add x2,x2,#8*4 |
| 895 csel x17,x22,x9,lo |
| 896 ldp x8,x9,[x3,#8*6] |
| 897 ldp x21,x22,[x1,#8*6] |
| 898 add x1,x1,#8*4 |
| 899 stp x14,x15,[x3,#8*0] |
| 900 stp x16,x17,[x3,#8*2] |
| 901 add x3,x3,#8*4 |
| 902 stp xzr,xzr,[x1,#8*0] |
| 903 stp xzr,xzr,[x1,#8*2] |
| 904 cbnz x27,.Lsqr4x_cond_copy |
| 905 |
| 906 csel x14,x19,x6,lo |
| 907 stp xzr,xzr,[x2,#8*0] |
| 908 csel x15,x20,x7,lo |
| 909 stp xzr,xzr,[x2,#8*2] |
| 910 csel x16,x21,x8,lo |
| 911 csel x17,x22,x9,lo |
| 912 stp x14,x15,[x3,#8*0] |
| 913 stp x16,x17,[x3,#8*2] |
| 914 |
| 915 b .Lsqr8x_done |
| 916 |
| 917 .align 4 |
| 918 .Lsqr8x8_post_condition: |
| 919 adc x28,xzr,xzr |
| 920 ldr x30,[x29,#8] // pull return address |
| 921 // x19-7,x28 hold result, x6-7 hold modulus |
| 922 subs x6,x19,x6 |
| 923 ldr x1,[x29,#96] // pull rp |
| 924 sbcs x7,x20,x7 |
| 925 stp xzr,xzr,[sp,#8*0] |
| 926 sbcs x8,x21,x8 |
| 927 stp xzr,xzr,[sp,#8*2] |
| 928 sbcs x9,x22,x9 |
| 929 stp xzr,xzr,[sp,#8*4] |
| 930 sbcs x10,x23,x10 |
| 931 stp xzr,xzr,[sp,#8*6] |
| 932 sbcs x11,x24,x11 |
| 933 stp xzr,xzr,[sp,#8*8] |
| 934 sbcs x12,x25,x12 |
| 935 stp xzr,xzr,[sp,#8*10] |
| 936 sbcs x13,x26,x13 |
| 937 stp xzr,xzr,[sp,#8*12] |
| 938 sbcs x28,x28,xzr // did it borrow? |
| 939 stp xzr,xzr,[sp,#8*14] |
| 940 |
| 941 // x6-7 hold result-modulus |
| 942 csel x6,x19,x6,lo |
| 943 csel x7,x20,x7,lo |
| 944 csel x8,x21,x8,lo |
| 945 csel x9,x22,x9,lo |
| 946 stp x6,x7,[x1,#8*0] |
| 947 csel x10,x23,x10,lo |
| 948 csel x11,x24,x11,lo |
| 949 stp x8,x9,[x1,#8*2] |
| 950 csel x12,x25,x12,lo |
| 951 csel x13,x26,x13,lo |
| 952 stp x10,x11,[x1,#8*4] |
| 953 stp x12,x13,[x1,#8*6] |
| 954 |
| 955 .Lsqr8x_done: |
| 956 ldp x19,x20,[x29,#16] |
| 957 mov sp,x29 |
| 958 ldp x21,x22,[x29,#32] |
| 959 mov x0,#1 |
| 960 ldp x23,x24,[x29,#48] |
| 961 ldp x25,x26,[x29,#64] |
| 962 ldp x27,x28,[x29,#80] |
| 963 ldr x29,[sp],#128 |
| 964 ret |
| 965 .size __bn_sqr8x_mont,.-__bn_sqr8x_mont |
| 966 .type __bn_mul4x_mont,%function |
| 967 .align 5 |
| 968 __bn_mul4x_mont: |
| 969 stp x29,x30,[sp,#-128]! |
| 970 add x29,sp,#0 |
| 971 stp x19,x20,[sp,#16] |
| 972 stp x21,x22,[sp,#32] |
| 973 stp x23,x24,[sp,#48] |
| 974 stp x25,x26,[sp,#64] |
| 975 stp x27,x28,[sp,#80] |
| 976 |
| 977 sub x26,sp,x5,lsl#3 |
| 978 lsl x5,x5,#3 |
| 979 ldr x4,[x4] // *n0 |
| 980 sub sp,x26,#8*4 // alloca |
| 981 |
| 982 add x10,x2,x5 |
| 983 add x27,x1,x5 |
| 984 stp x0,x10,[x29,#96] // offload rp and &b[num] |
| 985 |
| 986 ldr x24,[x2,#8*0] // b[0] |
| 987 ldp x6,x7,[x1,#8*0] // a[0..3] |
| 988 ldp x8,x9,[x1,#8*2] |
| 989 add x1,x1,#8*4 |
| 990 mov x19,xzr |
| 991 mov x20,xzr |
| 992 mov x21,xzr |
| 993 mov x22,xzr |
| 994 ldp x14,x15,[x3,#8*0] // n[0..3] |
| 995 ldp x16,x17,[x3,#8*2] |
| 996 adds x3,x3,#8*4 // clear carry bit |
| 997 mov x0,xzr |
| 998 mov x28,#0 |
| 999 mov x26,sp |
| 1000 |
| 1001 .Loop_mul4x_1st_reduction: |
| 1002 mul x10,x6,x24 // lo(a[0..3]*b[0]) |
| 1003 adc x0,x0,xzr // modulo-scheduled |
| 1004 mul x11,x7,x24 |
| 1005 add x28,x28,#8 |
| 1006 mul x12,x8,x24 |
| 1007 and x28,x28,#31 |
| 1008 mul x13,x9,x24 |
| 1009 adds x19,x19,x10 |
| 1010 umulh x10,x6,x24 // hi(a[0..3]*b[0]) |
| 1011 adcs x20,x20,x11 |
| 1012 mul x25,x19,x4 // t[0]*n0 |
| 1013 adcs x21,x21,x12 |
| 1014 umulh x11,x7,x24 |
| 1015 adcs x22,x22,x13 |
| 1016 umulh x12,x8,x24 |
| 1017 adc x23,xzr,xzr |
| 1018 umulh x13,x9,x24 |
| 1019 ldr x24,[x2,x28] // next b[i] (or b[0]) |
| 1020 adds x20,x20,x10 |
| 1021 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) |
| 1022 str x25,[x26],#8 // put aside t[0]*n0 for tail processing |
| 1023 adcs x21,x21,x11 |
| 1024 mul x11,x15,x25 |
| 1025 adcs x22,x22,x12 |
| 1026 mul x12,x16,x25 |
| 1027 adc x23,x23,x13 // can't overflow |
| 1028 mul x13,x17,x25 |
| 1029 // (*) adds xzr,x19,x10 |
| 1030 subs xzr,x19,#1 // (*) |
| 1031 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) |
| 1032 adcs x19,x20,x11 |
| 1033 umulh x11,x15,x25 |
| 1034 adcs x20,x21,x12 |
| 1035 umulh x12,x16,x25 |
| 1036 adcs x21,x22,x13 |
| 1037 umulh x13,x17,x25 |
| 1038 adcs x22,x23,x0 |
| 1039 adc x0,xzr,xzr |
| 1040 adds x19,x19,x10 |
| 1041 sub x10,x27,x1 |
| 1042 adcs x20,x20,x11 |
| 1043 adcs x21,x21,x12 |
| 1044 adcs x22,x22,x13 |
| 1045 //adc x0,x0,xzr |
| 1046 cbnz x28,.Loop_mul4x_1st_reduction |
| 1047 |
| 1048 cbz x10,.Lmul4x4_post_condition |
| 1049 |
| 1050 ldp x6,x7,[x1,#8*0] // a[4..7] |
| 1051 ldp x8,x9,[x1,#8*2] |
| 1052 add x1,x1,#8*4 |
| 1053 ldr x25,[sp] // a[0]*n0 |
| 1054 ldp x14,x15,[x3,#8*0] // n[4..7] |
| 1055 ldp x16,x17,[x3,#8*2] |
| 1056 add x3,x3,#8*4 |
| 1057 |
| 1058 .Loop_mul4x_1st_tail: |
| 1059 mul x10,x6,x24 // lo(a[4..7]*b[i]) |
| 1060 adc x0,x0,xzr // modulo-scheduled |
| 1061 mul x11,x7,x24 |
| 1062 add x28,x28,#8 |
| 1063 mul x12,x8,x24 |
| 1064 and x28,x28,#31 |
| 1065 mul x13,x9,x24 |
| 1066 adds x19,x19,x10 |
| 1067 umulh x10,x6,x24 // hi(a[4..7]*b[i]) |
| 1068 adcs x20,x20,x11 |
| 1069 umulh x11,x7,x24 |
| 1070 adcs x21,x21,x12 |
| 1071 umulh x12,x8,x24 |
| 1072 adcs x22,x22,x13 |
| 1073 umulh x13,x9,x24 |
| 1074 adc x23,xzr,xzr |
| 1075 ldr x24,[x2,x28] // next b[i] (or b[0]) |
| 1076 adds x20,x20,x10 |
| 1077 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) |
| 1078 adcs x21,x21,x11 |
| 1079 mul x11,x15,x25 |
| 1080 adcs x22,x22,x12 |
| 1081 mul x12,x16,x25 |
| 1082 adc x23,x23,x13 // can't overflow |
| 1083 mul x13,x17,x25 |
| 1084 adds x19,x19,x10 |
| 1085 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) |
| 1086 adcs x20,x20,x11 |
| 1087 umulh x11,x15,x25 |
| 1088 adcs x21,x21,x12 |
| 1089 umulh x12,x16,x25 |
| 1090 adcs x22,x22,x13 |
| 1091 adcs x23,x23,x0 |
| 1092 umulh x13,x17,x25 |
| 1093 adc x0,xzr,xzr |
| 1094 ldr x25,[sp,x28] // next t[0]*n0 |
| 1095 str x19,[x26],#8 // result!!! |
| 1096 adds x19,x20,x10 |
| 1097 sub x10,x27,x1 // done yet? |
| 1098 adcs x20,x21,x11 |
| 1099 adcs x21,x22,x12 |
| 1100 adcs x22,x23,x13 |
| 1101 //adc x0,x0,xzr |
| 1102 cbnz x28,.Loop_mul4x_1st_tail |
| 1103 |
| 1104 sub x11,x27,x5 // rewinded x1 |
| 1105 cbz x10,.Lmul4x_proceed |
| 1106 |
| 1107 ldp x6,x7,[x1,#8*0] |
| 1108 ldp x8,x9,[x1,#8*2] |
| 1109 add x1,x1,#8*4 |
| 1110 ldp x14,x15,[x3,#8*0] |
| 1111 ldp x16,x17,[x3,#8*2] |
| 1112 add x3,x3,#8*4 |
| 1113 b .Loop_mul4x_1st_tail |
| 1114 |
| 1115 .align 5 |
| 1116 .Lmul4x_proceed: |
| 1117 ldr x24,[x2,#8*4]! // *++b |
| 1118 adc x30,x0,xzr |
| 1119 ldp x6,x7,[x11,#8*0] // a[0..3] |
| 1120 sub x3,x3,x5 // rewind np |
| 1121 ldp x8,x9,[x11,#8*2] |
| 1122 add x1,x11,#8*4 |
| 1123 |
| 1124 stp x19,x20,[x26,#8*0] // result!!! |
| 1125 ldp x19,x20,[sp,#8*4] // t[0..3] |
| 1126 stp x21,x22,[x26,#8*2] // result!!! |
| 1127 ldp x21,x22,[sp,#8*6] |
| 1128 |
| 1129 ldp x14,x15,[x3,#8*0] // n[0..3] |
| 1130 mov x26,sp |
| 1131 ldp x16,x17,[x3,#8*2] |
| 1132 adds x3,x3,#8*4 // clear carry bit |
| 1133 mov x0,xzr |
| 1134 |
| 1135 .align 4 |
| 1136 .Loop_mul4x_reduction: |
| 1137 mul x10,x6,x24 // lo(a[0..3]*b[4]) |
| 1138 adc x0,x0,xzr // modulo-scheduled |
| 1139 mul x11,x7,x24 |
| 1140 add x28,x28,#8 |
| 1141 mul x12,x8,x24 |
| 1142 and x28,x28,#31 |
| 1143 mul x13,x9,x24 |
| 1144 adds x19,x19,x10 |
| 1145 umulh x10,x6,x24 // hi(a[0..3]*b[4]) |
| 1146 adcs x20,x20,x11 |
| 1147 mul x25,x19,x4 // t[0]*n0 |
| 1148 adcs x21,x21,x12 |
| 1149 umulh x11,x7,x24 |
| 1150 adcs x22,x22,x13 |
| 1151 umulh x12,x8,x24 |
| 1152 adc x23,xzr,xzr |
| 1153 umulh x13,x9,x24 |
| 1154 ldr x24,[x2,x28] // next b[i] |
| 1155 adds x20,x20,x10 |
| 1156 // (*) mul x10,x14,x25 |
| 1157 str x25,[x26],#8 // put aside t[0]*n0 for tail processing |
| 1158 adcs x21,x21,x11 |
| 1159 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 |
| 1160 adcs x22,x22,x12 |
| 1161 mul x12,x16,x25 |
| 1162 adc x23,x23,x13 // can't overflow |
| 1163 mul x13,x17,x25 |
| 1164 // (*) adds xzr,x19,x10 |
| 1165 subs xzr,x19,#1 // (*) |
| 1166 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 |
| 1167 adcs x19,x20,x11 |
| 1168 umulh x11,x15,x25 |
| 1169 adcs x20,x21,x12 |
| 1170 umulh x12,x16,x25 |
| 1171 adcs x21,x22,x13 |
| 1172 umulh x13,x17,x25 |
| 1173 adcs x22,x23,x0 |
| 1174 adc x0,xzr,xzr |
| 1175 adds x19,x19,x10 |
| 1176 adcs x20,x20,x11 |
| 1177 adcs x21,x21,x12 |
| 1178 adcs x22,x22,x13 |
| 1179 //adc x0,x0,xzr |
| 1180 cbnz x28,.Loop_mul4x_reduction |
| 1181 |
| 1182 adc x0,x0,xzr |
| 1183 ldp x10,x11,[x26,#8*4] // t[4..7] |
| 1184 ldp x12,x13,[x26,#8*6] |
| 1185 ldp x6,x7,[x1,#8*0] // a[4..7] |
| 1186 ldp x8,x9,[x1,#8*2] |
| 1187 add x1,x1,#8*4 |
| 1188 adds x19,x19,x10 |
| 1189 adcs x20,x20,x11 |
| 1190 adcs x21,x21,x12 |
| 1191 adcs x22,x22,x13 |
| 1192 //adc x0,x0,xzr |
| 1193 |
| 1194 ldr x25,[sp] // t[0]*n0 |
| 1195 ldp x14,x15,[x3,#8*0] // n[4..7] |
| 1196 ldp x16,x17,[x3,#8*2] |
| 1197 add x3,x3,#8*4 |
| 1198 |
| 1199 .align 4 |
| 1200 .Loop_mul4x_tail: |
| 1201 mul x10,x6,x24 // lo(a[4..7]*b[4]) |
| 1202 adc x0,x0,xzr // modulo-scheduled |
| 1203 mul x11,x7,x24 |
| 1204 add x28,x28,#8 |
| 1205 mul x12,x8,x24 |
| 1206 and x28,x28,#31 |
| 1207 mul x13,x9,x24 |
| 1208 adds x19,x19,x10 |
| 1209 umulh x10,x6,x24 // hi(a[4..7]*b[4]) |
| 1210 adcs x20,x20,x11 |
| 1211 umulh x11,x7,x24 |
| 1212 adcs x21,x21,x12 |
| 1213 umulh x12,x8,x24 |
| 1214 adcs x22,x22,x13 |
| 1215 umulh x13,x9,x24 |
| 1216 adc x23,xzr,xzr |
| 1217 ldr x24,[x2,x28] // next b[i] |
| 1218 adds x20,x20,x10 |
| 1219 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) |
| 1220 adcs x21,x21,x11 |
| 1221 mul x11,x15,x25 |
| 1222 adcs x22,x22,x12 |
| 1223 mul x12,x16,x25 |
| 1224 adc x23,x23,x13 // can't overflow |
| 1225 mul x13,x17,x25 |
| 1226 adds x19,x19,x10 |
| 1227 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) |
| 1228 adcs x20,x20,x11 |
| 1229 umulh x11,x15,x25 |
| 1230 adcs x21,x21,x12 |
| 1231 umulh x12,x16,x25 |
| 1232 adcs x22,x22,x13 |
| 1233 umulh x13,x17,x25 |
| 1234 adcs x23,x23,x0 |
| 1235 ldr x25,[sp,x28] // next a[0]*n0 |
| 1236 adc x0,xzr,xzr |
| 1237 str x19,[x26],#8 // result!!! |
| 1238 adds x19,x20,x10 |
| 1239 sub x10,x27,x1 // done yet? |
| 1240 adcs x20,x21,x11 |
| 1241 adcs x21,x22,x12 |
| 1242 adcs x22,x23,x13 |
| 1243 //adc x0,x0,xzr |
| 1244 cbnz x28,.Loop_mul4x_tail |
| 1245 |
| 1246 sub x11,x3,x5 // rewinded np? |
| 1247 adc x0,x0,xzr |
| 1248 cbz x10,.Loop_mul4x_break |
| 1249 |
| 1250 ldp x10,x11,[x26,#8*4] |
| 1251 ldp x12,x13,[x26,#8*6] |
| 1252 ldp x6,x7,[x1,#8*0] |
| 1253 ldp x8,x9,[x1,#8*2] |
| 1254 add x1,x1,#8*4 |
| 1255 adds x19,x19,x10 |
| 1256 adcs x20,x20,x11 |
| 1257 adcs x21,x21,x12 |
| 1258 adcs x22,x22,x13 |
| 1259 //adc x0,x0,xzr |
| 1260 ldp x14,x15,[x3,#8*0] |
| 1261 ldp x16,x17,[x3,#8*2] |
| 1262 add x3,x3,#8*4 |
| 1263 b .Loop_mul4x_tail |
| 1264 |
| 1265 .align 4 |
| 1266 .Loop_mul4x_break: |
| 1267 ldp x12,x13,[x29,#96] // pull rp and &b[num] |
| 1268 adds x19,x19,x30 |
| 1269 add x2,x2,#8*4 // bp++ |
| 1270 adcs x20,x20,xzr |
| 1271 sub x1,x1,x5 // rewind ap |
| 1272 adcs x21,x21,xzr |
| 1273 stp x19,x20,[x26,#8*0] // result!!! |
| 1274 adcs x22,x22,xzr |
| 1275 ldp x19,x20,[sp,#8*4] // t[0..3] |
| 1276 adc x30,x0,xzr |
| 1277 stp x21,x22,[x26,#8*2] // result!!! |
| 1278 cmp x2,x13 // done yet? |
| 1279 ldp x21,x22,[sp,#8*6] |
| 1280 ldp x14,x15,[x11,#8*0] // n[0..3] |
| 1281 ldp x16,x17,[x11,#8*2] |
| 1282 add x3,x11,#8*4 |
| 1283 b.eq .Lmul4x_post |
| 1284 |
| 1285 ldr x24,[x2] |
| 1286 ldp x6,x7,[x1,#8*0] // a[0..3] |
| 1287 ldp x8,x9,[x1,#8*2] |
| 1288 adds x1,x1,#8*4 // clear carry bit |
| 1289 mov x0,xzr |
| 1290 mov x26,sp |
| 1291 b .Loop_mul4x_reduction |
| 1292 |
| 1293 .align 4 |
| 1294 .Lmul4x_post: |
| 1295 // Final step. We see if result is larger than modulus, and |
| 1296 // if it is, subtract the modulus. But comparison implies |
| 1297 // subtraction. So we subtract modulus, see if it borrowed, |
| 1298 // and conditionally copy original value. |
| 1299 mov x0,x12 |
| 1300 mov x27,x12 // x0 copy |
| 1301 subs x10,x19,x14 |
| 1302 add x26,sp,#8*8 |
| 1303 sbcs x11,x20,x15 |
| 1304 sub x28,x5,#8*4 |
| 1305 |
| 1306 .Lmul4x_sub: |
| 1307 sbcs x12,x21,x16 |
| 1308 ldp x14,x15,[x3,#8*0] |
| 1309 sub x28,x28,#8*4 |
| 1310 ldp x19,x20,[x26,#8*0] |
| 1311 sbcs x13,x22,x17 |
| 1312 ldp x16,x17,[x3,#8*2] |
| 1313 add x3,x3,#8*4 |
| 1314 ldp x21,x22,[x26,#8*2] |
| 1315 add x26,x26,#8*4 |
| 1316 stp x10,x11,[x0,#8*0] |
| 1317 sbcs x10,x19,x14 |
| 1318 stp x12,x13,[x0,#8*2] |
| 1319 add x0,x0,#8*4 |
| 1320 sbcs x11,x20,x15 |
| 1321 cbnz x28,.Lmul4x_sub |
| 1322 |
| 1323 sbcs x12,x21,x16 |
| 1324 mov x26,sp |
| 1325 add x1,sp,#8*4 |
| 1326 ldp x6,x7,[x27,#8*0] |
| 1327 sbcs x13,x22,x17 |
| 1328 stp x10,x11,[x0,#8*0] |
| 1329 ldp x8,x9,[x27,#8*2] |
| 1330 stp x12,x13,[x0,#8*2] |
| 1331 ldp x19,x20,[x1,#8*0] |
| 1332 ldp x21,x22,[x1,#8*2] |
| 1333 sbcs xzr,x30,xzr // did it borrow? |
| 1334 ldr x30,[x29,#8] // pull return address |
| 1335 |
| 1336 sub x28,x5,#8*4 |
| 1337 .Lmul4x_cond_copy: |
| 1338 sub x28,x28,#8*4 |
| 1339 csel x10,x19,x6,lo |
| 1340 stp xzr,xzr,[x26,#8*0] |
| 1341 csel x11,x20,x7,lo |
| 1342 ldp x6,x7,[x27,#8*4] |
| 1343 ldp x19,x20,[x1,#8*4] |
| 1344 csel x12,x21,x8,lo |
| 1345 stp xzr,xzr,[x26,#8*2] |
| 1346 add x26,x26,#8*4 |
| 1347 csel x13,x22,x9,lo |
| 1348 ldp x8,x9,[x27,#8*6] |
| 1349 ldp x21,x22,[x1,#8*6] |
| 1350 add x1,x1,#8*4 |
| 1351 stp x10,x11,[x27,#8*0] |
| 1352 stp x12,x13,[x27,#8*2] |
| 1353 add x27,x27,#8*4 |
| 1354 cbnz x28,.Lmul4x_cond_copy |
| 1355 |
| 1356 csel x10,x19,x6,lo |
| 1357 stp xzr,xzr,[x26,#8*0] |
| 1358 csel x11,x20,x7,lo |
| 1359 stp xzr,xzr,[x26,#8*2] |
| 1360 csel x12,x21,x8,lo |
| 1361 stp xzr,xzr,[x26,#8*3] |
| 1362 csel x13,x22,x9,lo |
| 1363 stp xzr,xzr,[x26,#8*4] |
| 1364 stp x10,x11,[x27,#8*0] |
| 1365 stp x12,x13,[x27,#8*2] |
| 1366 |
| 1367 b .Lmul4x_done |
| 1368 |
| 1369 .align 4 |
| 1370 .Lmul4x4_post_condition: |
| 1371 adc x0,x0,xzr |
| 1372 ldr x1,[x29,#96] // pull rp |
| 1373 // x19-3,x0 hold result, x14-7 hold modulus |
| 1374 subs x6,x19,x14 |
| 1375 ldr x30,[x29,#8] // pull return address |
| 1376 sbcs x7,x20,x15 |
| 1377 stp xzr,xzr,[sp,#8*0] |
| 1378 sbcs x8,x21,x16 |
| 1379 stp xzr,xzr,[sp,#8*2] |
| 1380 sbcs x9,x22,x17 |
| 1381 stp xzr,xzr,[sp,#8*4] |
| 1382 sbcs xzr,x0,xzr // did it borrow? |
| 1383 stp xzr,xzr,[sp,#8*6] |
| 1384 |
| 1385 // x6-3 hold result-modulus |
| 1386 csel x6,x19,x6,lo |
| 1387 csel x7,x20,x7,lo |
| 1388 csel x8,x21,x8,lo |
| 1389 csel x9,x22,x9,lo |
| 1390 stp x6,x7,[x1,#8*0] |
| 1391 stp x8,x9,[x1,#8*2] |
| 1392 |
| 1393 .Lmul4x_done: |
| 1394 ldp x19,x20,[x29,#16] |
| 1395 mov sp,x29 |
| 1396 ldp x21,x22,[x29,#32] |
| 1397 mov x0,#1 |
| 1398 ldp x23,x24,[x29,#48] |
| 1399 ldp x25,x26,[x29,#64] |
| 1400 ldp x27,x28,[x29,#80] |
| 1401 ldr x29,[sp],#128 |
| 1402 ret |
| 1403 .size __bn_mul4x_mont,.-__bn_mul4x_mont |
| 1404 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105
,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79
,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,4
6,111,114,103,62,0 |
| 1405 .align 2 |
| 1406 .align 4 |
| 1407 #endif |
OLD | NEW |