Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(346)

Side by Side Diff: third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont5.S

Issue 2219933002: Land BoringSSL roll on master (Closed) Base URL: git@github.com:dart-lang/sdk.git@master
Patch Set: Created 4 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 #if defined(__x86_64__) 1 #if defined(__x86_64__)
2 .text 2 .text
3 3
4 4
5 5
6 .globl _bn_mul_mont_gather5 6 .globl _bn_mul_mont_gather5
7 .private_extern _bn_mul_mont_gather5 7 .private_extern _bn_mul_mont_gather5
8 8
9 .p2align 6 9 .p2align 6
10 _bn_mul_mont_gather5: 10 _bn_mul_mont_gather5:
11 testl $7,%r9d 11 testl $7,%r9d
12 jnz L$mul_enter 12 jnz L$mul_enter
13 jmp L$mul4x_enter 13 jmp L$mul4x_enter
14 14
15 .p2align 4 15 .p2align 4
16 L$mul_enter: 16 L$mul_enter:
17 movl %r9d,%r9d 17 movl %r9d,%r9d
18 movq %rsp,%rax 18 movq %rsp,%rax
19 » movl» 8(%rsp),%r10d 19 » movd» 8(%rsp),%xmm5
20 » leaq» L$inc(%rip),%r10
20 pushq %rbx 21 pushq %rbx
21 pushq %rbp 22 pushq %rbp
22 pushq %r12 23 pushq %r12
23 pushq %r13 24 pushq %r13
24 pushq %r14 25 pushq %r14
25 pushq %r15 26 pushq %r15
27
26 leaq 2(%r9),%r11 28 leaq 2(%r9),%r11
27 negq %r11 29 negq %r11
28 » leaq» (%rsp,%r11,8),%rsp 30 » leaq» -264(%rsp,%r11,8),%rsp
29 andq $-1024,%rsp 31 andq $-1024,%rsp
30 32
31 movq %rax,8(%rsp,%r9,8) 33 movq %rax,8(%rsp,%r9,8)
32 L$mul_body: 34 L$mul_body:
33 » movq» %rdx,%r12 35 » leaq» 128(%rdx),%r12
34 » movq» %r10,%r11 36 » movdqa» 0(%r10),%xmm0
35 » shrq» $3,%r10 37 » movdqa» 16(%r10),%xmm1
36 » andq» $7,%r11 38 » leaq» 24-112(%rsp,%r9,8),%r10
37 » notq» %r10 39 » andq» $-16,%r10
38 » leaq» L$magic_masks(%rip),%rax
39 » andq» $3,%r10
40 » leaq» 96(%r12,%r11,8),%r12
41 » movq» 0(%rax,%r10,8),%xmm4
42 » movq» 8(%rax,%r10,8),%xmm5
43 » movq» 16(%rax,%r10,8),%xmm6
44 » movq» 24(%rax,%r10,8),%xmm7
45 40
46 » movq» -96(%r12),%xmm0 41 » pshufd» $0,%xmm5,%xmm5
47 » movq» -32(%r12),%xmm1 42 » movdqa» %xmm1,%xmm4
48 » pand» %xmm4,%xmm0 43 » movdqa» %xmm1,%xmm2
49 » movq» 32(%r12),%xmm2 44 » paddd» %xmm0,%xmm1
50 » pand» %xmm5,%xmm1 45 » pcmpeqd»%xmm5,%xmm0
51 » movq» 96(%r12),%xmm3 46 .byte» 0x67
52 » pand» %xmm6,%xmm2 47 » movdqa» %xmm4,%xmm3
48 » paddd» %xmm1,%xmm2
49 » pcmpeqd»%xmm5,%xmm1
50 » movdqa» %xmm0,112(%r10)
51 » movdqa» %xmm4,%xmm0
52
53 » paddd» %xmm2,%xmm3
54 » pcmpeqd»%xmm5,%xmm2
55 » movdqa» %xmm1,128(%r10)
56 » movdqa» %xmm4,%xmm1
57
58 » paddd» %xmm3,%xmm0
59 » pcmpeqd»%xmm5,%xmm3
60 » movdqa» %xmm2,144(%r10)
61 » movdqa» %xmm4,%xmm2
62
63 » paddd» %xmm0,%xmm1
64 » pcmpeqd»%xmm5,%xmm0
65 » movdqa» %xmm3,160(%r10)
66 » movdqa» %xmm4,%xmm3
67 » paddd» %xmm1,%xmm2
68 » pcmpeqd»%xmm5,%xmm1
69 » movdqa» %xmm0,176(%r10)
70 » movdqa» %xmm4,%xmm0
71
72 » paddd» %xmm2,%xmm3
73 » pcmpeqd»%xmm5,%xmm2
74 » movdqa» %xmm1,192(%r10)
75 » movdqa» %xmm4,%xmm1
76
77 » paddd» %xmm3,%xmm0
78 » pcmpeqd»%xmm5,%xmm3
79 » movdqa» %xmm2,208(%r10)
80 » movdqa» %xmm4,%xmm2
81
82 » paddd» %xmm0,%xmm1
83 » pcmpeqd»%xmm5,%xmm0
84 » movdqa» %xmm3,224(%r10)
85 » movdqa» %xmm4,%xmm3
86 » paddd» %xmm1,%xmm2
87 » pcmpeqd»%xmm5,%xmm1
88 » movdqa» %xmm0,240(%r10)
89 » movdqa» %xmm4,%xmm0
90
91 » paddd» %xmm2,%xmm3
92 » pcmpeqd»%xmm5,%xmm2
93 » movdqa» %xmm1,256(%r10)
94 » movdqa» %xmm4,%xmm1
95
96 » paddd» %xmm3,%xmm0
97 » pcmpeqd»%xmm5,%xmm3
98 » movdqa» %xmm2,272(%r10)
99 » movdqa» %xmm4,%xmm2
100
101 » paddd» %xmm0,%xmm1
102 » pcmpeqd»%xmm5,%xmm0
103 » movdqa» %xmm3,288(%r10)
104 » movdqa» %xmm4,%xmm3
105 » paddd» %xmm1,%xmm2
106 » pcmpeqd»%xmm5,%xmm1
107 » movdqa» %xmm0,304(%r10)
108
109 » paddd» %xmm2,%xmm3
110 .byte» 0x67
111 » pcmpeqd»%xmm5,%xmm2
112 » movdqa» %xmm1,320(%r10)
113
114 » pcmpeqd»%xmm5,%xmm3
115 » movdqa» %xmm2,336(%r10)
116 » pand» 64(%r12),%xmm0
117
118 » pand» 80(%r12),%xmm1
119 » pand» 96(%r12),%xmm2
120 » movdqa» %xmm3,352(%r10)
121 » pand» 112(%r12),%xmm3
122 » por» %xmm2,%xmm0
123 » por» %xmm3,%xmm1
124 » movdqa» -128(%r12),%xmm4
125 » movdqa» -112(%r12),%xmm5
126 » movdqa» -96(%r12),%xmm2
127 » pand» 112(%r10),%xmm4
128 » movdqa» -80(%r12),%xmm3
129 » pand» 128(%r10),%xmm5
130 » por» %xmm4,%xmm0
131 » pand» 144(%r10),%xmm2
132 » por» %xmm5,%xmm1
133 » pand» 160(%r10),%xmm3
134 » por» %xmm2,%xmm0
135 » por» %xmm3,%xmm1
136 » movdqa» -64(%r12),%xmm4
137 » movdqa» -48(%r12),%xmm5
138 » movdqa» -32(%r12),%xmm2
139 » pand» 176(%r10),%xmm4
140 » movdqa» -16(%r12),%xmm3
141 » pand» 192(%r10),%xmm5
142 » por» %xmm4,%xmm0
143 » pand» 208(%r10),%xmm2
144 » por» %xmm5,%xmm1
145 » pand» 224(%r10),%xmm3
146 » por» %xmm2,%xmm0
147 » por» %xmm3,%xmm1
148 » movdqa» 0(%r12),%xmm4
149 » movdqa» 16(%r12),%xmm5
150 » movdqa» 32(%r12),%xmm2
151 » pand» 240(%r10),%xmm4
152 » movdqa» 48(%r12),%xmm3
153 » pand» 256(%r10),%xmm5
154 » por» %xmm4,%xmm0
155 » pand» 272(%r10),%xmm2
156 » por» %xmm5,%xmm1
157 » pand» 288(%r10),%xmm3
158 » por» %xmm2,%xmm0
159 » por» %xmm3,%xmm1
53 por %xmm1,%xmm0 160 por %xmm1,%xmm0
54 » pand» %xmm7,%xmm3 161 » pshufd» $0x4e,%xmm0,%xmm1
55 » por» %xmm2,%xmm0 162 » por» %xmm1,%xmm0
56 leaq 256(%r12),%r12 163 leaq 256(%r12),%r12
57 por %xmm3,%xmm0
58
59 .byte 102,72,15,126,195 164 .byte 102,72,15,126,195
60 165
61 movq (%r8),%r8 166 movq (%r8),%r8
62 movq (%rsi),%rax 167 movq (%rsi),%rax
63 168
64 xorq %r14,%r14 169 xorq %r14,%r14
65 xorq %r15,%r15 170 xorq %r15,%r15
66 171
67 movq -96(%r12),%xmm0
68 movq -32(%r12),%xmm1
69 pand %xmm4,%xmm0
70 movq 32(%r12),%xmm2
71 pand %xmm5,%xmm1
72
73 movq %r8,%rbp 172 movq %r8,%rbp
74 mulq %rbx 173 mulq %rbx
75 movq %rax,%r10 174 movq %rax,%r10
76 movq (%rcx),%rax 175 movq (%rcx),%rax
77 176
78 movq 96(%r12),%xmm3
79 pand %xmm6,%xmm2
80 por %xmm1,%xmm0
81 pand %xmm7,%xmm3
82
83 imulq %r10,%rbp 177 imulq %r10,%rbp
84 movq %rdx,%r11 178 movq %rdx,%r11
85 179
86 por %xmm2,%xmm0
87 leaq 256(%r12),%r12
88 por %xmm3,%xmm0
89
90 mulq %rbp 180 mulq %rbp
91 addq %rax,%r10 181 addq %rax,%r10
92 movq 8(%rsi),%rax 182 movq 8(%rsi),%rax
93 adcq $0,%rdx 183 adcq $0,%rdx
94 movq %rdx,%r13 184 movq %rdx,%r13
95 185
96 leaq 1(%r15),%r15 186 leaq 1(%r15),%r15
97 jmp L$1st_enter 187 jmp L$1st_enter
98 188
99 .p2align 4 189 .p2align 4
(...skipping 12 matching lines...) Expand all
112 addq %rax,%r11 202 addq %rax,%r11
113 movq (%rcx,%r15,8),%rax 203 movq (%rcx,%r15,8),%rax
114 adcq $0,%rdx 204 adcq $0,%rdx
115 leaq 1(%r15),%r15 205 leaq 1(%r15),%r15
116 movq %rdx,%r10 206 movq %rdx,%r10
117 207
118 mulq %rbp 208 mulq %rbp
119 cmpq %r9,%r15 209 cmpq %r9,%r15
120 jne L$1st 210 jne L$1st
121 211
122 .byte 102,72,15,126,195
123 212
124 addq %rax,%r13 213 addq %rax,%r13
125 movq (%rsi),%rax
126 adcq $0,%rdx 214 adcq $0,%rdx
127 addq %r11,%r13 215 addq %r11,%r13
128 adcq $0,%rdx 216 adcq $0,%rdx
129 » movq» %r13,-16(%rsp,%r15,8) 217 » movq» %r13,-16(%rsp,%r9,8)
130 movq %rdx,%r13 218 movq %rdx,%r13
131 movq %r10,%r11 219 movq %r10,%r11
132 220
133 xorq %rdx,%rdx 221 xorq %rdx,%rdx
134 addq %r11,%r13 222 addq %r11,%r13
135 adcq $0,%rdx 223 adcq $0,%rdx
136 movq %r13,-8(%rsp,%r9,8) 224 movq %r13,-8(%rsp,%r9,8)
137 movq %rdx,(%rsp,%r9,8) 225 movq %rdx,(%rsp,%r9,8)
138 226
139 leaq 1(%r14),%r14 227 leaq 1(%r14),%r14
140 jmp L$outer 228 jmp L$outer
141 .p2align 4 229 .p2align 4
142 L$outer: 230 L$outer:
231 leaq 24+128(%rsp,%r9,8),%rdx
232 andq $-16,%rdx
233 pxor %xmm4,%xmm4
234 pxor %xmm5,%xmm5
235 movdqa -128(%r12),%xmm0
236 movdqa -112(%r12),%xmm1
237 movdqa -96(%r12),%xmm2
238 movdqa -80(%r12),%xmm3
239 pand -128(%rdx),%xmm0
240 pand -112(%rdx),%xmm1
241 por %xmm0,%xmm4
242 pand -96(%rdx),%xmm2
243 por %xmm1,%xmm5
244 pand -80(%rdx),%xmm3
245 por %xmm2,%xmm4
246 por %xmm3,%xmm5
247 movdqa -64(%r12),%xmm0
248 movdqa -48(%r12),%xmm1
249 movdqa -32(%r12),%xmm2
250 movdqa -16(%r12),%xmm3
251 pand -64(%rdx),%xmm0
252 pand -48(%rdx),%xmm1
253 por %xmm0,%xmm4
254 pand -32(%rdx),%xmm2
255 por %xmm1,%xmm5
256 pand -16(%rdx),%xmm3
257 por %xmm2,%xmm4
258 por %xmm3,%xmm5
259 movdqa 0(%r12),%xmm0
260 movdqa 16(%r12),%xmm1
261 movdqa 32(%r12),%xmm2
262 movdqa 48(%r12),%xmm3
263 pand 0(%rdx),%xmm0
264 pand 16(%rdx),%xmm1
265 por %xmm0,%xmm4
266 pand 32(%rdx),%xmm2
267 por %xmm1,%xmm5
268 pand 48(%rdx),%xmm3
269 por %xmm2,%xmm4
270 por %xmm3,%xmm5
271 movdqa 64(%r12),%xmm0
272 movdqa 80(%r12),%xmm1
273 movdqa 96(%r12),%xmm2
274 movdqa 112(%r12),%xmm3
275 pand 64(%rdx),%xmm0
276 pand 80(%rdx),%xmm1
277 por %xmm0,%xmm4
278 pand 96(%rdx),%xmm2
279 por %xmm1,%xmm5
280 pand 112(%rdx),%xmm3
281 por %xmm2,%xmm4
282 por %xmm3,%xmm5
283 por %xmm5,%xmm4
284 pshufd $0x4e,%xmm4,%xmm0
285 por %xmm4,%xmm0
286 leaq 256(%r12),%r12
287
288 movq (%rsi),%rax
289 .byte 102,72,15,126,195
290
143 xorq %r15,%r15 291 xorq %r15,%r15
144 movq %r8,%rbp 292 movq %r8,%rbp
145 movq (%rsp),%r10 293 movq (%rsp),%r10
146 294
147 movq -96(%r12),%xmm0
148 movq -32(%r12),%xmm1
149 pand %xmm4,%xmm0
150 movq 32(%r12),%xmm2
151 pand %xmm5,%xmm1
152
153 mulq %rbx 295 mulq %rbx
154 addq %rax,%r10 296 addq %rax,%r10
155 movq (%rcx),%rax 297 movq (%rcx),%rax
156 adcq $0,%rdx 298 adcq $0,%rdx
157 299
158 movq 96(%r12),%xmm3
159 pand %xmm6,%xmm2
160 por %xmm1,%xmm0
161 pand %xmm7,%xmm3
162
163 imulq %r10,%rbp 300 imulq %r10,%rbp
164 movq %rdx,%r11 301 movq %rdx,%r11
165 302
166 por %xmm2,%xmm0
167 leaq 256(%r12),%r12
168 por %xmm3,%xmm0
169
170 mulq %rbp 303 mulq %rbp
171 addq %rax,%r10 304 addq %rax,%r10
172 movq 8(%rsi),%rax 305 movq 8(%rsi),%rax
173 adcq $0,%rdx 306 adcq $0,%rdx
174 movq 8(%rsp),%r10 307 movq 8(%rsp),%r10
175 movq %rdx,%r13 308 movq %rdx,%r13
176 309
177 leaq 1(%r15),%r15 310 leaq 1(%r15),%r15
178 jmp L$inner_enter 311 jmp L$inner_enter
179 312
(...skipping 15 matching lines...) Expand all
195 adcq $0,%rdx 328 adcq $0,%rdx
196 addq %r11,%r10 329 addq %r11,%r10
197 movq %rdx,%r11 330 movq %rdx,%r11
198 adcq $0,%r11 331 adcq $0,%r11
199 leaq 1(%r15),%r15 332 leaq 1(%r15),%r15
200 333
201 mulq %rbp 334 mulq %rbp
202 cmpq %r9,%r15 335 cmpq %r9,%r15
203 jne L$inner 336 jne L$inner
204 337
205 .byte 102,72,15,126,195
206
207 addq %rax,%r13 338 addq %rax,%r13
208 movq (%rsi),%rax
209 adcq $0,%rdx 339 adcq $0,%rdx
210 addq %r10,%r13 340 addq %r10,%r13
211 » movq» (%rsp,%r15,8),%r10 341 » movq» (%rsp,%r9,8),%r10
212 adcq $0,%rdx 342 adcq $0,%rdx
213 » movq» %r13,-16(%rsp,%r15,8) 343 » movq» %r13,-16(%rsp,%r9,8)
214 movq %rdx,%r13 344 movq %rdx,%r13
215 345
216 xorq %rdx,%rdx 346 xorq %rdx,%rdx
217 addq %r11,%r13 347 addq %r11,%r13
218 adcq $0,%rdx 348 adcq $0,%rdx
219 addq %r10,%r13 349 addq %r10,%r13
220 adcq $0,%rdx 350 adcq $0,%rdx
221 movq %r13,-8(%rsp,%r9,8) 351 movq %r13,-8(%rsp,%r9,8)
222 movq %rdx,(%rsp,%r9,8) 352 movq %rdx,(%rsp,%r9,8)
223 353
(...skipping 25 matching lines...) Expand all
249 andq %rax,%rsi 379 andq %rax,%rsi
250 xorq %rcx,%rsi 380 xorq %rcx,%rsi
251 movq %r14,(%rsp,%r14,8) 381 movq %r14,(%rsp,%r14,8)
252 movq %rsi,(%rdi,%r14,8) 382 movq %rsi,(%rdi,%r14,8)
253 leaq 1(%r14),%r14 383 leaq 1(%r14),%r14
254 subq $1,%r15 384 subq $1,%r15
255 jnz L$copy 385 jnz L$copy
256 386
257 movq 8(%rsp,%r9,8),%rsi 387 movq 8(%rsp,%r9,8),%rsi
258 movq $1,%rax 388 movq $1,%rax
389
259 movq -48(%rsi),%r15 390 movq -48(%rsi),%r15
260 movq -40(%rsi),%r14 391 movq -40(%rsi),%r14
261 movq -32(%rsi),%r13 392 movq -32(%rsi),%r13
262 movq -24(%rsi),%r12 393 movq -24(%rsi),%r12
263 movq -16(%rsi),%rbp 394 movq -16(%rsi),%rbp
264 movq -8(%rsi),%rbx 395 movq -8(%rsi),%rbx
265 leaq (%rsi),%rsp 396 leaq (%rsi),%rsp
266 L$mul_epilogue: 397 L$mul_epilogue:
267 .byte 0xf3,0xc3 398 .byte 0xf3,0xc3
268 399
269 400
270 .p2align 5 401 .p2align 5
271 bn_mul4x_mont_gather5: 402 bn_mul4x_mont_gather5:
272 L$mul4x_enter: 403 L$mul4x_enter:
273 .byte 0x67 404 .byte 0x67
274 movq %rsp,%rax 405 movq %rsp,%rax
275 pushq %rbx 406 pushq %rbx
276 pushq %rbp 407 pushq %rbp
277 pushq %r12 408 pushq %r12
278 pushq %r13 409 pushq %r13
279 pushq %r14 410 pushq %r14
280 pushq %r15 411 pushq %r15
412
281 .byte 0x67 413 .byte 0x67
282 movl %r9d,%r10d
283 shll $3,%r9d 414 shll $3,%r9d
284 » shll» $3+2,%r10d 415 » leaq» (%r9,%r9,2),%r10
285 negq %r9 416 negq %r9
286 417
287 418
288 419
289 420
290 421
291 422
292 423
293 424
294 » leaq» -64(%rsp,%r9,2),%r11 425
295 » subq» %rsi,%r11 426
427 » leaq» -320(%rsp,%r9,2),%r11
428 » subq» %rdi,%r11
296 andq $4095,%r11 429 andq $4095,%r11
297 cmpq %r11,%r10 430 cmpq %r11,%r10
298 jb L$mul4xsp_alt 431 jb L$mul4xsp_alt
299 subq %r11,%rsp 432 subq %r11,%rsp
300 » leaq» -64(%rsp,%r9,2),%rsp 433 » leaq» -320(%rsp,%r9,2),%rsp
301 jmp L$mul4xsp_done 434 jmp L$mul4xsp_done
302 435
303 .p2align 5 436 .p2align 5
304 L$mul4xsp_alt: 437 L$mul4xsp_alt:
305 » leaq» 4096-64(,%r9,2),%r10 438 » leaq» 4096-320(,%r9,2),%r10
306 » leaq» -64(%rsp,%r9,2),%rsp 439 » leaq» -320(%rsp,%r9,2),%rsp
307 subq %r10,%r11 440 subq %r10,%r11
308 movq $0,%r10 441 movq $0,%r10
309 cmovcq %r10,%r11 442 cmovcq %r10,%r11
310 subq %r11,%rsp 443 subq %r11,%rsp
311 L$mul4xsp_done: 444 L$mul4xsp_done:
312 andq $-64,%rsp 445 andq $-64,%rsp
313 negq %r9 446 negq %r9
314 447
315 movq %rax,40(%rsp) 448 movq %rax,40(%rsp)
316 L$mul4x_body: 449 L$mul4x_body:
317 450
318 call mul4x_internal 451 call mul4x_internal
319 452
320 movq 40(%rsp),%rsi 453 movq 40(%rsp),%rsi
321 movq $1,%rax 454 movq $1,%rax
455
322 movq -48(%rsi),%r15 456 movq -48(%rsi),%r15
323 movq -40(%rsi),%r14 457 movq -40(%rsi),%r14
324 movq -32(%rsi),%r13 458 movq -32(%rsi),%r13
325 movq -24(%rsi),%r12 459 movq -24(%rsi),%r12
326 movq -16(%rsi),%rbp 460 movq -16(%rsi),%rbp
327 movq -8(%rsi),%rbx 461 movq -8(%rsi),%rbx
328 leaq (%rsi),%rsp 462 leaq (%rsi),%rsp
329 L$mul4x_epilogue: 463 L$mul4x_epilogue:
330 .byte 0xf3,0xc3 464 .byte 0xf3,0xc3
331 465
332 466
333 467
334 .p2align 5 468 .p2align 5
335 mul4x_internal: 469 mul4x_internal:
336 shlq $5,%r9 470 shlq $5,%r9
337 » movl» 8(%rax),%r10d 471 » movd» 8(%rax),%xmm5
338 » leaq» 256(%rdx,%r9,1),%r13 472 » leaq» L$inc(%rip),%rax
473 » leaq» 128(%rdx,%r9,1),%r13
339 shrq $5,%r9 474 shrq $5,%r9
340 » movq» %r10,%r11 475 » movdqa» 0(%rax),%xmm0
341 » shrq» $3,%r10 476 » movdqa» 16(%rax),%xmm1
342 » andq» $7,%r11 477 » leaq» 88-112(%rsp,%r9,1),%r10
343 » notq» %r10 478 » leaq» 128(%rdx),%r12
344 » leaq» L$magic_masks(%rip),%rax
345 » andq» $3,%r10
346 » leaq» 96(%rdx,%r11,8),%r12
347 » movq» 0(%rax,%r10,8),%xmm4
348 » movq» 8(%rax,%r10,8),%xmm5
349 » addq» $7,%r11
350 » movq» 16(%rax,%r10,8),%xmm6
351 » movq» 24(%rax,%r10,8),%xmm7
352 » andq» $7,%r11
353 479
354 » movq» -96(%r12),%xmm0 480 » pshufd» $0,%xmm5,%xmm5
355 » leaq» 256(%r12),%r14 481 » movdqa» %xmm1,%xmm4
356 » movq» -32(%r12),%xmm1 482 .byte» 0x67,0x67
357 » pand» %xmm4,%xmm0 483 » movdqa» %xmm1,%xmm2
358 » movq» 32(%r12),%xmm2 484 » paddd» %xmm0,%xmm1
359 » pand» %xmm5,%xmm1 485 » pcmpeqd»%xmm5,%xmm0
360 » movq» 96(%r12),%xmm3
361 » pand» %xmm6,%xmm2
362 .byte 0x67 486 .byte 0x67
487 movdqa %xmm4,%xmm3
488 paddd %xmm1,%xmm2
489 pcmpeqd %xmm5,%xmm1
490 movdqa %xmm0,112(%r10)
491 movdqa %xmm4,%xmm0
492
493 paddd %xmm2,%xmm3
494 pcmpeqd %xmm5,%xmm2
495 movdqa %xmm1,128(%r10)
496 movdqa %xmm4,%xmm1
497
498 paddd %xmm3,%xmm0
499 pcmpeqd %xmm5,%xmm3
500 movdqa %xmm2,144(%r10)
501 movdqa %xmm4,%xmm2
502
503 paddd %xmm0,%xmm1
504 pcmpeqd %xmm5,%xmm0
505 movdqa %xmm3,160(%r10)
506 movdqa %xmm4,%xmm3
507 paddd %xmm1,%xmm2
508 pcmpeqd %xmm5,%xmm1
509 movdqa %xmm0,176(%r10)
510 movdqa %xmm4,%xmm0
511
512 paddd %xmm2,%xmm3
513 pcmpeqd %xmm5,%xmm2
514 movdqa %xmm1,192(%r10)
515 movdqa %xmm4,%xmm1
516
517 paddd %xmm3,%xmm0
518 pcmpeqd %xmm5,%xmm3
519 movdqa %xmm2,208(%r10)
520 movdqa %xmm4,%xmm2
521
522 paddd %xmm0,%xmm1
523 pcmpeqd %xmm5,%xmm0
524 movdqa %xmm3,224(%r10)
525 movdqa %xmm4,%xmm3
526 paddd %xmm1,%xmm2
527 pcmpeqd %xmm5,%xmm1
528 movdqa %xmm0,240(%r10)
529 movdqa %xmm4,%xmm0
530
531 paddd %xmm2,%xmm3
532 pcmpeqd %xmm5,%xmm2
533 movdqa %xmm1,256(%r10)
534 movdqa %xmm4,%xmm1
535
536 paddd %xmm3,%xmm0
537 pcmpeqd %xmm5,%xmm3
538 movdqa %xmm2,272(%r10)
539 movdqa %xmm4,%xmm2
540
541 paddd %xmm0,%xmm1
542 pcmpeqd %xmm5,%xmm0
543 movdqa %xmm3,288(%r10)
544 movdqa %xmm4,%xmm3
545 paddd %xmm1,%xmm2
546 pcmpeqd %xmm5,%xmm1
547 movdqa %xmm0,304(%r10)
548
549 paddd %xmm2,%xmm3
550 .byte 0x67
551 pcmpeqd %xmm5,%xmm2
552 movdqa %xmm1,320(%r10)
553
554 pcmpeqd %xmm5,%xmm3
555 movdqa %xmm2,336(%r10)
556 pand 64(%r12),%xmm0
557
558 pand 80(%r12),%xmm1
559 pand 96(%r12),%xmm2
560 movdqa %xmm3,352(%r10)
561 pand 112(%r12),%xmm3
562 por %xmm2,%xmm0
563 por %xmm3,%xmm1
564 movdqa -128(%r12),%xmm4
565 movdqa -112(%r12),%xmm5
566 movdqa -96(%r12),%xmm2
567 pand 112(%r10),%xmm4
568 movdqa -80(%r12),%xmm3
569 pand 128(%r10),%xmm5
570 por %xmm4,%xmm0
571 pand 144(%r10),%xmm2
572 por %xmm5,%xmm1
573 pand 160(%r10),%xmm3
574 por %xmm2,%xmm0
575 por %xmm3,%xmm1
576 movdqa -64(%r12),%xmm4
577 movdqa -48(%r12),%xmm5
578 movdqa -32(%r12),%xmm2
579 pand 176(%r10),%xmm4
580 movdqa -16(%r12),%xmm3
581 pand 192(%r10),%xmm5
582 por %xmm4,%xmm0
583 pand 208(%r10),%xmm2
584 por %xmm5,%xmm1
585 pand 224(%r10),%xmm3
586 por %xmm2,%xmm0
587 por %xmm3,%xmm1
588 movdqa 0(%r12),%xmm4
589 movdqa 16(%r12),%xmm5
590 movdqa 32(%r12),%xmm2
591 pand 240(%r10),%xmm4
592 movdqa 48(%r12),%xmm3
593 pand 256(%r10),%xmm5
594 por %xmm4,%xmm0
595 pand 272(%r10),%xmm2
596 por %xmm5,%xmm1
597 pand 288(%r10),%xmm3
598 por %xmm2,%xmm0
599 por %xmm3,%xmm1
363 por %xmm1,%xmm0 600 por %xmm1,%xmm0
364 » movq» -96(%r14),%xmm1 601 » pshufd» $0x4e,%xmm0,%xmm1
365 .byte» 0x67 602 » por» %xmm1,%xmm0
366 » pand» %xmm7,%xmm3 603 » leaq» 256(%r12),%r12
367 .byte» 0x67 604 .byte» 102,72,15,126,195
368 » por» %xmm2,%xmm0
369 » movq» -32(%r14),%xmm2
370 .byte» 0x67
371 » pand» %xmm4,%xmm1
372 .byte» 0x67
373 » por» %xmm3,%xmm0
374 » movq» 32(%r14),%xmm3
375 605
376 .byte 102,72,15,126,195
377 movq 96(%r14),%xmm0
378 movq %r13,16+8(%rsp) 606 movq %r13,16+8(%rsp)
379 movq %rdi,56+8(%rsp) 607 movq %rdi,56+8(%rsp)
380 608
381 movq (%r8),%r8 609 movq (%r8),%r8
382 movq (%rsi),%rax 610 movq (%rsi),%rax
383 leaq (%rsi,%r9,1),%rsi 611 leaq (%rsi,%r9,1),%rsi
384 negq %r9 612 negq %r9
385 613
386 movq %r8,%rbp 614 movq %r8,%rbp
387 mulq %rbx 615 mulq %rbx
388 movq %rax,%r10 616 movq %rax,%r10
389 movq (%rcx),%rax 617 movq (%rcx),%rax
390 618
391 pand %xmm5,%xmm2
392 pand %xmm6,%xmm3
393 por %xmm2,%xmm1
394
395 imulq %r10,%rbp 619 imulq %r10,%rbp
396 620 » leaq» 64+8(%rsp),%r14
397
398
399
400
401
402
403 » leaq» 64+8(%rsp,%r11,8),%r14
404 movq %rdx,%r11 621 movq %rdx,%r11
405 622
406 pand %xmm7,%xmm0
407 por %xmm3,%xmm1
408 leaq 512(%r12),%r12
409 por %xmm1,%xmm0
410
411 mulq %rbp 623 mulq %rbp
412 addq %rax,%r10 624 addq %rax,%r10
413 movq 8(%rsi,%r9,1),%rax 625 movq 8(%rsi,%r9,1),%rax
414 adcq $0,%rdx 626 adcq $0,%rdx
415 movq %rdx,%rdi 627 movq %rdx,%rdi
416 628
417 mulq %rbx 629 mulq %rbx
418 addq %rax,%r11 630 addq %rax,%r11
419 » movq» 16(%rcx),%rax 631 » movq» 8(%rcx),%rax
420 adcq $0,%rdx 632 adcq $0,%rdx
421 movq %rdx,%r10 633 movq %rdx,%r10
422 634
423 mulq %rbp 635 mulq %rbp
424 addq %rax,%rdi 636 addq %rax,%rdi
425 movq 16(%rsi,%r9,1),%rax 637 movq 16(%rsi,%r9,1),%rax
426 adcq $0,%rdx 638 adcq $0,%rdx
427 addq %r11,%rdi 639 addq %r11,%rdi
428 leaq 32(%r9),%r15 640 leaq 32(%r9),%r15
429 » leaq» 64(%rcx),%rcx 641 » leaq» 32(%rcx),%rcx
430 adcq $0,%rdx 642 adcq $0,%rdx
431 movq %rdi,(%r14) 643 movq %rdi,(%r14)
432 movq %rdx,%r13 644 movq %rdx,%r13
433 jmp L$1st4x 645 jmp L$1st4x
434 646
435 .p2align 5 647 .p2align 5
436 L$1st4x: 648 L$1st4x:
437 mulq %rbx 649 mulq %rbx
438 addq %rax,%r10 650 addq %rax,%r10
439 » movq» -32(%rcx),%rax 651 » movq» -16(%rcx),%rax
440 leaq 32(%r14),%r14 652 leaq 32(%r14),%r14
441 adcq $0,%rdx 653 adcq $0,%rdx
442 movq %rdx,%r11 654 movq %rdx,%r11
443 655
444 mulq %rbp 656 mulq %rbp
445 addq %rax,%r13 657 addq %rax,%r13
446 movq -8(%rsi,%r15,1),%rax 658 movq -8(%rsi,%r15,1),%rax
447 adcq $0,%rdx 659 adcq $0,%rdx
448 addq %r10,%r13 660 addq %r10,%r13
449 adcq $0,%rdx 661 adcq $0,%rdx
450 movq %r13,-24(%r14) 662 movq %r13,-24(%r14)
451 movq %rdx,%rdi 663 movq %rdx,%rdi
452 664
453 mulq %rbx 665 mulq %rbx
454 addq %rax,%r11 666 addq %rax,%r11
455 » movq» -16(%rcx),%rax 667 » movq» -8(%rcx),%rax
456 adcq $0,%rdx 668 adcq $0,%rdx
457 movq %rdx,%r10 669 movq %rdx,%r10
458 670
459 mulq %rbp 671 mulq %rbp
460 addq %rax,%rdi 672 addq %rax,%rdi
461 movq (%rsi,%r15,1),%rax 673 movq (%rsi,%r15,1),%rax
462 adcq $0,%rdx 674 adcq $0,%rdx
463 addq %r11,%rdi 675 addq %r11,%rdi
464 adcq $0,%rdx 676 adcq $0,%rdx
465 movq %rdi,-16(%r14) 677 movq %rdi,-16(%r14)
466 movq %rdx,%r13 678 movq %rdx,%r13
467 679
468 mulq %rbx 680 mulq %rbx
469 addq %rax,%r10 681 addq %rax,%r10
470 movq 0(%rcx),%rax 682 movq 0(%rcx),%rax
471 adcq $0,%rdx 683 adcq $0,%rdx
472 movq %rdx,%r11 684 movq %rdx,%r11
473 685
474 mulq %rbp 686 mulq %rbp
475 addq %rax,%r13 687 addq %rax,%r13
476 movq 8(%rsi,%r15,1),%rax 688 movq 8(%rsi,%r15,1),%rax
477 adcq $0,%rdx 689 adcq $0,%rdx
478 addq %r10,%r13 690 addq %r10,%r13
479 adcq $0,%rdx 691 adcq $0,%rdx
480 movq %r13,-8(%r14) 692 movq %r13,-8(%r14)
481 movq %rdx,%rdi 693 movq %rdx,%rdi
482 694
483 mulq %rbx 695 mulq %rbx
484 addq %rax,%r11 696 addq %rax,%r11
485 » movq» 16(%rcx),%rax 697 » movq» 8(%rcx),%rax
486 adcq $0,%rdx 698 adcq $0,%rdx
487 movq %rdx,%r10 699 movq %rdx,%r10
488 700
489 mulq %rbp 701 mulq %rbp
490 addq %rax,%rdi 702 addq %rax,%rdi
491 movq 16(%rsi,%r15,1),%rax 703 movq 16(%rsi,%r15,1),%rax
492 adcq $0,%rdx 704 adcq $0,%rdx
493 addq %r11,%rdi 705 addq %r11,%rdi
494 » leaq» 64(%rcx),%rcx 706 » leaq» 32(%rcx),%rcx
495 adcq $0,%rdx 707 adcq $0,%rdx
496 movq %rdi,(%r14) 708 movq %rdi,(%r14)
497 movq %rdx,%r13 709 movq %rdx,%r13
498 710
499 addq $32,%r15 711 addq $32,%r15
500 jnz L$1st4x 712 jnz L$1st4x
501 713
502 mulq %rbx 714 mulq %rbx
503 addq %rax,%r10 715 addq %rax,%r10
504 » movq» -32(%rcx),%rax 716 » movq» -16(%rcx),%rax
505 leaq 32(%r14),%r14 717 leaq 32(%r14),%r14
506 adcq $0,%rdx 718 adcq $0,%rdx
507 movq %rdx,%r11 719 movq %rdx,%r11
508 720
509 mulq %rbp 721 mulq %rbp
510 addq %rax,%r13 722 addq %rax,%r13
511 movq -8(%rsi),%rax 723 movq -8(%rsi),%rax
512 adcq $0,%rdx 724 adcq $0,%rdx
513 addq %r10,%r13 725 addq %r10,%r13
514 adcq $0,%rdx 726 adcq $0,%rdx
515 movq %r13,-24(%r14) 727 movq %r13,-24(%r14)
516 movq %rdx,%rdi 728 movq %rdx,%rdi
517 729
518 mulq %rbx 730 mulq %rbx
519 addq %rax,%r11 731 addq %rax,%r11
520 » movq» -16(%rcx),%rax 732 » movq» -8(%rcx),%rax
521 adcq $0,%rdx 733 adcq $0,%rdx
522 movq %rdx,%r10 734 movq %rdx,%r10
523 735
524 mulq %rbp 736 mulq %rbp
525 addq %rax,%rdi 737 addq %rax,%rdi
526 movq (%rsi,%r9,1),%rax 738 movq (%rsi,%r9,1),%rax
527 adcq $0,%rdx 739 adcq $0,%rdx
528 addq %r11,%rdi 740 addq %r11,%rdi
529 adcq $0,%rdx 741 adcq $0,%rdx
530 movq %rdi,-16(%r14) 742 movq %rdi,-16(%r14)
531 movq %rdx,%r13 743 movq %rdx,%r13
532 744
533 .byte» 102,72,15,126,195 745 » leaq» (%rcx,%r9,1),%rcx
534 » leaq» (%rcx,%r9,2),%rcx
535 746
536 xorq %rdi,%rdi 747 xorq %rdi,%rdi
537 addq %r10,%r13 748 addq %r10,%r13
538 adcq $0,%rdi 749 adcq $0,%rdi
539 movq %r13,-8(%r14) 750 movq %r13,-8(%r14)
540 751
541 jmp L$outer4x 752 jmp L$outer4x
542 753
543 .p2align 5 754 .p2align 5
544 L$outer4x: 755 L$outer4x:
756 leaq 16+128(%r14),%rdx
757 pxor %xmm4,%xmm4
758 pxor %xmm5,%xmm5
759 movdqa -128(%r12),%xmm0
760 movdqa -112(%r12),%xmm1
761 movdqa -96(%r12),%xmm2
762 movdqa -80(%r12),%xmm3
763 pand -128(%rdx),%xmm0
764 pand -112(%rdx),%xmm1
765 por %xmm0,%xmm4
766 pand -96(%rdx),%xmm2
767 por %xmm1,%xmm5
768 pand -80(%rdx),%xmm3
769 por %xmm2,%xmm4
770 por %xmm3,%xmm5
771 movdqa -64(%r12),%xmm0
772 movdqa -48(%r12),%xmm1
773 movdqa -32(%r12),%xmm2
774 movdqa -16(%r12),%xmm3
775 pand -64(%rdx),%xmm0
776 pand -48(%rdx),%xmm1
777 por %xmm0,%xmm4
778 pand -32(%rdx),%xmm2
779 por %xmm1,%xmm5
780 pand -16(%rdx),%xmm3
781 por %xmm2,%xmm4
782 por %xmm3,%xmm5
783 movdqa 0(%r12),%xmm0
784 movdqa 16(%r12),%xmm1
785 movdqa 32(%r12),%xmm2
786 movdqa 48(%r12),%xmm3
787 pand 0(%rdx),%xmm0
788 pand 16(%rdx),%xmm1
789 por %xmm0,%xmm4
790 pand 32(%rdx),%xmm2
791 por %xmm1,%xmm5
792 pand 48(%rdx),%xmm3
793 por %xmm2,%xmm4
794 por %xmm3,%xmm5
795 movdqa 64(%r12),%xmm0
796 movdqa 80(%r12),%xmm1
797 movdqa 96(%r12),%xmm2
798 movdqa 112(%r12),%xmm3
799 pand 64(%rdx),%xmm0
800 pand 80(%rdx),%xmm1
801 por %xmm0,%xmm4
802 pand 96(%rdx),%xmm2
803 por %xmm1,%xmm5
804 pand 112(%rdx),%xmm3
805 por %xmm2,%xmm4
806 por %xmm3,%xmm5
807 por %xmm5,%xmm4
808 pshufd $0x4e,%xmm4,%xmm0
809 por %xmm4,%xmm0
810 leaq 256(%r12),%r12
811 .byte 102,72,15,126,195
812
545 movq (%r14,%r9,1),%r10 813 movq (%r14,%r9,1),%r10
546 movq %r8,%rbp 814 movq %r8,%rbp
547 mulq %rbx 815 mulq %rbx
548 addq %rax,%r10 816 addq %rax,%r10
549 movq (%rcx),%rax 817 movq (%rcx),%rax
550 adcq $0,%rdx 818 adcq $0,%rdx
551 819
552 movq -96(%r12),%xmm0
553 movq -32(%r12),%xmm1
554 pand %xmm4,%xmm0
555 movq 32(%r12),%xmm2
556 pand %xmm5,%xmm1
557 movq 96(%r12),%xmm3
558
559 imulq %r10,%rbp 820 imulq %r10,%rbp
560 .byte 0x67
561 movq %rdx,%r11 821 movq %rdx,%r11
562 movq %rdi,(%r14) 822 movq %rdi,(%r14)
563 823
564 pand %xmm6,%xmm2
565 por %xmm1,%xmm0
566 pand %xmm7,%xmm3
567 por %xmm2,%xmm0
568 leaq (%r14,%r9,1),%r14 824 leaq (%r14,%r9,1),%r14
569 leaq 256(%r12),%r12
570 por %xmm3,%xmm0
571 825
572 mulq %rbp 826 mulq %rbp
573 addq %rax,%r10 827 addq %rax,%r10
574 movq 8(%rsi,%r9,1),%rax 828 movq 8(%rsi,%r9,1),%rax
575 adcq $0,%rdx 829 adcq $0,%rdx
576 movq %rdx,%rdi 830 movq %rdx,%rdi
577 831
578 mulq %rbx 832 mulq %rbx
579 addq %rax,%r11 833 addq %rax,%r11
580 » movq» 16(%rcx),%rax 834 » movq» 8(%rcx),%rax
581 adcq $0,%rdx 835 adcq $0,%rdx
582 addq 8(%r14),%r11 836 addq 8(%r14),%r11
583 adcq $0,%rdx 837 adcq $0,%rdx
584 movq %rdx,%r10 838 movq %rdx,%r10
585 839
586 mulq %rbp 840 mulq %rbp
587 addq %rax,%rdi 841 addq %rax,%rdi
588 movq 16(%rsi,%r9,1),%rax 842 movq 16(%rsi,%r9,1),%rax
589 adcq $0,%rdx 843 adcq $0,%rdx
590 addq %r11,%rdi 844 addq %r11,%rdi
591 leaq 32(%r9),%r15 845 leaq 32(%r9),%r15
592 » leaq» 64(%rcx),%rcx 846 » leaq» 32(%rcx),%rcx
593 adcq $0,%rdx 847 adcq $0,%rdx
594 movq %rdx,%r13 848 movq %rdx,%r13
595 jmp L$inner4x 849 jmp L$inner4x
596 850
597 .p2align 5 851 .p2align 5
598 L$inner4x: 852 L$inner4x:
599 mulq %rbx 853 mulq %rbx
600 addq %rax,%r10 854 addq %rax,%r10
601 » movq» -32(%rcx),%rax 855 » movq» -16(%rcx),%rax
602 adcq $0,%rdx 856 adcq $0,%rdx
603 addq 16(%r14),%r10 857 addq 16(%r14),%r10
604 leaq 32(%r14),%r14 858 leaq 32(%r14),%r14
605 adcq $0,%rdx 859 adcq $0,%rdx
606 movq %rdx,%r11 860 movq %rdx,%r11
607 861
608 mulq %rbp 862 mulq %rbp
609 addq %rax,%r13 863 addq %rax,%r13
610 movq -8(%rsi,%r15,1),%rax 864 movq -8(%rsi,%r15,1),%rax
611 adcq $0,%rdx 865 adcq $0,%rdx
612 addq %r10,%r13 866 addq %r10,%r13
613 adcq $0,%rdx 867 adcq $0,%rdx
614 movq %rdi,-32(%r14) 868 movq %rdi,-32(%r14)
615 movq %rdx,%rdi 869 movq %rdx,%rdi
616 870
617 mulq %rbx 871 mulq %rbx
618 addq %rax,%r11 872 addq %rax,%r11
619 » movq» -16(%rcx),%rax 873 » movq» -8(%rcx),%rax
620 adcq $0,%rdx 874 adcq $0,%rdx
621 addq -8(%r14),%r11 875 addq -8(%r14),%r11
622 adcq $0,%rdx 876 adcq $0,%rdx
623 movq %rdx,%r10 877 movq %rdx,%r10
624 878
625 mulq %rbp 879 mulq %rbp
626 addq %rax,%rdi 880 addq %rax,%rdi
627 movq (%rsi,%r15,1),%rax 881 movq (%rsi,%r15,1),%rax
628 adcq $0,%rdx 882 adcq $0,%rdx
629 addq %r11,%rdi 883 addq %r11,%rdi
(...skipping 13 matching lines...) Expand all
643 addq %rax,%r13 897 addq %rax,%r13
644 movq 8(%rsi,%r15,1),%rax 898 movq 8(%rsi,%r15,1),%rax
645 adcq $0,%rdx 899 adcq $0,%rdx
646 addq %r10,%r13 900 addq %r10,%r13
647 adcq $0,%rdx 901 adcq $0,%rdx
648 movq %rdi,-16(%r14) 902 movq %rdi,-16(%r14)
649 movq %rdx,%rdi 903 movq %rdx,%rdi
650 904
651 mulq %rbx 905 mulq %rbx
652 addq %rax,%r11 906 addq %rax,%r11
653 » movq» 16(%rcx),%rax 907 » movq» 8(%rcx),%rax
654 adcq $0,%rdx 908 adcq $0,%rdx
655 addq 8(%r14),%r11 909 addq 8(%r14),%r11
656 adcq $0,%rdx 910 adcq $0,%rdx
657 movq %rdx,%r10 911 movq %rdx,%r10
658 912
659 mulq %rbp 913 mulq %rbp
660 addq %rax,%rdi 914 addq %rax,%rdi
661 movq 16(%rsi,%r15,1),%rax 915 movq 16(%rsi,%r15,1),%rax
662 adcq $0,%rdx 916 adcq $0,%rdx
663 addq %r11,%rdi 917 addq %r11,%rdi
664 » leaq» 64(%rcx),%rcx 918 » leaq» 32(%rcx),%rcx
665 adcq $0,%rdx 919 adcq $0,%rdx
666 movq %r13,-8(%r14) 920 movq %r13,-8(%r14)
667 movq %rdx,%r13 921 movq %rdx,%r13
668 922
669 addq $32,%r15 923 addq $32,%r15
670 jnz L$inner4x 924 jnz L$inner4x
671 925
672 mulq %rbx 926 mulq %rbx
673 addq %rax,%r10 927 addq %rax,%r10
674 » movq» -32(%rcx),%rax 928 » movq» -16(%rcx),%rax
675 adcq $0,%rdx 929 adcq $0,%rdx
676 addq 16(%r14),%r10 930 addq 16(%r14),%r10
677 leaq 32(%r14),%r14 931 leaq 32(%r14),%r14
678 adcq $0,%rdx 932 adcq $0,%rdx
679 movq %rdx,%r11 933 movq %rdx,%r11
680 934
681 mulq %rbp 935 mulq %rbp
682 addq %rax,%r13 936 addq %rax,%r13
683 movq -8(%rsi),%rax 937 movq -8(%rsi),%rax
684 adcq $0,%rdx 938 adcq $0,%rdx
685 addq %r10,%r13 939 addq %r10,%r13
686 adcq $0,%rdx 940 adcq $0,%rdx
687 movq %rdi,-32(%r14) 941 movq %rdi,-32(%r14)
688 movq %rdx,%rdi 942 movq %rdx,%rdi
689 943
690 mulq %rbx 944 mulq %rbx
691 addq %rax,%r11 945 addq %rax,%r11
692 movq %rbp,%rax 946 movq %rbp,%rax
693 » movq» -16(%rcx),%rbp 947 » movq» -8(%rcx),%rbp
694 adcq $0,%rdx 948 adcq $0,%rdx
695 addq -8(%r14),%r11 949 addq -8(%r14),%r11
696 adcq $0,%rdx 950 adcq $0,%rdx
697 movq %rdx,%r10 951 movq %rdx,%r10
698 952
699 mulq %rbp 953 mulq %rbp
700 addq %rax,%rdi 954 addq %rax,%rdi
701 movq (%rsi,%r9,1),%rax 955 movq (%rsi,%r9,1),%rax
702 adcq $0,%rdx 956 adcq $0,%rdx
703 addq %r11,%rdi 957 addq %r11,%rdi
704 adcq $0,%rdx 958 adcq $0,%rdx
705 movq %r13,-24(%r14) 959 movq %r13,-24(%r14)
706 movq %rdx,%r13 960 movq %rdx,%r13
707 961
708 .byte 102,72,15,126,195
709 movq %rdi,-16(%r14) 962 movq %rdi,-16(%r14)
710 » leaq» (%rcx,%r9,2),%rcx 963 » leaq» (%rcx,%r9,1),%rcx
711 964
712 xorq %rdi,%rdi 965 xorq %rdi,%rdi
713 addq %r10,%r13 966 addq %r10,%r13
714 adcq $0,%rdi 967 adcq $0,%rdi
715 addq (%r14),%r13 968 addq (%r14),%r13
716 adcq $0,%rdi 969 adcq $0,%rdi
717 movq %r13,-8(%r14) 970 movq %r13,-8(%r14)
718 971
719 cmpq 16+8(%rsp),%r12 972 cmpq 16+8(%rsp),%r12
720 jb L$outer4x 973 jb L$outer4x
974 xorq %rax,%rax
721 subq %r13,%rbp 975 subq %r13,%rbp
722 adcq %r15,%r15 976 adcq %r15,%r15
723 orq %r15,%rdi 977 orq %r15,%rdi
724 » xorq» $1,%rdi 978 » subq» %rdi,%rax
725 leaq (%r14,%r9,1),%rbx 979 leaq (%r14,%r9,1),%rbx
726 » leaq» (%rcx,%rdi,8),%rbp 980 » movq» (%rcx),%r12
981 » leaq» (%rcx),%rbp
727 movq %r9,%rcx 982 movq %r9,%rcx
728 sarq $3+2,%rcx 983 sarq $3+2,%rcx
729 movq 56+8(%rsp),%rdi 984 movq 56+8(%rsp),%rdi
730 » jmp» L$sqr4x_sub 985 » decq» %r12
986 » xorq» %r10,%r10
987 » movq» 8(%rbp),%r13
988 » movq» 16(%rbp),%r14
989 » movq» 24(%rbp),%r15
990 » jmp» L$sqr4x_sub_entry
731 991
732 .globl _bn_power5 992 .globl _bn_power5
733 .private_extern _bn_power5 993 .private_extern _bn_power5
734 994
735 .p2align 5 995 .p2align 5
736 _bn_power5: 996 _bn_power5:
737 movq %rsp,%rax 997 movq %rsp,%rax
738 pushq %rbx 998 pushq %rbx
739 pushq %rbp 999 pushq %rbp
740 pushq %r12 1000 pushq %r12
741 pushq %r13 1001 pushq %r13
742 pushq %r14 1002 pushq %r14
743 pushq %r15 1003 pushq %r15
744 » movl» %r9d,%r10d 1004
745 shll $3,%r9d 1005 shll $3,%r9d
746 » shll» $3+2,%r10d 1006 » leal» (%r9,%r9,2),%r10d
747 negq %r9 1007 negq %r9
748 movq (%r8),%r8 1008 movq (%r8),%r8
749 1009
750 1010
751 1011
752 1012
753 1013
754 1014
755 1015
756 » leaq» -64(%rsp,%r9,2),%r11 1016
757 » subq» %rsi,%r11 1017 » leaq» -320(%rsp,%r9,2),%r11
1018 » subq» %rdi,%r11
758 andq $4095,%r11 1019 andq $4095,%r11
759 cmpq %r11,%r10 1020 cmpq %r11,%r10
760 jb L$pwr_sp_alt 1021 jb L$pwr_sp_alt
761 subq %r11,%rsp 1022 subq %r11,%rsp
762 » leaq» -64(%rsp,%r9,2),%rsp 1023 » leaq» -320(%rsp,%r9,2),%rsp
763 jmp L$pwr_sp_done 1024 jmp L$pwr_sp_done
764 1025
765 .p2align 5 1026 .p2align 5
766 L$pwr_sp_alt: 1027 L$pwr_sp_alt:
767 » leaq» 4096-64(,%r9,2),%r10 1028 » leaq» 4096-320(,%r9,2),%r10
768 » leaq» -64(%rsp,%r9,2),%rsp 1029 » leaq» -320(%rsp,%r9,2),%rsp
769 subq %r10,%r11 1030 subq %r10,%r11
770 movq $0,%r10 1031 movq $0,%r10
771 cmovcq %r10,%r11 1032 cmovcq %r10,%r11
772 subq %r11,%rsp 1033 subq %r11,%rsp
773 L$pwr_sp_done: 1034 L$pwr_sp_done:
774 andq $-64,%rsp 1035 andq $-64,%rsp
775 movq %r9,%r10 1036 movq %r9,%r10
776 negq %r9 1037 negq %r9
777 1038
778 1039
779 1040
780 1041
781 1042
782 1043
783 1044
784 1045
785 1046
786 1047
787 movq %r8,32(%rsp) 1048 movq %r8,32(%rsp)
788 movq %rax,40(%rsp) 1049 movq %rax,40(%rsp)
789 L$power5_body: 1050 L$power5_body:
790 .byte 102,72,15,110,207 1051 .byte 102,72,15,110,207
791 .byte 102,72,15,110,209 1052 .byte 102,72,15,110,209
792 .byte 102,73,15,110,218 1053 .byte 102,73,15,110,218
793 .byte 102,72,15,110,226 1054 .byte 102,72,15,110,226
794 1055
795 call __bn_sqr8x_internal 1056 call __bn_sqr8x_internal
1057 call __bn_post4x_internal
796 call __bn_sqr8x_internal 1058 call __bn_sqr8x_internal
1059 call __bn_post4x_internal
797 call __bn_sqr8x_internal 1060 call __bn_sqr8x_internal
1061 call __bn_post4x_internal
798 call __bn_sqr8x_internal 1062 call __bn_sqr8x_internal
1063 call __bn_post4x_internal
799 call __bn_sqr8x_internal 1064 call __bn_sqr8x_internal
1065 call __bn_post4x_internal
800 1066
801 .byte 102,72,15,126,209 1067 .byte 102,72,15,126,209
802 .byte 102,72,15,126,226 1068 .byte 102,72,15,126,226
803 movq %rsi,%rdi 1069 movq %rsi,%rdi
804 movq 40(%rsp),%rax 1070 movq 40(%rsp),%rax
805 leaq 32(%rsp),%r8 1071 leaq 32(%rsp),%r8
806 1072
807 call mul4x_internal 1073 call mul4x_internal
808 1074
809 movq 40(%rsp),%rsi 1075 movq 40(%rsp),%rsi
(...skipping 524 matching lines...) Expand 10 before | Expand all | Expand 10 after
1334 leaq (%rcx,%r11,2),%r8 1600 leaq (%rcx,%r11,2),%r8
1335 shrq $63,%r11 1601 shrq $63,%r11
1336 orq %r10,%r8 1602 orq %r10,%r8
1337 mulq %rax 1603 mulq %rax
1338 negq %r15 1604 negq %r15
1339 adcq %rax,%rbx 1605 adcq %rax,%rbx
1340 adcq %rdx,%r8 1606 adcq %rdx,%r8
1341 movq %rbx,-16(%rdi) 1607 movq %rbx,-16(%rdi)
1342 movq %r8,-8(%rdi) 1608 movq %r8,-8(%rdi)
1343 .byte 102,72,15,126,213 1609 .byte 102,72,15,126,213
1344 sqr8x_reduction: 1610 __bn_sqr8x_reduction:
1345 xorq %rax,%rax 1611 xorq %rax,%rax
1346 » leaq» (%rbp,%r9,2),%rcx 1612 » leaq» (%r9,%rbp,1),%rcx
1347 leaq 48+8(%rsp,%r9,2),%rdx 1613 leaq 48+8(%rsp,%r9,2),%rdx
1348 movq %rcx,0+8(%rsp) 1614 movq %rcx,0+8(%rsp)
1349 leaq 48+8(%rsp,%r9,1),%rdi 1615 leaq 48+8(%rsp,%r9,1),%rdi
1350 movq %rdx,8+8(%rsp) 1616 movq %rdx,8+8(%rsp)
1351 negq %r9 1617 negq %r9
1352 jmp L$8x_reduction_loop 1618 jmp L$8x_reduction_loop
1353 1619
1354 .p2align 5 1620 .p2align 5
1355 L$8x_reduction_loop: 1621 L$8x_reduction_loop:
1356 leaq (%rdi,%r9,1),%rdi 1622 leaq (%rdi,%r9,1),%rdi
(...skipping 12 matching lines...) Expand all
1369 .byte 0x67 1635 .byte 0x67
1370 movq %rbx,%r8 1636 movq %rbx,%r8
1371 imulq 32+8(%rsp),%rbx 1637 imulq 32+8(%rsp),%rbx
1372 movq 0(%rbp),%rax 1638 movq 0(%rbp),%rax
1373 movl $8,%ecx 1639 movl $8,%ecx
1374 jmp L$8x_reduce 1640 jmp L$8x_reduce
1375 1641
1376 .p2align 5 1642 .p2align 5
1377 L$8x_reduce: 1643 L$8x_reduce:
1378 mulq %rbx 1644 mulq %rbx
1379 » movq» 16(%rbp),%rax 1645 » movq» 8(%rbp),%rax
1380 negq %r8 1646 negq %r8
1381 movq %rdx,%r8 1647 movq %rdx,%r8
1382 adcq $0,%r8 1648 adcq $0,%r8
1383 1649
1384 mulq %rbx 1650 mulq %rbx
1385 addq %rax,%r9 1651 addq %rax,%r9
1386 » movq» 32(%rbp),%rax 1652 » movq» 16(%rbp),%rax
1387 adcq $0,%rdx 1653 adcq $0,%rdx
1388 addq %r9,%r8 1654 addq %r9,%r8
1389 movq %rbx,48-8+8(%rsp,%rcx,8) 1655 movq %rbx,48-8+8(%rsp,%rcx,8)
1390 movq %rdx,%r9 1656 movq %rdx,%r9
1391 adcq $0,%r9 1657 adcq $0,%r9
1392 1658
1393 mulq %rbx 1659 mulq %rbx
1394 addq %rax,%r10 1660 addq %rax,%r10
1395 » movq» 48(%rbp),%rax 1661 » movq» 24(%rbp),%rax
1396 adcq $0,%rdx 1662 adcq $0,%rdx
1397 addq %r10,%r9 1663 addq %r10,%r9
1398 movq 32+8(%rsp),%rsi 1664 movq 32+8(%rsp),%rsi
1399 movq %rdx,%r10 1665 movq %rdx,%r10
1400 adcq $0,%r10 1666 adcq $0,%r10
1401 1667
1402 mulq %rbx 1668 mulq %rbx
1403 addq %rax,%r11 1669 addq %rax,%r11
1404 » movq» 64(%rbp),%rax 1670 » movq» 32(%rbp),%rax
1405 adcq $0,%rdx 1671 adcq $0,%rdx
1406 imulq %r8,%rsi 1672 imulq %r8,%rsi
1407 addq %r11,%r10 1673 addq %r11,%r10
1408 movq %rdx,%r11 1674 movq %rdx,%r11
1409 adcq $0,%r11 1675 adcq $0,%r11
1410 1676
1411 mulq %rbx 1677 mulq %rbx
1412 addq %rax,%r12 1678 addq %rax,%r12
1413 » movq» 80(%rbp),%rax 1679 » movq» 40(%rbp),%rax
1414 adcq $0,%rdx 1680 adcq $0,%rdx
1415 addq %r12,%r11 1681 addq %r12,%r11
1416 movq %rdx,%r12 1682 movq %rdx,%r12
1417 adcq $0,%r12 1683 adcq $0,%r12
1418 1684
1419 mulq %rbx 1685 mulq %rbx
1420 addq %rax,%r13 1686 addq %rax,%r13
1421 » movq» 96(%rbp),%rax 1687 » movq» 48(%rbp),%rax
1422 adcq $0,%rdx 1688 adcq $0,%rdx
1423 addq %r13,%r12 1689 addq %r13,%r12
1424 movq %rdx,%r13 1690 movq %rdx,%r13
1425 adcq $0,%r13 1691 adcq $0,%r13
1426 1692
1427 mulq %rbx 1693 mulq %rbx
1428 addq %rax,%r14 1694 addq %rax,%r14
1429 » movq» 112(%rbp),%rax 1695 » movq» 56(%rbp),%rax
1430 adcq $0,%rdx 1696 adcq $0,%rdx
1431 addq %r14,%r13 1697 addq %r14,%r13
1432 movq %rdx,%r14 1698 movq %rdx,%r14
1433 adcq $0,%r14 1699 adcq $0,%r14
1434 1700
1435 mulq %rbx 1701 mulq %rbx
1436 movq %rsi,%rbx 1702 movq %rsi,%rbx
1437 addq %rax,%r15 1703 addq %rax,%r15
1438 movq 0(%rbp),%rax 1704 movq 0(%rbp),%rax
1439 adcq $0,%rdx 1705 adcq $0,%rdx
1440 addq %r15,%r14 1706 addq %r15,%r14
1441 movq %rdx,%r15 1707 movq %rdx,%r15
1442 adcq $0,%r15 1708 adcq $0,%r15
1443 1709
1444 decl %ecx 1710 decl %ecx
1445 jnz L$8x_reduce 1711 jnz L$8x_reduce
1446 1712
1447 » leaq» 128(%rbp),%rbp 1713 » leaq» 64(%rbp),%rbp
1448 xorq %rax,%rax 1714 xorq %rax,%rax
1449 movq 8+8(%rsp),%rdx 1715 movq 8+8(%rsp),%rdx
1450 cmpq 0+8(%rsp),%rbp 1716 cmpq 0+8(%rsp),%rbp
1451 jae L$8x_no_tail 1717 jae L$8x_no_tail
1452 1718
1453 .byte 0x66 1719 .byte 0x66
1454 addq 0(%rdi),%r8 1720 addq 0(%rdi),%r8
1455 adcq 8(%rdi),%r9 1721 adcq 8(%rdi),%r9
1456 adcq 16(%rdi),%r10 1722 adcq 16(%rdi),%r10
1457 adcq 24(%rdi),%r11 1723 adcq 24(%rdi),%r11
1458 adcq 32(%rdi),%r12 1724 adcq 32(%rdi),%r12
1459 adcq 40(%rdi),%r13 1725 adcq 40(%rdi),%r13
1460 adcq 48(%rdi),%r14 1726 adcq 48(%rdi),%r14
1461 adcq 56(%rdi),%r15 1727 adcq 56(%rdi),%r15
1462 sbbq %rsi,%rsi 1728 sbbq %rsi,%rsi
1463 1729
1464 movq 48+56+8(%rsp),%rbx 1730 movq 48+56+8(%rsp),%rbx
1465 movl $8,%ecx 1731 movl $8,%ecx
1466 movq 0(%rbp),%rax 1732 movq 0(%rbp),%rax
1467 jmp L$8x_tail 1733 jmp L$8x_tail
1468 1734
1469 .p2align 5 1735 .p2align 5
1470 L$8x_tail: 1736 L$8x_tail:
1471 mulq %rbx 1737 mulq %rbx
1472 addq %rax,%r8 1738 addq %rax,%r8
1473 » movq» 16(%rbp),%rax 1739 » movq» 8(%rbp),%rax
1474 movq %r8,(%rdi) 1740 movq %r8,(%rdi)
1475 movq %rdx,%r8 1741 movq %rdx,%r8
1476 adcq $0,%r8 1742 adcq $0,%r8
1477 1743
1478 mulq %rbx 1744 mulq %rbx
1479 addq %rax,%r9 1745 addq %rax,%r9
1480 » movq» 32(%rbp),%rax 1746 » movq» 16(%rbp),%rax
1481 adcq $0,%rdx 1747 adcq $0,%rdx
1482 addq %r9,%r8 1748 addq %r9,%r8
1483 leaq 8(%rdi),%rdi 1749 leaq 8(%rdi),%rdi
1484 movq %rdx,%r9 1750 movq %rdx,%r9
1485 adcq $0,%r9 1751 adcq $0,%r9
1486 1752
1487 mulq %rbx 1753 mulq %rbx
1488 addq %rax,%r10 1754 addq %rax,%r10
1489 » movq» 48(%rbp),%rax 1755 » movq» 24(%rbp),%rax
1490 adcq $0,%rdx 1756 adcq $0,%rdx
1491 addq %r10,%r9 1757 addq %r10,%r9
1492 movq %rdx,%r10 1758 movq %rdx,%r10
1493 adcq $0,%r10 1759 adcq $0,%r10
1494 1760
1495 mulq %rbx 1761 mulq %rbx
1496 addq %rax,%r11 1762 addq %rax,%r11
1497 » movq» 64(%rbp),%rax 1763 » movq» 32(%rbp),%rax
1498 adcq $0,%rdx 1764 adcq $0,%rdx
1499 addq %r11,%r10 1765 addq %r11,%r10
1500 movq %rdx,%r11 1766 movq %rdx,%r11
1501 adcq $0,%r11 1767 adcq $0,%r11
1502 1768
1503 mulq %rbx 1769 mulq %rbx
1504 addq %rax,%r12 1770 addq %rax,%r12
1505 » movq» 80(%rbp),%rax 1771 » movq» 40(%rbp),%rax
1506 adcq $0,%rdx 1772 adcq $0,%rdx
1507 addq %r12,%r11 1773 addq %r12,%r11
1508 movq %rdx,%r12 1774 movq %rdx,%r12
1509 adcq $0,%r12 1775 adcq $0,%r12
1510 1776
1511 mulq %rbx 1777 mulq %rbx
1512 addq %rax,%r13 1778 addq %rax,%r13
1513 » movq» 96(%rbp),%rax 1779 » movq» 48(%rbp),%rax
1514 adcq $0,%rdx 1780 adcq $0,%rdx
1515 addq %r13,%r12 1781 addq %r13,%r12
1516 movq %rdx,%r13 1782 movq %rdx,%r13
1517 adcq $0,%r13 1783 adcq $0,%r13
1518 1784
1519 mulq %rbx 1785 mulq %rbx
1520 addq %rax,%r14 1786 addq %rax,%r14
1521 » movq» 112(%rbp),%rax 1787 » movq» 56(%rbp),%rax
1522 adcq $0,%rdx 1788 adcq $0,%rdx
1523 addq %r14,%r13 1789 addq %r14,%r13
1524 movq %rdx,%r14 1790 movq %rdx,%r14
1525 adcq $0,%r14 1791 adcq $0,%r14
1526 1792
1527 mulq %rbx 1793 mulq %rbx
1528 movq 48-16+8(%rsp,%rcx,8),%rbx 1794 movq 48-16+8(%rsp,%rcx,8),%rbx
1529 addq %rax,%r15 1795 addq %rax,%r15
1530 adcq $0,%rdx 1796 adcq $0,%rdx
1531 addq %r15,%r14 1797 addq %r15,%r14
1532 movq 0(%rbp),%rax 1798 movq 0(%rbp),%rax
1533 movq %rdx,%r15 1799 movq %rdx,%r15
1534 adcq $0,%r15 1800 adcq $0,%r15
1535 1801
1536 decl %ecx 1802 decl %ecx
1537 jnz L$8x_tail 1803 jnz L$8x_tail
1538 1804
1539 » leaq» 128(%rbp),%rbp 1805 » leaq» 64(%rbp),%rbp
1540 movq 8+8(%rsp),%rdx 1806 movq 8+8(%rsp),%rdx
1541 cmpq 0+8(%rsp),%rbp 1807 cmpq 0+8(%rsp),%rbp
1542 jae L$8x_tail_done 1808 jae L$8x_tail_done
1543 1809
1544 movq 48+56+8(%rsp),%rbx 1810 movq 48+56+8(%rsp),%rbx
1545 negq %rsi 1811 negq %rsi
1546 movq 0(%rbp),%rax 1812 movq 0(%rbp),%rax
1547 adcq 0(%rdi),%r8 1813 adcq 0(%rdi),%r8
1548 adcq 8(%rdi),%r9 1814 adcq 8(%rdi),%r9
1549 adcq 16(%rdi),%r10 1815 adcq 16(%rdi),%r10
1550 adcq 24(%rdi),%r11 1816 adcq 24(%rdi),%r11
1551 adcq 32(%rdi),%r12 1817 adcq 32(%rdi),%r12
1552 adcq 40(%rdi),%r13 1818 adcq 40(%rdi),%r13
1553 adcq 48(%rdi),%r14 1819 adcq 48(%rdi),%r14
1554 adcq 56(%rdi),%r15 1820 adcq 56(%rdi),%r15
1555 sbbq %rsi,%rsi 1821 sbbq %rsi,%rsi
1556 1822
1557 movl $8,%ecx 1823 movl $8,%ecx
1558 jmp L$8x_tail 1824 jmp L$8x_tail
1559 1825
1560 .p2align 5 1826 .p2align 5
1561 L$8x_tail_done: 1827 L$8x_tail_done:
1562 addq (%rdx),%r8 1828 addq (%rdx),%r8
1829 adcq $0,%r9
1830 adcq $0,%r10
1831 adcq $0,%r11
1832 adcq $0,%r12
1833 adcq $0,%r13
1834 adcq $0,%r14
1835 adcq $0,%r15
1836
1837
1563 xorq %rax,%rax 1838 xorq %rax,%rax
1564 1839
1565 negq %rsi 1840 negq %rsi
1566 L$8x_no_tail: 1841 L$8x_no_tail:
1567 adcq 0(%rdi),%r8 1842 adcq 0(%rdi),%r8
1568 adcq 8(%rdi),%r9 1843 adcq 8(%rdi),%r9
1569 adcq 16(%rdi),%r10 1844 adcq 16(%rdi),%r10
1570 adcq 24(%rdi),%r11 1845 adcq 24(%rdi),%r11
1571 adcq 32(%rdi),%r12 1846 adcq 32(%rdi),%r12
1572 adcq 40(%rdi),%r13 1847 adcq 40(%rdi),%r13
1573 adcq 48(%rdi),%r14 1848 adcq 48(%rdi),%r14
1574 adcq 56(%rdi),%r15 1849 adcq 56(%rdi),%r15
1575 adcq $0,%rax 1850 adcq $0,%rax
1576 » movq» -16(%rbp),%rcx 1851 » movq» -8(%rbp),%rcx
1577 xorq %rsi,%rsi 1852 xorq %rsi,%rsi
1578 1853
1579 .byte 102,72,15,126,213 1854 .byte 102,72,15,126,213
1580 1855
1581 movq %r8,0(%rdi) 1856 movq %r8,0(%rdi)
1582 movq %r9,8(%rdi) 1857 movq %r9,8(%rdi)
1583 .byte 102,73,15,126,217 1858 .byte 102,73,15,126,217
1584 movq %r10,16(%rdi) 1859 movq %r10,16(%rdi)
1585 movq %r11,24(%rdi) 1860 movq %r11,24(%rdi)
1586 movq %r12,32(%rdi) 1861 movq %r12,32(%rdi)
1587 movq %r13,40(%rdi) 1862 movq %r13,40(%rdi)
1588 movq %r14,48(%rdi) 1863 movq %r14,48(%rdi)
1589 movq %r15,56(%rdi) 1864 movq %r15,56(%rdi)
1590 leaq 64(%rdi),%rdi 1865 leaq 64(%rdi),%rdi
1591 1866
1592 cmpq %rdx,%rdi 1867 cmpq %rdx,%rdi
1593 jb L$8x_reduction_loop 1868 jb L$8x_reduction_loop
1869 .byte 0xf3,0xc3
1594 1870
1595 subq %r15,%rcx
1596 leaq (%rdi,%r9,1),%rbx
1597 adcq %rsi,%rsi
1598 movq %r9,%rcx
1599 orq %rsi,%rax
1600 .byte 102,72,15,126,207
1601 xorq $1,%rax
1602 .byte 102,72,15,126,206
1603 leaq (%rbp,%rax,8),%rbp
1604 sarq $3+2,%rcx
1605 jmp L$sqr4x_sub
1606 1871
1607 .p2align 5 1872 .p2align 5
1873 __bn_post4x_internal:
1874 movq 0(%rbp),%r12
1875 leaq (%rdi,%r9,1),%rbx
1876 movq %r9,%rcx
1877 .byte 102,72,15,126,207
1878 negq %rax
1879 .byte 102,72,15,126,206
1880 sarq $3+2,%rcx
1881 decq %r12
1882 xorq %r10,%r10
1883 movq 8(%rbp),%r13
1884 movq 16(%rbp),%r14
1885 movq 24(%rbp),%r15
1886 jmp L$sqr4x_sub_entry
1887
1888 .p2align 4
1608 L$sqr4x_sub: 1889 L$sqr4x_sub:
1609 .byte» 0x66 1890 » movq» 0(%rbp),%r12
1610 » movq» 0(%rbx),%r12 1891 » movq» 8(%rbp),%r13
1611 » movq» 8(%rbx),%r13 1892 » movq» 16(%rbp),%r14
1612 » sbbq» 0(%rbp),%r12 1893 » movq» 24(%rbp),%r15
1613 » movq» 16(%rbx),%r14 1894 L$sqr4x_sub_entry:
1614 » sbbq» 16(%rbp),%r13 1895 » leaq» 32(%rbp),%rbp
1615 » movq» 24(%rbx),%r15 1896 » notq» %r12
1897 » notq» %r13
1898 » notq» %r14
1899 » notq» %r15
1900 » andq» %rax,%r12
1901 » andq» %rax,%r13
1902 » andq» %rax,%r14
1903 » andq» %rax,%r15
1904
1905 » negq» %r10
1906 » adcq» 0(%rbx),%r12
1907 » adcq» 8(%rbx),%r13
1908 » adcq» 16(%rbx),%r14
1909 » adcq» 24(%rbx),%r15
1910 » movq» %r12,0(%rdi)
1616 leaq 32(%rbx),%rbx 1911 leaq 32(%rbx),%rbx
1617 sbbq 32(%rbp),%r14
1618 movq %r12,0(%rdi)
1619 sbbq 48(%rbp),%r15
1620 leaq 64(%rbp),%rbp
1621 movq %r13,8(%rdi) 1912 movq %r13,8(%rdi)
1913 sbbq %r10,%r10
1622 movq %r14,16(%rdi) 1914 movq %r14,16(%rdi)
1623 movq %r15,24(%rdi) 1915 movq %r15,24(%rdi)
1624 leaq 32(%rdi),%rdi 1916 leaq 32(%rdi),%rdi
1625 1917
1626 incq %rcx 1918 incq %rcx
1627 jnz L$sqr4x_sub 1919 jnz L$sqr4x_sub
1920
1628 movq %r9,%r10 1921 movq %r9,%r10
1629 negq %r9 1922 negq %r9
1630 .byte 0xf3,0xc3 1923 .byte 0xf3,0xc3
1631 1924
1632 .globl _bn_from_montgomery 1925 .globl _bn_from_montgomery
1633 .private_extern _bn_from_montgomery 1926 .private_extern _bn_from_montgomery
1634 1927
1635 .p2align 5 1928 .p2align 5
1636 _bn_from_montgomery: 1929 _bn_from_montgomery:
1637 testl $7,%r9d 1930 testl $7,%r9d
1638 jz bn_from_mont8x 1931 jz bn_from_mont8x
1639 xorl %eax,%eax 1932 xorl %eax,%eax
1640 .byte 0xf3,0xc3 1933 .byte 0xf3,0xc3
1641 1934
1642 1935
1643 1936
1644 .p2align 5 1937 .p2align 5
1645 bn_from_mont8x: 1938 bn_from_mont8x:
1646 .byte 0x67 1939 .byte 0x67
1647 movq %rsp,%rax 1940 movq %rsp,%rax
1648 pushq %rbx 1941 pushq %rbx
1649 pushq %rbp 1942 pushq %rbp
1650 pushq %r12 1943 pushq %r12
1651 pushq %r13 1944 pushq %r13
1652 pushq %r14 1945 pushq %r14
1653 pushq %r15 1946 pushq %r15
1654 .byte» 0x67 1947
1655 » movl» %r9d,%r10d
1656 shll $3,%r9d 1948 shll $3,%r9d
1657 » shll» $3+2,%r10d 1949 » leaq» (%r9,%r9,2),%r10
1658 negq %r9 1950 negq %r9
1659 movq (%r8),%r8 1951 movq (%r8),%r8
1660 1952
1661 1953
1662 1954
1663 1955
1664 1956
1665 1957
1666 1958
1667 » leaq» -64(%rsp,%r9,2),%r11 1959
1668 » subq» %rsi,%r11 1960 » leaq» -320(%rsp,%r9,2),%r11
1961 » subq» %rdi,%r11
1669 andq $4095,%r11 1962 andq $4095,%r11
1670 cmpq %r11,%r10 1963 cmpq %r11,%r10
1671 jb L$from_sp_alt 1964 jb L$from_sp_alt
1672 subq %r11,%rsp 1965 subq %r11,%rsp
1673 » leaq» -64(%rsp,%r9,2),%rsp 1966 » leaq» -320(%rsp,%r9,2),%rsp
1674 jmp L$from_sp_done 1967 jmp L$from_sp_done
1675 1968
1676 .p2align 5 1969 .p2align 5
1677 L$from_sp_alt: 1970 L$from_sp_alt:
1678 » leaq» 4096-64(,%r9,2),%r10 1971 » leaq» 4096-320(,%r9,2),%r10
1679 » leaq» -64(%rsp,%r9,2),%rsp 1972 » leaq» -320(%rsp,%r9,2),%rsp
1680 subq %r10,%r11 1973 subq %r10,%r11
1681 movq $0,%r10 1974 movq $0,%r10
1682 cmovcq %r10,%r11 1975 cmovcq %r10,%r11
1683 subq %r11,%rsp 1976 subq %r11,%rsp
1684 L$from_sp_done: 1977 L$from_sp_done:
1685 andq $-64,%rsp 1978 andq $-64,%rsp
1686 movq %r9,%r10 1979 movq %r9,%r10
1687 negq %r9 1980 negq %r9
1688 1981
1689 1982
(...skipping 30 matching lines...) Expand all
1720 movdqa %xmm4,48(%rax) 2013 movdqa %xmm4,48(%rax)
1721 leaq 64(%rax),%rax 2014 leaq 64(%rax),%rax
1722 subq $64,%r11 2015 subq $64,%r11
1723 jnz L$mul_by_1 2016 jnz L$mul_by_1
1724 2017
1725 .byte 102,72,15,110,207 2018 .byte 102,72,15,110,207
1726 .byte 102,72,15,110,209 2019 .byte 102,72,15,110,209
1727 .byte 0x67 2020 .byte 0x67
1728 movq %rcx,%rbp 2021 movq %rcx,%rbp
1729 .byte 102,73,15,110,218 2022 .byte 102,73,15,110,218
1730 » call» sqr8x_reduction 2023 » call» __bn_sqr8x_reduction
2024 » call» __bn_post4x_internal
1731 2025
1732 pxor %xmm0,%xmm0 2026 pxor %xmm0,%xmm0
1733 leaq 48(%rsp),%rax 2027 leaq 48(%rsp),%rax
1734 movq 40(%rsp),%rsi 2028 movq 40(%rsp),%rsi
1735 jmp L$from_mont_zero 2029 jmp L$from_mont_zero
1736 2030
1737 .p2align 5 2031 .p2align 5
1738 L$from_mont_zero: 2032 L$from_mont_zero:
1739 movdqa %xmm0,0(%rax) 2033 movdqa %xmm0,0(%rax)
1740 movdqa %xmm0,16(%rax) 2034 movdqa %xmm0,16(%rax)
(...skipping 29 matching lines...) Expand all
1770 leaq 256(%rdx),%rdx 2064 leaq 256(%rdx),%rdx
1771 subl $1,%esi 2065 subl $1,%esi
1772 jnz L$scatter 2066 jnz L$scatter
1773 L$scatter_epilogue: 2067 L$scatter_epilogue:
1774 .byte 0xf3,0xc3 2068 .byte 0xf3,0xc3
1775 2069
1776 2070
1777 .globl _bn_gather5 2071 .globl _bn_gather5
1778 .private_extern _bn_gather5 2072 .private_extern _bn_gather5
1779 2073
1780 .p2align» 4 2074 .p2align» 5
1781 _bn_gather5: 2075 _bn_gather5:
1782 » movl» %ecx,%r11d 2076 L$SEH_begin_bn_gather5:
1783 » shrl» $3,%ecx 2077
1784 » andq» $7,%r11 2078 .byte» 0x4c,0x8d,0x14,0x24
1785 » notl» %ecx 2079 .byte» 0x48,0x81,0xec,0x08,0x01,0x00,0x00
1786 » leaq» L$magic_masks(%rip),%rax 2080 » leaq» L$inc(%rip),%rax
1787 » andl» $3,%ecx 2081 » andq» $-16,%rsp
1788 » leaq» 128(%rdx,%r11,8),%rdx 2082
1789 » movq» 0(%rax,%rcx,8),%xmm4 2083 » movd» %ecx,%xmm5
1790 » movq» 8(%rax,%rcx,8),%xmm5 2084 » movdqa» 0(%rax),%xmm0
1791 » movq» 16(%rax,%rcx,8),%xmm6 2085 » movdqa» 16(%rax),%xmm1
1792 » movq» 24(%rax,%rcx,8),%xmm7 2086 » leaq» 128(%rdx),%r11
2087 » leaq» 128(%rsp),%rax
2088
2089 » pshufd» $0,%xmm5,%xmm5
2090 » movdqa» %xmm1,%xmm4
2091 » movdqa» %xmm1,%xmm2
2092 » paddd» %xmm0,%xmm1
2093 » pcmpeqd»%xmm5,%xmm0
2094 » movdqa» %xmm4,%xmm3
2095
2096 » paddd» %xmm1,%xmm2
2097 » pcmpeqd»%xmm5,%xmm1
2098 » movdqa» %xmm0,-128(%rax)
2099 » movdqa» %xmm4,%xmm0
2100
2101 » paddd» %xmm2,%xmm3
2102 » pcmpeqd»%xmm5,%xmm2
2103 » movdqa» %xmm1,-112(%rax)
2104 » movdqa» %xmm4,%xmm1
2105
2106 » paddd» %xmm3,%xmm0
2107 » pcmpeqd»%xmm5,%xmm3
2108 » movdqa» %xmm2,-96(%rax)
2109 » movdqa» %xmm4,%xmm2
2110 » paddd» %xmm0,%xmm1
2111 » pcmpeqd»%xmm5,%xmm0
2112 » movdqa» %xmm3,-80(%rax)
2113 » movdqa» %xmm4,%xmm3
2114
2115 » paddd» %xmm1,%xmm2
2116 » pcmpeqd»%xmm5,%xmm1
2117 » movdqa» %xmm0,-64(%rax)
2118 » movdqa» %xmm4,%xmm0
2119
2120 » paddd» %xmm2,%xmm3
2121 » pcmpeqd»%xmm5,%xmm2
2122 » movdqa» %xmm1,-48(%rax)
2123 » movdqa» %xmm4,%xmm1
2124
2125 » paddd» %xmm3,%xmm0
2126 » pcmpeqd»%xmm5,%xmm3
2127 » movdqa» %xmm2,-32(%rax)
2128 » movdqa» %xmm4,%xmm2
2129 » paddd» %xmm0,%xmm1
2130 » pcmpeqd»%xmm5,%xmm0
2131 » movdqa» %xmm3,-16(%rax)
2132 » movdqa» %xmm4,%xmm3
2133
2134 » paddd» %xmm1,%xmm2
2135 » pcmpeqd»%xmm5,%xmm1
2136 » movdqa» %xmm0,0(%rax)
2137 » movdqa» %xmm4,%xmm0
2138
2139 » paddd» %xmm2,%xmm3
2140 » pcmpeqd»%xmm5,%xmm2
2141 » movdqa» %xmm1,16(%rax)
2142 » movdqa» %xmm4,%xmm1
2143
2144 » paddd» %xmm3,%xmm0
2145 » pcmpeqd»%xmm5,%xmm3
2146 » movdqa» %xmm2,32(%rax)
2147 » movdqa» %xmm4,%xmm2
2148 » paddd» %xmm0,%xmm1
2149 » pcmpeqd»%xmm5,%xmm0
2150 » movdqa» %xmm3,48(%rax)
2151 » movdqa» %xmm4,%xmm3
2152
2153 » paddd» %xmm1,%xmm2
2154 » pcmpeqd»%xmm5,%xmm1
2155 » movdqa» %xmm0,64(%rax)
2156 » movdqa» %xmm4,%xmm0
2157
2158 » paddd» %xmm2,%xmm3
2159 » pcmpeqd»%xmm5,%xmm2
2160 » movdqa» %xmm1,80(%rax)
2161 » movdqa» %xmm4,%xmm1
2162
2163 » paddd» %xmm3,%xmm0
2164 » pcmpeqd»%xmm5,%xmm3
2165 » movdqa» %xmm2,96(%rax)
2166 » movdqa» %xmm4,%xmm2
2167 » movdqa» %xmm3,112(%rax)
1793 jmp L$gather 2168 jmp L$gather
1794 .p2align» 4 2169
2170 .p2align» 5
1795 L$gather: 2171 L$gather:
1796 » movq» -128(%rdx),%xmm0 2172 » pxor» %xmm4,%xmm4
1797 » movq» -64(%rdx),%xmm1 2173 » pxor» %xmm5,%xmm5
1798 » pand» %xmm4,%xmm0 2174 » movdqa» -128(%r11),%xmm0
1799 » movq» 0(%rdx),%xmm2 2175 » movdqa» -112(%r11),%xmm1
1800 » pand» %xmm5,%xmm1 2176 » movdqa» -96(%r11),%xmm2
1801 » movq» 64(%rdx),%xmm3 2177 » pand» -128(%rax),%xmm0
1802 » pand» %xmm6,%xmm2 2178 » movdqa» -80(%r11),%xmm3
1803 » por» %xmm1,%xmm0 2179 » pand» -112(%rax),%xmm1
1804 » pand» %xmm7,%xmm3 2180 » por» %xmm0,%xmm4
1805 .byte» 0x67,0x67 2181 » pand» -96(%rax),%xmm2
1806 » por» %xmm2,%xmm0 2182 » por» %xmm1,%xmm5
1807 » leaq» 256(%rdx),%rdx 2183 » pand» -80(%rax),%xmm3
1808 » por» %xmm3,%xmm0 2184 » por» %xmm2,%xmm4
1809 2185 » por» %xmm3,%xmm5
2186 » movdqa» -64(%r11),%xmm0
2187 » movdqa» -48(%r11),%xmm1
2188 » movdqa» -32(%r11),%xmm2
2189 » pand» -64(%rax),%xmm0
2190 » movdqa» -16(%r11),%xmm3
2191 » pand» -48(%rax),%xmm1
2192 » por» %xmm0,%xmm4
2193 » pand» -32(%rax),%xmm2
2194 » por» %xmm1,%xmm5
2195 » pand» -16(%rax),%xmm3
2196 » por» %xmm2,%xmm4
2197 » por» %xmm3,%xmm5
2198 » movdqa» 0(%r11),%xmm0
2199 » movdqa» 16(%r11),%xmm1
2200 » movdqa» 32(%r11),%xmm2
2201 » pand» 0(%rax),%xmm0
2202 » movdqa» 48(%r11),%xmm3
2203 » pand» 16(%rax),%xmm1
2204 » por» %xmm0,%xmm4
2205 » pand» 32(%rax),%xmm2
2206 » por» %xmm1,%xmm5
2207 » pand» 48(%rax),%xmm3
2208 » por» %xmm2,%xmm4
2209 » por» %xmm3,%xmm5
2210 » movdqa» 64(%r11),%xmm0
2211 » movdqa» 80(%r11),%xmm1
2212 » movdqa» 96(%r11),%xmm2
2213 » pand» 64(%rax),%xmm0
2214 » movdqa» 112(%r11),%xmm3
2215 » pand» 80(%rax),%xmm1
2216 » por» %xmm0,%xmm4
2217 » pand» 96(%rax),%xmm2
2218 » por» %xmm1,%xmm5
2219 » pand» 112(%rax),%xmm3
2220 » por» %xmm2,%xmm4
2221 » por» %xmm3,%xmm5
2222 » por» %xmm5,%xmm4
2223 » leaq» 256(%r11),%r11
2224 » pshufd» $0x4e,%xmm4,%xmm0
2225 » por» %xmm4,%xmm0
1810 movq %xmm0,(%rdi) 2226 movq %xmm0,(%rdi)
1811 leaq 8(%rdi),%rdi 2227 leaq 8(%rdi),%rdi
1812 subl $1,%esi 2228 subl $1,%esi
1813 jnz L$gather 2229 jnz L$gather
2230
2231 leaq (%r10),%rsp
1814 .byte 0xf3,0xc3 2232 .byte 0xf3,0xc3
1815 L$SEH_end_bn_gather5: 2233 L$SEH_end_bn_gather5:
1816 2234
1817 .p2align 6 2235 .p2align 6
1818 L$magic_masks: 2236 L$inc:
1819 .long» 0,0, 0,0, 0,0, -1,-1 2237 .long» 0,0, 1,1
1820 .long» 0,0, 0,0, 0,0, 0,0 2238 .long» 2,2, 2,2
1821 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105 ,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97 ,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71 ,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,1 11,114,103,62,0 2239 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105 ,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97 ,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71 ,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,1 11,114,103,62,0
1822 #endif 2240 #endif
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698