Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(135)

Side by Side Diff: third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont5.S

Issue 2219933002: Land BoringSSL roll on master (Closed) Base URL: git@github.com:dart-lang/sdk.git@master
Patch Set: Created 4 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 #if defined(__x86_64__) 1 #if defined(__x86_64__)
2 .text 2 .text
3 3
4 .extern OPENSSL_ia32cap_P 4 .extern OPENSSL_ia32cap_P
5 .hidden OPENSSL_ia32cap_P 5 .hidden OPENSSL_ia32cap_P
6 6
7 .globl bn_mul_mont_gather5 7 .globl bn_mul_mont_gather5
8 .hidden bn_mul_mont_gather5 8 .hidden bn_mul_mont_gather5
9 .type bn_mul_mont_gather5,@function 9 .type bn_mul_mont_gather5,@function
10 .align 64 10 .align 64
11 bn_mul_mont_gather5: 11 bn_mul_mont_gather5:
12 testl $7,%r9d 12 testl $7,%r9d
13 jnz .Lmul_enter 13 jnz .Lmul_enter
14 jmp .Lmul4x_enter 14 jmp .Lmul4x_enter
15 15
16 .align 16 16 .align 16
17 .Lmul_enter: 17 .Lmul_enter:
18 movl %r9d,%r9d 18 movl %r9d,%r9d
19 movq %rsp,%rax 19 movq %rsp,%rax
20 » movl» 8(%rsp),%r10d 20 » movd» 8(%rsp),%xmm5
21 » leaq» .Linc(%rip),%r10
21 pushq %rbx 22 pushq %rbx
22 pushq %rbp 23 pushq %rbp
23 pushq %r12 24 pushq %r12
24 pushq %r13 25 pushq %r13
25 pushq %r14 26 pushq %r14
26 pushq %r15 27 pushq %r15
28
27 leaq 2(%r9),%r11 29 leaq 2(%r9),%r11
28 negq %r11 30 negq %r11
29 » leaq» (%rsp,%r11,8),%rsp 31 » leaq» -264(%rsp,%r11,8),%rsp
30 andq $-1024,%rsp 32 andq $-1024,%rsp
31 33
32 movq %rax,8(%rsp,%r9,8) 34 movq %rax,8(%rsp,%r9,8)
33 .Lmul_body: 35 .Lmul_body:
34 » movq» %rdx,%r12 36 » leaq» 128(%rdx),%r12
35 » movq» %r10,%r11 37 » movdqa» 0(%r10),%xmm0
36 » shrq» $3,%r10 38 » movdqa» 16(%r10),%xmm1
37 » andq» $7,%r11 39 » leaq» 24-112(%rsp,%r9,8),%r10
38 » notq» %r10 40 » andq» $-16,%r10
39 » leaq» .Lmagic_masks(%rip),%rax
40 » andq» $3,%r10
41 » leaq» 96(%r12,%r11,8),%r12
42 » movq» 0(%rax,%r10,8),%xmm4
43 » movq» 8(%rax,%r10,8),%xmm5
44 » movq» 16(%rax,%r10,8),%xmm6
45 » movq» 24(%rax,%r10,8),%xmm7
46 41
47 » movq» -96(%r12),%xmm0 42 » pshufd» $0,%xmm5,%xmm5
48 » movq» -32(%r12),%xmm1 43 » movdqa» %xmm1,%xmm4
49 » pand» %xmm4,%xmm0 44 » movdqa» %xmm1,%xmm2
50 » movq» 32(%r12),%xmm2 45 » paddd» %xmm0,%xmm1
51 » pand» %xmm5,%xmm1 46 » pcmpeqd»%xmm5,%xmm0
52 » movq» 96(%r12),%xmm3 47 .byte» 0x67
53 » pand» %xmm6,%xmm2 48 » movdqa» %xmm4,%xmm3
49 » paddd» %xmm1,%xmm2
50 » pcmpeqd»%xmm5,%xmm1
51 » movdqa» %xmm0,112(%r10)
52 » movdqa» %xmm4,%xmm0
53
54 » paddd» %xmm2,%xmm3
55 » pcmpeqd»%xmm5,%xmm2
56 » movdqa» %xmm1,128(%r10)
57 » movdqa» %xmm4,%xmm1
58
59 » paddd» %xmm3,%xmm0
60 » pcmpeqd»%xmm5,%xmm3
61 » movdqa» %xmm2,144(%r10)
62 » movdqa» %xmm4,%xmm2
63
64 » paddd» %xmm0,%xmm1
65 » pcmpeqd»%xmm5,%xmm0
66 » movdqa» %xmm3,160(%r10)
67 » movdqa» %xmm4,%xmm3
68 » paddd» %xmm1,%xmm2
69 » pcmpeqd»%xmm5,%xmm1
70 » movdqa» %xmm0,176(%r10)
71 » movdqa» %xmm4,%xmm0
72
73 » paddd» %xmm2,%xmm3
74 » pcmpeqd»%xmm5,%xmm2
75 » movdqa» %xmm1,192(%r10)
76 » movdqa» %xmm4,%xmm1
77
78 » paddd» %xmm3,%xmm0
79 » pcmpeqd»%xmm5,%xmm3
80 » movdqa» %xmm2,208(%r10)
81 » movdqa» %xmm4,%xmm2
82
83 » paddd» %xmm0,%xmm1
84 » pcmpeqd»%xmm5,%xmm0
85 » movdqa» %xmm3,224(%r10)
86 » movdqa» %xmm4,%xmm3
87 » paddd» %xmm1,%xmm2
88 » pcmpeqd»%xmm5,%xmm1
89 » movdqa» %xmm0,240(%r10)
90 » movdqa» %xmm4,%xmm0
91
92 » paddd» %xmm2,%xmm3
93 » pcmpeqd»%xmm5,%xmm2
94 » movdqa» %xmm1,256(%r10)
95 » movdqa» %xmm4,%xmm1
96
97 » paddd» %xmm3,%xmm0
98 » pcmpeqd»%xmm5,%xmm3
99 » movdqa» %xmm2,272(%r10)
100 » movdqa» %xmm4,%xmm2
101
102 » paddd» %xmm0,%xmm1
103 » pcmpeqd»%xmm5,%xmm0
104 » movdqa» %xmm3,288(%r10)
105 » movdqa» %xmm4,%xmm3
106 » paddd» %xmm1,%xmm2
107 » pcmpeqd»%xmm5,%xmm1
108 » movdqa» %xmm0,304(%r10)
109
110 » paddd» %xmm2,%xmm3
111 .byte» 0x67
112 » pcmpeqd»%xmm5,%xmm2
113 » movdqa» %xmm1,320(%r10)
114
115 » pcmpeqd»%xmm5,%xmm3
116 » movdqa» %xmm2,336(%r10)
117 » pand» 64(%r12),%xmm0
118
119 » pand» 80(%r12),%xmm1
120 » pand» 96(%r12),%xmm2
121 » movdqa» %xmm3,352(%r10)
122 » pand» 112(%r12),%xmm3
123 » por» %xmm2,%xmm0
124 » por» %xmm3,%xmm1
125 » movdqa» -128(%r12),%xmm4
126 » movdqa» -112(%r12),%xmm5
127 » movdqa» -96(%r12),%xmm2
128 » pand» 112(%r10),%xmm4
129 » movdqa» -80(%r12),%xmm3
130 » pand» 128(%r10),%xmm5
131 » por» %xmm4,%xmm0
132 » pand» 144(%r10),%xmm2
133 » por» %xmm5,%xmm1
134 » pand» 160(%r10),%xmm3
135 » por» %xmm2,%xmm0
136 » por» %xmm3,%xmm1
137 » movdqa» -64(%r12),%xmm4
138 » movdqa» -48(%r12),%xmm5
139 » movdqa» -32(%r12),%xmm2
140 » pand» 176(%r10),%xmm4
141 » movdqa» -16(%r12),%xmm3
142 » pand» 192(%r10),%xmm5
143 » por» %xmm4,%xmm0
144 » pand» 208(%r10),%xmm2
145 » por» %xmm5,%xmm1
146 » pand» 224(%r10),%xmm3
147 » por» %xmm2,%xmm0
148 » por» %xmm3,%xmm1
149 » movdqa» 0(%r12),%xmm4
150 » movdqa» 16(%r12),%xmm5
151 » movdqa» 32(%r12),%xmm2
152 » pand» 240(%r10),%xmm4
153 » movdqa» 48(%r12),%xmm3
154 » pand» 256(%r10),%xmm5
155 » por» %xmm4,%xmm0
156 » pand» 272(%r10),%xmm2
157 » por» %xmm5,%xmm1
158 » pand» 288(%r10),%xmm3
159 » por» %xmm2,%xmm0
160 » por» %xmm3,%xmm1
54 por %xmm1,%xmm0 161 por %xmm1,%xmm0
55 » pand» %xmm7,%xmm3 162 » pshufd» $0x4e,%xmm0,%xmm1
56 » por» %xmm2,%xmm0 163 » por» %xmm1,%xmm0
57 leaq 256(%r12),%r12 164 leaq 256(%r12),%r12
58 por %xmm3,%xmm0
59
60 .byte 102,72,15,126,195 165 .byte 102,72,15,126,195
61 166
62 movq (%r8),%r8 167 movq (%r8),%r8
63 movq (%rsi),%rax 168 movq (%rsi),%rax
64 169
65 xorq %r14,%r14 170 xorq %r14,%r14
66 xorq %r15,%r15 171 xorq %r15,%r15
67 172
68 movq -96(%r12),%xmm0
69 movq -32(%r12),%xmm1
70 pand %xmm4,%xmm0
71 movq 32(%r12),%xmm2
72 pand %xmm5,%xmm1
73
74 movq %r8,%rbp 173 movq %r8,%rbp
75 mulq %rbx 174 mulq %rbx
76 movq %rax,%r10 175 movq %rax,%r10
77 movq (%rcx),%rax 176 movq (%rcx),%rax
78 177
79 movq 96(%r12),%xmm3
80 pand %xmm6,%xmm2
81 por %xmm1,%xmm0
82 pand %xmm7,%xmm3
83
84 imulq %r10,%rbp 178 imulq %r10,%rbp
85 movq %rdx,%r11 179 movq %rdx,%r11
86 180
87 por %xmm2,%xmm0
88 leaq 256(%r12),%r12
89 por %xmm3,%xmm0
90
91 mulq %rbp 181 mulq %rbp
92 addq %rax,%r10 182 addq %rax,%r10
93 movq 8(%rsi),%rax 183 movq 8(%rsi),%rax
94 adcq $0,%rdx 184 adcq $0,%rdx
95 movq %rdx,%r13 185 movq %rdx,%r13
96 186
97 leaq 1(%r15),%r15 187 leaq 1(%r15),%r15
98 jmp .L1st_enter 188 jmp .L1st_enter
99 189
100 .align 16 190 .align 16
(...skipping 12 matching lines...) Expand all
113 addq %rax,%r11 203 addq %rax,%r11
114 movq (%rcx,%r15,8),%rax 204 movq (%rcx,%r15,8),%rax
115 adcq $0,%rdx 205 adcq $0,%rdx
116 leaq 1(%r15),%r15 206 leaq 1(%r15),%r15
117 movq %rdx,%r10 207 movq %rdx,%r10
118 208
119 mulq %rbp 209 mulq %rbp
120 cmpq %r9,%r15 210 cmpq %r9,%r15
121 jne .L1st 211 jne .L1st
122 212
123 .byte 102,72,15,126,195
124 213
125 addq %rax,%r13 214 addq %rax,%r13
126 movq (%rsi),%rax
127 adcq $0,%rdx 215 adcq $0,%rdx
128 addq %r11,%r13 216 addq %r11,%r13
129 adcq $0,%rdx 217 adcq $0,%rdx
130 » movq» %r13,-16(%rsp,%r15,8) 218 » movq» %r13,-16(%rsp,%r9,8)
131 movq %rdx,%r13 219 movq %rdx,%r13
132 movq %r10,%r11 220 movq %r10,%r11
133 221
134 xorq %rdx,%rdx 222 xorq %rdx,%rdx
135 addq %r11,%r13 223 addq %r11,%r13
136 adcq $0,%rdx 224 adcq $0,%rdx
137 movq %r13,-8(%rsp,%r9,8) 225 movq %r13,-8(%rsp,%r9,8)
138 movq %rdx,(%rsp,%r9,8) 226 movq %rdx,(%rsp,%r9,8)
139 227
140 leaq 1(%r14),%r14 228 leaq 1(%r14),%r14
141 jmp .Louter 229 jmp .Louter
142 .align 16 230 .align 16
143 .Louter: 231 .Louter:
232 leaq 24+128(%rsp,%r9,8),%rdx
233 andq $-16,%rdx
234 pxor %xmm4,%xmm4
235 pxor %xmm5,%xmm5
236 movdqa -128(%r12),%xmm0
237 movdqa -112(%r12),%xmm1
238 movdqa -96(%r12),%xmm2
239 movdqa -80(%r12),%xmm3
240 pand -128(%rdx),%xmm0
241 pand -112(%rdx),%xmm1
242 por %xmm0,%xmm4
243 pand -96(%rdx),%xmm2
244 por %xmm1,%xmm5
245 pand -80(%rdx),%xmm3
246 por %xmm2,%xmm4
247 por %xmm3,%xmm5
248 movdqa -64(%r12),%xmm0
249 movdqa -48(%r12),%xmm1
250 movdqa -32(%r12),%xmm2
251 movdqa -16(%r12),%xmm3
252 pand -64(%rdx),%xmm0
253 pand -48(%rdx),%xmm1
254 por %xmm0,%xmm4
255 pand -32(%rdx),%xmm2
256 por %xmm1,%xmm5
257 pand -16(%rdx),%xmm3
258 por %xmm2,%xmm4
259 por %xmm3,%xmm5
260 movdqa 0(%r12),%xmm0
261 movdqa 16(%r12),%xmm1
262 movdqa 32(%r12),%xmm2
263 movdqa 48(%r12),%xmm3
264 pand 0(%rdx),%xmm0
265 pand 16(%rdx),%xmm1
266 por %xmm0,%xmm4
267 pand 32(%rdx),%xmm2
268 por %xmm1,%xmm5
269 pand 48(%rdx),%xmm3
270 por %xmm2,%xmm4
271 por %xmm3,%xmm5
272 movdqa 64(%r12),%xmm0
273 movdqa 80(%r12),%xmm1
274 movdqa 96(%r12),%xmm2
275 movdqa 112(%r12),%xmm3
276 pand 64(%rdx),%xmm0
277 pand 80(%rdx),%xmm1
278 por %xmm0,%xmm4
279 pand 96(%rdx),%xmm2
280 por %xmm1,%xmm5
281 pand 112(%rdx),%xmm3
282 por %xmm2,%xmm4
283 por %xmm3,%xmm5
284 por %xmm5,%xmm4
285 pshufd $0x4e,%xmm4,%xmm0
286 por %xmm4,%xmm0
287 leaq 256(%r12),%r12
288
289 movq (%rsi),%rax
290 .byte 102,72,15,126,195
291
144 xorq %r15,%r15 292 xorq %r15,%r15
145 movq %r8,%rbp 293 movq %r8,%rbp
146 movq (%rsp),%r10 294 movq (%rsp),%r10
147 295
148 movq -96(%r12),%xmm0
149 movq -32(%r12),%xmm1
150 pand %xmm4,%xmm0
151 movq 32(%r12),%xmm2
152 pand %xmm5,%xmm1
153
154 mulq %rbx 296 mulq %rbx
155 addq %rax,%r10 297 addq %rax,%r10
156 movq (%rcx),%rax 298 movq (%rcx),%rax
157 adcq $0,%rdx 299 adcq $0,%rdx
158 300
159 movq 96(%r12),%xmm3
160 pand %xmm6,%xmm2
161 por %xmm1,%xmm0
162 pand %xmm7,%xmm3
163
164 imulq %r10,%rbp 301 imulq %r10,%rbp
165 movq %rdx,%r11 302 movq %rdx,%r11
166 303
167 por %xmm2,%xmm0
168 leaq 256(%r12),%r12
169 por %xmm3,%xmm0
170
171 mulq %rbp 304 mulq %rbp
172 addq %rax,%r10 305 addq %rax,%r10
173 movq 8(%rsi),%rax 306 movq 8(%rsi),%rax
174 adcq $0,%rdx 307 adcq $0,%rdx
175 movq 8(%rsp),%r10 308 movq 8(%rsp),%r10
176 movq %rdx,%r13 309 movq %rdx,%r13
177 310
178 leaq 1(%r15),%r15 311 leaq 1(%r15),%r15
179 jmp .Linner_enter 312 jmp .Linner_enter
180 313
(...skipping 15 matching lines...) Expand all
196 adcq $0,%rdx 329 adcq $0,%rdx
197 addq %r11,%r10 330 addq %r11,%r10
198 movq %rdx,%r11 331 movq %rdx,%r11
199 adcq $0,%r11 332 adcq $0,%r11
200 leaq 1(%r15),%r15 333 leaq 1(%r15),%r15
201 334
202 mulq %rbp 335 mulq %rbp
203 cmpq %r9,%r15 336 cmpq %r9,%r15
204 jne .Linner 337 jne .Linner
205 338
206 .byte 102,72,15,126,195
207
208 addq %rax,%r13 339 addq %rax,%r13
209 movq (%rsi),%rax
210 adcq $0,%rdx 340 adcq $0,%rdx
211 addq %r10,%r13 341 addq %r10,%r13
212 » movq» (%rsp,%r15,8),%r10 342 » movq» (%rsp,%r9,8),%r10
213 adcq $0,%rdx 343 adcq $0,%rdx
214 » movq» %r13,-16(%rsp,%r15,8) 344 » movq» %r13,-16(%rsp,%r9,8)
215 movq %rdx,%r13 345 movq %rdx,%r13
216 346
217 xorq %rdx,%rdx 347 xorq %rdx,%rdx
218 addq %r11,%r13 348 addq %r11,%r13
219 adcq $0,%rdx 349 adcq $0,%rdx
220 addq %r10,%r13 350 addq %r10,%r13
221 adcq $0,%rdx 351 adcq $0,%rdx
222 movq %r13,-8(%rsp,%r9,8) 352 movq %r13,-8(%rsp,%r9,8)
223 movq %rdx,(%rsp,%r9,8) 353 movq %rdx,(%rsp,%r9,8)
224 354
(...skipping 25 matching lines...) Expand all
250 andq %rax,%rsi 380 andq %rax,%rsi
251 xorq %rcx,%rsi 381 xorq %rcx,%rsi
252 movq %r14,(%rsp,%r14,8) 382 movq %r14,(%rsp,%r14,8)
253 movq %rsi,(%rdi,%r14,8) 383 movq %rsi,(%rdi,%r14,8)
254 leaq 1(%r14),%r14 384 leaq 1(%r14),%r14
255 subq $1,%r15 385 subq $1,%r15
256 jnz .Lcopy 386 jnz .Lcopy
257 387
258 movq 8(%rsp,%r9,8),%rsi 388 movq 8(%rsp,%r9,8),%rsi
259 movq $1,%rax 389 movq $1,%rax
390
260 movq -48(%rsi),%r15 391 movq -48(%rsi),%r15
261 movq -40(%rsi),%r14 392 movq -40(%rsi),%r14
262 movq -32(%rsi),%r13 393 movq -32(%rsi),%r13
263 movq -24(%rsi),%r12 394 movq -24(%rsi),%r12
264 movq -16(%rsi),%rbp 395 movq -16(%rsi),%rbp
265 movq -8(%rsi),%rbx 396 movq -8(%rsi),%rbx
266 leaq (%rsi),%rsp 397 leaq (%rsi),%rsp
267 .Lmul_epilogue: 398 .Lmul_epilogue:
268 .byte 0xf3,0xc3 399 .byte 0xf3,0xc3
269 .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 400 .size bn_mul_mont_gather5,.-bn_mul_mont_gather5
270 .type bn_mul4x_mont_gather5,@function 401 .type bn_mul4x_mont_gather5,@function
271 .align 32 402 .align 32
272 bn_mul4x_mont_gather5: 403 bn_mul4x_mont_gather5:
273 .Lmul4x_enter: 404 .Lmul4x_enter:
274 .byte 0x67 405 .byte 0x67
275 movq %rsp,%rax 406 movq %rsp,%rax
276 pushq %rbx 407 pushq %rbx
277 pushq %rbp 408 pushq %rbp
278 pushq %r12 409 pushq %r12
279 pushq %r13 410 pushq %r13
280 pushq %r14 411 pushq %r14
281 pushq %r15 412 pushq %r15
413
282 .byte 0x67 414 .byte 0x67
283 movl %r9d,%r10d
284 shll $3,%r9d 415 shll $3,%r9d
285 » shll» $3+2,%r10d 416 » leaq» (%r9,%r9,2),%r10
286 negq %r9 417 negq %r9
287 418
288 419
289 420
290 421
291 422
292 423
293 424
294 425
295 » leaq» -64(%rsp,%r9,2),%r11 426
296 » subq» %rsi,%r11 427
428 » leaq» -320(%rsp,%r9,2),%r11
429 » subq» %rdi,%r11
297 andq $4095,%r11 430 andq $4095,%r11
298 cmpq %r11,%r10 431 cmpq %r11,%r10
299 jb .Lmul4xsp_alt 432 jb .Lmul4xsp_alt
300 subq %r11,%rsp 433 subq %r11,%rsp
301 » leaq» -64(%rsp,%r9,2),%rsp 434 » leaq» -320(%rsp,%r9,2),%rsp
302 jmp .Lmul4xsp_done 435 jmp .Lmul4xsp_done
303 436
304 .align 32 437 .align 32
305 .Lmul4xsp_alt: 438 .Lmul4xsp_alt:
306 » leaq» 4096-64(,%r9,2),%r10 439 » leaq» 4096-320(,%r9,2),%r10
307 » leaq» -64(%rsp,%r9,2),%rsp 440 » leaq» -320(%rsp,%r9,2),%rsp
308 subq %r10,%r11 441 subq %r10,%r11
309 movq $0,%r10 442 movq $0,%r10
310 cmovcq %r10,%r11 443 cmovcq %r10,%r11
311 subq %r11,%rsp 444 subq %r11,%rsp
312 .Lmul4xsp_done: 445 .Lmul4xsp_done:
313 andq $-64,%rsp 446 andq $-64,%rsp
314 negq %r9 447 negq %r9
315 448
316 movq %rax,40(%rsp) 449 movq %rax,40(%rsp)
317 .Lmul4x_body: 450 .Lmul4x_body:
318 451
319 call mul4x_internal 452 call mul4x_internal
320 453
321 movq 40(%rsp),%rsi 454 movq 40(%rsp),%rsi
322 movq $1,%rax 455 movq $1,%rax
456
323 movq -48(%rsi),%r15 457 movq -48(%rsi),%r15
324 movq -40(%rsi),%r14 458 movq -40(%rsi),%r14
325 movq -32(%rsi),%r13 459 movq -32(%rsi),%r13
326 movq -24(%rsi),%r12 460 movq -24(%rsi),%r12
327 movq -16(%rsi),%rbp 461 movq -16(%rsi),%rbp
328 movq -8(%rsi),%rbx 462 movq -8(%rsi),%rbx
329 leaq (%rsi),%rsp 463 leaq (%rsi),%rsp
330 .Lmul4x_epilogue: 464 .Lmul4x_epilogue:
331 .byte 0xf3,0xc3 465 .byte 0xf3,0xc3
332 .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 466 .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
333 467
334 .type mul4x_internal,@function 468 .type mul4x_internal,@function
335 .align 32 469 .align 32
336 mul4x_internal: 470 mul4x_internal:
337 shlq $5,%r9 471 shlq $5,%r9
338 » movl» 8(%rax),%r10d 472 » movd» 8(%rax),%xmm5
339 » leaq» 256(%rdx,%r9,1),%r13 473 » leaq» .Linc(%rip),%rax
474 » leaq» 128(%rdx,%r9,1),%r13
340 shrq $5,%r9 475 shrq $5,%r9
341 » movq» %r10,%r11 476 » movdqa» 0(%rax),%xmm0
342 » shrq» $3,%r10 477 » movdqa» 16(%rax),%xmm1
343 » andq» $7,%r11 478 » leaq» 88-112(%rsp,%r9,1),%r10
344 » notq» %r10 479 » leaq» 128(%rdx),%r12
345 » leaq» .Lmagic_masks(%rip),%rax
346 » andq» $3,%r10
347 » leaq» 96(%rdx,%r11,8),%r12
348 » movq» 0(%rax,%r10,8),%xmm4
349 » movq» 8(%rax,%r10,8),%xmm5
350 » addq» $7,%r11
351 » movq» 16(%rax,%r10,8),%xmm6
352 » movq» 24(%rax,%r10,8),%xmm7
353 » andq» $7,%r11
354 480
355 » movq» -96(%r12),%xmm0 481 » pshufd» $0,%xmm5,%xmm5
356 » leaq» 256(%r12),%r14 482 » movdqa» %xmm1,%xmm4
357 » movq» -32(%r12),%xmm1 483 .byte» 0x67,0x67
358 » pand» %xmm4,%xmm0 484 » movdqa» %xmm1,%xmm2
359 » movq» 32(%r12),%xmm2 485 » paddd» %xmm0,%xmm1
360 » pand» %xmm5,%xmm1 486 » pcmpeqd»%xmm5,%xmm0
361 » movq» 96(%r12),%xmm3
362 » pand» %xmm6,%xmm2
363 .byte 0x67 487 .byte 0x67
488 movdqa %xmm4,%xmm3
489 paddd %xmm1,%xmm2
490 pcmpeqd %xmm5,%xmm1
491 movdqa %xmm0,112(%r10)
492 movdqa %xmm4,%xmm0
493
494 paddd %xmm2,%xmm3
495 pcmpeqd %xmm5,%xmm2
496 movdqa %xmm1,128(%r10)
497 movdqa %xmm4,%xmm1
498
499 paddd %xmm3,%xmm0
500 pcmpeqd %xmm5,%xmm3
501 movdqa %xmm2,144(%r10)
502 movdqa %xmm4,%xmm2
503
504 paddd %xmm0,%xmm1
505 pcmpeqd %xmm5,%xmm0
506 movdqa %xmm3,160(%r10)
507 movdqa %xmm4,%xmm3
508 paddd %xmm1,%xmm2
509 pcmpeqd %xmm5,%xmm1
510 movdqa %xmm0,176(%r10)
511 movdqa %xmm4,%xmm0
512
513 paddd %xmm2,%xmm3
514 pcmpeqd %xmm5,%xmm2
515 movdqa %xmm1,192(%r10)
516 movdqa %xmm4,%xmm1
517
518 paddd %xmm3,%xmm0
519 pcmpeqd %xmm5,%xmm3
520 movdqa %xmm2,208(%r10)
521 movdqa %xmm4,%xmm2
522
523 paddd %xmm0,%xmm1
524 pcmpeqd %xmm5,%xmm0
525 movdqa %xmm3,224(%r10)
526 movdqa %xmm4,%xmm3
527 paddd %xmm1,%xmm2
528 pcmpeqd %xmm5,%xmm1
529 movdqa %xmm0,240(%r10)
530 movdqa %xmm4,%xmm0
531
532 paddd %xmm2,%xmm3
533 pcmpeqd %xmm5,%xmm2
534 movdqa %xmm1,256(%r10)
535 movdqa %xmm4,%xmm1
536
537 paddd %xmm3,%xmm0
538 pcmpeqd %xmm5,%xmm3
539 movdqa %xmm2,272(%r10)
540 movdqa %xmm4,%xmm2
541
542 paddd %xmm0,%xmm1
543 pcmpeqd %xmm5,%xmm0
544 movdqa %xmm3,288(%r10)
545 movdqa %xmm4,%xmm3
546 paddd %xmm1,%xmm2
547 pcmpeqd %xmm5,%xmm1
548 movdqa %xmm0,304(%r10)
549
550 paddd %xmm2,%xmm3
551 .byte 0x67
552 pcmpeqd %xmm5,%xmm2
553 movdqa %xmm1,320(%r10)
554
555 pcmpeqd %xmm5,%xmm3
556 movdqa %xmm2,336(%r10)
557 pand 64(%r12),%xmm0
558
559 pand 80(%r12),%xmm1
560 pand 96(%r12),%xmm2
561 movdqa %xmm3,352(%r10)
562 pand 112(%r12),%xmm3
563 por %xmm2,%xmm0
564 por %xmm3,%xmm1
565 movdqa -128(%r12),%xmm4
566 movdqa -112(%r12),%xmm5
567 movdqa -96(%r12),%xmm2
568 pand 112(%r10),%xmm4
569 movdqa -80(%r12),%xmm3
570 pand 128(%r10),%xmm5
571 por %xmm4,%xmm0
572 pand 144(%r10),%xmm2
573 por %xmm5,%xmm1
574 pand 160(%r10),%xmm3
575 por %xmm2,%xmm0
576 por %xmm3,%xmm1
577 movdqa -64(%r12),%xmm4
578 movdqa -48(%r12),%xmm5
579 movdqa -32(%r12),%xmm2
580 pand 176(%r10),%xmm4
581 movdqa -16(%r12),%xmm3
582 pand 192(%r10),%xmm5
583 por %xmm4,%xmm0
584 pand 208(%r10),%xmm2
585 por %xmm5,%xmm1
586 pand 224(%r10),%xmm3
587 por %xmm2,%xmm0
588 por %xmm3,%xmm1
589 movdqa 0(%r12),%xmm4
590 movdqa 16(%r12),%xmm5
591 movdqa 32(%r12),%xmm2
592 pand 240(%r10),%xmm4
593 movdqa 48(%r12),%xmm3
594 pand 256(%r10),%xmm5
595 por %xmm4,%xmm0
596 pand 272(%r10),%xmm2
597 por %xmm5,%xmm1
598 pand 288(%r10),%xmm3
599 por %xmm2,%xmm0
600 por %xmm3,%xmm1
364 por %xmm1,%xmm0 601 por %xmm1,%xmm0
365 » movq» -96(%r14),%xmm1 602 » pshufd» $0x4e,%xmm0,%xmm1
366 .byte» 0x67 603 » por» %xmm1,%xmm0
367 » pand» %xmm7,%xmm3 604 » leaq» 256(%r12),%r12
368 .byte» 0x67 605 .byte» 102,72,15,126,195
369 » por» %xmm2,%xmm0
370 » movq» -32(%r14),%xmm2
371 .byte» 0x67
372 » pand» %xmm4,%xmm1
373 .byte» 0x67
374 » por» %xmm3,%xmm0
375 » movq» 32(%r14),%xmm3
376 606
377 .byte 102,72,15,126,195
378 movq 96(%r14),%xmm0
379 movq %r13,16+8(%rsp) 607 movq %r13,16+8(%rsp)
380 movq %rdi,56+8(%rsp) 608 movq %rdi,56+8(%rsp)
381 609
382 movq (%r8),%r8 610 movq (%r8),%r8
383 movq (%rsi),%rax 611 movq (%rsi),%rax
384 leaq (%rsi,%r9,1),%rsi 612 leaq (%rsi,%r9,1),%rsi
385 negq %r9 613 negq %r9
386 614
387 movq %r8,%rbp 615 movq %r8,%rbp
388 mulq %rbx 616 mulq %rbx
389 movq %rax,%r10 617 movq %rax,%r10
390 movq (%rcx),%rax 618 movq (%rcx),%rax
391 619
392 pand %xmm5,%xmm2
393 pand %xmm6,%xmm3
394 por %xmm2,%xmm1
395
396 imulq %r10,%rbp 620 imulq %r10,%rbp
397 621 » leaq» 64+8(%rsp),%r14
398
399
400
401
402
403
404 » leaq» 64+8(%rsp,%r11,8),%r14
405 movq %rdx,%r11 622 movq %rdx,%r11
406 623
407 pand %xmm7,%xmm0
408 por %xmm3,%xmm1
409 leaq 512(%r12),%r12
410 por %xmm1,%xmm0
411
412 mulq %rbp 624 mulq %rbp
413 addq %rax,%r10 625 addq %rax,%r10
414 movq 8(%rsi,%r9,1),%rax 626 movq 8(%rsi,%r9,1),%rax
415 adcq $0,%rdx 627 adcq $0,%rdx
416 movq %rdx,%rdi 628 movq %rdx,%rdi
417 629
418 mulq %rbx 630 mulq %rbx
419 addq %rax,%r11 631 addq %rax,%r11
420 » movq» 16(%rcx),%rax 632 » movq» 8(%rcx),%rax
421 adcq $0,%rdx 633 adcq $0,%rdx
422 movq %rdx,%r10 634 movq %rdx,%r10
423 635
424 mulq %rbp 636 mulq %rbp
425 addq %rax,%rdi 637 addq %rax,%rdi
426 movq 16(%rsi,%r9,1),%rax 638 movq 16(%rsi,%r9,1),%rax
427 adcq $0,%rdx 639 adcq $0,%rdx
428 addq %r11,%rdi 640 addq %r11,%rdi
429 leaq 32(%r9),%r15 641 leaq 32(%r9),%r15
430 » leaq» 64(%rcx),%rcx 642 » leaq» 32(%rcx),%rcx
431 adcq $0,%rdx 643 adcq $0,%rdx
432 movq %rdi,(%r14) 644 movq %rdi,(%r14)
433 movq %rdx,%r13 645 movq %rdx,%r13
434 jmp .L1st4x 646 jmp .L1st4x
435 647
436 .align 32 648 .align 32
437 .L1st4x: 649 .L1st4x:
438 mulq %rbx 650 mulq %rbx
439 addq %rax,%r10 651 addq %rax,%r10
440 » movq» -32(%rcx),%rax 652 » movq» -16(%rcx),%rax
441 leaq 32(%r14),%r14 653 leaq 32(%r14),%r14
442 adcq $0,%rdx 654 adcq $0,%rdx
443 movq %rdx,%r11 655 movq %rdx,%r11
444 656
445 mulq %rbp 657 mulq %rbp
446 addq %rax,%r13 658 addq %rax,%r13
447 movq -8(%rsi,%r15,1),%rax 659 movq -8(%rsi,%r15,1),%rax
448 adcq $0,%rdx 660 adcq $0,%rdx
449 addq %r10,%r13 661 addq %r10,%r13
450 adcq $0,%rdx 662 adcq $0,%rdx
451 movq %r13,-24(%r14) 663 movq %r13,-24(%r14)
452 movq %rdx,%rdi 664 movq %rdx,%rdi
453 665
454 mulq %rbx 666 mulq %rbx
455 addq %rax,%r11 667 addq %rax,%r11
456 » movq» -16(%rcx),%rax 668 » movq» -8(%rcx),%rax
457 adcq $0,%rdx 669 adcq $0,%rdx
458 movq %rdx,%r10 670 movq %rdx,%r10
459 671
460 mulq %rbp 672 mulq %rbp
461 addq %rax,%rdi 673 addq %rax,%rdi
462 movq (%rsi,%r15,1),%rax 674 movq (%rsi,%r15,1),%rax
463 adcq $0,%rdx 675 adcq $0,%rdx
464 addq %r11,%rdi 676 addq %r11,%rdi
465 adcq $0,%rdx 677 adcq $0,%rdx
466 movq %rdi,-16(%r14) 678 movq %rdi,-16(%r14)
467 movq %rdx,%r13 679 movq %rdx,%r13
468 680
469 mulq %rbx 681 mulq %rbx
470 addq %rax,%r10 682 addq %rax,%r10
471 movq 0(%rcx),%rax 683 movq 0(%rcx),%rax
472 adcq $0,%rdx 684 adcq $0,%rdx
473 movq %rdx,%r11 685 movq %rdx,%r11
474 686
475 mulq %rbp 687 mulq %rbp
476 addq %rax,%r13 688 addq %rax,%r13
477 movq 8(%rsi,%r15,1),%rax 689 movq 8(%rsi,%r15,1),%rax
478 adcq $0,%rdx 690 adcq $0,%rdx
479 addq %r10,%r13 691 addq %r10,%r13
480 adcq $0,%rdx 692 adcq $0,%rdx
481 movq %r13,-8(%r14) 693 movq %r13,-8(%r14)
482 movq %rdx,%rdi 694 movq %rdx,%rdi
483 695
484 mulq %rbx 696 mulq %rbx
485 addq %rax,%r11 697 addq %rax,%r11
486 » movq» 16(%rcx),%rax 698 » movq» 8(%rcx),%rax
487 adcq $0,%rdx 699 adcq $0,%rdx
488 movq %rdx,%r10 700 movq %rdx,%r10
489 701
490 mulq %rbp 702 mulq %rbp
491 addq %rax,%rdi 703 addq %rax,%rdi
492 movq 16(%rsi,%r15,1),%rax 704 movq 16(%rsi,%r15,1),%rax
493 adcq $0,%rdx 705 adcq $0,%rdx
494 addq %r11,%rdi 706 addq %r11,%rdi
495 » leaq» 64(%rcx),%rcx 707 » leaq» 32(%rcx),%rcx
496 adcq $0,%rdx 708 adcq $0,%rdx
497 movq %rdi,(%r14) 709 movq %rdi,(%r14)
498 movq %rdx,%r13 710 movq %rdx,%r13
499 711
500 addq $32,%r15 712 addq $32,%r15
501 jnz .L1st4x 713 jnz .L1st4x
502 714
503 mulq %rbx 715 mulq %rbx
504 addq %rax,%r10 716 addq %rax,%r10
505 » movq» -32(%rcx),%rax 717 » movq» -16(%rcx),%rax
506 leaq 32(%r14),%r14 718 leaq 32(%r14),%r14
507 adcq $0,%rdx 719 adcq $0,%rdx
508 movq %rdx,%r11 720 movq %rdx,%r11
509 721
510 mulq %rbp 722 mulq %rbp
511 addq %rax,%r13 723 addq %rax,%r13
512 movq -8(%rsi),%rax 724 movq -8(%rsi),%rax
513 adcq $0,%rdx 725 adcq $0,%rdx
514 addq %r10,%r13 726 addq %r10,%r13
515 adcq $0,%rdx 727 adcq $0,%rdx
516 movq %r13,-24(%r14) 728 movq %r13,-24(%r14)
517 movq %rdx,%rdi 729 movq %rdx,%rdi
518 730
519 mulq %rbx 731 mulq %rbx
520 addq %rax,%r11 732 addq %rax,%r11
521 » movq» -16(%rcx),%rax 733 » movq» -8(%rcx),%rax
522 adcq $0,%rdx 734 adcq $0,%rdx
523 movq %rdx,%r10 735 movq %rdx,%r10
524 736
525 mulq %rbp 737 mulq %rbp
526 addq %rax,%rdi 738 addq %rax,%rdi
527 movq (%rsi,%r9,1),%rax 739 movq (%rsi,%r9,1),%rax
528 adcq $0,%rdx 740 adcq $0,%rdx
529 addq %r11,%rdi 741 addq %r11,%rdi
530 adcq $0,%rdx 742 adcq $0,%rdx
531 movq %rdi,-16(%r14) 743 movq %rdi,-16(%r14)
532 movq %rdx,%r13 744 movq %rdx,%r13
533 745
534 .byte» 102,72,15,126,195 746 » leaq» (%rcx,%r9,1),%rcx
535 » leaq» (%rcx,%r9,2),%rcx
536 747
537 xorq %rdi,%rdi 748 xorq %rdi,%rdi
538 addq %r10,%r13 749 addq %r10,%r13
539 adcq $0,%rdi 750 adcq $0,%rdi
540 movq %r13,-8(%r14) 751 movq %r13,-8(%r14)
541 752
542 jmp .Louter4x 753 jmp .Louter4x
543 754
544 .align 32 755 .align 32
545 .Louter4x: 756 .Louter4x:
757 leaq 16+128(%r14),%rdx
758 pxor %xmm4,%xmm4
759 pxor %xmm5,%xmm5
760 movdqa -128(%r12),%xmm0
761 movdqa -112(%r12),%xmm1
762 movdqa -96(%r12),%xmm2
763 movdqa -80(%r12),%xmm3
764 pand -128(%rdx),%xmm0
765 pand -112(%rdx),%xmm1
766 por %xmm0,%xmm4
767 pand -96(%rdx),%xmm2
768 por %xmm1,%xmm5
769 pand -80(%rdx),%xmm3
770 por %xmm2,%xmm4
771 por %xmm3,%xmm5
772 movdqa -64(%r12),%xmm0
773 movdqa -48(%r12),%xmm1
774 movdqa -32(%r12),%xmm2
775 movdqa -16(%r12),%xmm3
776 pand -64(%rdx),%xmm0
777 pand -48(%rdx),%xmm1
778 por %xmm0,%xmm4
779 pand -32(%rdx),%xmm2
780 por %xmm1,%xmm5
781 pand -16(%rdx),%xmm3
782 por %xmm2,%xmm4
783 por %xmm3,%xmm5
784 movdqa 0(%r12),%xmm0
785 movdqa 16(%r12),%xmm1
786 movdqa 32(%r12),%xmm2
787 movdqa 48(%r12),%xmm3
788 pand 0(%rdx),%xmm0
789 pand 16(%rdx),%xmm1
790 por %xmm0,%xmm4
791 pand 32(%rdx),%xmm2
792 por %xmm1,%xmm5
793 pand 48(%rdx),%xmm3
794 por %xmm2,%xmm4
795 por %xmm3,%xmm5
796 movdqa 64(%r12),%xmm0
797 movdqa 80(%r12),%xmm1
798 movdqa 96(%r12),%xmm2
799 movdqa 112(%r12),%xmm3
800 pand 64(%rdx),%xmm0
801 pand 80(%rdx),%xmm1
802 por %xmm0,%xmm4
803 pand 96(%rdx),%xmm2
804 por %xmm1,%xmm5
805 pand 112(%rdx),%xmm3
806 por %xmm2,%xmm4
807 por %xmm3,%xmm5
808 por %xmm5,%xmm4
809 pshufd $0x4e,%xmm4,%xmm0
810 por %xmm4,%xmm0
811 leaq 256(%r12),%r12
812 .byte 102,72,15,126,195
813
546 movq (%r14,%r9,1),%r10 814 movq (%r14,%r9,1),%r10
547 movq %r8,%rbp 815 movq %r8,%rbp
548 mulq %rbx 816 mulq %rbx
549 addq %rax,%r10 817 addq %rax,%r10
550 movq (%rcx),%rax 818 movq (%rcx),%rax
551 adcq $0,%rdx 819 adcq $0,%rdx
552 820
553 movq -96(%r12),%xmm0
554 movq -32(%r12),%xmm1
555 pand %xmm4,%xmm0
556 movq 32(%r12),%xmm2
557 pand %xmm5,%xmm1
558 movq 96(%r12),%xmm3
559
560 imulq %r10,%rbp 821 imulq %r10,%rbp
561 .byte 0x67
562 movq %rdx,%r11 822 movq %rdx,%r11
563 movq %rdi,(%r14) 823 movq %rdi,(%r14)
564 824
565 pand %xmm6,%xmm2
566 por %xmm1,%xmm0
567 pand %xmm7,%xmm3
568 por %xmm2,%xmm0
569 leaq (%r14,%r9,1),%r14 825 leaq (%r14,%r9,1),%r14
570 leaq 256(%r12),%r12
571 por %xmm3,%xmm0
572 826
573 mulq %rbp 827 mulq %rbp
574 addq %rax,%r10 828 addq %rax,%r10
575 movq 8(%rsi,%r9,1),%rax 829 movq 8(%rsi,%r9,1),%rax
576 adcq $0,%rdx 830 adcq $0,%rdx
577 movq %rdx,%rdi 831 movq %rdx,%rdi
578 832
579 mulq %rbx 833 mulq %rbx
580 addq %rax,%r11 834 addq %rax,%r11
581 » movq» 16(%rcx),%rax 835 » movq» 8(%rcx),%rax
582 adcq $0,%rdx 836 adcq $0,%rdx
583 addq 8(%r14),%r11 837 addq 8(%r14),%r11
584 adcq $0,%rdx 838 adcq $0,%rdx
585 movq %rdx,%r10 839 movq %rdx,%r10
586 840
587 mulq %rbp 841 mulq %rbp
588 addq %rax,%rdi 842 addq %rax,%rdi
589 movq 16(%rsi,%r9,1),%rax 843 movq 16(%rsi,%r9,1),%rax
590 adcq $0,%rdx 844 adcq $0,%rdx
591 addq %r11,%rdi 845 addq %r11,%rdi
592 leaq 32(%r9),%r15 846 leaq 32(%r9),%r15
593 » leaq» 64(%rcx),%rcx 847 » leaq» 32(%rcx),%rcx
594 adcq $0,%rdx 848 adcq $0,%rdx
595 movq %rdx,%r13 849 movq %rdx,%r13
596 jmp .Linner4x 850 jmp .Linner4x
597 851
598 .align 32 852 .align 32
599 .Linner4x: 853 .Linner4x:
600 mulq %rbx 854 mulq %rbx
601 addq %rax,%r10 855 addq %rax,%r10
602 » movq» -32(%rcx),%rax 856 » movq» -16(%rcx),%rax
603 adcq $0,%rdx 857 adcq $0,%rdx
604 addq 16(%r14),%r10 858 addq 16(%r14),%r10
605 leaq 32(%r14),%r14 859 leaq 32(%r14),%r14
606 adcq $0,%rdx 860 adcq $0,%rdx
607 movq %rdx,%r11 861 movq %rdx,%r11
608 862
609 mulq %rbp 863 mulq %rbp
610 addq %rax,%r13 864 addq %rax,%r13
611 movq -8(%rsi,%r15,1),%rax 865 movq -8(%rsi,%r15,1),%rax
612 adcq $0,%rdx 866 adcq $0,%rdx
613 addq %r10,%r13 867 addq %r10,%r13
614 adcq $0,%rdx 868 adcq $0,%rdx
615 movq %rdi,-32(%r14) 869 movq %rdi,-32(%r14)
616 movq %rdx,%rdi 870 movq %rdx,%rdi
617 871
618 mulq %rbx 872 mulq %rbx
619 addq %rax,%r11 873 addq %rax,%r11
620 » movq» -16(%rcx),%rax 874 » movq» -8(%rcx),%rax
621 adcq $0,%rdx 875 adcq $0,%rdx
622 addq -8(%r14),%r11 876 addq -8(%r14),%r11
623 adcq $0,%rdx 877 adcq $0,%rdx
624 movq %rdx,%r10 878 movq %rdx,%r10
625 879
626 mulq %rbp 880 mulq %rbp
627 addq %rax,%rdi 881 addq %rax,%rdi
628 movq (%rsi,%r15,1),%rax 882 movq (%rsi,%r15,1),%rax
629 adcq $0,%rdx 883 adcq $0,%rdx
630 addq %r11,%rdi 884 addq %r11,%rdi
(...skipping 13 matching lines...) Expand all
644 addq %rax,%r13 898 addq %rax,%r13
645 movq 8(%rsi,%r15,1),%rax 899 movq 8(%rsi,%r15,1),%rax
646 adcq $0,%rdx 900 adcq $0,%rdx
647 addq %r10,%r13 901 addq %r10,%r13
648 adcq $0,%rdx 902 adcq $0,%rdx
649 movq %rdi,-16(%r14) 903 movq %rdi,-16(%r14)
650 movq %rdx,%rdi 904 movq %rdx,%rdi
651 905
652 mulq %rbx 906 mulq %rbx
653 addq %rax,%r11 907 addq %rax,%r11
654 » movq» 16(%rcx),%rax 908 » movq» 8(%rcx),%rax
655 adcq $0,%rdx 909 adcq $0,%rdx
656 addq 8(%r14),%r11 910 addq 8(%r14),%r11
657 adcq $0,%rdx 911 adcq $0,%rdx
658 movq %rdx,%r10 912 movq %rdx,%r10
659 913
660 mulq %rbp 914 mulq %rbp
661 addq %rax,%rdi 915 addq %rax,%rdi
662 movq 16(%rsi,%r15,1),%rax 916 movq 16(%rsi,%r15,1),%rax
663 adcq $0,%rdx 917 adcq $0,%rdx
664 addq %r11,%rdi 918 addq %r11,%rdi
665 » leaq» 64(%rcx),%rcx 919 » leaq» 32(%rcx),%rcx
666 adcq $0,%rdx 920 adcq $0,%rdx
667 movq %r13,-8(%r14) 921 movq %r13,-8(%r14)
668 movq %rdx,%r13 922 movq %rdx,%r13
669 923
670 addq $32,%r15 924 addq $32,%r15
671 jnz .Linner4x 925 jnz .Linner4x
672 926
673 mulq %rbx 927 mulq %rbx
674 addq %rax,%r10 928 addq %rax,%r10
675 » movq» -32(%rcx),%rax 929 » movq» -16(%rcx),%rax
676 adcq $0,%rdx 930 adcq $0,%rdx
677 addq 16(%r14),%r10 931 addq 16(%r14),%r10
678 leaq 32(%r14),%r14 932 leaq 32(%r14),%r14
679 adcq $0,%rdx 933 adcq $0,%rdx
680 movq %rdx,%r11 934 movq %rdx,%r11
681 935
682 mulq %rbp 936 mulq %rbp
683 addq %rax,%r13 937 addq %rax,%r13
684 movq -8(%rsi),%rax 938 movq -8(%rsi),%rax
685 adcq $0,%rdx 939 adcq $0,%rdx
686 addq %r10,%r13 940 addq %r10,%r13
687 adcq $0,%rdx 941 adcq $0,%rdx
688 movq %rdi,-32(%r14) 942 movq %rdi,-32(%r14)
689 movq %rdx,%rdi 943 movq %rdx,%rdi
690 944
691 mulq %rbx 945 mulq %rbx
692 addq %rax,%r11 946 addq %rax,%r11
693 movq %rbp,%rax 947 movq %rbp,%rax
694 » movq» -16(%rcx),%rbp 948 » movq» -8(%rcx),%rbp
695 adcq $0,%rdx 949 adcq $0,%rdx
696 addq -8(%r14),%r11 950 addq -8(%r14),%r11
697 adcq $0,%rdx 951 adcq $0,%rdx
698 movq %rdx,%r10 952 movq %rdx,%r10
699 953
700 mulq %rbp 954 mulq %rbp
701 addq %rax,%rdi 955 addq %rax,%rdi
702 movq (%rsi,%r9,1),%rax 956 movq (%rsi,%r9,1),%rax
703 adcq $0,%rdx 957 adcq $0,%rdx
704 addq %r11,%rdi 958 addq %r11,%rdi
705 adcq $0,%rdx 959 adcq $0,%rdx
706 movq %r13,-24(%r14) 960 movq %r13,-24(%r14)
707 movq %rdx,%r13 961 movq %rdx,%r13
708 962
709 .byte 102,72,15,126,195
710 movq %rdi,-16(%r14) 963 movq %rdi,-16(%r14)
711 » leaq» (%rcx,%r9,2),%rcx 964 » leaq» (%rcx,%r9,1),%rcx
712 965
713 xorq %rdi,%rdi 966 xorq %rdi,%rdi
714 addq %r10,%r13 967 addq %r10,%r13
715 adcq $0,%rdi 968 adcq $0,%rdi
716 addq (%r14),%r13 969 addq (%r14),%r13
717 adcq $0,%rdi 970 adcq $0,%rdi
718 movq %r13,-8(%r14) 971 movq %r13,-8(%r14)
719 972
720 cmpq 16+8(%rsp),%r12 973 cmpq 16+8(%rsp),%r12
721 jb .Louter4x 974 jb .Louter4x
975 xorq %rax,%rax
722 subq %r13,%rbp 976 subq %r13,%rbp
723 adcq %r15,%r15 977 adcq %r15,%r15
724 orq %r15,%rdi 978 orq %r15,%rdi
725 » xorq» $1,%rdi 979 » subq» %rdi,%rax
726 leaq (%r14,%r9,1),%rbx 980 leaq (%r14,%r9,1),%rbx
727 » leaq» (%rcx,%rdi,8),%rbp 981 » movq» (%rcx),%r12
982 » leaq» (%rcx),%rbp
728 movq %r9,%rcx 983 movq %r9,%rcx
729 sarq $3+2,%rcx 984 sarq $3+2,%rcx
730 movq 56+8(%rsp),%rdi 985 movq 56+8(%rsp),%rdi
731 » jmp» .Lsqr4x_sub 986 » decq» %r12
987 » xorq» %r10,%r10
988 » movq» 8(%rbp),%r13
989 » movq» 16(%rbp),%r14
990 » movq» 24(%rbp),%r15
991 » jmp» .Lsqr4x_sub_entry
732 .size mul4x_internal,.-mul4x_internal 992 .size mul4x_internal,.-mul4x_internal
733 .globl bn_power5 993 .globl bn_power5
734 .hidden bn_power5 994 .hidden bn_power5
735 .type bn_power5,@function 995 .type bn_power5,@function
736 .align 32 996 .align 32
737 bn_power5: 997 bn_power5:
738 movq %rsp,%rax 998 movq %rsp,%rax
739 pushq %rbx 999 pushq %rbx
740 pushq %rbp 1000 pushq %rbp
741 pushq %r12 1001 pushq %r12
742 pushq %r13 1002 pushq %r13
743 pushq %r14 1003 pushq %r14
744 pushq %r15 1004 pushq %r15
745 » movl» %r9d,%r10d 1005
746 shll $3,%r9d 1006 shll $3,%r9d
747 » shll» $3+2,%r10d 1007 » leal» (%r9,%r9,2),%r10d
748 negq %r9 1008 negq %r9
749 movq (%r8),%r8 1009 movq (%r8),%r8
750 1010
751 1011
752 1012
753 1013
754 1014
755 1015
756 1016
757 » leaq» -64(%rsp,%r9,2),%r11 1017
758 » subq» %rsi,%r11 1018 » leaq» -320(%rsp,%r9,2),%r11
1019 » subq» %rdi,%r11
759 andq $4095,%r11 1020 andq $4095,%r11
760 cmpq %r11,%r10 1021 cmpq %r11,%r10
761 jb .Lpwr_sp_alt 1022 jb .Lpwr_sp_alt
762 subq %r11,%rsp 1023 subq %r11,%rsp
763 » leaq» -64(%rsp,%r9,2),%rsp 1024 » leaq» -320(%rsp,%r9,2),%rsp
764 jmp .Lpwr_sp_done 1025 jmp .Lpwr_sp_done
765 1026
766 .align 32 1027 .align 32
767 .Lpwr_sp_alt: 1028 .Lpwr_sp_alt:
768 » leaq» 4096-64(,%r9,2),%r10 1029 » leaq» 4096-320(,%r9,2),%r10
769 » leaq» -64(%rsp,%r9,2),%rsp 1030 » leaq» -320(%rsp,%r9,2),%rsp
770 subq %r10,%r11 1031 subq %r10,%r11
771 movq $0,%r10 1032 movq $0,%r10
772 cmovcq %r10,%r11 1033 cmovcq %r10,%r11
773 subq %r11,%rsp 1034 subq %r11,%rsp
774 .Lpwr_sp_done: 1035 .Lpwr_sp_done:
775 andq $-64,%rsp 1036 andq $-64,%rsp
776 movq %r9,%r10 1037 movq %r9,%r10
777 negq %r9 1038 negq %r9
778 1039
779 1040
780 1041
781 1042
782 1043
783 1044
784 1045
785 1046
786 1047
787 1048
788 movq %r8,32(%rsp) 1049 movq %r8,32(%rsp)
789 movq %rax,40(%rsp) 1050 movq %rax,40(%rsp)
790 .Lpower5_body: 1051 .Lpower5_body:
791 .byte 102,72,15,110,207 1052 .byte 102,72,15,110,207
792 .byte 102,72,15,110,209 1053 .byte 102,72,15,110,209
793 .byte 102,73,15,110,218 1054 .byte 102,73,15,110,218
794 .byte 102,72,15,110,226 1055 .byte 102,72,15,110,226
795 1056
796 call __bn_sqr8x_internal 1057 call __bn_sqr8x_internal
1058 call __bn_post4x_internal
797 call __bn_sqr8x_internal 1059 call __bn_sqr8x_internal
1060 call __bn_post4x_internal
798 call __bn_sqr8x_internal 1061 call __bn_sqr8x_internal
1062 call __bn_post4x_internal
799 call __bn_sqr8x_internal 1063 call __bn_sqr8x_internal
1064 call __bn_post4x_internal
800 call __bn_sqr8x_internal 1065 call __bn_sqr8x_internal
1066 call __bn_post4x_internal
801 1067
802 .byte 102,72,15,126,209 1068 .byte 102,72,15,126,209
803 .byte 102,72,15,126,226 1069 .byte 102,72,15,126,226
804 movq %rsi,%rdi 1070 movq %rsi,%rdi
805 movq 40(%rsp),%rax 1071 movq 40(%rsp),%rax
806 leaq 32(%rsp),%r8 1072 leaq 32(%rsp),%r8
807 1073
808 call mul4x_internal 1074 call mul4x_internal
809 1075
810 movq 40(%rsp),%rsi 1076 movq 40(%rsp),%rsi
(...skipping 524 matching lines...) Expand 10 before | Expand all | Expand 10 after
1335 leaq (%rcx,%r11,2),%r8 1601 leaq (%rcx,%r11,2),%r8
1336 shrq $63,%r11 1602 shrq $63,%r11
1337 orq %r10,%r8 1603 orq %r10,%r8
1338 mulq %rax 1604 mulq %rax
1339 negq %r15 1605 negq %r15
1340 adcq %rax,%rbx 1606 adcq %rax,%rbx
1341 adcq %rdx,%r8 1607 adcq %rdx,%r8
1342 movq %rbx,-16(%rdi) 1608 movq %rbx,-16(%rdi)
1343 movq %r8,-8(%rdi) 1609 movq %r8,-8(%rdi)
1344 .byte 102,72,15,126,213 1610 .byte 102,72,15,126,213
1345 sqr8x_reduction: 1611 __bn_sqr8x_reduction:
1346 xorq %rax,%rax 1612 xorq %rax,%rax
1347 » leaq» (%rbp,%r9,2),%rcx 1613 » leaq» (%r9,%rbp,1),%rcx
1348 leaq 48+8(%rsp,%r9,2),%rdx 1614 leaq 48+8(%rsp,%r9,2),%rdx
1349 movq %rcx,0+8(%rsp) 1615 movq %rcx,0+8(%rsp)
1350 leaq 48+8(%rsp,%r9,1),%rdi 1616 leaq 48+8(%rsp,%r9,1),%rdi
1351 movq %rdx,8+8(%rsp) 1617 movq %rdx,8+8(%rsp)
1352 negq %r9 1618 negq %r9
1353 jmp .L8x_reduction_loop 1619 jmp .L8x_reduction_loop
1354 1620
1355 .align 32 1621 .align 32
1356 .L8x_reduction_loop: 1622 .L8x_reduction_loop:
1357 leaq (%rdi,%r9,1),%rdi 1623 leaq (%rdi,%r9,1),%rdi
(...skipping 12 matching lines...) Expand all
1370 .byte 0x67 1636 .byte 0x67
1371 movq %rbx,%r8 1637 movq %rbx,%r8
1372 imulq 32+8(%rsp),%rbx 1638 imulq 32+8(%rsp),%rbx
1373 movq 0(%rbp),%rax 1639 movq 0(%rbp),%rax
1374 movl $8,%ecx 1640 movl $8,%ecx
1375 jmp .L8x_reduce 1641 jmp .L8x_reduce
1376 1642
1377 .align 32 1643 .align 32
1378 .L8x_reduce: 1644 .L8x_reduce:
1379 mulq %rbx 1645 mulq %rbx
1380 » movq» 16(%rbp),%rax 1646 » movq» 8(%rbp),%rax
1381 negq %r8 1647 negq %r8
1382 movq %rdx,%r8 1648 movq %rdx,%r8
1383 adcq $0,%r8 1649 adcq $0,%r8
1384 1650
1385 mulq %rbx 1651 mulq %rbx
1386 addq %rax,%r9 1652 addq %rax,%r9
1387 » movq» 32(%rbp),%rax 1653 » movq» 16(%rbp),%rax
1388 adcq $0,%rdx 1654 adcq $0,%rdx
1389 addq %r9,%r8 1655 addq %r9,%r8
1390 movq %rbx,48-8+8(%rsp,%rcx,8) 1656 movq %rbx,48-8+8(%rsp,%rcx,8)
1391 movq %rdx,%r9 1657 movq %rdx,%r9
1392 adcq $0,%r9 1658 adcq $0,%r9
1393 1659
1394 mulq %rbx 1660 mulq %rbx
1395 addq %rax,%r10 1661 addq %rax,%r10
1396 » movq» 48(%rbp),%rax 1662 » movq» 24(%rbp),%rax
1397 adcq $0,%rdx 1663 adcq $0,%rdx
1398 addq %r10,%r9 1664 addq %r10,%r9
1399 movq 32+8(%rsp),%rsi 1665 movq 32+8(%rsp),%rsi
1400 movq %rdx,%r10 1666 movq %rdx,%r10
1401 adcq $0,%r10 1667 adcq $0,%r10
1402 1668
1403 mulq %rbx 1669 mulq %rbx
1404 addq %rax,%r11 1670 addq %rax,%r11
1405 » movq» 64(%rbp),%rax 1671 » movq» 32(%rbp),%rax
1406 adcq $0,%rdx 1672 adcq $0,%rdx
1407 imulq %r8,%rsi 1673 imulq %r8,%rsi
1408 addq %r11,%r10 1674 addq %r11,%r10
1409 movq %rdx,%r11 1675 movq %rdx,%r11
1410 adcq $0,%r11 1676 adcq $0,%r11
1411 1677
1412 mulq %rbx 1678 mulq %rbx
1413 addq %rax,%r12 1679 addq %rax,%r12
1414 » movq» 80(%rbp),%rax 1680 » movq» 40(%rbp),%rax
1415 adcq $0,%rdx 1681 adcq $0,%rdx
1416 addq %r12,%r11 1682 addq %r12,%r11
1417 movq %rdx,%r12 1683 movq %rdx,%r12
1418 adcq $0,%r12 1684 adcq $0,%r12
1419 1685
1420 mulq %rbx 1686 mulq %rbx
1421 addq %rax,%r13 1687 addq %rax,%r13
1422 » movq» 96(%rbp),%rax 1688 » movq» 48(%rbp),%rax
1423 adcq $0,%rdx 1689 adcq $0,%rdx
1424 addq %r13,%r12 1690 addq %r13,%r12
1425 movq %rdx,%r13 1691 movq %rdx,%r13
1426 adcq $0,%r13 1692 adcq $0,%r13
1427 1693
1428 mulq %rbx 1694 mulq %rbx
1429 addq %rax,%r14 1695 addq %rax,%r14
1430 » movq» 112(%rbp),%rax 1696 » movq» 56(%rbp),%rax
1431 adcq $0,%rdx 1697 adcq $0,%rdx
1432 addq %r14,%r13 1698 addq %r14,%r13
1433 movq %rdx,%r14 1699 movq %rdx,%r14
1434 adcq $0,%r14 1700 adcq $0,%r14
1435 1701
1436 mulq %rbx 1702 mulq %rbx
1437 movq %rsi,%rbx 1703 movq %rsi,%rbx
1438 addq %rax,%r15 1704 addq %rax,%r15
1439 movq 0(%rbp),%rax 1705 movq 0(%rbp),%rax
1440 adcq $0,%rdx 1706 adcq $0,%rdx
1441 addq %r15,%r14 1707 addq %r15,%r14
1442 movq %rdx,%r15 1708 movq %rdx,%r15
1443 adcq $0,%r15 1709 adcq $0,%r15
1444 1710
1445 decl %ecx 1711 decl %ecx
1446 jnz .L8x_reduce 1712 jnz .L8x_reduce
1447 1713
1448 » leaq» 128(%rbp),%rbp 1714 » leaq» 64(%rbp),%rbp
1449 xorq %rax,%rax 1715 xorq %rax,%rax
1450 movq 8+8(%rsp),%rdx 1716 movq 8+8(%rsp),%rdx
1451 cmpq 0+8(%rsp),%rbp 1717 cmpq 0+8(%rsp),%rbp
1452 jae .L8x_no_tail 1718 jae .L8x_no_tail
1453 1719
1454 .byte 0x66 1720 .byte 0x66
1455 addq 0(%rdi),%r8 1721 addq 0(%rdi),%r8
1456 adcq 8(%rdi),%r9 1722 adcq 8(%rdi),%r9
1457 adcq 16(%rdi),%r10 1723 adcq 16(%rdi),%r10
1458 adcq 24(%rdi),%r11 1724 adcq 24(%rdi),%r11
1459 adcq 32(%rdi),%r12 1725 adcq 32(%rdi),%r12
1460 adcq 40(%rdi),%r13 1726 adcq 40(%rdi),%r13
1461 adcq 48(%rdi),%r14 1727 adcq 48(%rdi),%r14
1462 adcq 56(%rdi),%r15 1728 adcq 56(%rdi),%r15
1463 sbbq %rsi,%rsi 1729 sbbq %rsi,%rsi
1464 1730
1465 movq 48+56+8(%rsp),%rbx 1731 movq 48+56+8(%rsp),%rbx
1466 movl $8,%ecx 1732 movl $8,%ecx
1467 movq 0(%rbp),%rax 1733 movq 0(%rbp),%rax
1468 jmp .L8x_tail 1734 jmp .L8x_tail
1469 1735
1470 .align 32 1736 .align 32
1471 .L8x_tail: 1737 .L8x_tail:
1472 mulq %rbx 1738 mulq %rbx
1473 addq %rax,%r8 1739 addq %rax,%r8
1474 » movq» 16(%rbp),%rax 1740 » movq» 8(%rbp),%rax
1475 movq %r8,(%rdi) 1741 movq %r8,(%rdi)
1476 movq %rdx,%r8 1742 movq %rdx,%r8
1477 adcq $0,%r8 1743 adcq $0,%r8
1478 1744
1479 mulq %rbx 1745 mulq %rbx
1480 addq %rax,%r9 1746 addq %rax,%r9
1481 » movq» 32(%rbp),%rax 1747 » movq» 16(%rbp),%rax
1482 adcq $0,%rdx 1748 adcq $0,%rdx
1483 addq %r9,%r8 1749 addq %r9,%r8
1484 leaq 8(%rdi),%rdi 1750 leaq 8(%rdi),%rdi
1485 movq %rdx,%r9 1751 movq %rdx,%r9
1486 adcq $0,%r9 1752 adcq $0,%r9
1487 1753
1488 mulq %rbx 1754 mulq %rbx
1489 addq %rax,%r10 1755 addq %rax,%r10
1490 » movq» 48(%rbp),%rax 1756 » movq» 24(%rbp),%rax
1491 adcq $0,%rdx 1757 adcq $0,%rdx
1492 addq %r10,%r9 1758 addq %r10,%r9
1493 movq %rdx,%r10 1759 movq %rdx,%r10
1494 adcq $0,%r10 1760 adcq $0,%r10
1495 1761
1496 mulq %rbx 1762 mulq %rbx
1497 addq %rax,%r11 1763 addq %rax,%r11
1498 » movq» 64(%rbp),%rax 1764 » movq» 32(%rbp),%rax
1499 adcq $0,%rdx 1765 adcq $0,%rdx
1500 addq %r11,%r10 1766 addq %r11,%r10
1501 movq %rdx,%r11 1767 movq %rdx,%r11
1502 adcq $0,%r11 1768 adcq $0,%r11
1503 1769
1504 mulq %rbx 1770 mulq %rbx
1505 addq %rax,%r12 1771 addq %rax,%r12
1506 » movq» 80(%rbp),%rax 1772 » movq» 40(%rbp),%rax
1507 adcq $0,%rdx 1773 adcq $0,%rdx
1508 addq %r12,%r11 1774 addq %r12,%r11
1509 movq %rdx,%r12 1775 movq %rdx,%r12
1510 adcq $0,%r12 1776 adcq $0,%r12
1511 1777
1512 mulq %rbx 1778 mulq %rbx
1513 addq %rax,%r13 1779 addq %rax,%r13
1514 » movq» 96(%rbp),%rax 1780 » movq» 48(%rbp),%rax
1515 adcq $0,%rdx 1781 adcq $0,%rdx
1516 addq %r13,%r12 1782 addq %r13,%r12
1517 movq %rdx,%r13 1783 movq %rdx,%r13
1518 adcq $0,%r13 1784 adcq $0,%r13
1519 1785
1520 mulq %rbx 1786 mulq %rbx
1521 addq %rax,%r14 1787 addq %rax,%r14
1522 » movq» 112(%rbp),%rax 1788 » movq» 56(%rbp),%rax
1523 adcq $0,%rdx 1789 adcq $0,%rdx
1524 addq %r14,%r13 1790 addq %r14,%r13
1525 movq %rdx,%r14 1791 movq %rdx,%r14
1526 adcq $0,%r14 1792 adcq $0,%r14
1527 1793
1528 mulq %rbx 1794 mulq %rbx
1529 movq 48-16+8(%rsp,%rcx,8),%rbx 1795 movq 48-16+8(%rsp,%rcx,8),%rbx
1530 addq %rax,%r15 1796 addq %rax,%r15
1531 adcq $0,%rdx 1797 adcq $0,%rdx
1532 addq %r15,%r14 1798 addq %r15,%r14
1533 movq 0(%rbp),%rax 1799 movq 0(%rbp),%rax
1534 movq %rdx,%r15 1800 movq %rdx,%r15
1535 adcq $0,%r15 1801 adcq $0,%r15
1536 1802
1537 decl %ecx 1803 decl %ecx
1538 jnz .L8x_tail 1804 jnz .L8x_tail
1539 1805
1540 » leaq» 128(%rbp),%rbp 1806 » leaq» 64(%rbp),%rbp
1541 movq 8+8(%rsp),%rdx 1807 movq 8+8(%rsp),%rdx
1542 cmpq 0+8(%rsp),%rbp 1808 cmpq 0+8(%rsp),%rbp
1543 jae .L8x_tail_done 1809 jae .L8x_tail_done
1544 1810
1545 movq 48+56+8(%rsp),%rbx 1811 movq 48+56+8(%rsp),%rbx
1546 negq %rsi 1812 negq %rsi
1547 movq 0(%rbp),%rax 1813 movq 0(%rbp),%rax
1548 adcq 0(%rdi),%r8 1814 adcq 0(%rdi),%r8
1549 adcq 8(%rdi),%r9 1815 adcq 8(%rdi),%r9
1550 adcq 16(%rdi),%r10 1816 adcq 16(%rdi),%r10
1551 adcq 24(%rdi),%r11 1817 adcq 24(%rdi),%r11
1552 adcq 32(%rdi),%r12 1818 adcq 32(%rdi),%r12
1553 adcq 40(%rdi),%r13 1819 adcq 40(%rdi),%r13
1554 adcq 48(%rdi),%r14 1820 adcq 48(%rdi),%r14
1555 adcq 56(%rdi),%r15 1821 adcq 56(%rdi),%r15
1556 sbbq %rsi,%rsi 1822 sbbq %rsi,%rsi
1557 1823
1558 movl $8,%ecx 1824 movl $8,%ecx
1559 jmp .L8x_tail 1825 jmp .L8x_tail
1560 1826
1561 .align 32 1827 .align 32
1562 .L8x_tail_done: 1828 .L8x_tail_done:
1563 addq (%rdx),%r8 1829 addq (%rdx),%r8
1830 adcq $0,%r9
1831 adcq $0,%r10
1832 adcq $0,%r11
1833 adcq $0,%r12
1834 adcq $0,%r13
1835 adcq $0,%r14
1836 adcq $0,%r15
1837
1838
1564 xorq %rax,%rax 1839 xorq %rax,%rax
1565 1840
1566 negq %rsi 1841 negq %rsi
1567 .L8x_no_tail: 1842 .L8x_no_tail:
1568 adcq 0(%rdi),%r8 1843 adcq 0(%rdi),%r8
1569 adcq 8(%rdi),%r9 1844 adcq 8(%rdi),%r9
1570 adcq 16(%rdi),%r10 1845 adcq 16(%rdi),%r10
1571 adcq 24(%rdi),%r11 1846 adcq 24(%rdi),%r11
1572 adcq 32(%rdi),%r12 1847 adcq 32(%rdi),%r12
1573 adcq 40(%rdi),%r13 1848 adcq 40(%rdi),%r13
1574 adcq 48(%rdi),%r14 1849 adcq 48(%rdi),%r14
1575 adcq 56(%rdi),%r15 1850 adcq 56(%rdi),%r15
1576 adcq $0,%rax 1851 adcq $0,%rax
1577 » movq» -16(%rbp),%rcx 1852 » movq» -8(%rbp),%rcx
1578 xorq %rsi,%rsi 1853 xorq %rsi,%rsi
1579 1854
1580 .byte 102,72,15,126,213 1855 .byte 102,72,15,126,213
1581 1856
1582 movq %r8,0(%rdi) 1857 movq %r8,0(%rdi)
1583 movq %r9,8(%rdi) 1858 movq %r9,8(%rdi)
1584 .byte 102,73,15,126,217 1859 .byte 102,73,15,126,217
1585 movq %r10,16(%rdi) 1860 movq %r10,16(%rdi)
1586 movq %r11,24(%rdi) 1861 movq %r11,24(%rdi)
1587 movq %r12,32(%rdi) 1862 movq %r12,32(%rdi)
1588 movq %r13,40(%rdi) 1863 movq %r13,40(%rdi)
1589 movq %r14,48(%rdi) 1864 movq %r14,48(%rdi)
1590 movq %r15,56(%rdi) 1865 movq %r15,56(%rdi)
1591 leaq 64(%rdi),%rdi 1866 leaq 64(%rdi),%rdi
1592 1867
1593 cmpq %rdx,%rdi 1868 cmpq %rdx,%rdi
1594 jb .L8x_reduction_loop 1869 jb .L8x_reduction_loop
1870 .byte 0xf3,0xc3
1871 .size bn_sqr8x_internal,.-bn_sqr8x_internal
1872 .type __bn_post4x_internal,@function
1873 .align 32
1874 __bn_post4x_internal:
1875 movq 0(%rbp),%r12
1876 leaq (%rdi,%r9,1),%rbx
1877 movq %r9,%rcx
1878 .byte 102,72,15,126,207
1879 negq %rax
1880 .byte 102,72,15,126,206
1881 sarq $3+2,%rcx
1882 decq %r12
1883 xorq %r10,%r10
1884 movq 8(%rbp),%r13
1885 movq 16(%rbp),%r14
1886 movq 24(%rbp),%r15
1887 jmp .Lsqr4x_sub_entry
1595 1888
1596 » subq» %r15,%rcx 1889 .align» 16
1597 » leaq» (%rdi,%r9,1),%rbx 1890 .Lsqr4x_sub:
1598 » adcq» %rsi,%rsi 1891 » movq» 0(%rbp),%r12
1599 » movq» %r9,%rcx 1892 » movq» 8(%rbp),%r13
1600 » orq» %rsi,%rax 1893 » movq» 16(%rbp),%r14
1601 .byte» 102,72,15,126,207 1894 » movq» 24(%rbp),%r15
1602 » xorq» $1,%rax 1895 .Lsqr4x_sub_entry:
1603 .byte» 102,72,15,126,206 1896 » leaq» 32(%rbp),%rbp
1604 » leaq» (%rbp,%rax,8),%rbp 1897 » notq» %r12
1605 » sarq» $3+2,%rcx 1898 » notq» %r13
1606 » jmp» .Lsqr4x_sub 1899 » notq» %r14
1900 » notq» %r15
1901 » andq» %rax,%r12
1902 » andq» %rax,%r13
1903 » andq» %rax,%r14
1904 » andq» %rax,%r15
1607 1905
1608 .align» 32 1906 » negq» %r10
1609 .Lsqr4x_sub: 1907 » adcq» 0(%rbx),%r12
1610 .byte» 0x66 1908 » adcq» 8(%rbx),%r13
1611 » movq» 0(%rbx),%r12 1909 » adcq» 16(%rbx),%r14
1612 » movq» 8(%rbx),%r13 1910 » adcq» 24(%rbx),%r15
1613 » sbbq» 0(%rbp),%r12 1911 » movq» %r12,0(%rdi)
1614 » movq» 16(%rbx),%r14
1615 » sbbq» 16(%rbp),%r13
1616 » movq» 24(%rbx),%r15
1617 leaq 32(%rbx),%rbx 1912 leaq 32(%rbx),%rbx
1618 sbbq 32(%rbp),%r14
1619 movq %r12,0(%rdi)
1620 sbbq 48(%rbp),%r15
1621 leaq 64(%rbp),%rbp
1622 movq %r13,8(%rdi) 1913 movq %r13,8(%rdi)
1914 sbbq %r10,%r10
1623 movq %r14,16(%rdi) 1915 movq %r14,16(%rdi)
1624 movq %r15,24(%rdi) 1916 movq %r15,24(%rdi)
1625 leaq 32(%rdi),%rdi 1917 leaq 32(%rdi),%rdi
1626 1918
1627 incq %rcx 1919 incq %rcx
1628 jnz .Lsqr4x_sub 1920 jnz .Lsqr4x_sub
1921
1629 movq %r9,%r10 1922 movq %r9,%r10
1630 negq %r9 1923 negq %r9
1631 .byte 0xf3,0xc3 1924 .byte 0xf3,0xc3
1632 .size» bn_sqr8x_internal,.-bn_sqr8x_internal 1925 .size» __bn_post4x_internal,.-__bn_post4x_internal
1633 .globl bn_from_montgomery 1926 .globl bn_from_montgomery
1634 .hidden bn_from_montgomery 1927 .hidden bn_from_montgomery
1635 .type bn_from_montgomery,@function 1928 .type bn_from_montgomery,@function
1636 .align 32 1929 .align 32
1637 bn_from_montgomery: 1930 bn_from_montgomery:
1638 testl $7,%r9d 1931 testl $7,%r9d
1639 jz bn_from_mont8x 1932 jz bn_from_mont8x
1640 xorl %eax,%eax 1933 xorl %eax,%eax
1641 .byte 0xf3,0xc3 1934 .byte 0xf3,0xc3
1642 .size bn_from_montgomery,.-bn_from_montgomery 1935 .size bn_from_montgomery,.-bn_from_montgomery
1643 1936
1644 .type bn_from_mont8x,@function 1937 .type bn_from_mont8x,@function
1645 .align 32 1938 .align 32
1646 bn_from_mont8x: 1939 bn_from_mont8x:
1647 .byte 0x67 1940 .byte 0x67
1648 movq %rsp,%rax 1941 movq %rsp,%rax
1649 pushq %rbx 1942 pushq %rbx
1650 pushq %rbp 1943 pushq %rbp
1651 pushq %r12 1944 pushq %r12
1652 pushq %r13 1945 pushq %r13
1653 pushq %r14 1946 pushq %r14
1654 pushq %r15 1947 pushq %r15
1655 .byte» 0x67 1948
1656 » movl» %r9d,%r10d
1657 shll $3,%r9d 1949 shll $3,%r9d
1658 » shll» $3+2,%r10d 1950 » leaq» (%r9,%r9,2),%r10
1659 negq %r9 1951 negq %r9
1660 movq (%r8),%r8 1952 movq (%r8),%r8
1661 1953
1662 1954
1663 1955
1664 1956
1665 1957
1666 1958
1667 1959
1668 » leaq» -64(%rsp,%r9,2),%r11 1960
1669 » subq» %rsi,%r11 1961 » leaq» -320(%rsp,%r9,2),%r11
1962 » subq» %rdi,%r11
1670 andq $4095,%r11 1963 andq $4095,%r11
1671 cmpq %r11,%r10 1964 cmpq %r11,%r10
1672 jb .Lfrom_sp_alt 1965 jb .Lfrom_sp_alt
1673 subq %r11,%rsp 1966 subq %r11,%rsp
1674 » leaq» -64(%rsp,%r9,2),%rsp 1967 » leaq» -320(%rsp,%r9,2),%rsp
1675 jmp .Lfrom_sp_done 1968 jmp .Lfrom_sp_done
1676 1969
1677 .align 32 1970 .align 32
1678 .Lfrom_sp_alt: 1971 .Lfrom_sp_alt:
1679 » leaq» 4096-64(,%r9,2),%r10 1972 » leaq» 4096-320(,%r9,2),%r10
1680 » leaq» -64(%rsp,%r9,2),%rsp 1973 » leaq» -320(%rsp,%r9,2),%rsp
1681 subq %r10,%r11 1974 subq %r10,%r11
1682 movq $0,%r10 1975 movq $0,%r10
1683 cmovcq %r10,%r11 1976 cmovcq %r10,%r11
1684 subq %r11,%rsp 1977 subq %r11,%rsp
1685 .Lfrom_sp_done: 1978 .Lfrom_sp_done:
1686 andq $-64,%rsp 1979 andq $-64,%rsp
1687 movq %r9,%r10 1980 movq %r9,%r10
1688 negq %r9 1981 negq %r9
1689 1982
1690 1983
(...skipping 30 matching lines...) Expand all
1721 movdqa %xmm4,48(%rax) 2014 movdqa %xmm4,48(%rax)
1722 leaq 64(%rax),%rax 2015 leaq 64(%rax),%rax
1723 subq $64,%r11 2016 subq $64,%r11
1724 jnz .Lmul_by_1 2017 jnz .Lmul_by_1
1725 2018
1726 .byte 102,72,15,110,207 2019 .byte 102,72,15,110,207
1727 .byte 102,72,15,110,209 2020 .byte 102,72,15,110,209
1728 .byte 0x67 2021 .byte 0x67
1729 movq %rcx,%rbp 2022 movq %rcx,%rbp
1730 .byte 102,73,15,110,218 2023 .byte 102,73,15,110,218
1731 » call» sqr8x_reduction 2024 » call» __bn_sqr8x_reduction
2025 » call» __bn_post4x_internal
1732 2026
1733 pxor %xmm0,%xmm0 2027 pxor %xmm0,%xmm0
1734 leaq 48(%rsp),%rax 2028 leaq 48(%rsp),%rax
1735 movq 40(%rsp),%rsi 2029 movq 40(%rsp),%rsi
1736 jmp .Lfrom_mont_zero 2030 jmp .Lfrom_mont_zero
1737 2031
1738 .align 32 2032 .align 32
1739 .Lfrom_mont_zero: 2033 .Lfrom_mont_zero:
1740 movdqa %xmm0,0(%rax) 2034 movdqa %xmm0,0(%rax)
1741 movdqa %xmm0,16(%rax) 2035 movdqa %xmm0,16(%rax)
(...skipping 29 matching lines...) Expand all
1771 leaq 256(%rdx),%rdx 2065 leaq 256(%rdx),%rdx
1772 subl $1,%esi 2066 subl $1,%esi
1773 jnz .Lscatter 2067 jnz .Lscatter
1774 .Lscatter_epilogue: 2068 .Lscatter_epilogue:
1775 .byte 0xf3,0xc3 2069 .byte 0xf3,0xc3
1776 .size bn_scatter5,.-bn_scatter5 2070 .size bn_scatter5,.-bn_scatter5
1777 2071
1778 .globl bn_gather5 2072 .globl bn_gather5
1779 .hidden bn_gather5 2073 .hidden bn_gather5
1780 .type bn_gather5,@function 2074 .type bn_gather5,@function
1781 .align» 16 2075 .align» 32
1782 bn_gather5: 2076 bn_gather5:
1783 » movl» %ecx,%r11d 2077 .LSEH_begin_bn_gather5:
1784 » shrl» $3,%ecx 2078
1785 » andq» $7,%r11 2079 .byte» 0x4c,0x8d,0x14,0x24
1786 » notl» %ecx 2080 .byte» 0x48,0x81,0xec,0x08,0x01,0x00,0x00
1787 » leaq» .Lmagic_masks(%rip),%rax 2081 » leaq» .Linc(%rip),%rax
1788 » andl» $3,%ecx 2082 » andq» $-16,%rsp
1789 » leaq» 128(%rdx,%r11,8),%rdx 2083
1790 » movq» 0(%rax,%rcx,8),%xmm4 2084 » movd» %ecx,%xmm5
1791 » movq» 8(%rax,%rcx,8),%xmm5 2085 » movdqa» 0(%rax),%xmm0
1792 » movq» 16(%rax,%rcx,8),%xmm6 2086 » movdqa» 16(%rax),%xmm1
1793 » movq» 24(%rax,%rcx,8),%xmm7 2087 » leaq» 128(%rdx),%r11
2088 » leaq» 128(%rsp),%rax
2089
2090 » pshufd» $0,%xmm5,%xmm5
2091 » movdqa» %xmm1,%xmm4
2092 » movdqa» %xmm1,%xmm2
2093 » paddd» %xmm0,%xmm1
2094 » pcmpeqd»%xmm5,%xmm0
2095 » movdqa» %xmm4,%xmm3
2096
2097 » paddd» %xmm1,%xmm2
2098 » pcmpeqd»%xmm5,%xmm1
2099 » movdqa» %xmm0,-128(%rax)
2100 » movdqa» %xmm4,%xmm0
2101
2102 » paddd» %xmm2,%xmm3
2103 » pcmpeqd»%xmm5,%xmm2
2104 » movdqa» %xmm1,-112(%rax)
2105 » movdqa» %xmm4,%xmm1
2106
2107 » paddd» %xmm3,%xmm0
2108 » pcmpeqd»%xmm5,%xmm3
2109 » movdqa» %xmm2,-96(%rax)
2110 » movdqa» %xmm4,%xmm2
2111 » paddd» %xmm0,%xmm1
2112 » pcmpeqd»%xmm5,%xmm0
2113 » movdqa» %xmm3,-80(%rax)
2114 » movdqa» %xmm4,%xmm3
2115
2116 » paddd» %xmm1,%xmm2
2117 » pcmpeqd»%xmm5,%xmm1
2118 » movdqa» %xmm0,-64(%rax)
2119 » movdqa» %xmm4,%xmm0
2120
2121 » paddd» %xmm2,%xmm3
2122 » pcmpeqd»%xmm5,%xmm2
2123 » movdqa» %xmm1,-48(%rax)
2124 » movdqa» %xmm4,%xmm1
2125
2126 » paddd» %xmm3,%xmm0
2127 » pcmpeqd»%xmm5,%xmm3
2128 » movdqa» %xmm2,-32(%rax)
2129 » movdqa» %xmm4,%xmm2
2130 » paddd» %xmm0,%xmm1
2131 » pcmpeqd»%xmm5,%xmm0
2132 » movdqa» %xmm3,-16(%rax)
2133 » movdqa» %xmm4,%xmm3
2134
2135 » paddd» %xmm1,%xmm2
2136 » pcmpeqd»%xmm5,%xmm1
2137 » movdqa» %xmm0,0(%rax)
2138 » movdqa» %xmm4,%xmm0
2139
2140 » paddd» %xmm2,%xmm3
2141 » pcmpeqd»%xmm5,%xmm2
2142 » movdqa» %xmm1,16(%rax)
2143 » movdqa» %xmm4,%xmm1
2144
2145 » paddd» %xmm3,%xmm0
2146 » pcmpeqd»%xmm5,%xmm3
2147 » movdqa» %xmm2,32(%rax)
2148 » movdqa» %xmm4,%xmm2
2149 » paddd» %xmm0,%xmm1
2150 » pcmpeqd»%xmm5,%xmm0
2151 » movdqa» %xmm3,48(%rax)
2152 » movdqa» %xmm4,%xmm3
2153
2154 » paddd» %xmm1,%xmm2
2155 » pcmpeqd»%xmm5,%xmm1
2156 » movdqa» %xmm0,64(%rax)
2157 » movdqa» %xmm4,%xmm0
2158
2159 » paddd» %xmm2,%xmm3
2160 » pcmpeqd»%xmm5,%xmm2
2161 » movdqa» %xmm1,80(%rax)
2162 » movdqa» %xmm4,%xmm1
2163
2164 » paddd» %xmm3,%xmm0
2165 » pcmpeqd»%xmm5,%xmm3
2166 » movdqa» %xmm2,96(%rax)
2167 » movdqa» %xmm4,%xmm2
2168 » movdqa» %xmm3,112(%rax)
1794 jmp .Lgather 2169 jmp .Lgather
1795 .align» 16 2170
2171 .align» 32
1796 .Lgather: 2172 .Lgather:
1797 » movq» -128(%rdx),%xmm0 2173 » pxor» %xmm4,%xmm4
1798 » movq» -64(%rdx),%xmm1 2174 » pxor» %xmm5,%xmm5
1799 » pand» %xmm4,%xmm0 2175 » movdqa» -128(%r11),%xmm0
1800 » movq» 0(%rdx),%xmm2 2176 » movdqa» -112(%r11),%xmm1
1801 » pand» %xmm5,%xmm1 2177 » movdqa» -96(%r11),%xmm2
1802 » movq» 64(%rdx),%xmm3 2178 » pand» -128(%rax),%xmm0
1803 » pand» %xmm6,%xmm2 2179 » movdqa» -80(%r11),%xmm3
1804 » por» %xmm1,%xmm0 2180 » pand» -112(%rax),%xmm1
1805 » pand» %xmm7,%xmm3 2181 » por» %xmm0,%xmm4
1806 .byte» 0x67,0x67 2182 » pand» -96(%rax),%xmm2
1807 » por» %xmm2,%xmm0 2183 » por» %xmm1,%xmm5
1808 » leaq» 256(%rdx),%rdx 2184 » pand» -80(%rax),%xmm3
1809 » por» %xmm3,%xmm0 2185 » por» %xmm2,%xmm4
1810 2186 » por» %xmm3,%xmm5
2187 » movdqa» -64(%r11),%xmm0
2188 » movdqa» -48(%r11),%xmm1
2189 » movdqa» -32(%r11),%xmm2
2190 » pand» -64(%rax),%xmm0
2191 » movdqa» -16(%r11),%xmm3
2192 » pand» -48(%rax),%xmm1
2193 » por» %xmm0,%xmm4
2194 » pand» -32(%rax),%xmm2
2195 » por» %xmm1,%xmm5
2196 » pand» -16(%rax),%xmm3
2197 » por» %xmm2,%xmm4
2198 » por» %xmm3,%xmm5
2199 » movdqa» 0(%r11),%xmm0
2200 » movdqa» 16(%r11),%xmm1
2201 » movdqa» 32(%r11),%xmm2
2202 » pand» 0(%rax),%xmm0
2203 » movdqa» 48(%r11),%xmm3
2204 » pand» 16(%rax),%xmm1
2205 » por» %xmm0,%xmm4
2206 » pand» 32(%rax),%xmm2
2207 » por» %xmm1,%xmm5
2208 » pand» 48(%rax),%xmm3
2209 » por» %xmm2,%xmm4
2210 » por» %xmm3,%xmm5
2211 » movdqa» 64(%r11),%xmm0
2212 » movdqa» 80(%r11),%xmm1
2213 » movdqa» 96(%r11),%xmm2
2214 » pand» 64(%rax),%xmm0
2215 » movdqa» 112(%r11),%xmm3
2216 » pand» 80(%rax),%xmm1
2217 » por» %xmm0,%xmm4
2218 » pand» 96(%rax),%xmm2
2219 » por» %xmm1,%xmm5
2220 » pand» 112(%rax),%xmm3
2221 » por» %xmm2,%xmm4
2222 » por» %xmm3,%xmm5
2223 » por» %xmm5,%xmm4
2224 » leaq» 256(%r11),%r11
2225 » pshufd» $0x4e,%xmm4,%xmm0
2226 » por» %xmm4,%xmm0
1811 movq %xmm0,(%rdi) 2227 movq %xmm0,(%rdi)
1812 leaq 8(%rdi),%rdi 2228 leaq 8(%rdi),%rdi
1813 subl $1,%esi 2229 subl $1,%esi
1814 jnz .Lgather 2230 jnz .Lgather
2231
2232 leaq (%r10),%rsp
1815 .byte 0xf3,0xc3 2233 .byte 0xf3,0xc3
1816 .LSEH_end_bn_gather5: 2234 .LSEH_end_bn_gather5:
1817 .size bn_gather5,.-bn_gather5 2235 .size bn_gather5,.-bn_gather5
1818 .align 64 2236 .align 64
1819 .Lmagic_masks: 2237 .Linc:
1820 .long» 0,0, 0,0, 0,0, -1,-1 2238 .long» 0,0, 1,1
1821 .long» 0,0, 0,0, 0,0, 0,0 2239 .long» 2,2, 2,2
1822 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105 ,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97 ,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71 ,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,1 11,114,103,62,0 2240 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105 ,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97 ,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71 ,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,1 11,114,103,62,0
1823 #endif 2241 #endif
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698