OLD | NEW |
| (Empty) |
1 #if defined(__i386__) | |
2 .file "chacha-x86.S" | |
3 .text | |
4 .globl ChaCha20_ctr32 | |
5 .hidden ChaCha20_ctr32 | |
6 .type ChaCha20_ctr32,@function | |
7 .align 16 | |
8 ChaCha20_ctr32: | |
9 .L_ChaCha20_ctr32_begin: | |
10 pushl %ebp | |
11 pushl %ebx | |
12 pushl %esi | |
13 pushl %edi | |
14 xorl %eax,%eax | |
15 cmpl 28(%esp),%eax | |
16 je .L000no_data | |
17 call .Lpic_point | |
18 .Lpic_point: | |
19 popl %eax | |
20 leal OPENSSL_ia32cap_P-.Lpic_point(%eax),%ebp | |
21 testl $16777216,(%ebp) | |
22 jz .L001x86 | |
23 testl $512,4(%ebp) | |
24 jz .L001x86 | |
25 jmp .Lssse3_shortcut | |
26 .L001x86: | |
27 movl 32(%esp),%esi | |
28 movl 36(%esp),%edi | |
29 subl $132,%esp | |
30 movl (%esi),%eax | |
31 movl 4(%esi),%ebx | |
32 movl 8(%esi),%ecx | |
33 movl 12(%esi),%edx | |
34 movl %eax,80(%esp) | |
35 movl %ebx,84(%esp) | |
36 movl %ecx,88(%esp) | |
37 movl %edx,92(%esp) | |
38 movl 16(%esi),%eax | |
39 movl 20(%esi),%ebx | |
40 movl 24(%esi),%ecx | |
41 movl 28(%esi),%edx | |
42 movl %eax,96(%esp) | |
43 movl %ebx,100(%esp) | |
44 movl %ecx,104(%esp) | |
45 movl %edx,108(%esp) | |
46 movl (%edi),%eax | |
47 movl 4(%edi),%ebx | |
48 movl 8(%edi),%ecx | |
49 movl 12(%edi),%edx | |
50 subl $1,%eax | |
51 movl %eax,112(%esp) | |
52 movl %ebx,116(%esp) | |
53 movl %ecx,120(%esp) | |
54 movl %edx,124(%esp) | |
55 jmp .L002entry | |
56 .align 16 | |
57 .L003outer_loop: | |
58 movl %ebx,156(%esp) | |
59 movl %eax,152(%esp) | |
60 movl %ecx,160(%esp) | |
61 .L002entry: | |
62 movl $1634760805,%eax | |
63 movl $857760878,4(%esp) | |
64 movl $2036477234,8(%esp) | |
65 movl $1797285236,12(%esp) | |
66 movl 84(%esp),%ebx | |
67 movl 88(%esp),%ebp | |
68 movl 104(%esp),%ecx | |
69 movl 108(%esp),%esi | |
70 movl 116(%esp),%edx | |
71 movl 120(%esp),%edi | |
72 movl %ebx,20(%esp) | |
73 movl %ebp,24(%esp) | |
74 movl %ecx,40(%esp) | |
75 movl %esi,44(%esp) | |
76 movl %edx,52(%esp) | |
77 movl %edi,56(%esp) | |
78 movl 92(%esp),%ebx | |
79 movl 124(%esp),%edi | |
80 movl 112(%esp),%edx | |
81 movl 80(%esp),%ebp | |
82 movl 96(%esp),%ecx | |
83 movl 100(%esp),%esi | |
84 addl $1,%edx | |
85 movl %ebx,28(%esp) | |
86 movl %edi,60(%esp) | |
87 movl %edx,112(%esp) | |
88 movl $10,%ebx | |
89 jmp .L004loop | |
90 .align 16 | |
91 .L004loop: | |
92 addl %ebp,%eax | |
93 movl %ebx,128(%esp) | |
94 movl %ebp,%ebx | |
95 xorl %eax,%edx | |
96 roll $16,%edx | |
97 addl %edx,%ecx | |
98 xorl %ecx,%ebx | |
99 movl 52(%esp),%edi | |
100 roll $12,%ebx | |
101 movl 20(%esp),%ebp | |
102 addl %ebx,%eax | |
103 xorl %eax,%edx | |
104 movl %eax,(%esp) | |
105 roll $8,%edx | |
106 movl 4(%esp),%eax | |
107 addl %edx,%ecx | |
108 movl %edx,48(%esp) | |
109 xorl %ecx,%ebx | |
110 addl %ebp,%eax | |
111 roll $7,%ebx | |
112 xorl %eax,%edi | |
113 movl %ecx,32(%esp) | |
114 roll $16,%edi | |
115 movl %ebx,16(%esp) | |
116 addl %edi,%esi | |
117 movl 40(%esp),%ecx | |
118 xorl %esi,%ebp | |
119 movl 56(%esp),%edx | |
120 roll $12,%ebp | |
121 movl 24(%esp),%ebx | |
122 addl %ebp,%eax | |
123 xorl %eax,%edi | |
124 movl %eax,4(%esp) | |
125 roll $8,%edi | |
126 movl 8(%esp),%eax | |
127 addl %edi,%esi | |
128 movl %edi,52(%esp) | |
129 xorl %esi,%ebp | |
130 addl %ebx,%eax | |
131 roll $7,%ebp | |
132 xorl %eax,%edx | |
133 movl %esi,36(%esp) | |
134 roll $16,%edx | |
135 movl %ebp,20(%esp) | |
136 addl %edx,%ecx | |
137 movl 44(%esp),%esi | |
138 xorl %ecx,%ebx | |
139 movl 60(%esp),%edi | |
140 roll $12,%ebx | |
141 movl 28(%esp),%ebp | |
142 addl %ebx,%eax | |
143 xorl %eax,%edx | |
144 movl %eax,8(%esp) | |
145 roll $8,%edx | |
146 movl 12(%esp),%eax | |
147 addl %edx,%ecx | |
148 movl %edx,56(%esp) | |
149 xorl %ecx,%ebx | |
150 addl %ebp,%eax | |
151 roll $7,%ebx | |
152 xorl %eax,%edi | |
153 roll $16,%edi | |
154 movl %ebx,24(%esp) | |
155 addl %edi,%esi | |
156 xorl %esi,%ebp | |
157 roll $12,%ebp | |
158 movl 20(%esp),%ebx | |
159 addl %ebp,%eax | |
160 xorl %eax,%edi | |
161 movl %eax,12(%esp) | |
162 roll $8,%edi | |
163 movl (%esp),%eax | |
164 addl %edi,%esi | |
165 movl %edi,%edx | |
166 xorl %esi,%ebp | |
167 addl %ebx,%eax | |
168 roll $7,%ebp | |
169 xorl %eax,%edx | |
170 roll $16,%edx | |
171 movl %ebp,28(%esp) | |
172 addl %edx,%ecx | |
173 xorl %ecx,%ebx | |
174 movl 48(%esp),%edi | |
175 roll $12,%ebx | |
176 movl 24(%esp),%ebp | |
177 addl %ebx,%eax | |
178 xorl %eax,%edx | |
179 movl %eax,(%esp) | |
180 roll $8,%edx | |
181 movl 4(%esp),%eax | |
182 addl %edx,%ecx | |
183 movl %edx,60(%esp) | |
184 xorl %ecx,%ebx | |
185 addl %ebp,%eax | |
186 roll $7,%ebx | |
187 xorl %eax,%edi | |
188 movl %ecx,40(%esp) | |
189 roll $16,%edi | |
190 movl %ebx,20(%esp) | |
191 addl %edi,%esi | |
192 movl 32(%esp),%ecx | |
193 xorl %esi,%ebp | |
194 movl 52(%esp),%edx | |
195 roll $12,%ebp | |
196 movl 28(%esp),%ebx | |
197 addl %ebp,%eax | |
198 xorl %eax,%edi | |
199 movl %eax,4(%esp) | |
200 roll $8,%edi | |
201 movl 8(%esp),%eax | |
202 addl %edi,%esi | |
203 movl %edi,48(%esp) | |
204 xorl %esi,%ebp | |
205 addl %ebx,%eax | |
206 roll $7,%ebp | |
207 xorl %eax,%edx | |
208 movl %esi,44(%esp) | |
209 roll $16,%edx | |
210 movl %ebp,24(%esp) | |
211 addl %edx,%ecx | |
212 movl 36(%esp),%esi | |
213 xorl %ecx,%ebx | |
214 movl 56(%esp),%edi | |
215 roll $12,%ebx | |
216 movl 16(%esp),%ebp | |
217 addl %ebx,%eax | |
218 xorl %eax,%edx | |
219 movl %eax,8(%esp) | |
220 roll $8,%edx | |
221 movl 12(%esp),%eax | |
222 addl %edx,%ecx | |
223 movl %edx,52(%esp) | |
224 xorl %ecx,%ebx | |
225 addl %ebp,%eax | |
226 roll $7,%ebx | |
227 xorl %eax,%edi | |
228 roll $16,%edi | |
229 movl %ebx,28(%esp) | |
230 addl %edi,%esi | |
231 xorl %esi,%ebp | |
232 movl 48(%esp),%edx | |
233 roll $12,%ebp | |
234 movl 128(%esp),%ebx | |
235 addl %ebp,%eax | |
236 xorl %eax,%edi | |
237 movl %eax,12(%esp) | |
238 roll $8,%edi | |
239 movl (%esp),%eax | |
240 addl %edi,%esi | |
241 movl %edi,56(%esp) | |
242 xorl %esi,%ebp | |
243 roll $7,%ebp | |
244 decl %ebx | |
245 jnz .L004loop | |
246 movl 160(%esp),%ebx | |
247 addl $1634760805,%eax | |
248 addl 80(%esp),%ebp | |
249 addl 96(%esp),%ecx | |
250 addl 100(%esp),%esi | |
251 cmpl $64,%ebx | |
252 jb .L005tail | |
253 movl 156(%esp),%ebx | |
254 addl 112(%esp),%edx | |
255 addl 120(%esp),%edi | |
256 xorl (%ebx),%eax | |
257 xorl 16(%ebx),%ebp | |
258 movl %eax,(%esp) | |
259 movl 152(%esp),%eax | |
260 xorl 32(%ebx),%ecx | |
261 xorl 36(%ebx),%esi | |
262 xorl 48(%ebx),%edx | |
263 xorl 56(%ebx),%edi | |
264 movl %ebp,16(%eax) | |
265 movl %ecx,32(%eax) | |
266 movl %esi,36(%eax) | |
267 movl %edx,48(%eax) | |
268 movl %edi,56(%eax) | |
269 movl 4(%esp),%ebp | |
270 movl 8(%esp),%ecx | |
271 movl 12(%esp),%esi | |
272 movl 20(%esp),%edx | |
273 movl 24(%esp),%edi | |
274 addl $857760878,%ebp | |
275 addl $2036477234,%ecx | |
276 addl $1797285236,%esi | |
277 addl 84(%esp),%edx | |
278 addl 88(%esp),%edi | |
279 xorl 4(%ebx),%ebp | |
280 xorl 8(%ebx),%ecx | |
281 xorl 12(%ebx),%esi | |
282 xorl 20(%ebx),%edx | |
283 xorl 24(%ebx),%edi | |
284 movl %ebp,4(%eax) | |
285 movl %ecx,8(%eax) | |
286 movl %esi,12(%eax) | |
287 movl %edx,20(%eax) | |
288 movl %edi,24(%eax) | |
289 movl 28(%esp),%ebp | |
290 movl 40(%esp),%ecx | |
291 movl 44(%esp),%esi | |
292 movl 52(%esp),%edx | |
293 movl 60(%esp),%edi | |
294 addl 92(%esp),%ebp | |
295 addl 104(%esp),%ecx | |
296 addl 108(%esp),%esi | |
297 addl 116(%esp),%edx | |
298 addl 124(%esp),%edi | |
299 xorl 28(%ebx),%ebp | |
300 xorl 40(%ebx),%ecx | |
301 xorl 44(%ebx),%esi | |
302 xorl 52(%ebx),%edx | |
303 xorl 60(%ebx),%edi | |
304 leal 64(%ebx),%ebx | |
305 movl %ebp,28(%eax) | |
306 movl (%esp),%ebp | |
307 movl %ecx,40(%eax) | |
308 movl 160(%esp),%ecx | |
309 movl %esi,44(%eax) | |
310 movl %edx,52(%eax) | |
311 movl %edi,60(%eax) | |
312 movl %ebp,(%eax) | |
313 leal 64(%eax),%eax | |
314 subl $64,%ecx | |
315 jnz .L003outer_loop | |
316 jmp .L006done | |
317 .L005tail: | |
318 addl 112(%esp),%edx | |
319 addl 120(%esp),%edi | |
320 movl %eax,(%esp) | |
321 movl %ebp,16(%esp) | |
322 movl %ecx,32(%esp) | |
323 movl %esi,36(%esp) | |
324 movl %edx,48(%esp) | |
325 movl %edi,56(%esp) | |
326 movl 4(%esp),%ebp | |
327 movl 8(%esp),%ecx | |
328 movl 12(%esp),%esi | |
329 movl 20(%esp),%edx | |
330 movl 24(%esp),%edi | |
331 addl $857760878,%ebp | |
332 addl $2036477234,%ecx | |
333 addl $1797285236,%esi | |
334 addl 84(%esp),%edx | |
335 addl 88(%esp),%edi | |
336 movl %ebp,4(%esp) | |
337 movl %ecx,8(%esp) | |
338 movl %esi,12(%esp) | |
339 movl %edx,20(%esp) | |
340 movl %edi,24(%esp) | |
341 movl 28(%esp),%ebp | |
342 movl 40(%esp),%ecx | |
343 movl 44(%esp),%esi | |
344 movl 52(%esp),%edx | |
345 movl 60(%esp),%edi | |
346 addl 92(%esp),%ebp | |
347 addl 104(%esp),%ecx | |
348 addl 108(%esp),%esi | |
349 addl 116(%esp),%edx | |
350 addl 124(%esp),%edi | |
351 movl %ebp,28(%esp) | |
352 movl 156(%esp),%ebp | |
353 movl %ecx,40(%esp) | |
354 movl 152(%esp),%ecx | |
355 movl %esi,44(%esp) | |
356 xorl %esi,%esi | |
357 movl %edx,52(%esp) | |
358 movl %edi,60(%esp) | |
359 xorl %eax,%eax | |
360 xorl %edx,%edx | |
361 .L007tail_loop: | |
362 movb (%esi,%ebp,1),%al | |
363 movb (%esp,%esi,1),%dl | |
364 leal 1(%esi),%esi | |
365 xorb %dl,%al | |
366 movb %al,-1(%ecx,%esi,1) | |
367 decl %ebx | |
368 jnz .L007tail_loop | |
369 .L006done: | |
370 addl $132,%esp | |
371 .L000no_data: | |
372 popl %edi | |
373 popl %esi | |
374 popl %ebx | |
375 popl %ebp | |
376 ret | |
377 .size ChaCha20_ctr32,.-.L_ChaCha20_ctr32_begin | |
378 .globl ChaCha20_ssse3 | |
379 .hidden ChaCha20_ssse3 | |
380 .type ChaCha20_ssse3,@function | |
381 .align 16 | |
382 ChaCha20_ssse3: | |
383 .L_ChaCha20_ssse3_begin: | |
384 pushl %ebp | |
385 pushl %ebx | |
386 pushl %esi | |
387 pushl %edi | |
388 .Lssse3_shortcut: | |
389 movl 20(%esp),%edi | |
390 movl 24(%esp),%esi | |
391 movl 28(%esp),%ecx | |
392 movl 32(%esp),%edx | |
393 movl 36(%esp),%ebx | |
394 movl %esp,%ebp | |
395 subl $524,%esp | |
396 andl $-64,%esp | |
397 movl %ebp,512(%esp) | |
398 leal .Lssse3_data-.Lpic_point(%eax),%eax | |
399 movdqu (%ebx),%xmm3 | |
400 cmpl $256,%ecx | |
401 jb .L0081x | |
402 movl %edx,516(%esp) | |
403 movl %ebx,520(%esp) | |
404 subl $256,%ecx | |
405 leal 384(%esp),%ebp | |
406 movdqu (%edx),%xmm7 | |
407 pshufd $0,%xmm3,%xmm0 | |
408 pshufd $85,%xmm3,%xmm1 | |
409 pshufd $170,%xmm3,%xmm2 | |
410 pshufd $255,%xmm3,%xmm3 | |
411 paddd 48(%eax),%xmm0 | |
412 pshufd $0,%xmm7,%xmm4 | |
413 pshufd $85,%xmm7,%xmm5 | |
414 psubd 64(%eax),%xmm0 | |
415 pshufd $170,%xmm7,%xmm6 | |
416 pshufd $255,%xmm7,%xmm7 | |
417 movdqa %xmm0,64(%ebp) | |
418 movdqa %xmm1,80(%ebp) | |
419 movdqa %xmm2,96(%ebp) | |
420 movdqa %xmm3,112(%ebp) | |
421 movdqu 16(%edx),%xmm3 | |
422 movdqa %xmm4,-64(%ebp) | |
423 movdqa %xmm5,-48(%ebp) | |
424 movdqa %xmm6,-32(%ebp) | |
425 movdqa %xmm7,-16(%ebp) | |
426 movdqa 32(%eax),%xmm7 | |
427 leal 128(%esp),%ebx | |
428 pshufd $0,%xmm3,%xmm0 | |
429 pshufd $85,%xmm3,%xmm1 | |
430 pshufd $170,%xmm3,%xmm2 | |
431 pshufd $255,%xmm3,%xmm3 | |
432 pshufd $0,%xmm7,%xmm4 | |
433 pshufd $85,%xmm7,%xmm5 | |
434 pshufd $170,%xmm7,%xmm6 | |
435 pshufd $255,%xmm7,%xmm7 | |
436 movdqa %xmm0,(%ebp) | |
437 movdqa %xmm1,16(%ebp) | |
438 movdqa %xmm2,32(%ebp) | |
439 movdqa %xmm3,48(%ebp) | |
440 movdqa %xmm4,-128(%ebp) | |
441 movdqa %xmm5,-112(%ebp) | |
442 movdqa %xmm6,-96(%ebp) | |
443 movdqa %xmm7,-80(%ebp) | |
444 leal 128(%esi),%esi | |
445 leal 128(%edi),%edi | |
446 jmp .L009outer_loop | |
447 .align 16 | |
448 .L009outer_loop: | |
449 movdqa -112(%ebp),%xmm1 | |
450 movdqa -96(%ebp),%xmm2 | |
451 movdqa -80(%ebp),%xmm3 | |
452 movdqa -48(%ebp),%xmm5 | |
453 movdqa -32(%ebp),%xmm6 | |
454 movdqa -16(%ebp),%xmm7 | |
455 movdqa %xmm1,-112(%ebx) | |
456 movdqa %xmm2,-96(%ebx) | |
457 movdqa %xmm3,-80(%ebx) | |
458 movdqa %xmm5,-48(%ebx) | |
459 movdqa %xmm6,-32(%ebx) | |
460 movdqa %xmm7,-16(%ebx) | |
461 movdqa 32(%ebp),%xmm2 | |
462 movdqa 48(%ebp),%xmm3 | |
463 movdqa 64(%ebp),%xmm4 | |
464 movdqa 80(%ebp),%xmm5 | |
465 movdqa 96(%ebp),%xmm6 | |
466 movdqa 112(%ebp),%xmm7 | |
467 paddd 64(%eax),%xmm4 | |
468 movdqa %xmm2,32(%ebx) | |
469 movdqa %xmm3,48(%ebx) | |
470 movdqa %xmm4,64(%ebx) | |
471 movdqa %xmm5,80(%ebx) | |
472 movdqa %xmm6,96(%ebx) | |
473 movdqa %xmm7,112(%ebx) | |
474 movdqa %xmm4,64(%ebp) | |
475 movdqa -128(%ebp),%xmm0 | |
476 movdqa %xmm4,%xmm6 | |
477 movdqa -64(%ebp),%xmm3 | |
478 movdqa (%ebp),%xmm4 | |
479 movdqa 16(%ebp),%xmm5 | |
480 movl $10,%edx | |
481 nop | |
482 .align 16 | |
483 .L010loop: | |
484 paddd %xmm3,%xmm0 | |
485 movdqa %xmm3,%xmm2 | |
486 pxor %xmm0,%xmm6 | |
487 pshufb (%eax),%xmm6 | |
488 paddd %xmm6,%xmm4 | |
489 pxor %xmm4,%xmm2 | |
490 movdqa -48(%ebx),%xmm3 | |
491 movdqa %xmm2,%xmm1 | |
492 pslld $12,%xmm2 | |
493 psrld $20,%xmm1 | |
494 por %xmm1,%xmm2 | |
495 movdqa -112(%ebx),%xmm1 | |
496 paddd %xmm2,%xmm0 | |
497 movdqa 80(%ebx),%xmm7 | |
498 pxor %xmm0,%xmm6 | |
499 movdqa %xmm0,-128(%ebx) | |
500 pshufb 16(%eax),%xmm6 | |
501 paddd %xmm6,%xmm4 | |
502 movdqa %xmm6,64(%ebx) | |
503 pxor %xmm4,%xmm2 | |
504 paddd %xmm3,%xmm1 | |
505 movdqa %xmm2,%xmm0 | |
506 pslld $7,%xmm2 | |
507 psrld $25,%xmm0 | |
508 pxor %xmm1,%xmm7 | |
509 por %xmm0,%xmm2 | |
510 movdqa %xmm4,(%ebx) | |
511 pshufb (%eax),%xmm7 | |
512 movdqa %xmm2,-64(%ebx) | |
513 paddd %xmm7,%xmm5 | |
514 movdqa 32(%ebx),%xmm4 | |
515 pxor %xmm5,%xmm3 | |
516 movdqa -32(%ebx),%xmm2 | |
517 movdqa %xmm3,%xmm0 | |
518 pslld $12,%xmm3 | |
519 psrld $20,%xmm0 | |
520 por %xmm0,%xmm3 | |
521 movdqa -96(%ebx),%xmm0 | |
522 paddd %xmm3,%xmm1 | |
523 movdqa 96(%ebx),%xmm6 | |
524 pxor %xmm1,%xmm7 | |
525 movdqa %xmm1,-112(%ebx) | |
526 pshufb 16(%eax),%xmm7 | |
527 paddd %xmm7,%xmm5 | |
528 movdqa %xmm7,80(%ebx) | |
529 pxor %xmm5,%xmm3 | |
530 paddd %xmm2,%xmm0 | |
531 movdqa %xmm3,%xmm1 | |
532 pslld $7,%xmm3 | |
533 psrld $25,%xmm1 | |
534 pxor %xmm0,%xmm6 | |
535 por %xmm1,%xmm3 | |
536 movdqa %xmm5,16(%ebx) | |
537 pshufb (%eax),%xmm6 | |
538 movdqa %xmm3,-48(%ebx) | |
539 paddd %xmm6,%xmm4 | |
540 movdqa 48(%ebx),%xmm5 | |
541 pxor %xmm4,%xmm2 | |
542 movdqa -16(%ebx),%xmm3 | |
543 movdqa %xmm2,%xmm1 | |
544 pslld $12,%xmm2 | |
545 psrld $20,%xmm1 | |
546 por %xmm1,%xmm2 | |
547 movdqa -80(%ebx),%xmm1 | |
548 paddd %xmm2,%xmm0 | |
549 movdqa 112(%ebx),%xmm7 | |
550 pxor %xmm0,%xmm6 | |
551 movdqa %xmm0,-96(%ebx) | |
552 pshufb 16(%eax),%xmm6 | |
553 paddd %xmm6,%xmm4 | |
554 movdqa %xmm6,96(%ebx) | |
555 pxor %xmm4,%xmm2 | |
556 paddd %xmm3,%xmm1 | |
557 movdqa %xmm2,%xmm0 | |
558 pslld $7,%xmm2 | |
559 psrld $25,%xmm0 | |
560 pxor %xmm1,%xmm7 | |
561 por %xmm0,%xmm2 | |
562 pshufb (%eax),%xmm7 | |
563 movdqa %xmm2,-32(%ebx) | |
564 paddd %xmm7,%xmm5 | |
565 pxor %xmm5,%xmm3 | |
566 movdqa -48(%ebx),%xmm2 | |
567 movdqa %xmm3,%xmm0 | |
568 pslld $12,%xmm3 | |
569 psrld $20,%xmm0 | |
570 por %xmm0,%xmm3 | |
571 movdqa -128(%ebx),%xmm0 | |
572 paddd %xmm3,%xmm1 | |
573 pxor %xmm1,%xmm7 | |
574 movdqa %xmm1,-80(%ebx) | |
575 pshufb 16(%eax),%xmm7 | |
576 paddd %xmm7,%xmm5 | |
577 movdqa %xmm7,%xmm6 | |
578 pxor %xmm5,%xmm3 | |
579 paddd %xmm2,%xmm0 | |
580 movdqa %xmm3,%xmm1 | |
581 pslld $7,%xmm3 | |
582 psrld $25,%xmm1 | |
583 pxor %xmm0,%xmm6 | |
584 por %xmm1,%xmm3 | |
585 pshufb (%eax),%xmm6 | |
586 movdqa %xmm3,-16(%ebx) | |
587 paddd %xmm6,%xmm4 | |
588 pxor %xmm4,%xmm2 | |
589 movdqa -32(%ebx),%xmm3 | |
590 movdqa %xmm2,%xmm1 | |
591 pslld $12,%xmm2 | |
592 psrld $20,%xmm1 | |
593 por %xmm1,%xmm2 | |
594 movdqa -112(%ebx),%xmm1 | |
595 paddd %xmm2,%xmm0 | |
596 movdqa 64(%ebx),%xmm7 | |
597 pxor %xmm0,%xmm6 | |
598 movdqa %xmm0,-128(%ebx) | |
599 pshufb 16(%eax),%xmm6 | |
600 paddd %xmm6,%xmm4 | |
601 movdqa %xmm6,112(%ebx) | |
602 pxor %xmm4,%xmm2 | |
603 paddd %xmm3,%xmm1 | |
604 movdqa %xmm2,%xmm0 | |
605 pslld $7,%xmm2 | |
606 psrld $25,%xmm0 | |
607 pxor %xmm1,%xmm7 | |
608 por %xmm0,%xmm2 | |
609 movdqa %xmm4,32(%ebx) | |
610 pshufb (%eax),%xmm7 | |
611 movdqa %xmm2,-48(%ebx) | |
612 paddd %xmm7,%xmm5 | |
613 movdqa (%ebx),%xmm4 | |
614 pxor %xmm5,%xmm3 | |
615 movdqa -16(%ebx),%xmm2 | |
616 movdqa %xmm3,%xmm0 | |
617 pslld $12,%xmm3 | |
618 psrld $20,%xmm0 | |
619 por %xmm0,%xmm3 | |
620 movdqa -96(%ebx),%xmm0 | |
621 paddd %xmm3,%xmm1 | |
622 movdqa 80(%ebx),%xmm6 | |
623 pxor %xmm1,%xmm7 | |
624 movdqa %xmm1,-112(%ebx) | |
625 pshufb 16(%eax),%xmm7 | |
626 paddd %xmm7,%xmm5 | |
627 movdqa %xmm7,64(%ebx) | |
628 pxor %xmm5,%xmm3 | |
629 paddd %xmm2,%xmm0 | |
630 movdqa %xmm3,%xmm1 | |
631 pslld $7,%xmm3 | |
632 psrld $25,%xmm1 | |
633 pxor %xmm0,%xmm6 | |
634 por %xmm1,%xmm3 | |
635 movdqa %xmm5,48(%ebx) | |
636 pshufb (%eax),%xmm6 | |
637 movdqa %xmm3,-32(%ebx) | |
638 paddd %xmm6,%xmm4 | |
639 movdqa 16(%ebx),%xmm5 | |
640 pxor %xmm4,%xmm2 | |
641 movdqa -64(%ebx),%xmm3 | |
642 movdqa %xmm2,%xmm1 | |
643 pslld $12,%xmm2 | |
644 psrld $20,%xmm1 | |
645 por %xmm1,%xmm2 | |
646 movdqa -80(%ebx),%xmm1 | |
647 paddd %xmm2,%xmm0 | |
648 movdqa 96(%ebx),%xmm7 | |
649 pxor %xmm0,%xmm6 | |
650 movdqa %xmm0,-96(%ebx) | |
651 pshufb 16(%eax),%xmm6 | |
652 paddd %xmm6,%xmm4 | |
653 movdqa %xmm6,80(%ebx) | |
654 pxor %xmm4,%xmm2 | |
655 paddd %xmm3,%xmm1 | |
656 movdqa %xmm2,%xmm0 | |
657 pslld $7,%xmm2 | |
658 psrld $25,%xmm0 | |
659 pxor %xmm1,%xmm7 | |
660 por %xmm0,%xmm2 | |
661 pshufb (%eax),%xmm7 | |
662 movdqa %xmm2,-16(%ebx) | |
663 paddd %xmm7,%xmm5 | |
664 pxor %xmm5,%xmm3 | |
665 movdqa %xmm3,%xmm0 | |
666 pslld $12,%xmm3 | |
667 psrld $20,%xmm0 | |
668 por %xmm0,%xmm3 | |
669 movdqa -128(%ebx),%xmm0 | |
670 paddd %xmm3,%xmm1 | |
671 movdqa 64(%ebx),%xmm6 | |
672 pxor %xmm1,%xmm7 | |
673 movdqa %xmm1,-80(%ebx) | |
674 pshufb 16(%eax),%xmm7 | |
675 paddd %xmm7,%xmm5 | |
676 movdqa %xmm7,96(%ebx) | |
677 pxor %xmm5,%xmm3 | |
678 movdqa %xmm3,%xmm1 | |
679 pslld $7,%xmm3 | |
680 psrld $25,%xmm1 | |
681 por %xmm1,%xmm3 | |
682 decl %edx | |
683 jnz .L010loop | |
684 movdqa %xmm3,-64(%ebx) | |
685 movdqa %xmm4,(%ebx) | |
686 movdqa %xmm5,16(%ebx) | |
687 movdqa %xmm6,64(%ebx) | |
688 movdqa %xmm7,96(%ebx) | |
689 movdqa -112(%ebx),%xmm1 | |
690 movdqa -96(%ebx),%xmm2 | |
691 movdqa -80(%ebx),%xmm3 | |
692 paddd -128(%ebp),%xmm0 | |
693 paddd -112(%ebp),%xmm1 | |
694 paddd -96(%ebp),%xmm2 | |
695 paddd -80(%ebp),%xmm3 | |
696 movdqa %xmm0,%xmm6 | |
697 punpckldq %xmm1,%xmm0 | |
698 movdqa %xmm2,%xmm7 | |
699 punpckldq %xmm3,%xmm2 | |
700 punpckhdq %xmm1,%xmm6 | |
701 punpckhdq %xmm3,%xmm7 | |
702 movdqa %xmm0,%xmm1 | |
703 punpcklqdq %xmm2,%xmm0 | |
704 movdqa %xmm6,%xmm3 | |
705 punpcklqdq %xmm7,%xmm6 | |
706 punpckhqdq %xmm2,%xmm1 | |
707 punpckhqdq %xmm7,%xmm3 | |
708 movdqu -128(%esi),%xmm4 | |
709 movdqu -64(%esi),%xmm5 | |
710 movdqu (%esi),%xmm2 | |
711 movdqu 64(%esi),%xmm7 | |
712 leal 16(%esi),%esi | |
713 pxor %xmm0,%xmm4 | |
714 movdqa -64(%ebx),%xmm0 | |
715 pxor %xmm1,%xmm5 | |
716 movdqa -48(%ebx),%xmm1 | |
717 pxor %xmm2,%xmm6 | |
718 movdqa -32(%ebx),%xmm2 | |
719 pxor %xmm3,%xmm7 | |
720 movdqa -16(%ebx),%xmm3 | |
721 movdqu %xmm4,-128(%edi) | |
722 movdqu %xmm5,-64(%edi) | |
723 movdqu %xmm6,(%edi) | |
724 movdqu %xmm7,64(%edi) | |
725 leal 16(%edi),%edi | |
726 paddd -64(%ebp),%xmm0 | |
727 paddd -48(%ebp),%xmm1 | |
728 paddd -32(%ebp),%xmm2 | |
729 paddd -16(%ebp),%xmm3 | |
730 movdqa %xmm0,%xmm6 | |
731 punpckldq %xmm1,%xmm0 | |
732 movdqa %xmm2,%xmm7 | |
733 punpckldq %xmm3,%xmm2 | |
734 punpckhdq %xmm1,%xmm6 | |
735 punpckhdq %xmm3,%xmm7 | |
736 movdqa %xmm0,%xmm1 | |
737 punpcklqdq %xmm2,%xmm0 | |
738 movdqa %xmm6,%xmm3 | |
739 punpcklqdq %xmm7,%xmm6 | |
740 punpckhqdq %xmm2,%xmm1 | |
741 punpckhqdq %xmm7,%xmm3 | |
742 movdqu -128(%esi),%xmm4 | |
743 movdqu -64(%esi),%xmm5 | |
744 movdqu (%esi),%xmm2 | |
745 movdqu 64(%esi),%xmm7 | |
746 leal 16(%esi),%esi | |
747 pxor %xmm0,%xmm4 | |
748 movdqa (%ebx),%xmm0 | |
749 pxor %xmm1,%xmm5 | |
750 movdqa 16(%ebx),%xmm1 | |
751 pxor %xmm2,%xmm6 | |
752 movdqa 32(%ebx),%xmm2 | |
753 pxor %xmm3,%xmm7 | |
754 movdqa 48(%ebx),%xmm3 | |
755 movdqu %xmm4,-128(%edi) | |
756 movdqu %xmm5,-64(%edi) | |
757 movdqu %xmm6,(%edi) | |
758 movdqu %xmm7,64(%edi) | |
759 leal 16(%edi),%edi | |
760 paddd (%ebp),%xmm0 | |
761 paddd 16(%ebp),%xmm1 | |
762 paddd 32(%ebp),%xmm2 | |
763 paddd 48(%ebp),%xmm3 | |
764 movdqa %xmm0,%xmm6 | |
765 punpckldq %xmm1,%xmm0 | |
766 movdqa %xmm2,%xmm7 | |
767 punpckldq %xmm3,%xmm2 | |
768 punpckhdq %xmm1,%xmm6 | |
769 punpckhdq %xmm3,%xmm7 | |
770 movdqa %xmm0,%xmm1 | |
771 punpcklqdq %xmm2,%xmm0 | |
772 movdqa %xmm6,%xmm3 | |
773 punpcklqdq %xmm7,%xmm6 | |
774 punpckhqdq %xmm2,%xmm1 | |
775 punpckhqdq %xmm7,%xmm3 | |
776 movdqu -128(%esi),%xmm4 | |
777 movdqu -64(%esi),%xmm5 | |
778 movdqu (%esi),%xmm2 | |
779 movdqu 64(%esi),%xmm7 | |
780 leal 16(%esi),%esi | |
781 pxor %xmm0,%xmm4 | |
782 movdqa 64(%ebx),%xmm0 | |
783 pxor %xmm1,%xmm5 | |
784 movdqa 80(%ebx),%xmm1 | |
785 pxor %xmm2,%xmm6 | |
786 movdqa 96(%ebx),%xmm2 | |
787 pxor %xmm3,%xmm7 | |
788 movdqa 112(%ebx),%xmm3 | |
789 movdqu %xmm4,-128(%edi) | |
790 movdqu %xmm5,-64(%edi) | |
791 movdqu %xmm6,(%edi) | |
792 movdqu %xmm7,64(%edi) | |
793 leal 16(%edi),%edi | |
794 paddd 64(%ebp),%xmm0 | |
795 paddd 80(%ebp),%xmm1 | |
796 paddd 96(%ebp),%xmm2 | |
797 paddd 112(%ebp),%xmm3 | |
798 movdqa %xmm0,%xmm6 | |
799 punpckldq %xmm1,%xmm0 | |
800 movdqa %xmm2,%xmm7 | |
801 punpckldq %xmm3,%xmm2 | |
802 punpckhdq %xmm1,%xmm6 | |
803 punpckhdq %xmm3,%xmm7 | |
804 movdqa %xmm0,%xmm1 | |
805 punpcklqdq %xmm2,%xmm0 | |
806 movdqa %xmm6,%xmm3 | |
807 punpcklqdq %xmm7,%xmm6 | |
808 punpckhqdq %xmm2,%xmm1 | |
809 punpckhqdq %xmm7,%xmm3 | |
810 movdqu -128(%esi),%xmm4 | |
811 movdqu -64(%esi),%xmm5 | |
812 movdqu (%esi),%xmm2 | |
813 movdqu 64(%esi),%xmm7 | |
814 leal 208(%esi),%esi | |
815 pxor %xmm0,%xmm4 | |
816 pxor %xmm1,%xmm5 | |
817 pxor %xmm2,%xmm6 | |
818 pxor %xmm3,%xmm7 | |
819 movdqu %xmm4,-128(%edi) | |
820 movdqu %xmm5,-64(%edi) | |
821 movdqu %xmm6,(%edi) | |
822 movdqu %xmm7,64(%edi) | |
823 leal 208(%edi),%edi | |
824 subl $256,%ecx | |
825 jnc .L009outer_loop | |
826 addl $256,%ecx | |
827 jz .L011done | |
828 movl 520(%esp),%ebx | |
829 leal -128(%esi),%esi | |
830 movl 516(%esp),%edx | |
831 leal -128(%edi),%edi | |
832 movd 64(%ebp),%xmm2 | |
833 movdqu (%ebx),%xmm3 | |
834 paddd 96(%eax),%xmm2 | |
835 pand 112(%eax),%xmm3 | |
836 por %xmm2,%xmm3 | |
837 .L0081x: | |
838 movdqa 32(%eax),%xmm0 | |
839 movdqu (%edx),%xmm1 | |
840 movdqu 16(%edx),%xmm2 | |
841 movdqa (%eax),%xmm6 | |
842 movdqa 16(%eax),%xmm7 | |
843 movl %ebp,48(%esp) | |
844 movdqa %xmm0,(%esp) | |
845 movdqa %xmm1,16(%esp) | |
846 movdqa %xmm2,32(%esp) | |
847 movdqa %xmm3,48(%esp) | |
848 movl $10,%edx | |
849 jmp .L012loop1x | |
850 .align 16 | |
851 .L013outer1x: | |
852 movdqa 80(%eax),%xmm3 | |
853 movdqa (%esp),%xmm0 | |
854 movdqa 16(%esp),%xmm1 | |
855 movdqa 32(%esp),%xmm2 | |
856 paddd 48(%esp),%xmm3 | |
857 movl $10,%edx | |
858 movdqa %xmm3,48(%esp) | |
859 jmp .L012loop1x | |
860 .align 16 | |
861 .L012loop1x: | |
862 paddd %xmm1,%xmm0 | |
863 pxor %xmm0,%xmm3 | |
864 .byte 102,15,56,0,222 | |
865 paddd %xmm3,%xmm2 | |
866 pxor %xmm2,%xmm1 | |
867 movdqa %xmm1,%xmm4 | |
868 psrld $20,%xmm1 | |
869 pslld $12,%xmm4 | |
870 por %xmm4,%xmm1 | |
871 paddd %xmm1,%xmm0 | |
872 pxor %xmm0,%xmm3 | |
873 .byte 102,15,56,0,223 | |
874 paddd %xmm3,%xmm2 | |
875 pxor %xmm2,%xmm1 | |
876 movdqa %xmm1,%xmm4 | |
877 psrld $25,%xmm1 | |
878 pslld $7,%xmm4 | |
879 por %xmm4,%xmm1 | |
880 pshufd $78,%xmm2,%xmm2 | |
881 pshufd $57,%xmm1,%xmm1 | |
882 pshufd $147,%xmm3,%xmm3 | |
883 nop | |
884 paddd %xmm1,%xmm0 | |
885 pxor %xmm0,%xmm3 | |
886 .byte 102,15,56,0,222 | |
887 paddd %xmm3,%xmm2 | |
888 pxor %xmm2,%xmm1 | |
889 movdqa %xmm1,%xmm4 | |
890 psrld $20,%xmm1 | |
891 pslld $12,%xmm4 | |
892 por %xmm4,%xmm1 | |
893 paddd %xmm1,%xmm0 | |
894 pxor %xmm0,%xmm3 | |
895 .byte 102,15,56,0,223 | |
896 paddd %xmm3,%xmm2 | |
897 pxor %xmm2,%xmm1 | |
898 movdqa %xmm1,%xmm4 | |
899 psrld $25,%xmm1 | |
900 pslld $7,%xmm4 | |
901 por %xmm4,%xmm1 | |
902 pshufd $78,%xmm2,%xmm2 | |
903 pshufd $147,%xmm1,%xmm1 | |
904 pshufd $57,%xmm3,%xmm3 | |
905 decl %edx | |
906 jnz .L012loop1x | |
907 paddd (%esp),%xmm0 | |
908 paddd 16(%esp),%xmm1 | |
909 paddd 32(%esp),%xmm2 | |
910 paddd 48(%esp),%xmm3 | |
911 cmpl $64,%ecx | |
912 jb .L014tail | |
913 movdqu (%esi),%xmm4 | |
914 movdqu 16(%esi),%xmm5 | |
915 pxor %xmm4,%xmm0 | |
916 movdqu 32(%esi),%xmm4 | |
917 pxor %xmm5,%xmm1 | |
918 movdqu 48(%esi),%xmm5 | |
919 pxor %xmm4,%xmm2 | |
920 pxor %xmm5,%xmm3 | |
921 leal 64(%esi),%esi | |
922 movdqu %xmm0,(%edi) | |
923 movdqu %xmm1,16(%edi) | |
924 movdqu %xmm2,32(%edi) | |
925 movdqu %xmm3,48(%edi) | |
926 leal 64(%edi),%edi | |
927 subl $64,%ecx | |
928 jnz .L013outer1x | |
929 jmp .L011done | |
930 .L014tail: | |
931 movdqa %xmm0,(%esp) | |
932 movdqa %xmm1,16(%esp) | |
933 movdqa %xmm2,32(%esp) | |
934 movdqa %xmm3,48(%esp) | |
935 xorl %eax,%eax | |
936 xorl %edx,%edx | |
937 xorl %ebp,%ebp | |
938 .L015tail_loop: | |
939 movb (%esp,%ebp,1),%al | |
940 movb (%esi,%ebp,1),%dl | |
941 leal 1(%ebp),%ebp | |
942 xorb %dl,%al | |
943 movb %al,-1(%edi,%ebp,1) | |
944 decl %ecx | |
945 jnz .L015tail_loop | |
946 .L011done: | |
947 movl 512(%esp),%esp | |
948 popl %edi | |
949 popl %esi | |
950 popl %ebx | |
951 popl %ebp | |
952 ret | |
953 .size ChaCha20_ssse3,.-.L_ChaCha20_ssse3_begin | |
954 .align 64 | |
955 .Lssse3_data: | |
956 .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 | |
957 .byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 | |
958 .long 1634760805,857760878,2036477234,1797285236 | |
959 .long 0,1,2,3 | |
960 .long 4,4,4,4 | |
961 .long 1,0,0,0 | |
962 .long 4,0,0,0 | |
963 .long 0,-1,-1,-1 | |
964 .align 64 | |
965 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 | |
966 .byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 | |
967 .byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 | |
968 .byte 114,103,62,0 | |
969 #endif | |
OLD | NEW |