OLD | NEW |
| (Empty) |
1 #!/usr/bin/env perl | |
2 # | |
3 # ==================================================================== | |
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
5 # project. The module is, however, dual licensed under OpenSSL and | |
6 # CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 # details see http://www.openssl.org/~appro/cryptogams/. | |
8 # ==================================================================== | |
9 # | |
10 # July 2004 | |
11 # | |
12 # 2.22x RC4 tune-up:-) It should be noted though that my hand [as in | |
13 # "hand-coded assembler"] doesn't stand for the whole improvement | |
14 # coefficient. It turned out that eliminating RC4_CHAR from config | |
15 # line results in ~40% improvement (yes, even for C implementation). | |
16 # Presumably it has everything to do with AMD cache architecture and | |
17 # RAW or whatever penalties. Once again! The module *requires* config | |
18 # line *without* RC4_CHAR! As for coding "secret," I bet on partial | |
19 # register arithmetics. For example instead of 'inc %r8; and $255,%r8' | |
20 # I simply 'inc %r8b'. Even though optimization manual discourages | |
21 # to operate on partial registers, it turned out to be the best bet. | |
22 # At least for AMD... How IA32E would perform remains to be seen... | |
23 | |
24 # November 2004 | |
25 # | |
26 # As was shown by Marc Bevand reordering of couple of load operations | |
27 # results in even higher performance gain of 3.3x:-) At least on | |
28 # Opteron... For reference, 1x in this case is RC4_CHAR C-code | |
29 # compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock. | |
30 # Latter means that if you want to *estimate* what to expect from | |
31 # *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz. | |
32 | |
33 # November 2004 | |
34 # | |
35 # Intel P4 EM64T core was found to run the AMD64 code really slow... | |
36 # The only way to achieve comparable performance on P4 was to keep | |
37 # RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to | |
38 # compose blended code, which would perform even within 30% marginal | |
39 # on either AMD and Intel platforms, I implement both cases. See | |
40 # rc4_skey.c for further details... | |
41 | |
42 # April 2005 | |
43 # | |
44 # P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing | |
45 # those with add/sub results in 50% performance improvement of folded | |
46 # loop... | |
47 | |
48 # May 2005 | |
49 # | |
50 # As was shown by Zou Nanhai loop unrolling can improve Intel EM64T | |
51 # performance by >30% [unlike P4 32-bit case that is]. But this is | |
52 # provided that loads are reordered even more aggressively! Both code | |
53 # pathes, AMD64 and EM64T, reorder loads in essentially same manner | |
54 # as my IA-64 implementation. On Opteron this resulted in modest 5% | |
55 # improvement [I had to test it], while final Intel P4 performance | |
56 # achieves respectful 432MBps on 2.8GHz processor now. For reference. | |
57 # If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than | |
58 # RC4_INT code-path. While if executed on Opteron, it's only 25% | |
59 # slower than the RC4_INT one [meaning that if CPU µ-arch detection | |
60 # is not implemented, then this final RC4_CHAR code-path should be | |
61 # preferred, as it provides better *all-round* performance]. | |
62 | |
63 # March 2007 | |
64 # | |
65 # Intel Core2 was observed to perform poorly on both code paths:-( It | |
66 # apparently suffers from some kind of partial register stall, which | |
67 # occurs in 64-bit mode only [as virtually identical 32-bit loop was | |
68 # observed to outperform 64-bit one by almost 50%]. Adding two movzb to | |
69 # cloop1 boosts its performance by 80%! This loop appears to be optimal | |
70 # fit for Core2 and therefore the code was modified to skip cloop8 on | |
71 # this CPU. | |
72 | |
73 # May 2010 | |
74 # | |
75 # Intel Westmere was observed to perform suboptimally. Adding yet | |
76 # another movzb to cloop1 improved performance by almost 50%! Core2 | |
77 # performance is improved too, but nominally... | |
78 | |
79 # May 2011 | |
80 # | |
81 # The only code path that was not modified is P4-specific one. Non-P4 | |
82 # Intel code path optimization is heavily based on submission by Maxim | |
83 # Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used | |
84 # some of the ideas even in attempt to optmize the original RC4_INT | |
85 # code path... Current performance in cycles per processed byte (less | |
86 # is better) and improvement coefficients relative to previous | |
87 # version of this module are: | |
88 # | |
89 # Opteron 5.3/+0%(*) | |
90 # P4 6.5 | |
91 # Core2 6.2/+15%(**) | |
92 # Westmere 4.2/+60% | |
93 # Sandy Bridge 4.2/+120% | |
94 # Atom 9.3/+80% | |
95 # | |
96 # (*) But corresponding loop has less instructions, which should have | |
97 # positive effect on upcoming Bulldozer, which has one less ALU. | |
98 # For reference, Intel code runs at 6.8 cpb rate on Opteron. | |
99 # (**) Note that Core2 result is ~15% lower than corresponding result | |
100 # for 32-bit code, meaning that it's possible to improve it, | |
101 # but more than likely at the cost of the others (see rc4-586.pl | |
102 # to get the idea)... | |
103 | |
104 $flavour = shift; | |
105 $output = shift; | |
106 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | |
107 | |
108 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
109 | |
110 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
111 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
112 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | |
113 die "can't locate x86_64-xlate.pl"; | |
114 | |
115 open OUT,"| \"$^X\" $xlate $flavour $output"; | |
116 *STDOUT=*OUT; | |
117 | |
118 $dat="%rdi"; # arg1 | |
119 $len="%rsi"; # arg2 | |
120 $inp="%rdx"; # arg3 | |
121 $out="%rcx"; # arg4 | |
122 | |
123 { | |
124 $code=<<___; | |
125 .text | |
126 .extern OPENSSL_ia32cap_P | |
127 | |
128 .globl RC4 | |
129 .type RC4,\@function,4 | |
130 .align 16 | |
131 RC4: or $len,$len | |
132 jne .Lentry | |
133 ret | |
134 .Lentry: | |
135 push %rbx | |
136 push %r12 | |
137 push %r13 | |
138 .Lprologue: | |
139 mov $len,%r11 | |
140 mov $inp,%r12 | |
141 mov $out,%r13 | |
142 ___ | |
143 my $len="%r11"; # reassign input arguments | |
144 my $inp="%r12"; | |
145 my $out="%r13"; | |
146 | |
147 my @XX=("%r10","%rsi"); | |
148 my @TX=("%rax","%rbx"); | |
149 my $YY="%rcx"; | |
150 my $TY="%rdx"; | |
151 | |
152 $code.=<<___; | |
153 xor $XX[0],$XX[0] | |
154 xor $YY,$YY | |
155 | |
156 lea 8($dat),$dat | |
157 mov -8($dat),$XX[0]#b | |
158 mov -4($dat),$YY#b | |
159 cmpl \$-1,256($dat) | |
160 je .LRC4_CHAR | |
161 mov OPENSSL_ia32cap_P(%rip),%r8d | |
162 xor $TX[1],$TX[1] | |
163 inc $XX[0]#b | |
164 sub $XX[0],$TX[1] | |
165 sub $inp,$out | |
166 movl ($dat,$XX[0],4),$TX[0]#d | |
167 test \$-16,$len | |
168 jz .Lloop1 | |
169 bt \$30,%r8d # Intel CPU? | |
170 jc .Lintel | |
171 and \$7,$TX[1] | |
172 lea 1($XX[0]),$XX[1] | |
173 jz .Loop8 | |
174 sub $TX[1],$len | |
175 .Loop8_warmup: | |
176 add $TX[0]#b,$YY#b | |
177 movl ($dat,$YY,4),$TY#d | |
178 movl $TX[0]#d,($dat,$YY,4) | |
179 movl $TY#d,($dat,$XX[0],4) | |
180 add $TY#b,$TX[0]#b | |
181 inc $XX[0]#b | |
182 movl ($dat,$TX[0],4),$TY#d | |
183 movl ($dat,$XX[0],4),$TX[0]#d | |
184 xorb ($inp),$TY#b | |
185 movb $TY#b,($out,$inp) | |
186 lea 1($inp),$inp | |
187 dec $TX[1] | |
188 jnz .Loop8_warmup | |
189 | |
190 lea 1($XX[0]),$XX[1] | |
191 jmp .Loop8 | |
192 .align 16 | |
193 .Loop8: | |
194 ___ | |
195 for ($i=0;$i<8;$i++) { | |
196 $code.=<<___ if ($i==7); | |
197 add \$8,$XX[1]#b | |
198 ___ | |
199 $code.=<<___; | |
200 add $TX[0]#b,$YY#b | |
201 movl ($dat,$YY,4),$TY#d | |
202 movl $TX[0]#d,($dat,$YY,4) | |
203 movl `4*($i==7?-1:$i)`($dat,$XX[1],4),$TX[1]#d | |
204 ror \$8,%r8 # ror is redundant when $i=0 | |
205 movl $TY#d,4*$i($dat,$XX[0],4) | |
206 add $TX[0]#b,$TY#b | |
207 movb ($dat,$TY,4),%r8b | |
208 ___ | |
209 push(@TX,shift(@TX)); #push(@XX,shift(@XX)); # "rotate" registers | |
210 } | |
211 $code.=<<___; | |
212 add \$8,$XX[0]#b | |
213 ror \$8,%r8 | |
214 sub \$8,$len | |
215 | |
216 xor ($inp),%r8 | |
217 mov %r8,($out,$inp) | |
218 lea 8($inp),$inp | |
219 | |
220 test \$-8,$len | |
221 jnz .Loop8 | |
222 cmp \$0,$len | |
223 jne .Lloop1 | |
224 jmp .Lexit | |
225 | |
226 .align 16 | |
227 .Lintel: | |
228 test \$-32,$len | |
229 jz .Lloop1 | |
230 and \$15,$TX[1] | |
231 jz .Loop16_is_hot | |
232 sub $TX[1],$len | |
233 .Loop16_warmup: | |
234 add $TX[0]#b,$YY#b | |
235 movl ($dat,$YY,4),$TY#d | |
236 movl $TX[0]#d,($dat,$YY,4) | |
237 movl $TY#d,($dat,$XX[0],4) | |
238 add $TY#b,$TX[0]#b | |
239 inc $XX[0]#b | |
240 movl ($dat,$TX[0],4),$TY#d | |
241 movl ($dat,$XX[0],4),$TX[0]#d | |
242 xorb ($inp),$TY#b | |
243 movb $TY#b,($out,$inp) | |
244 lea 1($inp),$inp | |
245 dec $TX[1] | |
246 jnz .Loop16_warmup | |
247 | |
248 mov $YY,$TX[1] | |
249 xor $YY,$YY | |
250 mov $TX[1]#b,$YY#b | |
251 | |
252 .Loop16_is_hot: | |
253 lea ($dat,$XX[0],4),$XX[1] | |
254 ___ | |
255 sub RC4_loop { | |
256 my $i=shift; | |
257 my $j=$i<0?0:$i; | |
258 my $xmm="%xmm".($j&1); | |
259 | |
260 $code.=" add \$16,$XX[0]#b\n" if ($i==15); | |
261 $code.=" movdqu ($inp),%xmm2\n" if ($i==15); | |
262 $code.=" add $TX[0]#b,$YY#b\n" if ($i<=0); | |
263 $code.=" movl ($dat,$YY,4),$TY#d\n"; | |
264 $code.=" pxor %xmm0,%xmm2\n" if ($i==0); | |
265 $code.=" psllq \$8,%xmm1\n" if ($i==0); | |
266 $code.=" pxor $xmm,$xmm\n" if ($i<=1); | |
267 $code.=" movl $TX[0]#d,($dat,$YY,4)\n"; | |
268 $code.=" add $TY#b,$TX[0]#b\n"; | |
269 $code.=" movl `4*($j+1)`($XX[1]),$TX[1]#d\n" if ($i<15); | |
270 $code.=" movz $TX[0]#b,$TX[0]#d\n"; | |
271 $code.=" movl $TY#d,4*$j($XX[1])\n"; | |
272 $code.=" pxor %xmm1,%xmm2\n" if ($i==0); | |
273 $code.=" lea ($dat,$XX[0],4),$XX[1]\n" if ($i==15); | |
274 $code.=" add $TX[1]#b,$YY#b\n" if ($i<15); | |
275 $code.=" pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n"; | |
276 $code.=" movdqu %xmm2,($out,$inp)\n" if ($i==0); | |
277 $code.=" lea 16($inp),$inp\n" if ($i==0); | |
278 $code.=" movl ($XX[1]),$TX[1]#d\n" if ($i==15); | |
279 } | |
280 RC4_loop(-1); | |
281 $code.=<<___; | |
282 jmp .Loop16_enter | |
283 .align 16 | |
284 .Loop16: | |
285 ___ | |
286 | |
287 for ($i=0;$i<16;$i++) { | |
288 $code.=".Loop16_enter:\n" if ($i==1); | |
289 RC4_loop($i); | |
290 push(@TX,shift(@TX)); # "rotate" registers | |
291 } | |
292 $code.=<<___; | |
293 mov $YY,$TX[1] | |
294 xor $YY,$YY # keyword to partial register | |
295 sub \$16,$len | |
296 mov $TX[1]#b,$YY#b | |
297 test \$-16,$len | |
298 jnz .Loop16 | |
299 | |
300 psllq \$8,%xmm1 | |
301 pxor %xmm0,%xmm2 | |
302 pxor %xmm1,%xmm2 | |
303 movdqu %xmm2,($out,$inp) | |
304 lea 16($inp),$inp | |
305 | |
306 cmp \$0,$len | |
307 jne .Lloop1 | |
308 jmp .Lexit | |
309 | |
310 .align 16 | |
311 .Lloop1: | |
312 add $TX[0]#b,$YY#b | |
313 movl ($dat,$YY,4),$TY#d | |
314 movl $TX[0]#d,($dat,$YY,4) | |
315 movl $TY#d,($dat,$XX[0],4) | |
316 add $TY#b,$TX[0]#b | |
317 inc $XX[0]#b | |
318 movl ($dat,$TX[0],4),$TY#d | |
319 movl ($dat,$XX[0],4),$TX[0]#d | |
320 xorb ($inp),$TY#b | |
321 movb $TY#b,($out,$inp) | |
322 lea 1($inp),$inp | |
323 dec $len | |
324 jnz .Lloop1 | |
325 jmp .Lexit | |
326 | |
327 .align 16 | |
328 .LRC4_CHAR: | |
329 add \$1,$XX[0]#b | |
330 movzb ($dat,$XX[0]),$TX[0]#d | |
331 test \$-8,$len | |
332 jz .Lcloop1 | |
333 jmp .Lcloop8 | |
334 .align 16 | |
335 .Lcloop8: | |
336 mov ($inp),%r8d | |
337 mov 4($inp),%r9d | |
338 ___ | |
339 # unroll 2x4-wise, because 64-bit rotates kill Intel P4... | |
340 for ($i=0;$i<4;$i++) { | |
341 $code.=<<___; | |
342 add $TX[0]#b,$YY#b | |
343 lea 1($XX[0]),$XX[1] | |
344 movzb ($dat,$YY),$TY#d | |
345 movzb $XX[1]#b,$XX[1]#d | |
346 movzb ($dat,$XX[1]),$TX[1]#d | |
347 movb $TX[0]#b,($dat,$YY) | |
348 cmp $XX[1],$YY | |
349 movb $TY#b,($dat,$XX[0]) | |
350 jne .Lcmov$i # Intel cmov is sloooow... | |
351 mov $TX[0],$TX[1] | |
352 .Lcmov$i: | |
353 add $TX[0]#b,$TY#b | |
354 xor ($dat,$TY),%r8b | |
355 ror \$8,%r8d | |
356 ___ | |
357 push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers | |
358 } | |
359 for ($i=4;$i<8;$i++) { | |
360 $code.=<<___; | |
361 add $TX[0]#b,$YY#b | |
362 lea 1($XX[0]),$XX[1] | |
363 movzb ($dat,$YY),$TY#d | |
364 movzb $XX[1]#b,$XX[1]#d | |
365 movzb ($dat,$XX[1]),$TX[1]#d | |
366 movb $TX[0]#b,($dat,$YY) | |
367 cmp $XX[1],$YY | |
368 movb $TY#b,($dat,$XX[0]) | |
369 jne .Lcmov$i # Intel cmov is sloooow... | |
370 mov $TX[0],$TX[1] | |
371 .Lcmov$i: | |
372 add $TX[0]#b,$TY#b | |
373 xor ($dat,$TY),%r9b | |
374 ror \$8,%r9d | |
375 ___ | |
376 push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers | |
377 } | |
378 $code.=<<___; | |
379 lea -8($len),$len | |
380 mov %r8d,($out) | |
381 lea 8($inp),$inp | |
382 mov %r9d,4($out) | |
383 lea 8($out),$out | |
384 | |
385 test \$-8,$len | |
386 jnz .Lcloop8 | |
387 cmp \$0,$len | |
388 jne .Lcloop1 | |
389 jmp .Lexit | |
390 ___ | |
391 $code.=<<___; | |
392 .align 16 | |
393 .Lcloop1: | |
394 add $TX[0]#b,$YY#b | |
395 movzb $YY#b,$YY#d | |
396 movzb ($dat,$YY),$TY#d | |
397 movb $TX[0]#b,($dat,$YY) | |
398 movb $TY#b,($dat,$XX[0]) | |
399 add $TX[0]#b,$TY#b | |
400 add \$1,$XX[0]#b | |
401 movzb $TY#b,$TY#d | |
402 movzb $XX[0]#b,$XX[0]#d | |
403 movzb ($dat,$TY),$TY#d | |
404 movzb ($dat,$XX[0]),$TX[0]#d | |
405 xorb ($inp),$TY#b | |
406 lea 1($inp),$inp | |
407 movb $TY#b,($out) | |
408 lea 1($out),$out | |
409 sub \$1,$len | |
410 jnz .Lcloop1 | |
411 jmp .Lexit | |
412 | |
413 .align 16 | |
414 .Lexit: | |
415 sub \$1,$XX[0]#b | |
416 movl $XX[0]#d,-8($dat) | |
417 movl $YY#d,-4($dat) | |
418 | |
419 mov (%rsp),%r13 | |
420 mov 8(%rsp),%r12 | |
421 mov 16(%rsp),%rbx | |
422 add \$24,%rsp | |
423 .Lepilogue: | |
424 ret | |
425 .size RC4,.-RC4 | |
426 ___ | |
427 } | |
428 | |
429 $idx="%r8"; | |
430 $ido="%r9"; | |
431 | |
432 $code.=<<___; | |
433 .globl private_RC4_set_key | |
434 .type private_RC4_set_key,\@function,3 | |
435 .align 16 | |
436 private_RC4_set_key: | |
437 lea 8($dat),$dat | |
438 lea ($inp,$len),$inp | |
439 neg $len | |
440 mov $len,%rcx | |
441 xor %eax,%eax | |
442 xor $ido,$ido | |
443 xor %r10,%r10 | |
444 xor %r11,%r11 | |
445 | |
446 mov OPENSSL_ia32cap_P(%rip),$idx#d | |
447 bt \$20,$idx#d # RC4_CHAR? | |
448 jc .Lc1stloop | |
449 jmp .Lw1stloop | |
450 | |
451 .align 16 | |
452 .Lw1stloop: | |
453 mov %eax,($dat,%rax,4) | |
454 add \$1,%al | |
455 jnc .Lw1stloop | |
456 | |
457 xor $ido,$ido | |
458 xor $idx,$idx | |
459 .align 16 | |
460 .Lw2ndloop: | |
461 mov ($dat,$ido,4),%r10d | |
462 add ($inp,$len,1),$idx#b | |
463 add %r10b,$idx#b | |
464 add \$1,$len | |
465 mov ($dat,$idx,4),%r11d | |
466 cmovz %rcx,$len | |
467 mov %r10d,($dat,$idx,4) | |
468 mov %r11d,($dat,$ido,4) | |
469 add \$1,$ido#b | |
470 jnc .Lw2ndloop | |
471 jmp .Lexit_key | |
472 | |
473 .align 16 | |
474 .Lc1stloop: | |
475 mov %al,($dat,%rax) | |
476 add \$1,%al | |
477 jnc .Lc1stloop | |
478 | |
479 xor $ido,$ido | |
480 xor $idx,$idx | |
481 .align 16 | |
482 .Lc2ndloop: | |
483 mov ($dat,$ido),%r10b | |
484 add ($inp,$len),$idx#b | |
485 add %r10b,$idx#b | |
486 add \$1,$len | |
487 mov ($dat,$idx),%r11b | |
488 jnz .Lcnowrap | |
489 mov %rcx,$len | |
490 .Lcnowrap: | |
491 mov %r10b,($dat,$idx) | |
492 mov %r11b,($dat,$ido) | |
493 add \$1,$ido#b | |
494 jnc .Lc2ndloop | |
495 movl \$-1,256($dat) | |
496 | |
497 .align 16 | |
498 .Lexit_key: | |
499 xor %eax,%eax | |
500 mov %eax,-8($dat) | |
501 mov %eax,-4($dat) | |
502 ret | |
503 .size private_RC4_set_key,.-private_RC4_set_key | |
504 | |
505 .globl RC4_options | |
506 .type RC4_options,\@abi-omnipotent | |
507 .align 16 | |
508 RC4_options: | |
509 lea .Lopts(%rip),%rax | |
510 mov OPENSSL_ia32cap_P(%rip),%edx | |
511 bt \$20,%edx | |
512 jc .L8xchar | |
513 bt \$30,%edx | |
514 jnc .Ldone | |
515 add \$25,%rax | |
516 ret | |
517 .L8xchar: | |
518 add \$12,%rax | |
519 .Ldone: | |
520 ret | |
521 .align 64 | |
522 .Lopts: | |
523 .asciz "rc4(8x,int)" | |
524 .asciz "rc4(8x,char)" | |
525 .asciz "rc4(16x,int)" | |
526 .asciz "RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | |
527 .align 64 | |
528 .size RC4_options,.-RC4_options | |
529 ___ | |
530 | |
531 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | |
532 # CONTEXT *context,DISPATCHER_CONTEXT *disp) | |
533 if ($win64) { | |
534 $rec="%rcx"; | |
535 $frame="%rdx"; | |
536 $context="%r8"; | |
537 $disp="%r9"; | |
538 | |
539 $code.=<<___; | |
540 .extern __imp_RtlVirtualUnwind | |
541 .type stream_se_handler,\@abi-omnipotent | |
542 .align 16 | |
543 stream_se_handler: | |
544 push %rsi | |
545 push %rdi | |
546 push %rbx | |
547 push %rbp | |
548 push %r12 | |
549 push %r13 | |
550 push %r14 | |
551 push %r15 | |
552 pushfq | |
553 sub \$64,%rsp | |
554 | |
555 mov 120($context),%rax # pull context->Rax | |
556 mov 248($context),%rbx # pull context->Rip | |
557 | |
558 lea .Lprologue(%rip),%r10 | |
559 cmp %r10,%rbx # context->Rip<prologue label | |
560 jb .Lin_prologue | |
561 | |
562 mov 152($context),%rax # pull context->Rsp | |
563 | |
564 lea .Lepilogue(%rip),%r10 | |
565 cmp %r10,%rbx # context->Rip>=epilogue label | |
566 jae .Lin_prologue | |
567 | |
568 lea 24(%rax),%rax | |
569 | |
570 mov -8(%rax),%rbx | |
571 mov -16(%rax),%r12 | |
572 mov -24(%rax),%r13 | |
573 mov %rbx,144($context) # restore context->Rbx | |
574 mov %r12,216($context) # restore context->R12 | |
575 mov %r13,224($context) # restore context->R13 | |
576 | |
577 .Lin_prologue: | |
578 mov 8(%rax),%rdi | |
579 mov 16(%rax),%rsi | |
580 mov %rax,152($context) # restore context->Rsp | |
581 mov %rsi,168($context) # restore context->Rsi | |
582 mov %rdi,176($context) # restore context->Rdi | |
583 | |
584 jmp .Lcommon_seh_exit | |
585 .size stream_se_handler,.-stream_se_handler | |
586 | |
587 .type key_se_handler,\@abi-omnipotent | |
588 .align 16 | |
589 key_se_handler: | |
590 push %rsi | |
591 push %rdi | |
592 push %rbx | |
593 push %rbp | |
594 push %r12 | |
595 push %r13 | |
596 push %r14 | |
597 push %r15 | |
598 pushfq | |
599 sub \$64,%rsp | |
600 | |
601 mov 152($context),%rax # pull context->Rsp | |
602 mov 8(%rax),%rdi | |
603 mov 16(%rax),%rsi | |
604 mov %rsi,168($context) # restore context->Rsi | |
605 mov %rdi,176($context) # restore context->Rdi | |
606 | |
607 .Lcommon_seh_exit: | |
608 | |
609 mov 40($disp),%rdi # disp->ContextRecord | |
610 mov $context,%rsi # context | |
611 mov \$154,%ecx # sizeof(CONTEXT) | |
612 .long 0xa548f3fc # cld; rep movsq | |
613 | |
614 mov $disp,%rsi | |
615 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | |
616 mov 8(%rsi),%rdx # arg2, disp->ImageBase | |
617 mov 0(%rsi),%r8 # arg3, disp->ControlPc | |
618 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | |
619 mov 40(%rsi),%r10 # disp->ContextRecord | |
620 lea 56(%rsi),%r11 # &disp->HandlerData | |
621 lea 24(%rsi),%r12 # &disp->EstablisherFrame | |
622 mov %r10,32(%rsp) # arg5 | |
623 mov %r11,40(%rsp) # arg6 | |
624 mov %r12,48(%rsp) # arg7 | |
625 mov %rcx,56(%rsp) # arg8, (NULL) | |
626 call *__imp_RtlVirtualUnwind(%rip) | |
627 | |
628 mov \$1,%eax # ExceptionContinueSearch | |
629 add \$64,%rsp | |
630 popfq | |
631 pop %r15 | |
632 pop %r14 | |
633 pop %r13 | |
634 pop %r12 | |
635 pop %rbp | |
636 pop %rbx | |
637 pop %rdi | |
638 pop %rsi | |
639 ret | |
640 .size key_se_handler,.-key_se_handler | |
641 | |
642 .section .pdata | |
643 .align 4 | |
644 .rva .LSEH_begin_RC4 | |
645 .rva .LSEH_end_RC4 | |
646 .rva .LSEH_info_RC4 | |
647 | |
648 .rva .LSEH_begin_private_RC4_set_key | |
649 .rva .LSEH_end_private_RC4_set_key | |
650 .rva .LSEH_info_private_RC4_set_key | |
651 | |
652 .section .xdata | |
653 .align 8 | |
654 .LSEH_info_RC4: | |
655 .byte 9,0,0,0 | |
656 .rva stream_se_handler | |
657 .LSEH_info_private_RC4_set_key: | |
658 .byte 9,0,0,0 | |
659 .rva key_se_handler | |
660 ___ | |
661 } | |
662 | |
663 sub reg_part { | |
664 my ($reg,$conv)=@_; | |
665 if ($reg =~ /%r[0-9]+/) { $reg .= $conv; } | |
666 elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; } | |
667 elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; } | |
668 elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; } | |
669 return $reg; | |
670 } | |
671 | |
672 $code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem; | |
673 $code =~ s/\`([^\`]*)\`/eval $1/gem; | |
674 | |
675 print $code; | |
676 | |
677 close STDOUT; | |
OLD | NEW |