OLD | NEW |
| (Empty) |
1 #!/usr/bin/env perl | |
2 # | |
3 # ==================================================================== | |
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
5 # project. The module is, however, dual licensed under OpenSSL and | |
6 # CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 # details see http://www.openssl.org/~appro/cryptogams/. | |
8 # ==================================================================== | |
9 # | |
10 # February 2009 | |
11 # | |
12 # Performance is 2x of gcc 3.4.6 on z10. Coding "secret" is to | |
13 # "cluster" Address Generation Interlocks, so that one pipeline stall | |
14 # resolves several dependencies. | |
15 | |
16 # November 2010. | |
17 # | |
18 # Adapt for -m31 build. If kernel supports what's called "highgprs" | |
19 # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit | |
20 # instructions and achieve "64-bit" performance even in 31-bit legacy | |
21 # application context. The feature is not specific to any particular | |
22 # processor, as long as it's "z-CPU". Latter implies that the code | |
23 # remains z/Architecture specific. On z990 it was measured to perform | |
24 # 50% better than code generated by gcc 4.3. | |
25 | |
26 $flavour = shift; | |
27 | |
28 if ($flavour =~ /3[12]/) { | |
29 $SIZE_T=4; | |
30 $g=""; | |
31 } else { | |
32 $SIZE_T=8; | |
33 $g="g"; | |
34 } | |
35 | |
36 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | |
37 open STDOUT,">$output"; | |
38 | |
39 $rp="%r14"; | |
40 $sp="%r15"; | |
41 $code=<<___; | |
42 .text | |
43 | |
44 ___ | |
45 | |
46 # void RC4(RC4_KEY *key,size_t len,const void *inp,void *out) | |
47 { | |
48 $acc="%r0"; | |
49 $cnt="%r1"; | |
50 $key="%r2"; | |
51 $len="%r3"; | |
52 $inp="%r4"; | |
53 $out="%r5"; | |
54 | |
55 @XX=("%r6","%r7"); | |
56 @TX=("%r8","%r9"); | |
57 $YY="%r10"; | |
58 $TY="%r11"; | |
59 | |
60 $code.=<<___; | |
61 .globl RC4 | |
62 .type RC4,\@function | |
63 .align 64 | |
64 RC4: | |
65 stm${g} %r6,%r11,6*$SIZE_T($sp) | |
66 ___ | |
67 $code.=<<___ if ($flavour =~ /3[12]/); | |
68 llgfr $len,$len | |
69 ___ | |
70 $code.=<<___; | |
71 llgc $XX[0],0($key) | |
72 llgc $YY,1($key) | |
73 la $XX[0],1($XX[0]) | |
74 nill $XX[0],0xff | |
75 srlg $cnt,$len,3 | |
76 ltgr $cnt,$cnt | |
77 llgc $TX[0],2($XX[0],$key) | |
78 jz .Lshort | |
79 j .Loop8 | |
80 | |
81 .align 64 | |
82 .Loop8: | |
83 ___ | |
84 for ($i=0;$i<8;$i++) { | |
85 $code.=<<___; | |
86 la $YY,0($YY,$TX[0]) # $i | |
87 nill $YY,255 | |
88 la $XX[1],1($XX[0]) | |
89 nill $XX[1],255 | |
90 ___ | |
91 $code.=<<___ if ($i==1); | |
92 llgc $acc,2($TY,$key) | |
93 ___ | |
94 $code.=<<___ if ($i>1); | |
95 sllg $acc,$acc,8 | |
96 ic $acc,2($TY,$key) | |
97 ___ | |
98 $code.=<<___; | |
99 llgc $TY,2($YY,$key) | |
100 stc $TX[0],2($YY,$key) | |
101 llgc $TX[1],2($XX[1],$key) | |
102 stc $TY,2($XX[0],$key) | |
103 cr $XX[1],$YY | |
104 jne .Lcmov$i | |
105 la $TX[1],0($TX[0]) | |
106 .Lcmov$i: | |
107 la $TY,0($TY,$TX[0]) | |
108 nill $TY,255 | |
109 ___ | |
110 push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers | |
111 } | |
112 | |
113 $code.=<<___; | |
114 lg $TX[1],0($inp) | |
115 sllg $acc,$acc,8 | |
116 la $inp,8($inp) | |
117 ic $acc,2($TY,$key) | |
118 xgr $acc,$TX[1] | |
119 stg $acc,0($out) | |
120 la $out,8($out) | |
121 brctg $cnt,.Loop8 | |
122 | |
123 .Lshort: | |
124 lghi $acc,7 | |
125 ngr $len,$acc | |
126 jz .Lexit | |
127 j .Loop1 | |
128 | |
129 .align 16 | |
130 .Loop1: | |
131 la $YY,0($YY,$TX[0]) | |
132 nill $YY,255 | |
133 llgc $TY,2($YY,$key) | |
134 stc $TX[0],2($YY,$key) | |
135 stc $TY,2($XX[0],$key) | |
136 ar $TY,$TX[0] | |
137 ahi $XX[0],1 | |
138 nill $TY,255 | |
139 nill $XX[0],255 | |
140 llgc $acc,0($inp) | |
141 la $inp,1($inp) | |
142 llgc $TY,2($TY,$key) | |
143 llgc $TX[0],2($XX[0],$key) | |
144 xr $acc,$TY | |
145 stc $acc,0($out) | |
146 la $out,1($out) | |
147 brct $len,.Loop1 | |
148 | |
149 .Lexit: | |
150 ahi $XX[0],-1 | |
151 stc $XX[0],0($key) | |
152 stc $YY,1($key) | |
153 lm${g} %r6,%r11,6*$SIZE_T($sp) | |
154 br $rp | |
155 .size RC4,.-RC4 | |
156 .string "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>" | |
157 | |
158 ___ | |
159 } | |
160 | |
161 # void RC4_set_key(RC4_KEY *key,unsigned int len,const void *inp) | |
162 { | |
163 $cnt="%r0"; | |
164 $idx="%r1"; | |
165 $key="%r2"; | |
166 $len="%r3"; | |
167 $inp="%r4"; | |
168 $acc="%r5"; | |
169 $dat="%r6"; | |
170 $ikey="%r7"; | |
171 $iinp="%r8"; | |
172 | |
173 $code.=<<___; | |
174 .globl private_RC4_set_key | |
175 .type private_RC4_set_key,\@function | |
176 .align 64 | |
177 private_RC4_set_key: | |
178 stm${g} %r6,%r8,6*$SIZE_T($sp) | |
179 lhi $cnt,256 | |
180 la $idx,0(%r0) | |
181 sth $idx,0($key) | |
182 .align 4 | |
183 .L1stloop: | |
184 stc $idx,2($idx,$key) | |
185 la $idx,1($idx) | |
186 brct $cnt,.L1stloop | |
187 | |
188 lghi $ikey,-256 | |
189 lr $cnt,$len | |
190 la $iinp,0(%r0) | |
191 la $idx,0(%r0) | |
192 .align 16 | |
193 .L2ndloop: | |
194 llgc $acc,2+256($ikey,$key) | |
195 llgc $dat,0($iinp,$inp) | |
196 la $idx,0($idx,$acc) | |
197 la $ikey,1($ikey) | |
198 la $idx,0($idx,$dat) | |
199 nill $idx,255 | |
200 la $iinp,1($iinp) | |
201 tml $ikey,255 | |
202 llgc $dat,2($idx,$key) | |
203 stc $dat,2+256-1($ikey,$key) | |
204 stc $acc,2($idx,$key) | |
205 jz .Ldone | |
206 brct $cnt,.L2ndloop | |
207 lr $cnt,$len | |
208 la $iinp,0(%r0) | |
209 j .L2ndloop | |
210 .Ldone: | |
211 lm${g} %r6,%r8,6*$SIZE_T($sp) | |
212 br $rp | |
213 .size private_RC4_set_key,.-private_RC4_set_key | |
214 | |
215 ___ | |
216 } | |
217 | |
218 # const char *RC4_options() | |
219 $code.=<<___; | |
220 .globl RC4_options | |
221 .type RC4_options,\@function | |
222 .align 16 | |
223 RC4_options: | |
224 larl %r2,.Loptions | |
225 br %r14 | |
226 .size RC4_options,.-RC4_options | |
227 .section .rodata | |
228 .Loptions: | |
229 .align 8 | |
230 .string "rc4(8x,char)" | |
231 ___ | |
232 | |
233 print $code; | |
234 close STDOUT; # force flush | |
OLD | NEW |