OLD | NEW |
| (Empty) |
1 #!/usr/bin/env perl | |
2 # | |
3 # ==================================================================== | |
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
5 # project. The module is, however, dual licensed under OpenSSL and | |
6 # CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 # details see http://www.openssl.org/~appro/cryptogams/. | |
8 # ==================================================================== | |
9 # | |
10 # March 2010 | |
11 # | |
12 # The module implements "4-bit" GCM GHASH function and underlying | |
13 # single multiplication operation in GF(2^128). "4-bit" means that it | |
14 # uses 256 bytes per-key table [+128 bytes shared table]. Even though | |
15 # loops are aggressively modulo-scheduled in respect to references to | |
16 # Htbl and Z.hi updates for 8 cycles per byte, measured performance is | |
17 # ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic | |
18 # scheduling "glitch," because uprofile(1) indicates uniform sample | |
19 # distribution, as if all instruction bundles execute in 1.5 cycles. | |
20 # Meaning that it could have been even faster, yet 12 cycles is ~60% | |
21 # better than gcc-generated code and ~80% than code generated by vendor | |
22 # compiler. | |
23 | |
24 $cnt="v0"; # $0 | |
25 $t0="t0"; | |
26 $t1="t1"; | |
27 $t2="t2"; | |
28 $Thi0="t3"; # $4 | |
29 $Tlo0="t4"; | |
30 $Thi1="t5"; | |
31 $Tlo1="t6"; | |
32 $rem="t7"; # $8 | |
33 ################# | |
34 $Xi="a0"; # $16, input argument block | |
35 $Htbl="a1"; | |
36 $inp="a2"; | |
37 $len="a3"; | |
38 $nlo="a4"; # $20 | |
39 $nhi="a5"; | |
40 $Zhi="t8"; | |
41 $Zlo="t9"; | |
42 $Xhi="t10"; # $24 | |
43 $Xlo="t11"; | |
44 $remp="t12"; | |
45 $rem_4bit="AT"; # $28 | |
46 | |
47 { my $N; | |
48 sub loop() { | |
49 | |
50 $N++; | |
51 $code.=<<___; | |
52 .align 4 | |
53 extbl $Xlo,7,$nlo | |
54 and $nlo,0xf0,$nhi | |
55 sll $nlo,4,$nlo | |
56 and $nlo,0xf0,$nlo | |
57 | |
58 addq $nlo,$Htbl,$nlo | |
59 ldq $Zlo,8($nlo) | |
60 addq $nhi,$Htbl,$nhi | |
61 ldq $Zhi,0($nlo) | |
62 | |
63 and $Zlo,0x0f,$remp | |
64 sll $Zhi,60,$t0 | |
65 lda $cnt,6(zero) | |
66 extbl $Xlo,6,$nlo | |
67 | |
68 ldq $Tlo1,8($nhi) | |
69 s8addq $remp,$rem_4bit,$remp | |
70 ldq $Thi1,0($nhi) | |
71 srl $Zlo,4,$Zlo | |
72 | |
73 ldq $rem,0($remp) | |
74 srl $Zhi,4,$Zhi | |
75 xor $t0,$Zlo,$Zlo | |
76 and $nlo,0xf0,$nhi | |
77 | |
78 xor $Tlo1,$Zlo,$Zlo | |
79 sll $nlo,4,$nlo | |
80 xor $Thi1,$Zhi,$Zhi | |
81 and $nlo,0xf0,$nlo | |
82 | |
83 addq $nlo,$Htbl,$nlo | |
84 ldq $Tlo0,8($nlo) | |
85 addq $nhi,$Htbl,$nhi | |
86 ldq $Thi0,0($nlo) | |
87 | |
88 .Looplo$N: | |
89 and $Zlo,0x0f,$remp | |
90 sll $Zhi,60,$t0 | |
91 subq $cnt,1,$cnt | |
92 srl $Zlo,4,$Zlo | |
93 | |
94 ldq $Tlo1,8($nhi) | |
95 xor $rem,$Zhi,$Zhi | |
96 ldq $Thi1,0($nhi) | |
97 s8addq $remp,$rem_4bit,$remp | |
98 | |
99 ldq $rem,0($remp) | |
100 srl $Zhi,4,$Zhi | |
101 xor $t0,$Zlo,$Zlo | |
102 extbl $Xlo,$cnt,$nlo | |
103 | |
104 and $nlo,0xf0,$nhi | |
105 xor $Thi0,$Zhi,$Zhi | |
106 xor $Tlo0,$Zlo,$Zlo | |
107 sll $nlo,4,$nlo | |
108 | |
109 | |
110 and $Zlo,0x0f,$remp | |
111 sll $Zhi,60,$t0 | |
112 and $nlo,0xf0,$nlo | |
113 srl $Zlo,4,$Zlo | |
114 | |
115 s8addq $remp,$rem_4bit,$remp | |
116 xor $rem,$Zhi,$Zhi | |
117 addq $nlo,$Htbl,$nlo | |
118 addq $nhi,$Htbl,$nhi | |
119 | |
120 ldq $rem,0($remp) | |
121 srl $Zhi,4,$Zhi | |
122 ldq $Tlo0,8($nlo) | |
123 xor $t0,$Zlo,$Zlo | |
124 | |
125 xor $Tlo1,$Zlo,$Zlo | |
126 xor $Thi1,$Zhi,$Zhi | |
127 ldq $Thi0,0($nlo) | |
128 bne $cnt,.Looplo$N | |
129 | |
130 | |
131 and $Zlo,0x0f,$remp | |
132 sll $Zhi,60,$t0 | |
133 lda $cnt,7(zero) | |
134 srl $Zlo,4,$Zlo | |
135 | |
136 ldq $Tlo1,8($nhi) | |
137 xor $rem,$Zhi,$Zhi | |
138 ldq $Thi1,0($nhi) | |
139 s8addq $remp,$rem_4bit,$remp | |
140 | |
141 ldq $rem,0($remp) | |
142 srl $Zhi,4,$Zhi | |
143 xor $t0,$Zlo,$Zlo | |
144 extbl $Xhi,$cnt,$nlo | |
145 | |
146 and $nlo,0xf0,$nhi | |
147 xor $Thi0,$Zhi,$Zhi | |
148 xor $Tlo0,$Zlo,$Zlo | |
149 sll $nlo,4,$nlo | |
150 | |
151 and $Zlo,0x0f,$remp | |
152 sll $Zhi,60,$t0 | |
153 and $nlo,0xf0,$nlo | |
154 srl $Zlo,4,$Zlo | |
155 | |
156 s8addq $remp,$rem_4bit,$remp | |
157 xor $rem,$Zhi,$Zhi | |
158 addq $nlo,$Htbl,$nlo | |
159 addq $nhi,$Htbl,$nhi | |
160 | |
161 ldq $rem,0($remp) | |
162 srl $Zhi,4,$Zhi | |
163 ldq $Tlo0,8($nlo) | |
164 xor $t0,$Zlo,$Zlo | |
165 | |
166 xor $Tlo1,$Zlo,$Zlo | |
167 xor $Thi1,$Zhi,$Zhi | |
168 ldq $Thi0,0($nlo) | |
169 unop | |
170 | |
171 | |
172 .Loophi$N: | |
173 and $Zlo,0x0f,$remp | |
174 sll $Zhi,60,$t0 | |
175 subq $cnt,1,$cnt | |
176 srl $Zlo,4,$Zlo | |
177 | |
178 ldq $Tlo1,8($nhi) | |
179 xor $rem,$Zhi,$Zhi | |
180 ldq $Thi1,0($nhi) | |
181 s8addq $remp,$rem_4bit,$remp | |
182 | |
183 ldq $rem,0($remp) | |
184 srl $Zhi,4,$Zhi | |
185 xor $t0,$Zlo,$Zlo | |
186 extbl $Xhi,$cnt,$nlo | |
187 | |
188 and $nlo,0xf0,$nhi | |
189 xor $Thi0,$Zhi,$Zhi | |
190 xor $Tlo0,$Zlo,$Zlo | |
191 sll $nlo,4,$nlo | |
192 | |
193 | |
194 and $Zlo,0x0f,$remp | |
195 sll $Zhi,60,$t0 | |
196 and $nlo,0xf0,$nlo | |
197 srl $Zlo,4,$Zlo | |
198 | |
199 s8addq $remp,$rem_4bit,$remp | |
200 xor $rem,$Zhi,$Zhi | |
201 addq $nlo,$Htbl,$nlo | |
202 addq $nhi,$Htbl,$nhi | |
203 | |
204 ldq $rem,0($remp) | |
205 srl $Zhi,4,$Zhi | |
206 ldq $Tlo0,8($nlo) | |
207 xor $t0,$Zlo,$Zlo | |
208 | |
209 xor $Tlo1,$Zlo,$Zlo | |
210 xor $Thi1,$Zhi,$Zhi | |
211 ldq $Thi0,0($nlo) | |
212 bne $cnt,.Loophi$N | |
213 | |
214 | |
215 and $Zlo,0x0f,$remp | |
216 sll $Zhi,60,$t0 | |
217 srl $Zlo,4,$Zlo | |
218 | |
219 ldq $Tlo1,8($nhi) | |
220 xor $rem,$Zhi,$Zhi | |
221 ldq $Thi1,0($nhi) | |
222 s8addq $remp,$rem_4bit,$remp | |
223 | |
224 ldq $rem,0($remp) | |
225 srl $Zhi,4,$Zhi | |
226 xor $t0,$Zlo,$Zlo | |
227 | |
228 xor $Tlo0,$Zlo,$Zlo | |
229 xor $Thi0,$Zhi,$Zhi | |
230 | |
231 and $Zlo,0x0f,$remp | |
232 sll $Zhi,60,$t0 | |
233 srl $Zlo,4,$Zlo | |
234 | |
235 s8addq $remp,$rem_4bit,$remp | |
236 xor $rem,$Zhi,$Zhi | |
237 | |
238 ldq $rem,0($remp) | |
239 srl $Zhi,4,$Zhi | |
240 xor $Tlo1,$Zlo,$Zlo | |
241 xor $Thi1,$Zhi,$Zhi | |
242 xor $t0,$Zlo,$Zlo | |
243 xor $rem,$Zhi,$Zhi | |
244 ___ | |
245 }} | |
246 | |
247 $code=<<___; | |
248 #ifdef __linux__ | |
249 #include <asm/regdef.h> | |
250 #else | |
251 #include <asm.h> | |
252 #include <regdef.h> | |
253 #endif | |
254 | |
255 .text | |
256 | |
257 .set noat | |
258 .set noreorder | |
259 .globl gcm_gmult_4bit | |
260 .align 4 | |
261 .ent gcm_gmult_4bit | |
262 gcm_gmult_4bit: | |
263 .frame sp,0,ra | |
264 .prologue 0 | |
265 | |
266 ldq $Xlo,8($Xi) | |
267 ldq $Xhi,0($Xi) | |
268 | |
269 br $rem_4bit,.Lpic1 | |
270 .Lpic1: lda $rem_4bit,rem_4bit-.Lpic1($rem_4bit) | |
271 ___ | |
272 | |
273 &loop(); | |
274 | |
275 $code.=<<___; | |
276 srl $Zlo,24,$t0 # byte swap | |
277 srl $Zlo,8,$t1 | |
278 | |
279 sll $Zlo,8,$t2 | |
280 sll $Zlo,24,$Zlo | |
281 zapnot $t0,0x11,$t0 | |
282 zapnot $t1,0x22,$t1 | |
283 | |
284 zapnot $Zlo,0x88,$Zlo | |
285 or $t0,$t1,$t0 | |
286 zapnot $t2,0x44,$t2 | |
287 | |
288 or $Zlo,$t0,$Zlo | |
289 srl $Zhi,24,$t0 | |
290 srl $Zhi,8,$t1 | |
291 | |
292 or $Zlo,$t2,$Zlo | |
293 sll $Zhi,8,$t2 | |
294 sll $Zhi,24,$Zhi | |
295 | |
296 srl $Zlo,32,$Xlo | |
297 sll $Zlo,32,$Zlo | |
298 | |
299 zapnot $t0,0x11,$t0 | |
300 zapnot $t1,0x22,$t1 | |
301 or $Zlo,$Xlo,$Xlo | |
302 | |
303 zapnot $Zhi,0x88,$Zhi | |
304 or $t0,$t1,$t0 | |
305 zapnot $t2,0x44,$t2 | |
306 | |
307 or $Zhi,$t0,$Zhi | |
308 or $Zhi,$t2,$Zhi | |
309 | |
310 srl $Zhi,32,$Xhi | |
311 sll $Zhi,32,$Zhi | |
312 | |
313 or $Zhi,$Xhi,$Xhi | |
314 stq $Xlo,8($Xi) | |
315 stq $Xhi,0($Xi) | |
316 | |
317 ret (ra) | |
318 .end gcm_gmult_4bit | |
319 ___ | |
320 | |
321 $inhi="s0"; | |
322 $inlo="s1"; | |
323 | |
324 $code.=<<___; | |
325 .globl gcm_ghash_4bit | |
326 .align 4 | |
327 .ent gcm_ghash_4bit | |
328 gcm_ghash_4bit: | |
329 lda sp,-32(sp) | |
330 stq ra,0(sp) | |
331 stq s0,8(sp) | |
332 stq s1,16(sp) | |
333 .mask 0x04000600,-32 | |
334 .frame sp,32,ra | |
335 .prologue 0 | |
336 | |
337 ldq_u $inhi,0($inp) | |
338 ldq_u $Thi0,7($inp) | |
339 ldq_u $inlo,8($inp) | |
340 ldq_u $Tlo0,15($inp) | |
341 ldq $Xhi,0($Xi) | |
342 ldq $Xlo,8($Xi) | |
343 | |
344 br $rem_4bit,.Lpic2 | |
345 .Lpic2: lda $rem_4bit,rem_4bit-.Lpic2($rem_4bit) | |
346 | |
347 .Louter: | |
348 extql $inhi,$inp,$inhi | |
349 extqh $Thi0,$inp,$Thi0 | |
350 or $inhi,$Thi0,$inhi | |
351 lda $inp,16($inp) | |
352 | |
353 extql $inlo,$inp,$inlo | |
354 extqh $Tlo0,$inp,$Tlo0 | |
355 or $inlo,$Tlo0,$inlo | |
356 subq $len,16,$len | |
357 | |
358 xor $Xlo,$inlo,$Xlo | |
359 xor $Xhi,$inhi,$Xhi | |
360 ___ | |
361 | |
362 &loop(); | |
363 | |
364 $code.=<<___; | |
365 srl $Zlo,24,$t0 # byte swap | |
366 srl $Zlo,8,$t1 | |
367 | |
368 sll $Zlo,8,$t2 | |
369 sll $Zlo,24,$Zlo | |
370 zapnot $t0,0x11,$t0 | |
371 zapnot $t1,0x22,$t1 | |
372 | |
373 zapnot $Zlo,0x88,$Zlo | |
374 or $t0,$t1,$t0 | |
375 zapnot $t2,0x44,$t2 | |
376 | |
377 or $Zlo,$t0,$Zlo | |
378 srl $Zhi,24,$t0 | |
379 srl $Zhi,8,$t1 | |
380 | |
381 or $Zlo,$t2,$Zlo | |
382 sll $Zhi,8,$t2 | |
383 sll $Zhi,24,$Zhi | |
384 | |
385 srl $Zlo,32,$Xlo | |
386 sll $Zlo,32,$Zlo | |
387 beq $len,.Ldone | |
388 | |
389 zapnot $t0,0x11,$t0 | |
390 zapnot $t1,0x22,$t1 | |
391 or $Zlo,$Xlo,$Xlo | |
392 ldq_u $inhi,0($inp) | |
393 | |
394 zapnot $Zhi,0x88,$Zhi | |
395 or $t0,$t1,$t0 | |
396 zapnot $t2,0x44,$t2 | |
397 ldq_u $Thi0,7($inp) | |
398 | |
399 or $Zhi,$t0,$Zhi | |
400 or $Zhi,$t2,$Zhi | |
401 ldq_u $inlo,8($inp) | |
402 ldq_u $Tlo0,15($inp) | |
403 | |
404 srl $Zhi,32,$Xhi | |
405 sll $Zhi,32,$Zhi | |
406 | |
407 or $Zhi,$Xhi,$Xhi | |
408 br zero,.Louter | |
409 | |
410 .Ldone: | |
411 zapnot $t0,0x11,$t0 | |
412 zapnot $t1,0x22,$t1 | |
413 or $Zlo,$Xlo,$Xlo | |
414 | |
415 zapnot $Zhi,0x88,$Zhi | |
416 or $t0,$t1,$t0 | |
417 zapnot $t2,0x44,$t2 | |
418 | |
419 or $Zhi,$t0,$Zhi | |
420 or $Zhi,$t2,$Zhi | |
421 | |
422 srl $Zhi,32,$Xhi | |
423 sll $Zhi,32,$Zhi | |
424 | |
425 or $Zhi,$Xhi,$Xhi | |
426 | |
427 stq $Xlo,8($Xi) | |
428 stq $Xhi,0($Xi) | |
429 | |
430 .set noreorder | |
431 /*ldq ra,0(sp)*/ | |
432 ldq s0,8(sp) | |
433 ldq s1,16(sp) | |
434 lda sp,32(sp) | |
435 ret (ra) | |
436 .end gcm_ghash_4bit | |
437 | |
438 .align 4 | |
439 rem_4bit: | |
440 .quad 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48 | |
441 .quad 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48 | |
442 .quad 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48 | |
443 .quad 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48 | |
444 .ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>" | |
445 .align 4 | |
446 | |
447 ___ | |
448 $output=shift and open STDOUT,">$output"; | |
449 print $code; | |
450 close STDOUT; | |
451 | |
OLD | NEW |