Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(198)

Side by Side Diff: openssl/crypto/modes/asm/ghash-alpha.pl

Issue 2072073002: Delete bundled copy of OpenSSL and replace with README. (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/openssl@master
Patch Set: Delete bundled copy of OpenSSL and replace with README. Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « openssl/crypto/mem_dbg.c ('k') | openssl/crypto/modes/asm/ghash-armv4.S » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # March 2010
11 #
12 # The module implements "4-bit" GCM GHASH function and underlying
13 # single multiplication operation in GF(2^128). "4-bit" means that it
14 # uses 256 bytes per-key table [+128 bytes shared table]. Even though
15 # loops are aggressively modulo-scheduled in respect to references to
16 # Htbl and Z.hi updates for 8 cycles per byte, measured performance is
17 # ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
18 # scheduling "glitch," because uprofile(1) indicates uniform sample
19 # distribution, as if all instruction bundles execute in 1.5 cycles.
20 # Meaning that it could have been even faster, yet 12 cycles is ~60%
21 # better than gcc-generated code and ~80% than code generated by vendor
22 # compiler.
23
24 $cnt="v0"; # $0
25 $t0="t0";
26 $t1="t1";
27 $t2="t2";
28 $Thi0="t3"; # $4
29 $Tlo0="t4";
30 $Thi1="t5";
31 $Tlo1="t6";
32 $rem="t7"; # $8
33 #################
34 $Xi="a0"; # $16, input argument block
35 $Htbl="a1";
36 $inp="a2";
37 $len="a3";
38 $nlo="a4"; # $20
39 $nhi="a5";
40 $Zhi="t8";
41 $Zlo="t9";
42 $Xhi="t10"; # $24
43 $Xlo="t11";
44 $remp="t12";
45 $rem_4bit="AT"; # $28
46
47 { my $N;
48 sub loop() {
49
50 $N++;
51 $code.=<<___;
52 .align 4
53 extbl $Xlo,7,$nlo
54 and $nlo,0xf0,$nhi
55 sll $nlo,4,$nlo
56 and $nlo,0xf0,$nlo
57
58 addq $nlo,$Htbl,$nlo
59 ldq $Zlo,8($nlo)
60 addq $nhi,$Htbl,$nhi
61 ldq $Zhi,0($nlo)
62
63 and $Zlo,0x0f,$remp
64 sll $Zhi,60,$t0
65 lda $cnt,6(zero)
66 extbl $Xlo,6,$nlo
67
68 ldq $Tlo1,8($nhi)
69 s8addq $remp,$rem_4bit,$remp
70 ldq $Thi1,0($nhi)
71 srl $Zlo,4,$Zlo
72
73 ldq $rem,0($remp)
74 srl $Zhi,4,$Zhi
75 xor $t0,$Zlo,$Zlo
76 and $nlo,0xf0,$nhi
77
78 xor $Tlo1,$Zlo,$Zlo
79 sll $nlo,4,$nlo
80 xor $Thi1,$Zhi,$Zhi
81 and $nlo,0xf0,$nlo
82
83 addq $nlo,$Htbl,$nlo
84 ldq $Tlo0,8($nlo)
85 addq $nhi,$Htbl,$nhi
86 ldq $Thi0,0($nlo)
87
88 .Looplo$N:
89 and $Zlo,0x0f,$remp
90 sll $Zhi,60,$t0
91 subq $cnt,1,$cnt
92 srl $Zlo,4,$Zlo
93
94 ldq $Tlo1,8($nhi)
95 xor $rem,$Zhi,$Zhi
96 ldq $Thi1,0($nhi)
97 s8addq $remp,$rem_4bit,$remp
98
99 ldq $rem,0($remp)
100 srl $Zhi,4,$Zhi
101 xor $t0,$Zlo,$Zlo
102 extbl $Xlo,$cnt,$nlo
103
104 and $nlo,0xf0,$nhi
105 xor $Thi0,$Zhi,$Zhi
106 xor $Tlo0,$Zlo,$Zlo
107 sll $nlo,4,$nlo
108
109
110 and $Zlo,0x0f,$remp
111 sll $Zhi,60,$t0
112 and $nlo,0xf0,$nlo
113 srl $Zlo,4,$Zlo
114
115 s8addq $remp,$rem_4bit,$remp
116 xor $rem,$Zhi,$Zhi
117 addq $nlo,$Htbl,$nlo
118 addq $nhi,$Htbl,$nhi
119
120 ldq $rem,0($remp)
121 srl $Zhi,4,$Zhi
122 ldq $Tlo0,8($nlo)
123 xor $t0,$Zlo,$Zlo
124
125 xor $Tlo1,$Zlo,$Zlo
126 xor $Thi1,$Zhi,$Zhi
127 ldq $Thi0,0($nlo)
128 bne $cnt,.Looplo$N
129
130
131 and $Zlo,0x0f,$remp
132 sll $Zhi,60,$t0
133 lda $cnt,7(zero)
134 srl $Zlo,4,$Zlo
135
136 ldq $Tlo1,8($nhi)
137 xor $rem,$Zhi,$Zhi
138 ldq $Thi1,0($nhi)
139 s8addq $remp,$rem_4bit,$remp
140
141 ldq $rem,0($remp)
142 srl $Zhi,4,$Zhi
143 xor $t0,$Zlo,$Zlo
144 extbl $Xhi,$cnt,$nlo
145
146 and $nlo,0xf0,$nhi
147 xor $Thi0,$Zhi,$Zhi
148 xor $Tlo0,$Zlo,$Zlo
149 sll $nlo,4,$nlo
150
151 and $Zlo,0x0f,$remp
152 sll $Zhi,60,$t0
153 and $nlo,0xf0,$nlo
154 srl $Zlo,4,$Zlo
155
156 s8addq $remp,$rem_4bit,$remp
157 xor $rem,$Zhi,$Zhi
158 addq $nlo,$Htbl,$nlo
159 addq $nhi,$Htbl,$nhi
160
161 ldq $rem,0($remp)
162 srl $Zhi,4,$Zhi
163 ldq $Tlo0,8($nlo)
164 xor $t0,$Zlo,$Zlo
165
166 xor $Tlo1,$Zlo,$Zlo
167 xor $Thi1,$Zhi,$Zhi
168 ldq $Thi0,0($nlo)
169 unop
170
171
172 .Loophi$N:
173 and $Zlo,0x0f,$remp
174 sll $Zhi,60,$t0
175 subq $cnt,1,$cnt
176 srl $Zlo,4,$Zlo
177
178 ldq $Tlo1,8($nhi)
179 xor $rem,$Zhi,$Zhi
180 ldq $Thi1,0($nhi)
181 s8addq $remp,$rem_4bit,$remp
182
183 ldq $rem,0($remp)
184 srl $Zhi,4,$Zhi
185 xor $t0,$Zlo,$Zlo
186 extbl $Xhi,$cnt,$nlo
187
188 and $nlo,0xf0,$nhi
189 xor $Thi0,$Zhi,$Zhi
190 xor $Tlo0,$Zlo,$Zlo
191 sll $nlo,4,$nlo
192
193
194 and $Zlo,0x0f,$remp
195 sll $Zhi,60,$t0
196 and $nlo,0xf0,$nlo
197 srl $Zlo,4,$Zlo
198
199 s8addq $remp,$rem_4bit,$remp
200 xor $rem,$Zhi,$Zhi
201 addq $nlo,$Htbl,$nlo
202 addq $nhi,$Htbl,$nhi
203
204 ldq $rem,0($remp)
205 srl $Zhi,4,$Zhi
206 ldq $Tlo0,8($nlo)
207 xor $t0,$Zlo,$Zlo
208
209 xor $Tlo1,$Zlo,$Zlo
210 xor $Thi1,$Zhi,$Zhi
211 ldq $Thi0,0($nlo)
212 bne $cnt,.Loophi$N
213
214
215 and $Zlo,0x0f,$remp
216 sll $Zhi,60,$t0
217 srl $Zlo,4,$Zlo
218
219 ldq $Tlo1,8($nhi)
220 xor $rem,$Zhi,$Zhi
221 ldq $Thi1,0($nhi)
222 s8addq $remp,$rem_4bit,$remp
223
224 ldq $rem,0($remp)
225 srl $Zhi,4,$Zhi
226 xor $t0,$Zlo,$Zlo
227
228 xor $Tlo0,$Zlo,$Zlo
229 xor $Thi0,$Zhi,$Zhi
230
231 and $Zlo,0x0f,$remp
232 sll $Zhi,60,$t0
233 srl $Zlo,4,$Zlo
234
235 s8addq $remp,$rem_4bit,$remp
236 xor $rem,$Zhi,$Zhi
237
238 ldq $rem,0($remp)
239 srl $Zhi,4,$Zhi
240 xor $Tlo1,$Zlo,$Zlo
241 xor $Thi1,$Zhi,$Zhi
242 xor $t0,$Zlo,$Zlo
243 xor $rem,$Zhi,$Zhi
244 ___
245 }}
246
247 $code=<<___;
248 #ifdef __linux__
249 #include <asm/regdef.h>
250 #else
251 #include <asm.h>
252 #include <regdef.h>
253 #endif
254
255 .text
256
257 .set noat
258 .set noreorder
259 .globl gcm_gmult_4bit
260 .align 4
261 .ent gcm_gmult_4bit
262 gcm_gmult_4bit:
263 .frame sp,0,ra
264 .prologue 0
265
266 ldq $Xlo,8($Xi)
267 ldq $Xhi,0($Xi)
268
269 br $rem_4bit,.Lpic1
270 .Lpic1: lda $rem_4bit,rem_4bit-.Lpic1($rem_4bit)
271 ___
272
273 &loop();
274
275 $code.=<<___;
276 srl $Zlo,24,$t0 # byte swap
277 srl $Zlo,8,$t1
278
279 sll $Zlo,8,$t2
280 sll $Zlo,24,$Zlo
281 zapnot $t0,0x11,$t0
282 zapnot $t1,0x22,$t1
283
284 zapnot $Zlo,0x88,$Zlo
285 or $t0,$t1,$t0
286 zapnot $t2,0x44,$t2
287
288 or $Zlo,$t0,$Zlo
289 srl $Zhi,24,$t0
290 srl $Zhi,8,$t1
291
292 or $Zlo,$t2,$Zlo
293 sll $Zhi,8,$t2
294 sll $Zhi,24,$Zhi
295
296 srl $Zlo,32,$Xlo
297 sll $Zlo,32,$Zlo
298
299 zapnot $t0,0x11,$t0
300 zapnot $t1,0x22,$t1
301 or $Zlo,$Xlo,$Xlo
302
303 zapnot $Zhi,0x88,$Zhi
304 or $t0,$t1,$t0
305 zapnot $t2,0x44,$t2
306
307 or $Zhi,$t0,$Zhi
308 or $Zhi,$t2,$Zhi
309
310 srl $Zhi,32,$Xhi
311 sll $Zhi,32,$Zhi
312
313 or $Zhi,$Xhi,$Xhi
314 stq $Xlo,8($Xi)
315 stq $Xhi,0($Xi)
316
317 ret (ra)
318 .end gcm_gmult_4bit
319 ___
320
321 $inhi="s0";
322 $inlo="s1";
323
324 $code.=<<___;
325 .globl gcm_ghash_4bit
326 .align 4
327 .ent gcm_ghash_4bit
328 gcm_ghash_4bit:
329 lda sp,-32(sp)
330 stq ra,0(sp)
331 stq s0,8(sp)
332 stq s1,16(sp)
333 .mask 0x04000600,-32
334 .frame sp,32,ra
335 .prologue 0
336
337 ldq_u $inhi,0($inp)
338 ldq_u $Thi0,7($inp)
339 ldq_u $inlo,8($inp)
340 ldq_u $Tlo0,15($inp)
341 ldq $Xhi,0($Xi)
342 ldq $Xlo,8($Xi)
343
344 br $rem_4bit,.Lpic2
345 .Lpic2: lda $rem_4bit,rem_4bit-.Lpic2($rem_4bit)
346
347 .Louter:
348 extql $inhi,$inp,$inhi
349 extqh $Thi0,$inp,$Thi0
350 or $inhi,$Thi0,$inhi
351 lda $inp,16($inp)
352
353 extql $inlo,$inp,$inlo
354 extqh $Tlo0,$inp,$Tlo0
355 or $inlo,$Tlo0,$inlo
356 subq $len,16,$len
357
358 xor $Xlo,$inlo,$Xlo
359 xor $Xhi,$inhi,$Xhi
360 ___
361
362 &loop();
363
364 $code.=<<___;
365 srl $Zlo,24,$t0 # byte swap
366 srl $Zlo,8,$t1
367
368 sll $Zlo,8,$t2
369 sll $Zlo,24,$Zlo
370 zapnot $t0,0x11,$t0
371 zapnot $t1,0x22,$t1
372
373 zapnot $Zlo,0x88,$Zlo
374 or $t0,$t1,$t0
375 zapnot $t2,0x44,$t2
376
377 or $Zlo,$t0,$Zlo
378 srl $Zhi,24,$t0
379 srl $Zhi,8,$t1
380
381 or $Zlo,$t2,$Zlo
382 sll $Zhi,8,$t2
383 sll $Zhi,24,$Zhi
384
385 srl $Zlo,32,$Xlo
386 sll $Zlo,32,$Zlo
387 beq $len,.Ldone
388
389 zapnot $t0,0x11,$t0
390 zapnot $t1,0x22,$t1
391 or $Zlo,$Xlo,$Xlo
392 ldq_u $inhi,0($inp)
393
394 zapnot $Zhi,0x88,$Zhi
395 or $t0,$t1,$t0
396 zapnot $t2,0x44,$t2
397 ldq_u $Thi0,7($inp)
398
399 or $Zhi,$t0,$Zhi
400 or $Zhi,$t2,$Zhi
401 ldq_u $inlo,8($inp)
402 ldq_u $Tlo0,15($inp)
403
404 srl $Zhi,32,$Xhi
405 sll $Zhi,32,$Zhi
406
407 or $Zhi,$Xhi,$Xhi
408 br zero,.Louter
409
410 .Ldone:
411 zapnot $t0,0x11,$t0
412 zapnot $t1,0x22,$t1
413 or $Zlo,$Xlo,$Xlo
414
415 zapnot $Zhi,0x88,$Zhi
416 or $t0,$t1,$t0
417 zapnot $t2,0x44,$t2
418
419 or $Zhi,$t0,$Zhi
420 or $Zhi,$t2,$Zhi
421
422 srl $Zhi,32,$Xhi
423 sll $Zhi,32,$Zhi
424
425 or $Zhi,$Xhi,$Xhi
426
427 stq $Xlo,8($Xi)
428 stq $Xhi,0($Xi)
429
430 .set noreorder
431 /*ldq ra,0(sp)*/
432 ldq s0,8(sp)
433 ldq s1,16(sp)
434 lda sp,32(sp)
435 ret (ra)
436 .end gcm_ghash_4bit
437
438 .align 4
439 rem_4bit:
440 .quad 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
441 .quad 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
442 .quad 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
443 .quad 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
444 .ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
445 .align 4
446
447 ___
448 $output=shift and open STDOUT,">$output";
449 print $code;
450 close STDOUT;
451
OLDNEW
« no previous file with comments | « openssl/crypto/mem_dbg.c ('k') | openssl/crypto/modes/asm/ghash-armv4.S » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698