Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(371)

Side by Side Diff: openssl/crypto/sha/asm/sha512-armv4.pl

Issue 2072073002: Delete bundled copy of OpenSSL and replace with README. (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/openssl@master
Patch Set: Delete bundled copy of OpenSSL and replace with README. Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « openssl/crypto/sha/asm/sha512-armv4.S ('k') | openssl/crypto/sha/asm/sha512-ia64.pl » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # SHA512 block procedure for ARMv4. September 2007.
11
12 # This code is ~4.5 (four and a half) times faster than code generated
13 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
14 # Xscale PXA250 core].
15 #
16 # July 2010.
17 #
18 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
19 # Cortex A8 core and ~40 cycles per processed byte.
20
21 # February 2011.
22 #
23 # Profiler-assisted and platform-specific optimization resulted in 7%
24 # improvement on Coxtex A8 core and ~38 cycles per byte.
25
26 # March 2011.
27 #
28 # Add NEON implementation. On Cortex A8 it was measured to process
29 # one byte in 25.5 cycles or 47% faster than integer-only code.
30
31 # Byte order [in]dependence. =========================================
32 #
33 # Originally caller was expected to maintain specific *dword* order in
34 # h[0-7], namely with most significant dword at *lower* address, which
35 # was reflected in below two parameters as 0 and 4. Now caller is
36 # expected to maintain native byte order for whole 64-bit values.
37 $hi="HI";
38 $lo="LO";
39 # ====================================================================
40
41 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
42 open STDOUT,">$output";
43
44 $ctx="r0"; # parameter block
45 $inp="r1";
46 $len="r2";
47
48 $Tlo="r3";
49 $Thi="r4";
50 $Alo="r5";
51 $Ahi="r6";
52 $Elo="r7";
53 $Ehi="r8";
54 $t0="r9";
55 $t1="r10";
56 $t2="r11";
57 $t3="r12";
58 ############ r13 is stack pointer
59 $Ktbl="r14";
60 ############ r15 is program counter
61
62 $Aoff=8*0;
63 $Boff=8*1;
64 $Coff=8*2;
65 $Doff=8*3;
66 $Eoff=8*4;
67 $Foff=8*5;
68 $Goff=8*6;
69 $Hoff=8*7;
70 $Xoff=8*8;
71
72 sub BODY_00_15() {
73 my $magic = shift;
74 $code.=<<___;
75 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
76 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
77 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
78 mov $t0,$Elo,lsr#14
79 str $Tlo,[sp,#$Xoff+0]
80 mov $t1,$Ehi,lsr#14
81 str $Thi,[sp,#$Xoff+4]
82 eor $t0,$t0,$Ehi,lsl#18
83 ldr $t2,[sp,#$Hoff+0] @ h.lo
84 eor $t1,$t1,$Elo,lsl#18
85 ldr $t3,[sp,#$Hoff+4] @ h.hi
86 eor $t0,$t0,$Elo,lsr#18
87 eor $t1,$t1,$Ehi,lsr#18
88 eor $t0,$t0,$Ehi,lsl#14
89 eor $t1,$t1,$Elo,lsl#14
90 eor $t0,$t0,$Ehi,lsr#9
91 eor $t1,$t1,$Elo,lsr#9
92 eor $t0,$t0,$Elo,lsl#23
93 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
94 adds $Tlo,$Tlo,$t0
95 ldr $t0,[sp,#$Foff+0] @ f.lo
96 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
97 ldr $t1,[sp,#$Foff+4] @ f.hi
98 adds $Tlo,$Tlo,$t2
99 ldr $t2,[sp,#$Goff+0] @ g.lo
100 adc $Thi,$Thi,$t3 @ T += h
101 ldr $t3,[sp,#$Goff+4] @ g.hi
102
103 eor $t0,$t0,$t2
104 str $Elo,[sp,#$Eoff+0]
105 eor $t1,$t1,$t3
106 str $Ehi,[sp,#$Eoff+4]
107 and $t0,$t0,$Elo
108 str $Alo,[sp,#$Aoff+0]
109 and $t1,$t1,$Ehi
110 str $Ahi,[sp,#$Aoff+4]
111 eor $t0,$t0,$t2
112 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
113 eor $t1,$t1,$t3 @ Ch(e,f,g)
114 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
115
116 adds $Tlo,$Tlo,$t0
117 ldr $Elo,[sp,#$Doff+0] @ d.lo
118 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
119 ldr $Ehi,[sp,#$Doff+4] @ d.hi
120 adds $Tlo,$Tlo,$t2
121 and $t0,$t2,#0xff
122 adc $Thi,$Thi,$t3 @ T += K[i]
123 adds $Elo,$Elo,$Tlo
124 ldr $t2,[sp,#$Boff+0] @ b.lo
125 adc $Ehi,$Ehi,$Thi @ d += T
126 teq $t0,#$magic
127
128 ldr $t3,[sp,#$Coff+0] @ c.lo
129 orreq $Ktbl,$Ktbl,#1
130 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
131 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
132 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
133 mov $t0,$Alo,lsr#28
134 mov $t1,$Ahi,lsr#28
135 eor $t0,$t0,$Ahi,lsl#4
136 eor $t1,$t1,$Alo,lsl#4
137 eor $t0,$t0,$Ahi,lsr#2
138 eor $t1,$t1,$Alo,lsr#2
139 eor $t0,$t0,$Alo,lsl#30
140 eor $t1,$t1,$Ahi,lsl#30
141 eor $t0,$t0,$Ahi,lsr#7
142 eor $t1,$t1,$Alo,lsr#7
143 eor $t0,$t0,$Alo,lsl#25
144 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
145 adds $Tlo,$Tlo,$t0
146 and $t0,$Alo,$t2
147 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
148
149 ldr $t1,[sp,#$Boff+4] @ b.hi
150 orr $Alo,$Alo,$t2
151 ldr $t2,[sp,#$Coff+4] @ c.hi
152 and $Alo,$Alo,$t3
153 and $t3,$Ahi,$t1
154 orr $Ahi,$Ahi,$t1
155 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
156 and $Ahi,$Ahi,$t2
157 adds $Alo,$Alo,$Tlo
158 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
159 sub sp,sp,#8
160 adc $Ahi,$Ahi,$Thi @ h += T
161 tst $Ktbl,#1
162 add $Ktbl,$Ktbl,#8
163 ___
164 }
165 $code=<<___;
166 #include "arm_arch.h"
167 #ifdef __ARMEL__
168 # define LO 0
169 # define HI 4
170 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
171 #else
172 # define HI 0
173 # define LO 4
174 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
175 #endif
176
177 .text
178 .code 32
179 .type K512,%object
180 .align 5
181 K512:
182 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
183 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
184 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
185 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
186 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
187 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
188 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
189 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
190 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
191 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
192 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
193 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
194 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
195 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
196 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
197 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
198 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
199 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
200 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
201 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
202 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
203 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
204 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
205 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
206 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
207 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
208 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
209 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
210 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
211 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
212 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
213 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
214 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
215 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
216 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
217 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
218 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
219 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
220 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
221 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
222 .size K512,.-K512
223 .LOPENSSL_armcap:
224 .word OPENSSL_armcap_P-sha512_block_data_order
225 .skip 32-4
226
227 .global sha512_block_data_order
228 .type sha512_block_data_order,%function
229 sha512_block_data_order:
230 sub r3,pc,#8 @ sha512_block_data_order
231 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
232 #if __ARM_ARCH__>=7
233 ldr r12,.LOPENSSL_armcap
234 ldr r12,[r3,r12] @ OPENSSL_armcap_P
235 tst r12,#1
236 bne .LNEON
237 #endif
238 stmdb sp!,{r4-r12,lr}
239 sub $Ktbl,r3,#672 @ K512
240 sub sp,sp,#9*8
241
242 ldr $Elo,[$ctx,#$Eoff+$lo]
243 ldr $Ehi,[$ctx,#$Eoff+$hi]
244 ldr $t0, [$ctx,#$Goff+$lo]
245 ldr $t1, [$ctx,#$Goff+$hi]
246 ldr $t2, [$ctx,#$Hoff+$lo]
247 ldr $t3, [$ctx,#$Hoff+$hi]
248 .Loop:
249 str $t0, [sp,#$Goff+0]
250 str $t1, [sp,#$Goff+4]
251 str $t2, [sp,#$Hoff+0]
252 str $t3, [sp,#$Hoff+4]
253 ldr $Alo,[$ctx,#$Aoff+$lo]
254 ldr $Ahi,[$ctx,#$Aoff+$hi]
255 ldr $Tlo,[$ctx,#$Boff+$lo]
256 ldr $Thi,[$ctx,#$Boff+$hi]
257 ldr $t0, [$ctx,#$Coff+$lo]
258 ldr $t1, [$ctx,#$Coff+$hi]
259 ldr $t2, [$ctx,#$Doff+$lo]
260 ldr $t3, [$ctx,#$Doff+$hi]
261 str $Tlo,[sp,#$Boff+0]
262 str $Thi,[sp,#$Boff+4]
263 str $t0, [sp,#$Coff+0]
264 str $t1, [sp,#$Coff+4]
265 str $t2, [sp,#$Doff+0]
266 str $t3, [sp,#$Doff+4]
267 ldr $Tlo,[$ctx,#$Foff+$lo]
268 ldr $Thi,[$ctx,#$Foff+$hi]
269 str $Tlo,[sp,#$Foff+0]
270 str $Thi,[sp,#$Foff+4]
271
272 .L00_15:
273 #if __ARM_ARCH__<7
274 ldrb $Tlo,[$inp,#7]
275 ldrb $t0, [$inp,#6]
276 ldrb $t1, [$inp,#5]
277 ldrb $t2, [$inp,#4]
278 ldrb $Thi,[$inp,#3]
279 ldrb $t3, [$inp,#2]
280 orr $Tlo,$Tlo,$t0,lsl#8
281 ldrb $t0, [$inp,#1]
282 orr $Tlo,$Tlo,$t1,lsl#16
283 ldrb $t1, [$inp],#8
284 orr $Tlo,$Tlo,$t2,lsl#24
285 orr $Thi,$Thi,$t3,lsl#8
286 orr $Thi,$Thi,$t0,lsl#16
287 orr $Thi,$Thi,$t1,lsl#24
288 #else
289 ldr $Tlo,[$inp,#4]
290 ldr $Thi,[$inp],#8
291 #ifdef __ARMEL__
292 rev $Tlo,$Tlo
293 rev $Thi,$Thi
294 #endif
295 #endif
296 ___
297 &BODY_00_15(0x94);
298 $code.=<<___;
299 tst $Ktbl,#1
300 beq .L00_15
301 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
302 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
303 bic $Ktbl,$Ktbl,#1
304 .L16_79:
305 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
306 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
307 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
308 mov $Tlo,$t0,lsr#1
309 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
310 mov $Thi,$t1,lsr#1
311 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
312 eor $Tlo,$Tlo,$t1,lsl#31
313 eor $Thi,$Thi,$t0,lsl#31
314 eor $Tlo,$Tlo,$t0,lsr#8
315 eor $Thi,$Thi,$t1,lsr#8
316 eor $Tlo,$Tlo,$t1,lsl#24
317 eor $Thi,$Thi,$t0,lsl#24
318 eor $Tlo,$Tlo,$t0,lsr#7
319 eor $Thi,$Thi,$t1,lsr#7
320 eor $Tlo,$Tlo,$t1,lsl#25
321
322 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
323 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
324 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
325 mov $t0,$t2,lsr#19
326 mov $t1,$t3,lsr#19
327 eor $t0,$t0,$t3,lsl#13
328 eor $t1,$t1,$t2,lsl#13
329 eor $t0,$t0,$t3,lsr#29
330 eor $t1,$t1,$t2,lsr#29
331 eor $t0,$t0,$t2,lsl#3
332 eor $t1,$t1,$t3,lsl#3
333 eor $t0,$t0,$t2,lsr#6
334 eor $t1,$t1,$t3,lsr#6
335 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
336 eor $t0,$t0,$t3,lsl#26
337
338 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
339 adds $Tlo,$Tlo,$t0
340 ldr $t0,[sp,#`$Xoff+8*16`+0]
341 adc $Thi,$Thi,$t1
342
343 ldr $t1,[sp,#`$Xoff+8*16`+4]
344 adds $Tlo,$Tlo,$t2
345 adc $Thi,$Thi,$t3
346 adds $Tlo,$Tlo,$t0
347 adc $Thi,$Thi,$t1
348 ___
349 &BODY_00_15(0x17);
350 $code.=<<___;
351 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
352 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
353 beq .L16_79
354 bic $Ktbl,$Ktbl,#1
355
356 ldr $Tlo,[sp,#$Boff+0]
357 ldr $Thi,[sp,#$Boff+4]
358 ldr $t0, [$ctx,#$Aoff+$lo]
359 ldr $t1, [$ctx,#$Aoff+$hi]
360 ldr $t2, [$ctx,#$Boff+$lo]
361 ldr $t3, [$ctx,#$Boff+$hi]
362 adds $t0,$Alo,$t0
363 str $t0, [$ctx,#$Aoff+$lo]
364 adc $t1,$Ahi,$t1
365 str $t1, [$ctx,#$Aoff+$hi]
366 adds $t2,$Tlo,$t2
367 str $t2, [$ctx,#$Boff+$lo]
368 adc $t3,$Thi,$t3
369 str $t3, [$ctx,#$Boff+$hi]
370
371 ldr $Alo,[sp,#$Coff+0]
372 ldr $Ahi,[sp,#$Coff+4]
373 ldr $Tlo,[sp,#$Doff+0]
374 ldr $Thi,[sp,#$Doff+4]
375 ldr $t0, [$ctx,#$Coff+$lo]
376 ldr $t1, [$ctx,#$Coff+$hi]
377 ldr $t2, [$ctx,#$Doff+$lo]
378 ldr $t3, [$ctx,#$Doff+$hi]
379 adds $t0,$Alo,$t0
380 str $t0, [$ctx,#$Coff+$lo]
381 adc $t1,$Ahi,$t1
382 str $t1, [$ctx,#$Coff+$hi]
383 adds $t2,$Tlo,$t2
384 str $t2, [$ctx,#$Doff+$lo]
385 adc $t3,$Thi,$t3
386 str $t3, [$ctx,#$Doff+$hi]
387
388 ldr $Tlo,[sp,#$Foff+0]
389 ldr $Thi,[sp,#$Foff+4]
390 ldr $t0, [$ctx,#$Eoff+$lo]
391 ldr $t1, [$ctx,#$Eoff+$hi]
392 ldr $t2, [$ctx,#$Foff+$lo]
393 ldr $t3, [$ctx,#$Foff+$hi]
394 adds $Elo,$Elo,$t0
395 str $Elo,[$ctx,#$Eoff+$lo]
396 adc $Ehi,$Ehi,$t1
397 str $Ehi,[$ctx,#$Eoff+$hi]
398 adds $t2,$Tlo,$t2
399 str $t2, [$ctx,#$Foff+$lo]
400 adc $t3,$Thi,$t3
401 str $t3, [$ctx,#$Foff+$hi]
402
403 ldr $Alo,[sp,#$Goff+0]
404 ldr $Ahi,[sp,#$Goff+4]
405 ldr $Tlo,[sp,#$Hoff+0]
406 ldr $Thi,[sp,#$Hoff+4]
407 ldr $t0, [$ctx,#$Goff+$lo]
408 ldr $t1, [$ctx,#$Goff+$hi]
409 ldr $t2, [$ctx,#$Hoff+$lo]
410 ldr $t3, [$ctx,#$Hoff+$hi]
411 adds $t0,$Alo,$t0
412 str $t0, [$ctx,#$Goff+$lo]
413 adc $t1,$Ahi,$t1
414 str $t1, [$ctx,#$Goff+$hi]
415 adds $t2,$Tlo,$t2
416 str $t2, [$ctx,#$Hoff+$lo]
417 adc $t3,$Thi,$t3
418 str $t3, [$ctx,#$Hoff+$hi]
419
420 add sp,sp,#640
421 sub $Ktbl,$Ktbl,#640
422
423 teq $inp,$len
424 bne .Loop
425
426 add sp,sp,#8*9 @ destroy frame
427 #if __ARM_ARCH__>=5
428 ldmia sp!,{r4-r12,pc}
429 #else
430 ldmia sp!,{r4-r12,lr}
431 tst lr,#1
432 moveq pc,lr @ be binary compatible with V4, yet
433 bx lr @ interoperable with Thumb ISA:-)
434 #endif
435 ___
436
437 {
438 my @Sigma0=(28,34,39);
439 my @Sigma1=(14,18,41);
440 my @sigma0=(1, 8, 7);
441 my @sigma1=(19,61,6);
442
443 my $Ktbl="r3";
444 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
445
446 my @X=map("d$_",(0..15));
447 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
448
449 sub NEON_00_15() {
450 my $i=shift;
451 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
452 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
453
454 $code.=<<___ if ($i<16 || $i&1);
455 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
456 #if $i<16
457 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
458 #endif
459 vshr.u64 $t1,$e,#@Sigma1[1]
460 vshr.u64 $t2,$e,#@Sigma1[2]
461 ___
462 $code.=<<___;
463 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
464 vsli.64 $t0,$e,#`64-@Sigma1[0]`
465 vsli.64 $t1,$e,#`64-@Sigma1[1]`
466 vsli.64 $t2,$e,#`64-@Sigma1[2]`
467 #if $i<16 && defined(__ARMEL__)
468 vrev64.8 @X[$i],@X[$i]
469 #endif
470 vadd.i64 $T1,$K,$h
471 veor $Ch,$f,$g
472 veor $t0,$t1
473 vand $Ch,$e
474 veor $t0,$t2 @ Sigma1(e)
475 veor $Ch,$g @ Ch(e,f,g)
476 vadd.i64 $T1,$t0
477 vshr.u64 $t0,$a,#@Sigma0[0]
478 vadd.i64 $T1,$Ch
479 vshr.u64 $t1,$a,#@Sigma0[1]
480 vshr.u64 $t2,$a,#@Sigma0[2]
481 vsli.64 $t0,$a,#`64-@Sigma0[0]`
482 vsli.64 $t1,$a,#`64-@Sigma0[1]`
483 vsli.64 $t2,$a,#`64-@Sigma0[2]`
484 vadd.i64 $T1,@X[$i%16]
485 vorr $Maj,$a,$c
486 vand $Ch,$a,$c
487 veor $h,$t0,$t1
488 vand $Maj,$b
489 veor $h,$t2 @ Sigma0(a)
490 vorr $Maj,$Ch @ Maj(a,b,c)
491 vadd.i64 $h,$T1
492 vadd.i64 $d,$T1
493 vadd.i64 $h,$Maj
494 ___
495 }
496
497 sub NEON_16_79() {
498 my $i=shift;
499
500 if ($i&1) { &NEON_00_15($i,@_); return; }
501
502 # 2x-vectorized, therefore runs every 2nd round
503 my @X=map("q$_",(0..7)); # view @X as 128-bit vector
504 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
505 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
506 my $e=@_[4]; # $e from NEON_00_15
507 $i /= 2;
508 $code.=<<___;
509 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
510 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
511 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
512 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
513 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
514 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
515 veor $s1,$t0
516 vshr.u64 $t0,$s0,#@sigma0[0]
517 veor $s1,$t1 @ sigma1(X[i+14])
518 vshr.u64 $t1,$s0,#@sigma0[1]
519 vadd.i64 @X[$i%8],$s1
520 vshr.u64 $s1,$s0,#@sigma0[2]
521 vsli.64 $t0,$s0,#`64-@sigma0[0]`
522 vsli.64 $t1,$s0,#`64-@sigma0[1]`
523 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
524 veor $s1,$t0
525 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
526 vadd.i64 @X[$i%8],$s0
527 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
528 veor $s1,$t1 @ sigma0(X[i+1])
529 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
530 vadd.i64 @X[$i%8],$s1
531 ___
532 &NEON_00_15(2*$i,@_);
533 }
534
535 $code.=<<___;
536 #if __ARM_ARCH__>=7
537 .fpu neon
538
539 .align 4
540 .LNEON:
541 dmb @ errata #451034 on early Cortex A8
542 vstmdb sp!,{d8-d15} @ ABI specification says so
543 sub $Ktbl,r3,#672 @ K512
544 vldmia $ctx,{$A-$H} @ load context
545 .Loop_neon:
546 ___
547 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
548 $code.=<<___;
549 mov $cnt,#4
550 .L16_79_neon:
551 subs $cnt,#1
552 ___
553 for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
554 $code.=<<___;
555 bne .L16_79_neon
556
557 vldmia $ctx,{d24-d31} @ load context to temp
558 vadd.i64 q8,q12 @ vectorized accumulate
559 vadd.i64 q9,q13
560 vadd.i64 q10,q14
561 vadd.i64 q11,q15
562 vstmia $ctx,{$A-$H} @ save context
563 teq $inp,$len
564 sub $Ktbl,#640 @ rewind K512
565 bne .Loop_neon
566
567 vldmia sp!,{d8-d15} @ epilogue
568 bx lr
569 #endif
570 ___
571 }
572 $code.=<<___;
573 .size sha512_block_data_order,.-sha512_block_data_order
574 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.or g>"
575 .align 2
576 .comm OPENSSL_armcap_P,4,4
577 ___
578
579 $code =~ s/\`([^\`]*)\`/eval $1/gem;
580 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile wi th -march=armv4
581 print $code;
582 close STDOUT; # enforce flush
OLDNEW
« no previous file with comments | « openssl/crypto/sha/asm/sha512-armv4.S ('k') | openssl/crypto/sha/asm/sha512-ia64.pl » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698