OLD | NEW |
| (Empty) |
1 #if defined(__arm__) | |
2 #include <openssl/arm_arch.h> | |
3 | |
4 .text | |
5 .fpu neon | |
6 .code 32 | |
7 .globl gcm_init_v8 | |
8 .hidden gcm_init_v8 | |
9 .type gcm_init_v8,%function | |
10 .align 4 | |
11 gcm_init_v8: | |
12 vld1.64 {q9},[r1] @ load input H | |
13 vmov.i8 q11,#0xe1 | |
14 vshl.i64 q11,q11,#57 @ 0xc2.0 | |
15 vext.8 q3,q9,q9,#8 | |
16 vshr.u64 q10,q11,#63 | |
17 vdup.32 q9,d18[1] | |
18 vext.8 q8,q10,q11,#8 @ t0=0xc2....01 | |
19 vshr.u64 q10,q3,#63 | |
20 vshr.s32 q9,q9,#31 @ broadcast carry bit | |
21 vand q10,q10,q8 | |
22 vshl.i64 q3,q3,#1 | |
23 vext.8 q10,q10,q10,#8 | |
24 vand q8,q8,q9 | |
25 vorr q3,q3,q10 @ H<<<=1 | |
26 veor q12,q3,q8 @ twisted H | |
27 vst1.64 {q12},[r0]! @ store Htable[0] | |
28 | |
29 @ calculate H^2 | |
30 vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing | |
31 .byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12 | |
32 veor q8,q8,q12 | |
33 .byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12 | |
34 .byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8 | |
35 | |
36 vext.8 q9,q0,q2,#8 @ Karatsuba post-processing | |
37 veor q10,q0,q2 | |
38 veor q1,q1,q9 | |
39 veor q1,q1,q10 | |
40 .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase | |
41 | |
42 vmov d4,d3 @ Xh|Xm - 256-bit result | |
43 vmov d3,d0 @ Xm is rotated Xl | |
44 veor q0,q1,q10 | |
45 | |
46 vext.8 q10,q0,q0,#8 @ 2nd phase | |
47 .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 | |
48 veor q10,q10,q2 | |
49 veor q14,q0,q10 | |
50 | |
51 vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing | |
52 veor q9,q9,q14 | |
53 vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed | |
54 vst1.64 {q13,q14},[r0] @ store Htable[1..2] | |
55 | |
56 bx lr | |
57 .size gcm_init_v8,.-gcm_init_v8 | |
58 .globl gcm_gmult_v8 | |
59 .hidden gcm_gmult_v8 | |
60 .type gcm_gmult_v8,%function | |
61 .align 4 | |
62 gcm_gmult_v8: | |
63 vld1.64 {q9},[r0] @ load Xi | |
64 vmov.i8 q11,#0xe1 | |
65 vld1.64 {q12,q13},[r1] @ load twisted H, ... | |
66 vshl.u64 q11,q11,#57 | |
67 #ifndef __ARMEB__ | |
68 vrev64.8 q9,q9 | |
69 #endif | |
70 vext.8 q3,q9,q9,#8 | |
71 | |
72 .byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo | |
73 veor q9,q9,q3 @ Karatsuba pre-processing | |
74 .byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi | |
75 .byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(X
i.lo+Xi.hi) | |
76 | |
77 vext.8 q9,q0,q2,#8 @ Karatsuba post-processing | |
78 veor q10,q0,q2 | |
79 veor q1,q1,q9 | |
80 veor q1,q1,q10 | |
81 .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of r
eduction | |
82 | |
83 vmov d4,d3 @ Xh|Xm - 256-bit result | |
84 vmov d3,d0 @ Xm is rotated Xl | |
85 veor q0,q1,q10 | |
86 | |
87 vext.8 q10,q0,q0,#8 @ 2nd phase of reduction | |
88 .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 | |
89 veor q10,q10,q2 | |
90 veor q0,q0,q10 | |
91 | |
92 #ifndef __ARMEB__ | |
93 vrev64.8 q0,q0 | |
94 #endif | |
95 vext.8 q0,q0,q0,#8 | |
96 vst1.64 {q0},[r0] @ write out Xi | |
97 | |
98 bx lr | |
99 .size gcm_gmult_v8,.-gcm_gmult_v8 | |
100 .globl gcm_ghash_v8 | |
101 .hidden gcm_ghash_v8 | |
102 .type gcm_ghash_v8,%function | |
103 .align 4 | |
104 gcm_ghash_v8: | |
105 vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI say
s so | |
106 vld1.64 {q0},[r0] @ load [rotated] Xi | |
107 @ "[rotated]" means that | |
108 @ loaded value would have | |
109 @ to be rotated in order to | |
110 @ make it appear as in | |
111 @ alorithm specification | |
112 subs r3,r3,#32 @ see if r3 is 32 or larger | |
113 mov r12,#16 @ r12 is used as post- | |
114 @ increment for input pointer; | |
115 @ as loop is modulo-scheduled | |
116 @ r12 is zeroed just in time | |
117 @ to preclude oversteping | |
118 @ inp[len], which means that | |
119 @ last block[s] are actually | |
120 @ loaded twice, but last | |
121 @ copy is not processed | |
122 vld1.64 {q12,q13},[r1]! @ load twisted H, ..., H^2 | |
123 vmov.i8 q11,#0xe1 | |
124 vld1.64 {q14},[r1] | |
125 moveq r12,#0 @ is it time to zero r12? | |
126 vext.8 q0,q0,q0,#8 @ rotate Xi | |
127 vld1.64 {q8},[r2]! @ load [rotated] I[0] | |
128 vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant | |
129 #ifndef __ARMEB__ | |
130 vrev64.8 q8,q8 | |
131 vrev64.8 q0,q0 | |
132 #endif | |
133 vext.8 q3,q8,q8,#8 @ rotate I[0] | |
134 blo .Lodd_tail_v8 @ r3 was less than 32 | |
135 vld1.64 {q9},[r2],r12 @ load [rotated] I[1] | |
136 #ifndef __ARMEB__ | |
137 vrev64.8 q9,q9 | |
138 #endif | |
139 vext.8 q7,q9,q9,#8 | |
140 veor q3,q3,q0 @ I[i]^=Xi | |
141 .byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 | |
142 veor q9,q9,q7 @ Karatsuba pre-processing | |
143 .byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 | |
144 b .Loop_mod2x_v8 | |
145 | |
146 .align 4 | |
147 .Loop_mod2x_v8: | |
148 vext.8 q10,q3,q3,#8 | |
149 subs r3,r3,#32 @ is there more data? | |
150 .byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo | |
151 movlo r12,#0 @ is it time to zero r12? | |
152 | |
153 .byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9 | |
154 veor q10,q10,q3 @ Karatsuba pre-processing | |
155 .byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi | |
156 veor q0,q0,q4 @ accumulate | |
157 .byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi
)·(Xi.lo+Xi.hi) | |
158 vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2] | |
159 | |
160 veor q2,q2,q6 | |
161 moveq r12,#0 @ is it time to zero r12? | |
162 veor q1,q1,q5 | |
163 | |
164 vext.8 q9,q0,q2,#8 @ Karatsuba post-processing | |
165 veor q10,q0,q2 | |
166 veor q1,q1,q9 | |
167 vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3] | |
168 #ifndef __ARMEB__ | |
169 vrev64.8 q8,q8 | |
170 #endif | |
171 veor q1,q1,q10 | |
172 .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of r
eduction | |
173 | |
174 #ifndef __ARMEB__ | |
175 vrev64.8 q9,q9 | |
176 #endif | |
177 vmov d4,d3 @ Xh|Xm - 256-bit result | |
178 vmov d3,d0 @ Xm is rotated Xl | |
179 vext.8 q7,q9,q9,#8 | |
180 vext.8 q3,q8,q8,#8 | |
181 veor q0,q1,q10 | |
182 .byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 | |
183 veor q3,q3,q2 @ accumulate q3 early | |
184 | |
185 vext.8 q10,q0,q0,#8 @ 2nd phase of reduction | |
186 .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 | |
187 veor q3,q3,q10 | |
188 veor q9,q9,q7 @ Karatsuba pre-processing | |
189 veor q3,q3,q0 | |
190 .byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 | |
191 bhs .Loop_mod2x_v8 @ there was at least 32 more bytes | |
192 | |
193 veor q2,q2,q10 | |
194 vext.8 q3,q8,q8,#8 @ re-construct q3 | |
195 adds r3,r3,#32 @ re-construct r3 | |
196 veor q0,q0,q2 @ re-construct q0 | |
197 beq .Ldone_v8 @ is r3 zero? | |
198 .Lodd_tail_v8: | |
199 vext.8 q10,q0,q0,#8 | |
200 veor q3,q3,q0 @ inp^=Xi | |
201 veor q9,q8,q10 @ q9 is rotated inp^Xi | |
202 | |
203 .byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo | |
204 veor q9,q9,q3 @ Karatsuba pre-processing | |
205 .byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi | |
206 .byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(X
i.lo+Xi.hi) | |
207 | |
208 vext.8 q9,q0,q2,#8 @ Karatsuba post-processing | |
209 veor q10,q0,q2 | |
210 veor q1,q1,q9 | |
211 veor q1,q1,q10 | |
212 .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of r
eduction | |
213 | |
214 vmov d4,d3 @ Xh|Xm - 256-bit result | |
215 vmov d3,d0 @ Xm is rotated Xl | |
216 veor q0,q1,q10 | |
217 | |
218 vext.8 q10,q0,q0,#8 @ 2nd phase of reduction | |
219 .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 | |
220 veor q10,q10,q2 | |
221 veor q0,q0,q10 | |
222 | |
223 .Ldone_v8: | |
224 #ifndef __ARMEB__ | |
225 vrev64.8 q0,q0 | |
226 #endif | |
227 vext.8 q0,q0,q0,#8 | |
228 vst1.64 {q0},[r0] @ write out Xi | |
229 | |
230 vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI say
s so | |
231 bx lr | |
232 .size gcm_ghash_v8,.-gcm_ghash_v8 | |
233 .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79
,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,4
6,111,114,103,62,0 | |
234 .align 2 | |
235 .align 2 | |
236 #endif | |
OLD | NEW |