Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(101)

Side by Side Diff: third_party/boringssl/linux-aarch64/crypto/modes/ghashv8-armx64.S

Issue 2354623003: Pull boringssl generated source from boringssl_gen (Closed)
Patch Set: . Created 4 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #if defined(__aarch64__)
2 #include <openssl/arm_arch.h>
3
4 .text
5 #if !defined(__clang__)
6 .arch armv8-a+crypto
7 #endif
8 .globl gcm_init_v8
9 .hidden gcm_init_v8
10 .type gcm_init_v8,%function
11 .align 4
12 gcm_init_v8:
13 ld1 {v17.2d},[x1] //load input H
14 movi v19.16b,#0xe1
15 shl v19.2d,v19.2d,#57 //0xc2.0
16 ext v3.16b,v17.16b,v17.16b,#8
17 ushr v18.2d,v19.2d,#63
18 dup v17.4s,v17.s[1]
19 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
20 ushr v18.2d,v3.2d,#63
21 sshr v17.4s,v17.4s,#31 //broadcast carry bit
22 and v18.16b,v18.16b,v16.16b
23 shl v3.2d,v3.2d,#1
24 ext v18.16b,v18.16b,v18.16b,#8
25 and v16.16b,v16.16b,v17.16b
26 orr v3.16b,v3.16b,v18.16b //H<<<=1
27 eor v20.16b,v3.16b,v16.16b //twisted H
28 st1 {v20.2d},[x0],#16 //store Htable[0]
29
30 //calculate H^2
31 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processi ng
32 pmull v0.1q,v20.1d,v20.1d
33 eor v16.16b,v16.16b,v20.16b
34 pmull2 v2.1q,v20.2d,v20.2d
35 pmull v1.1q,v16.1d,v16.1d
36
37 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-process ing
38 eor v18.16b,v0.16b,v2.16b
39 eor v1.16b,v1.16b,v17.16b
40 eor v1.16b,v1.16b,v18.16b
41 pmull v18.1q,v0.1d,v19.1d //1st phase
42
43 ins v2.d[0],v1.d[1]
44 ins v1.d[1],v0.d[0]
45 eor v0.16b,v1.16b,v18.16b
46
47 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
48 pmull v0.1q,v0.1d,v19.1d
49 eor v18.16b,v18.16b,v2.16b
50 eor v22.16b,v0.16b,v18.16b
51
52 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processi ng
53 eor v17.16b,v17.16b,v22.16b
54 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-pro cessed
55 st1 {v21.2d,v22.2d},[x0] //store Htable[1..2]
56
57 ret
58 .size gcm_init_v8,.-gcm_init_v8
59 .globl gcm_gmult_v8
60 .hidden gcm_gmult_v8
61 .type gcm_gmult_v8,%function
62 .align 4
63 gcm_gmult_v8:
64 ld1 {v17.2d},[x0] //load Xi
65 movi v19.16b,#0xe1
66 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...
67 shl v19.2d,v19.2d,#57
68 #ifndef __ARMEB__
69 rev64 v17.16b,v17.16b
70 #endif
71 ext v3.16b,v17.16b,v17.16b,#8
72
73 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
74 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
75 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
76 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
77
78 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-process ing
79 eor v18.16b,v0.16b,v2.16b
80 eor v1.16b,v1.16b,v17.16b
81 eor v1.16b,v1.16b,v18.16b
82 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
83
84 ins v2.d[0],v1.d[1]
85 ins v1.d[1],v0.d[0]
86 eor v0.16b,v1.16b,v18.16b
87
88 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
89 pmull v0.1q,v0.1d,v19.1d
90 eor v18.16b,v18.16b,v2.16b
91 eor v0.16b,v0.16b,v18.16b
92
93 #ifndef __ARMEB__
94 rev64 v0.16b,v0.16b
95 #endif
96 ext v0.16b,v0.16b,v0.16b,#8
97 st1 {v0.2d},[x0] //write out Xi
98
99 ret
100 .size gcm_gmult_v8,.-gcm_gmult_v8
101 .globl gcm_ghash_v8
102 .hidden gcm_ghash_v8
103 .type gcm_ghash_v8,%function
104 .align 4
105 gcm_ghash_v8:
106 ld1 {v0.2d},[x0] //load [rotated] Xi
107 //"[rotated]" means that
108 //loaded value would have
109 //to be rotated in order to
110 //make it appear as in
111 //alorithm specification
112 subs x3,x3,#32 //see if x3 is 32 or larger
113 mov x12,#16 //x12 is used as post-
114 //increment for input pointer;
115 //as loop is modulo-scheduled
116 //x12 is zeroed just in time
117 //to preclude oversteping
118 //inp[len], which means that
119 //last block[s] are actually
120 //loaded twice, but last
121 //copy is not processed
122 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2
123 movi v19.16b,#0xe1
124 ld1 {v22.2d},[x1]
125 csel x12,xzr,x12,eq //is it time to zero x12?
126 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
127 ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
128 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
129 #ifndef __ARMEB__
130 rev64 v16.16b,v16.16b
131 rev64 v0.16b,v0.16b
132 #endif
133 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
134 b.lo .Lodd_tail_v8 //x3 was less than 32
135 ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
136 #ifndef __ARMEB__
137 rev64 v17.16b,v17.16b
138 #endif
139 ext v7.16b,v17.16b,v17.16b,#8
140 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
141 pmull v4.1q,v20.1d,v7.1d //H·Ii+1
142 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
143 pmull2 v6.1q,v20.2d,v7.2d
144 b .Loop_mod2x_v8
145
146 .align 4
147 .Loop_mod2x_v8:
148 ext v18.16b,v3.16b,v3.16b,#8
149 subs x3,x3,#32 //is there more data?
150 pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
151 csel x12,xzr,x12,lo //is it time to zero x12?
152
153 pmull v5.1q,v21.1d,v17.1d
154 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
155 pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
156 eor v0.16b,v0.16b,v4.16b //accumulate
157 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
158 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
159
160 eor v2.16b,v2.16b,v6.16b
161 csel x12,xzr,x12,eq //is it time to zero x12?
162 eor v1.16b,v1.16b,v5.16b
163
164 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-process ing
165 eor v18.16b,v0.16b,v2.16b
166 eor v1.16b,v1.16b,v17.16b
167 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
168 #ifndef __ARMEB__
169 rev64 v16.16b,v16.16b
170 #endif
171 eor v1.16b,v1.16b,v18.16b
172 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
173
174 #ifndef __ARMEB__
175 rev64 v17.16b,v17.16b
176 #endif
177 ins v2.d[0],v1.d[1]
178 ins v1.d[1],v0.d[0]
179 ext v7.16b,v17.16b,v17.16b,#8
180 ext v3.16b,v16.16b,v16.16b,#8
181 eor v0.16b,v1.16b,v18.16b
182 pmull v4.1q,v20.1d,v7.1d //H·Ii+1
183 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
184
185 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
186 pmull v0.1q,v0.1d,v19.1d
187 eor v3.16b,v3.16b,v18.16b
188 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
189 eor v3.16b,v3.16b,v0.16b
190 pmull2 v6.1q,v20.2d,v7.2d
191 b.hs .Loop_mod2x_v8 //there was at least 32 more bytes
192
193 eor v2.16b,v2.16b,v18.16b
194 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
195 adds x3,x3,#32 //re-construct x3
196 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
197 b.eq .Ldone_v8 //is x3 zero?
198 .Lodd_tail_v8:
199 ext v18.16b,v0.16b,v0.16b,#8
200 eor v3.16b,v3.16b,v0.16b //inp^=Xi
201 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
202
203 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
204 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
205 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
206 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
207
208 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-process ing
209 eor v18.16b,v0.16b,v2.16b
210 eor v1.16b,v1.16b,v17.16b
211 eor v1.16b,v1.16b,v18.16b
212 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
213
214 ins v2.d[0],v1.d[1]
215 ins v1.d[1],v0.d[0]
216 eor v0.16b,v1.16b,v18.16b
217
218 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
219 pmull v0.1q,v0.1d,v19.1d
220 eor v18.16b,v18.16b,v2.16b
221 eor v0.16b,v0.16b,v18.16b
222
223 .Ldone_v8:
224 #ifndef __ARMEB__
225 rev64 v0.16b,v0.16b
226 #endif
227 ext v0.16b,v0.16b,v0.16b,#8
228 st1 {v0.2d},[x0] //write out Xi
229
230 ret
231 .size gcm_ghash_v8,.-gcm_ghash_v8
232 .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79 ,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,4 6,111,114,103,62,0
233 .align 2
234 .align 2
235 #endif
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698