OLD | NEW |
| (Empty) |
1 #include "arm_arch.h" | |
2 | |
3 .text | |
4 .code 32 | |
5 | |
6 #if __ARM_ARCH__>=7 | |
7 .fpu neon | |
8 | |
9 .type mul_1x1_neon,%function | |
10 .align 5 | |
11 mul_1x1_neon: | |
12 vshl.u64 d2,d16,#8 @ q1-q3 are slided | |
13 vmull.p8 q0,d16,d17 @ a·bb | |
14 vshl.u64 d4,d16,#16 | |
15 vmull.p8 q1,d2,d17 @ a<<8·bb | |
16 vshl.u64 d6,d16,#24 | |
17 vmull.p8 q2,d4,d17 @ a<<16·bb | |
18 vshr.u64 d2,#8 | |
19 vmull.p8 q3,d6,d17 @ a<<24·bb | |
20 vshl.u64 d3,#24 | |
21 veor d0,d2 | |
22 vshr.u64 d4,#16 | |
23 veor d0,d3 | |
24 vshl.u64 d5,#16 | |
25 veor d0,d4 | |
26 vshr.u64 d6,#24 | |
27 veor d0,d5 | |
28 vshl.u64 d7,#8 | |
29 veor d0,d6 | |
30 veor d0,d7 | |
31 .word 0xe12fff1e | |
32 .size mul_1x1_neon,.-mul_1x1_neon | |
33 #endif | |
34 .type mul_1x1_ialu,%function | |
35 .align 5 | |
36 mul_1x1_ialu: | |
37 mov r4,#0 | |
38 bic r5,r1,#3<<30 @ a1=a&0x3fffffff | |
39 str r4,[sp,#0] @ tab[0]=0 | |
40 add r6,r5,r5 @ a2=a1<<1 | |
41 str r5,[sp,#4] @ tab[1]=a1 | |
42 eor r7,r5,r6 @ a1^a2 | |
43 str r6,[sp,#8] @ tab[2]=a2 | |
44 mov r8,r5,lsl#2 @ a4=a1<<2 | |
45 str r7,[sp,#12] @ tab[3]=a1^a2 | |
46 eor r9,r5,r8 @ a1^a4 | |
47 str r8,[sp,#16] @ tab[4]=a4 | |
48 eor r4,r6,r8 @ a2^a4 | |
49 str r9,[sp,#20] @ tab[5]=a1^a4 | |
50 eor r7,r7,r8 @ a1^a2^a4 | |
51 str r4,[sp,#24] @ tab[6]=a2^a4 | |
52 and r8,r12,r0,lsl#2 | |
53 str r7,[sp,#28] @ tab[7]=a1^a2^a4 | |
54 | |
55 and r9,r12,r0,lsr#1 | |
56 ldr r5,[sp,r8] @ tab[b & 0x7] | |
57 and r8,r12,r0,lsr#4 | |
58 ldr r7,[sp,r9] @ tab[b >> 3 & 0x7] | |
59 and r9,r12,r0,lsr#7 | |
60 ldr r6,[sp,r8] @ tab[b >> 6 & 0x7] | |
61 eor r5,r5,r7,lsl#3 @ stall | |
62 mov r4,r7,lsr#29 | |
63 ldr r7,[sp,r9] @ tab[b >> 9 & 0x7] | |
64 | |
65 and r8,r12,r0,lsr#10 | |
66 eor r5,r5,r6,lsl#6 | |
67 eor r4,r4,r6,lsr#26 | |
68 ldr r6,[sp,r8] @ tab[b >> 12 & 0x7] | |
69 | |
70 and r9,r12,r0,lsr#13 | |
71 eor r5,r5,r7,lsl#9 | |
72 eor r4,r4,r7,lsr#23 | |
73 ldr r7,[sp,r9] @ tab[b >> 15 & 0x7] | |
74 | |
75 and r8,r12,r0,lsr#16 | |
76 eor r5,r5,r6,lsl#12 | |
77 eor r4,r4,r6,lsr#20 | |
78 ldr r6,[sp,r8] @ tab[b >> 18 & 0x7] | |
79 | |
80 and r9,r12,r0,lsr#19 | |
81 eor r5,r5,r7,lsl#15 | |
82 eor r4,r4,r7,lsr#17 | |
83 ldr r7,[sp,r9] @ tab[b >> 21 & 0x7] | |
84 | |
85 and r8,r12,r0,lsr#22 | |
86 eor r5,r5,r6,lsl#18 | |
87 eor r4,r4,r6,lsr#14 | |
88 ldr r6,[sp,r8] @ tab[b >> 24 & 0x7] | |
89 | |
90 and r9,r12,r0,lsr#25 | |
91 eor r5,r5,r7,lsl#21 | |
92 eor r4,r4,r7,lsr#11 | |
93 ldr r7,[sp,r9] @ tab[b >> 27 & 0x7] | |
94 | |
95 tst r1,#1<<30 | |
96 and r8,r12,r0,lsr#28 | |
97 eor r5,r5,r6,lsl#24 | |
98 eor r4,r4,r6,lsr#8 | |
99 ldr r6,[sp,r8] @ tab[b >> 30 ] | |
100 | |
101 eorne r5,r5,r0,lsl#30 | |
102 eorne r4,r4,r0,lsr#2 | |
103 tst r1,#1<<31 | |
104 eor r5,r5,r7,lsl#27 | |
105 eor r4,r4,r7,lsr#5 | |
106 eorne r5,r5,r0,lsl#31 | |
107 eorne r4,r4,r0,lsr#1 | |
108 eor r5,r5,r6,lsl#30 | |
109 eor r4,r4,r6,lsr#2 | |
110 | |
111 mov pc,lr | |
112 .size mul_1x1_ialu,.-mul_1x1_ialu | |
113 .global bn_GF2m_mul_2x2 | |
114 .type bn_GF2m_mul_2x2,%function | |
115 .align 5 | |
116 bn_GF2m_mul_2x2: | |
117 #if __ARM_ARCH__>=7 | |
118 ldr r12,.LOPENSSL_armcap | |
119 .Lpic: ldr r12,[pc,r12] | |
120 tst r12,#1 | |
121 beq .Lialu | |
122 | |
123 veor d18,d18 | |
124 vmov.32 d19,r3,r3 @ two copies of b1 | |
125 vmov.32 d18[0],r1 @ a1 | |
126 | |
127 veor d20,d20 | |
128 vld1.32 d21[],[sp,:32] @ two copies of b0 | |
129 vmov.32 d20[0],r2 @ a0 | |
130 mov r12,lr | |
131 | |
132 vmov d16,d18 | |
133 vmov d17,d19 | |
134 bl mul_1x1_neon @ a1·b1 | |
135 vmov d22,d0 | |
136 | |
137 vmov d16,d20 | |
138 vmov d17,d21 | |
139 bl mul_1x1_neon @ a0·b0 | |
140 vmov d23,d0 | |
141 | |
142 veor d16,d20,d18 | |
143 veor d17,d21,d19 | |
144 veor d20,d23,d22 | |
145 bl mul_1x1_neon @ (a0+a1)·(b0+b1) | |
146 | |
147 veor d0,d20 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1 | |
148 vshl.u64 d1,d0,#32 | |
149 vshr.u64 d0,d0,#32 | |
150 veor d23,d1 | |
151 veor d22,d0 | |
152 vst1.32 {d23[0]},[r0,:32]! | |
153 vst1.32 {d23[1]},[r0,:32]! | |
154 vst1.32 {d22[0]},[r0,:32]! | |
155 vst1.32 {d22[1]},[r0,:32] | |
156 bx r12 | |
157 .align 4 | |
158 .Lialu: | |
159 #endif | |
160 stmdb sp!,{r4-r10,lr} | |
161 mov r10,r0 @ reassign 1st argument | |
162 mov r0,r3 @ r0=b1 | |
163 ldr r3,[sp,#32] @ load b0 | |
164 mov r12,#7<<2 | |
165 sub sp,sp,#32 @ allocate tab[8] | |
166 | |
167 bl mul_1x1_ialu @ a1·b1 | |
168 str r5,[r10,#8] | |
169 str r4,[r10,#12] | |
170 | |
171 eor r0,r0,r3 @ flip b0 and b1 | |
172 eor r1,r1,r2 @ flip a0 and a1 | |
173 eor r3,r3,r0 | |
174 eor r2,r2,r1 | |
175 eor r0,r0,r3 | |
176 eor r1,r1,r2 | |
177 bl mul_1x1_ialu @ a0·b0 | |
178 str r5,[r10] | |
179 str r4,[r10,#4] | |
180 | |
181 eor r1,r1,r2 | |
182 eor r0,r0,r3 | |
183 bl mul_1x1_ialu @ (a1+a0)·(b1+b0) | |
184 ldmia r10,{r6-r9} | |
185 eor r5,r5,r4 | |
186 eor r4,r4,r7 | |
187 eor r5,r5,r6 | |
188 eor r4,r4,r8 | |
189 eor r5,r5,r9 | |
190 eor r4,r4,r9 | |
191 str r4,[r10,#8] | |
192 eor r5,r5,r4 | |
193 add sp,sp,#32 @ destroy tab[8] | |
194 str r5,[r10,#4] | |
195 | |
196 #if __ARM_ARCH__>=5 | |
197 ldmia sp!,{r4-r10,pc} | |
198 #else | |
199 ldmia sp!,{r4-r10,lr} | |
200 tst lr,#1 | |
201 moveq pc,lr @ be binary compatible with V4, yet | |
202 .word 0xe12fff1e @ interoperable with Thumb ISA:-
) | |
203 #endif | |
204 .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 | |
205 #if __ARM_ARCH__>=7 | |
206 .align 5 | |
207 .LOPENSSL_armcap: | |
208 .word OPENSSL_armcap_P-(.Lpic+8) | |
209 #endif | |
210 .asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org
>" | |
211 .align 5 | |
212 | |
213 .comm OPENSSL_armcap_P,4,4 | |
OLD | NEW |