OLD | NEW |
(Empty) | |
| 1 #include "arm_arch.h" |
| 2 |
| 3 #if __ARM_MAX_ARCH__>=7 |
| 4 .text |
| 5 .arch armv8-a+crypto |
| 6 .align 5 |
| 7 rcon: |
| 8 .long 0x01,0x01,0x01,0x01 |
| 9 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat |
| 10 .long 0x1b,0x1b,0x1b,0x1b |
| 11 |
| 12 .globl aes_v8_set_encrypt_key |
| 13 .type aes_v8_set_encrypt_key,%function |
| 14 .align 5 |
| 15 aes_v8_set_encrypt_key: |
| 16 .Lenc_key: |
| 17 stp x29,x30,[sp,#-16]! |
| 18 add x29,sp,#0 |
| 19 mov x3,#-1 |
| 20 cmp x0,#0 |
| 21 b.eq .Lenc_key_abort |
| 22 cmp x2,#0 |
| 23 b.eq .Lenc_key_abort |
| 24 mov x3,#-2 |
| 25 cmp w1,#128 |
| 26 b.lt .Lenc_key_abort |
| 27 cmp w1,#256 |
| 28 b.gt .Lenc_key_abort |
| 29 tst w1,#0x3f |
| 30 b.ne .Lenc_key_abort |
| 31 |
| 32 adr x3,rcon |
| 33 cmp w1,#192 |
| 34 |
| 35 eor v0.16b,v0.16b,v0.16b |
| 36 ld1 {v3.16b},[x0],#16 |
| 37 mov w1,#8 // reuse w1 |
| 38 ld1 {v1.4s,v2.4s},[x3],#32 |
| 39 |
| 40 b.lt .Loop128 |
| 41 b.eq .L192 |
| 42 b .L256 |
| 43 |
| 44 .align 4 |
| 45 .Loop128: |
| 46 tbl v6.16b,{v3.16b},v2.16b |
| 47 ext v5.16b,v0.16b,v3.16b,#12 |
| 48 st1 {v3.4s},[x2],#16 |
| 49 aese v6.16b,v0.16b |
| 50 subs w1,w1,#1 |
| 51 |
| 52 eor v3.16b,v3.16b,v5.16b |
| 53 ext v5.16b,v0.16b,v5.16b,#12 |
| 54 eor v3.16b,v3.16b,v5.16b |
| 55 ext v5.16b,v0.16b,v5.16b,#12 |
| 56 eor v6.16b,v6.16b,v1.16b |
| 57 eor v3.16b,v3.16b,v5.16b |
| 58 shl v1.16b,v1.16b,#1 |
| 59 eor v3.16b,v3.16b,v6.16b |
| 60 b.ne .Loop128 |
| 61 |
| 62 ld1 {v1.4s},[x3] |
| 63 |
| 64 tbl v6.16b,{v3.16b},v2.16b |
| 65 ext v5.16b,v0.16b,v3.16b,#12 |
| 66 st1 {v3.4s},[x2],#16 |
| 67 aese v6.16b,v0.16b |
| 68 |
| 69 eor v3.16b,v3.16b,v5.16b |
| 70 ext v5.16b,v0.16b,v5.16b,#12 |
| 71 eor v3.16b,v3.16b,v5.16b |
| 72 ext v5.16b,v0.16b,v5.16b,#12 |
| 73 eor v6.16b,v6.16b,v1.16b |
| 74 eor v3.16b,v3.16b,v5.16b |
| 75 shl v1.16b,v1.16b,#1 |
| 76 eor v3.16b,v3.16b,v6.16b |
| 77 |
| 78 tbl v6.16b,{v3.16b},v2.16b |
| 79 ext v5.16b,v0.16b,v3.16b,#12 |
| 80 st1 {v3.4s},[x2],#16 |
| 81 aese v6.16b,v0.16b |
| 82 |
| 83 eor v3.16b,v3.16b,v5.16b |
| 84 ext v5.16b,v0.16b,v5.16b,#12 |
| 85 eor v3.16b,v3.16b,v5.16b |
| 86 ext v5.16b,v0.16b,v5.16b,#12 |
| 87 eor v6.16b,v6.16b,v1.16b |
| 88 eor v3.16b,v3.16b,v5.16b |
| 89 eor v3.16b,v3.16b,v6.16b |
| 90 st1 {v3.4s},[x2] |
| 91 add x2,x2,#0x50 |
| 92 |
| 93 mov w12,#10 |
| 94 b .Ldone |
| 95 |
| 96 .align 4 |
| 97 .L192: |
| 98 ld1 {v4.8b},[x0],#8 |
| 99 movi v6.16b,#8 // borrow v6.16b |
| 100 st1 {v3.4s},[x2],#16 |
| 101 sub v2.16b,v2.16b,v6.16b // adjust the mask |
| 102 |
| 103 .Loop192: |
| 104 tbl v6.16b,{v4.16b},v2.16b |
| 105 ext v5.16b,v0.16b,v3.16b,#12 |
| 106 st1 {v4.8b},[x2],#8 |
| 107 aese v6.16b,v0.16b |
| 108 subs w1,w1,#1 |
| 109 |
| 110 eor v3.16b,v3.16b,v5.16b |
| 111 ext v5.16b,v0.16b,v5.16b,#12 |
| 112 eor v3.16b,v3.16b,v5.16b |
| 113 ext v5.16b,v0.16b,v5.16b,#12 |
| 114 eor v3.16b,v3.16b,v5.16b |
| 115 |
| 116 dup v5.4s,v3.s[3] |
| 117 eor v5.16b,v5.16b,v4.16b |
| 118 eor v6.16b,v6.16b,v1.16b |
| 119 ext v4.16b,v0.16b,v4.16b,#12 |
| 120 shl v1.16b,v1.16b,#1 |
| 121 eor v4.16b,v4.16b,v5.16b |
| 122 eor v3.16b,v3.16b,v6.16b |
| 123 eor v4.16b,v4.16b,v6.16b |
| 124 st1 {v3.4s},[x2],#16 |
| 125 b.ne .Loop192 |
| 126 |
| 127 mov w12,#12 |
| 128 add x2,x2,#0x20 |
| 129 b .Ldone |
| 130 |
| 131 .align 4 |
| 132 .L256: |
| 133 ld1 {v4.16b},[x0] |
| 134 mov w1,#7 |
| 135 mov w12,#14 |
| 136 st1 {v3.4s},[x2],#16 |
| 137 |
| 138 .Loop256: |
| 139 tbl v6.16b,{v4.16b},v2.16b |
| 140 ext v5.16b,v0.16b,v3.16b,#12 |
| 141 st1 {v4.4s},[x2],#16 |
| 142 aese v6.16b,v0.16b |
| 143 subs w1,w1,#1 |
| 144 |
| 145 eor v3.16b,v3.16b,v5.16b |
| 146 ext v5.16b,v0.16b,v5.16b,#12 |
| 147 eor v3.16b,v3.16b,v5.16b |
| 148 ext v5.16b,v0.16b,v5.16b,#12 |
| 149 eor v6.16b,v6.16b,v1.16b |
| 150 eor v3.16b,v3.16b,v5.16b |
| 151 shl v1.16b,v1.16b,#1 |
| 152 eor v3.16b,v3.16b,v6.16b |
| 153 st1 {v3.4s},[x2],#16 |
| 154 b.eq .Ldone |
| 155 |
| 156 dup v6.4s,v3.s[3] // just splat |
| 157 ext v5.16b,v0.16b,v4.16b,#12 |
| 158 aese v6.16b,v0.16b |
| 159 |
| 160 eor v4.16b,v4.16b,v5.16b |
| 161 ext v5.16b,v0.16b,v5.16b,#12 |
| 162 eor v4.16b,v4.16b,v5.16b |
| 163 ext v5.16b,v0.16b,v5.16b,#12 |
| 164 eor v4.16b,v4.16b,v5.16b |
| 165 |
| 166 eor v4.16b,v4.16b,v6.16b |
| 167 b .Loop256 |
| 168 |
| 169 .Ldone: |
| 170 str w12,[x2] |
| 171 mov x3,#0 |
| 172 |
| 173 .Lenc_key_abort: |
| 174 mov x0,x3 // return value |
| 175 ldr x29,[sp],#16 |
| 176 ret |
| 177 .size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key |
| 178 |
| 179 .globl aes_v8_set_decrypt_key |
| 180 .type aes_v8_set_decrypt_key,%function |
| 181 .align 5 |
| 182 aes_v8_set_decrypt_key: |
| 183 stp x29,x30,[sp,#-16]! |
| 184 add x29,sp,#0 |
| 185 bl .Lenc_key |
| 186 |
| 187 cmp x0,#0 |
| 188 b.ne .Ldec_key_abort |
| 189 |
| 190 sub x2,x2,#240 // restore original x2 |
| 191 mov x4,#-16 |
| 192 add x0,x2,x12,lsl#4 // end of key schedule |
| 193 |
| 194 ld1 {v0.4s},[x2] |
| 195 ld1 {v1.4s},[x0] |
| 196 st1 {v0.4s},[x0],x4 |
| 197 st1 {v1.4s},[x2],#16 |
| 198 |
| 199 .Loop_imc: |
| 200 ld1 {v0.4s},[x2] |
| 201 ld1 {v1.4s},[x0] |
| 202 aesimc v0.16b,v0.16b |
| 203 aesimc v1.16b,v1.16b |
| 204 st1 {v0.4s},[x0],x4 |
| 205 st1 {v1.4s},[x2],#16 |
| 206 cmp x0,x2 |
| 207 b.hi .Loop_imc |
| 208 |
| 209 ld1 {v0.4s},[x2] |
| 210 aesimc v0.16b,v0.16b |
| 211 st1 {v0.4s},[x0] |
| 212 |
| 213 eor x0,x0,x0 // return value |
| 214 .Ldec_key_abort: |
| 215 ldp x29,x30,[sp],#16 |
| 216 ret |
| 217 .size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key |
| 218 .globl aes_v8_encrypt |
| 219 .type aes_v8_encrypt,%function |
| 220 .align 5 |
| 221 aes_v8_encrypt: |
| 222 ldr w3,[x2,#240] |
| 223 ld1 {v0.4s},[x2],#16 |
| 224 ld1 {v2.16b},[x0] |
| 225 sub w3,w3,#2 |
| 226 ld1 {v1.4s},[x2],#16 |
| 227 |
| 228 .Loop_enc: |
| 229 aese v2.16b,v0.16b |
| 230 ld1 {v0.4s},[x2],#16 |
| 231 aesmc v2.16b,v2.16b |
| 232 subs w3,w3,#2 |
| 233 aese v2.16b,v1.16b |
| 234 ld1 {v1.4s},[x2],#16 |
| 235 aesmc v2.16b,v2.16b |
| 236 b.gt .Loop_enc |
| 237 |
| 238 aese v2.16b,v0.16b |
| 239 ld1 {v0.4s},[x2] |
| 240 aesmc v2.16b,v2.16b |
| 241 aese v2.16b,v1.16b |
| 242 eor v2.16b,v2.16b,v0.16b |
| 243 |
| 244 st1 {v2.16b},[x1] |
| 245 ret |
| 246 .size aes_v8_encrypt,.-aes_v8_encrypt |
| 247 .globl aes_v8_decrypt |
| 248 .type aes_v8_decrypt,%function |
| 249 .align 5 |
| 250 aes_v8_decrypt: |
| 251 ldr w3,[x2,#240] |
| 252 ld1 {v0.4s},[x2],#16 |
| 253 ld1 {v2.16b},[x0] |
| 254 sub w3,w3,#2 |
| 255 ld1 {v1.4s},[x2],#16 |
| 256 |
| 257 .Loop_dec: |
| 258 aesd v2.16b,v0.16b |
| 259 ld1 {v0.4s},[x2],#16 |
| 260 aesimc v2.16b,v2.16b |
| 261 subs w3,w3,#2 |
| 262 aesd v2.16b,v1.16b |
| 263 ld1 {v1.4s},[x2],#16 |
| 264 aesimc v2.16b,v2.16b |
| 265 b.gt .Loop_dec |
| 266 |
| 267 aesd v2.16b,v0.16b |
| 268 ld1 {v0.4s},[x2] |
| 269 aesimc v2.16b,v2.16b |
| 270 aesd v2.16b,v1.16b |
| 271 eor v2.16b,v2.16b,v0.16b |
| 272 |
| 273 st1 {v2.16b},[x1] |
| 274 ret |
| 275 .size aes_v8_decrypt,.-aes_v8_decrypt |
| 276 .globl aes_v8_cbc_encrypt |
| 277 .type aes_v8_cbc_encrypt,%function |
| 278 .align 5 |
| 279 aes_v8_cbc_encrypt: |
| 280 stp x29,x30,[sp,#-16]! |
| 281 add x29,sp,#0 |
| 282 subs x2,x2,#16 |
| 283 mov x8,#16 |
| 284 b.lo .Lcbc_abort |
| 285 csel x8,xzr,x8,eq |
| 286 |
| 287 cmp w5,#0 // en- or decrypting? |
| 288 ldr w5,[x3,#240] |
| 289 and x2,x2,#-16 |
| 290 ld1 {v6.16b},[x4] |
| 291 ld1 {v0.16b},[x0],x8 |
| 292 |
| 293 ld1 {v16.4s-v17.4s},[x3] // load key schedule... |
| 294 sub w5,w5,#6 |
| 295 add x7,x3,x5,lsl#4 // pointer to last 7 round keys |
| 296 sub w5,w5,#2 |
| 297 ld1 {v18.4s-v19.4s},[x7],#32 |
| 298 ld1 {v20.4s-v21.4s},[x7],#32 |
| 299 ld1 {v22.4s-v23.4s},[x7],#32 |
| 300 ld1 {v7.4s},[x7] |
| 301 |
| 302 add x7,x3,#32 |
| 303 mov w6,w5 |
| 304 b.eq .Lcbc_dec |
| 305 |
| 306 cmp w5,#2 |
| 307 eor v0.16b,v0.16b,v6.16b |
| 308 eor v5.16b,v16.16b,v7.16b |
| 309 b.eq .Lcbc_enc128 |
| 310 |
| 311 .Loop_cbc_enc: |
| 312 aese v0.16b,v16.16b |
| 313 ld1 {v16.4s},[x7],#16 |
| 314 aesmc v0.16b,v0.16b |
| 315 subs w6,w6,#2 |
| 316 aese v0.16b,v17.16b |
| 317 ld1 {v17.4s},[x7],#16 |
| 318 aesmc v0.16b,v0.16b |
| 319 b.gt .Loop_cbc_enc |
| 320 |
| 321 aese v0.16b,v16.16b |
| 322 aesmc v0.16b,v0.16b |
| 323 subs x2,x2,#16 |
| 324 aese v0.16b,v17.16b |
| 325 aesmc v0.16b,v0.16b |
| 326 csel x8,xzr,x8,eq |
| 327 aese v0.16b,v18.16b |
| 328 aesmc v0.16b,v0.16b |
| 329 add x7,x3,#16 |
| 330 aese v0.16b,v19.16b |
| 331 aesmc v0.16b,v0.16b |
| 332 ld1 {v16.16b},[x0],x8 |
| 333 aese v0.16b,v20.16b |
| 334 aesmc v0.16b,v0.16b |
| 335 eor v16.16b,v16.16b,v5.16b |
| 336 aese v0.16b,v21.16b |
| 337 aesmc v0.16b,v0.16b |
| 338 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] |
| 339 aese v0.16b,v22.16b |
| 340 aesmc v0.16b,v0.16b |
| 341 aese v0.16b,v23.16b |
| 342 |
| 343 mov w6,w5 |
| 344 eor v6.16b,v0.16b,v7.16b |
| 345 st1 {v6.16b},[x1],#16 |
| 346 b.hs .Loop_cbc_enc |
| 347 |
| 348 b .Lcbc_done |
| 349 |
| 350 .align 5 |
| 351 .Lcbc_enc128: |
| 352 ld1 {v2.4s-v3.4s},[x7] |
| 353 aese v0.16b,v16.16b |
| 354 aesmc v0.16b,v0.16b |
| 355 b .Lenter_cbc_enc128 |
| 356 .Loop_cbc_enc128: |
| 357 aese v0.16b,v16.16b |
| 358 aesmc v0.16b,v0.16b |
| 359 st1 {v6.16b},[x1],#16 |
| 360 .Lenter_cbc_enc128: |
| 361 aese v0.16b,v17.16b |
| 362 aesmc v0.16b,v0.16b |
| 363 subs x2,x2,#16 |
| 364 aese v0.16b,v2.16b |
| 365 aesmc v0.16b,v0.16b |
| 366 csel x8,xzr,x8,eq |
| 367 aese v0.16b,v3.16b |
| 368 aesmc v0.16b,v0.16b |
| 369 aese v0.16b,v18.16b |
| 370 aesmc v0.16b,v0.16b |
| 371 aese v0.16b,v19.16b |
| 372 aesmc v0.16b,v0.16b |
| 373 ld1 {v16.16b},[x0],x8 |
| 374 aese v0.16b,v20.16b |
| 375 aesmc v0.16b,v0.16b |
| 376 aese v0.16b,v21.16b |
| 377 aesmc v0.16b,v0.16b |
| 378 aese v0.16b,v22.16b |
| 379 aesmc v0.16b,v0.16b |
| 380 eor v16.16b,v16.16b,v5.16b |
| 381 aese v0.16b,v23.16b |
| 382 eor v6.16b,v0.16b,v7.16b |
| 383 b.hs .Loop_cbc_enc128 |
| 384 |
| 385 st1 {v6.16b},[x1],#16 |
| 386 b .Lcbc_done |
| 387 .align 5 |
| 388 .Lcbc_dec: |
| 389 ld1 {v18.16b},[x0],#16 |
| 390 subs x2,x2,#32 // bias |
| 391 add w6,w5,#2 |
| 392 orr v3.16b,v0.16b,v0.16b |
| 393 orr v1.16b,v0.16b,v0.16b |
| 394 orr v19.16b,v18.16b,v18.16b |
| 395 b.lo .Lcbc_dec_tail |
| 396 |
| 397 orr v1.16b,v18.16b,v18.16b |
| 398 ld1 {v18.16b},[x0],#16 |
| 399 orr v2.16b,v0.16b,v0.16b |
| 400 orr v3.16b,v1.16b,v1.16b |
| 401 orr v19.16b,v18.16b,v18.16b |
| 402 |
| 403 .Loop3x_cbc_dec: |
| 404 aesd v0.16b,v16.16b |
| 405 aesd v1.16b,v16.16b |
| 406 aesd v18.16b,v16.16b |
| 407 ld1 {v16.4s},[x7],#16 |
| 408 aesimc v0.16b,v0.16b |
| 409 aesimc v1.16b,v1.16b |
| 410 aesimc v18.16b,v18.16b |
| 411 subs w6,w6,#2 |
| 412 aesd v0.16b,v17.16b |
| 413 aesd v1.16b,v17.16b |
| 414 aesd v18.16b,v17.16b |
| 415 ld1 {v17.4s},[x7],#16 |
| 416 aesimc v0.16b,v0.16b |
| 417 aesimc v1.16b,v1.16b |
| 418 aesimc v18.16b,v18.16b |
| 419 b.gt .Loop3x_cbc_dec |
| 420 |
| 421 aesd v0.16b,v16.16b |
| 422 aesd v1.16b,v16.16b |
| 423 aesd v18.16b,v16.16b |
| 424 eor v4.16b,v6.16b,v7.16b |
| 425 aesimc v0.16b,v0.16b |
| 426 aesimc v1.16b,v1.16b |
| 427 aesimc v18.16b,v18.16b |
| 428 eor v5.16b,v2.16b,v7.16b |
| 429 aesd v0.16b,v17.16b |
| 430 aesd v1.16b,v17.16b |
| 431 aesd v18.16b,v17.16b |
| 432 eor v17.16b,v3.16b,v7.16b |
| 433 subs x2,x2,#0x30 |
| 434 aesimc v0.16b,v0.16b |
| 435 aesimc v1.16b,v1.16b |
| 436 aesimc v18.16b,v18.16b |
| 437 orr v6.16b,v19.16b,v19.16b |
| 438 csel x6,x2,x6,lo // x6, w6, is zero at this point |
| 439 aesd v0.16b,v20.16b |
| 440 aesd v1.16b,v20.16b |
| 441 aesd v18.16b,v20.16b |
| 442 add x0,x0,x6 // x0 is adjusted in such way that |
| 443 // at exit from the loop v1.16b-v18.16b |
| 444 // are loaded with last "words" |
| 445 aesimc v0.16b,v0.16b |
| 446 aesimc v1.16b,v1.16b |
| 447 aesimc v18.16b,v18.16b |
| 448 mov x7,x3 |
| 449 aesd v0.16b,v21.16b |
| 450 aesd v1.16b,v21.16b |
| 451 aesd v18.16b,v21.16b |
| 452 ld1 {v2.16b},[x0],#16 |
| 453 aesimc v0.16b,v0.16b |
| 454 aesimc v1.16b,v1.16b |
| 455 aesimc v18.16b,v18.16b |
| 456 ld1 {v3.16b},[x0],#16 |
| 457 aesd v0.16b,v22.16b |
| 458 aesd v1.16b,v22.16b |
| 459 aesd v18.16b,v22.16b |
| 460 ld1 {v19.16b},[x0],#16 |
| 461 aesimc v0.16b,v0.16b |
| 462 aesimc v1.16b,v1.16b |
| 463 aesimc v18.16b,v18.16b |
| 464 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] |
| 465 aesd v0.16b,v23.16b |
| 466 aesd v1.16b,v23.16b |
| 467 aesd v18.16b,v23.16b |
| 468 |
| 469 add w6,w5,#2 |
| 470 eor v4.16b,v4.16b,v0.16b |
| 471 eor v5.16b,v5.16b,v1.16b |
| 472 eor v18.16b,v18.16b,v17.16b |
| 473 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] |
| 474 orr v0.16b,v2.16b,v2.16b |
| 475 st1 {v4.16b},[x1],#16 |
| 476 orr v1.16b,v3.16b,v3.16b |
| 477 st1 {v5.16b},[x1],#16 |
| 478 st1 {v18.16b},[x1],#16 |
| 479 orr v18.16b,v19.16b,v19.16b |
| 480 b.hs .Loop3x_cbc_dec |
| 481 |
| 482 cmn x2,#0x30 |
| 483 b.eq .Lcbc_done |
| 484 nop |
| 485 |
| 486 .Lcbc_dec_tail: |
| 487 aesd v1.16b,v16.16b |
| 488 aesd v18.16b,v16.16b |
| 489 ld1 {v16.4s},[x7],#16 |
| 490 aesimc v1.16b,v1.16b |
| 491 aesimc v18.16b,v18.16b |
| 492 subs w6,w6,#2 |
| 493 aesd v1.16b,v17.16b |
| 494 aesd v18.16b,v17.16b |
| 495 ld1 {v17.4s},[x7],#16 |
| 496 aesimc v1.16b,v1.16b |
| 497 aesimc v18.16b,v18.16b |
| 498 b.gt .Lcbc_dec_tail |
| 499 |
| 500 aesd v1.16b,v16.16b |
| 501 aesd v18.16b,v16.16b |
| 502 aesimc v1.16b,v1.16b |
| 503 aesimc v18.16b,v18.16b |
| 504 aesd v1.16b,v17.16b |
| 505 aesd v18.16b,v17.16b |
| 506 aesimc v1.16b,v1.16b |
| 507 aesimc v18.16b,v18.16b |
| 508 aesd v1.16b,v20.16b |
| 509 aesd v18.16b,v20.16b |
| 510 aesimc v1.16b,v1.16b |
| 511 aesimc v18.16b,v18.16b |
| 512 cmn x2,#0x20 |
| 513 aesd v1.16b,v21.16b |
| 514 aesd v18.16b,v21.16b |
| 515 aesimc v1.16b,v1.16b |
| 516 aesimc v18.16b,v18.16b |
| 517 eor v5.16b,v6.16b,v7.16b |
| 518 aesd v1.16b,v22.16b |
| 519 aesd v18.16b,v22.16b |
| 520 aesimc v1.16b,v1.16b |
| 521 aesimc v18.16b,v18.16b |
| 522 eor v17.16b,v3.16b,v7.16b |
| 523 aesd v1.16b,v23.16b |
| 524 aesd v18.16b,v23.16b |
| 525 b.eq .Lcbc_dec_one |
| 526 eor v5.16b,v5.16b,v1.16b |
| 527 eor v17.16b,v17.16b,v18.16b |
| 528 orr v6.16b,v19.16b,v19.16b |
| 529 st1 {v5.16b},[x1],#16 |
| 530 st1 {v17.16b},[x1],#16 |
| 531 b .Lcbc_done |
| 532 |
| 533 .Lcbc_dec_one: |
| 534 eor v5.16b,v5.16b,v18.16b |
| 535 orr v6.16b,v19.16b,v19.16b |
| 536 st1 {v5.16b},[x1],#16 |
| 537 |
| 538 .Lcbc_done: |
| 539 st1 {v6.16b},[x4] |
| 540 .Lcbc_abort: |
| 541 ldr x29,[sp],#16 |
| 542 ret |
| 543 .size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt |
| 544 .globl aes_v8_ctr32_encrypt_blocks |
| 545 .type aes_v8_ctr32_encrypt_blocks,%function |
| 546 .align 5 |
| 547 aes_v8_ctr32_encrypt_blocks: |
| 548 stp x29,x30,[sp,#-16]! |
| 549 add x29,sp,#0 |
| 550 ldr w5,[x3,#240] |
| 551 |
| 552 ldr w8, [x4, #12] |
| 553 ld1 {v0.4s},[x4] |
| 554 |
| 555 ld1 {v16.4s-v17.4s},[x3] // load key schedule... |
| 556 sub w5,w5,#4 |
| 557 mov x12,#16 |
| 558 cmp x2,#2 |
| 559 add x7,x3,x5,lsl#4 // pointer to last 5 round keys |
| 560 sub w5,w5,#2 |
| 561 ld1 {v20.4s-v21.4s},[x7],#32 |
| 562 ld1 {v22.4s-v23.4s},[x7],#32 |
| 563 ld1 {v7.4s},[x7] |
| 564 add x7,x3,#32 |
| 565 mov w6,w5 |
| 566 csel x12,xzr,x12,lo |
| 567 #ifndef __ARMEB__ |
| 568 rev w8, w8 |
| 569 #endif |
| 570 orr v1.16b,v0.16b,v0.16b |
| 571 add w10, w8, #1 |
| 572 orr v18.16b,v0.16b,v0.16b |
| 573 add w8, w8, #2 |
| 574 orr v6.16b,v0.16b,v0.16b |
| 575 rev w10, w10 |
| 576 mov v1.s[3],w10 |
| 577 b.ls .Lctr32_tail |
| 578 rev w12, w8 |
| 579 sub x2,x2,#3 // bias |
| 580 mov v18.s[3],w12 |
| 581 b .Loop3x_ctr32 |
| 582 |
| 583 .align 4 |
| 584 .Loop3x_ctr32: |
| 585 aese v0.16b,v16.16b |
| 586 aese v1.16b,v16.16b |
| 587 aese v18.16b,v16.16b |
| 588 ld1 {v16.4s},[x7],#16 |
| 589 aesmc v0.16b,v0.16b |
| 590 aesmc v1.16b,v1.16b |
| 591 aesmc v18.16b,v18.16b |
| 592 subs w6,w6,#2 |
| 593 aese v0.16b,v17.16b |
| 594 aese v1.16b,v17.16b |
| 595 aese v18.16b,v17.16b |
| 596 ld1 {v17.4s},[x7],#16 |
| 597 aesmc v0.16b,v0.16b |
| 598 aesmc v1.16b,v1.16b |
| 599 aesmc v18.16b,v18.16b |
| 600 b.gt .Loop3x_ctr32 |
| 601 |
| 602 aese v0.16b,v16.16b |
| 603 aese v1.16b,v16.16b |
| 604 aese v18.16b,v16.16b |
| 605 mov x7,x3 |
| 606 aesmc v4.16b,v0.16b |
| 607 ld1 {v2.16b},[x0],#16 |
| 608 aesmc v5.16b,v1.16b |
| 609 aesmc v18.16b,v18.16b |
| 610 orr v0.16b,v6.16b,v6.16b |
| 611 aese v4.16b,v17.16b |
| 612 ld1 {v3.16b},[x0],#16 |
| 613 aese v5.16b,v17.16b |
| 614 aese v18.16b,v17.16b |
| 615 orr v1.16b,v6.16b,v6.16b |
| 616 aesmc v4.16b,v4.16b |
| 617 ld1 {v19.16b},[x0],#16 |
| 618 aesmc v5.16b,v5.16b |
| 619 aesmc v17.16b,v18.16b |
| 620 orr v18.16b,v6.16b,v6.16b |
| 621 add w9,w8,#1 |
| 622 aese v4.16b,v20.16b |
| 623 aese v5.16b,v20.16b |
| 624 aese v17.16b,v20.16b |
| 625 eor v2.16b,v2.16b,v7.16b |
| 626 add w10,w8,#2 |
| 627 aesmc v4.16b,v4.16b |
| 628 aesmc v5.16b,v5.16b |
| 629 aesmc v17.16b,v17.16b |
| 630 eor v3.16b,v3.16b,v7.16b |
| 631 add w8,w8,#3 |
| 632 aese v4.16b,v21.16b |
| 633 aese v5.16b,v21.16b |
| 634 aese v17.16b,v21.16b |
| 635 eor v19.16b,v19.16b,v7.16b |
| 636 rev w9,w9 |
| 637 aesmc v4.16b,v4.16b |
| 638 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] |
| 639 aesmc v5.16b,v5.16b |
| 640 aesmc v17.16b,v17.16b |
| 641 mov v0.s[3], w9 |
| 642 rev w10,w10 |
| 643 aese v4.16b,v22.16b |
| 644 aese v5.16b,v22.16b |
| 645 aese v17.16b,v22.16b |
| 646 mov v1.s[3], w10 |
| 647 rev w12,w8 |
| 648 aesmc v4.16b,v4.16b |
| 649 aesmc v5.16b,v5.16b |
| 650 aesmc v17.16b,v17.16b |
| 651 mov v18.s[3], w12 |
| 652 subs x2,x2,#3 |
| 653 aese v4.16b,v23.16b |
| 654 aese v5.16b,v23.16b |
| 655 aese v17.16b,v23.16b |
| 656 |
| 657 mov w6,w5 |
| 658 eor v2.16b,v2.16b,v4.16b |
| 659 eor v3.16b,v3.16b,v5.16b |
| 660 eor v19.16b,v19.16b,v17.16b |
| 661 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] |
| 662 st1 {v2.16b},[x1],#16 |
| 663 st1 {v3.16b},[x1],#16 |
| 664 st1 {v19.16b},[x1],#16 |
| 665 b.hs .Loop3x_ctr32 |
| 666 |
| 667 adds x2,x2,#3 |
| 668 b.eq .Lctr32_done |
| 669 cmp x2,#1 |
| 670 mov x12,#16 |
| 671 csel x12,xzr,x12,eq |
| 672 |
| 673 .Lctr32_tail: |
| 674 aese v0.16b,v16.16b |
| 675 aese v1.16b,v16.16b |
| 676 ld1 {v16.4s},[x7],#16 |
| 677 aesmc v0.16b,v0.16b |
| 678 aesmc v1.16b,v1.16b |
| 679 subs w6,w6,#2 |
| 680 aese v0.16b,v17.16b |
| 681 aese v1.16b,v17.16b |
| 682 ld1 {v17.4s},[x7],#16 |
| 683 aesmc v0.16b,v0.16b |
| 684 aesmc v1.16b,v1.16b |
| 685 b.gt .Lctr32_tail |
| 686 |
| 687 aese v0.16b,v16.16b |
| 688 aese v1.16b,v16.16b |
| 689 aesmc v0.16b,v0.16b |
| 690 aesmc v1.16b,v1.16b |
| 691 aese v0.16b,v17.16b |
| 692 aese v1.16b,v17.16b |
| 693 aesmc v0.16b,v0.16b |
| 694 aesmc v1.16b,v1.16b |
| 695 ld1 {v2.16b},[x0],x12 |
| 696 aese v0.16b,v20.16b |
| 697 aese v1.16b,v20.16b |
| 698 ld1 {v3.16b},[x0] |
| 699 aesmc v0.16b,v0.16b |
| 700 aesmc v1.16b,v1.16b |
| 701 aese v0.16b,v21.16b |
| 702 aese v1.16b,v21.16b |
| 703 aesmc v0.16b,v0.16b |
| 704 aesmc v1.16b,v1.16b |
| 705 aese v0.16b,v22.16b |
| 706 aese v1.16b,v22.16b |
| 707 eor v2.16b,v2.16b,v7.16b |
| 708 aesmc v0.16b,v0.16b |
| 709 aesmc v1.16b,v1.16b |
| 710 eor v3.16b,v3.16b,v7.16b |
| 711 aese v0.16b,v23.16b |
| 712 aese v1.16b,v23.16b |
| 713 |
| 714 cmp x2,#1 |
| 715 eor v2.16b,v2.16b,v0.16b |
| 716 eor v3.16b,v3.16b,v1.16b |
| 717 st1 {v2.16b},[x1],#16 |
| 718 b.eq .Lctr32_done |
| 719 st1 {v3.16b},[x1] |
| 720 |
| 721 .Lctr32_done: |
| 722 ldr x29,[sp],#16 |
| 723 ret |
| 724 .size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks |
| 725 #endif |
OLD | NEW |