OLD | NEW |
(Empty) | |
| 1 #if defined(__aarch64__) |
| 2 #include "arm_arch.h" |
| 3 |
| 4 #if __ARM_MAX_ARCH__>=7 |
| 5 .text |
| 6 #if !defined(__clang__) |
| 7 .arch armv8-a+crypto |
| 8 #endif |
| 9 .align 5 |
| 10 .Lrcon: |
| 11 .long 0x01,0x01,0x01,0x01 |
| 12 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat |
| 13 .long 0x1b,0x1b,0x1b,0x1b |
| 14 |
| 15 .globl aes_v8_set_encrypt_key |
| 16 .type aes_v8_set_encrypt_key,%function |
| 17 .align 5 |
| 18 aes_v8_set_encrypt_key: |
| 19 .Lenc_key: |
| 20 stp x29,x30,[sp,#-16]! |
| 21 add x29,sp,#0 |
| 22 mov x3,#-1 |
| 23 cmp x0,#0 |
| 24 b.eq .Lenc_key_abort |
| 25 cmp x2,#0 |
| 26 b.eq .Lenc_key_abort |
| 27 mov x3,#-2 |
| 28 cmp w1,#128 |
| 29 b.lt .Lenc_key_abort |
| 30 cmp w1,#256 |
| 31 b.gt .Lenc_key_abort |
| 32 tst w1,#0x3f |
| 33 b.ne .Lenc_key_abort |
| 34 |
| 35 adr x3,.Lrcon |
| 36 cmp w1,#192 |
| 37 |
| 38 eor v0.16b,v0.16b,v0.16b |
| 39 ld1 {v3.16b},[x0],#16 |
| 40 mov w1,#8 // reuse w1 |
| 41 ld1 {v1.4s,v2.4s},[x3],#32 |
| 42 |
| 43 b.lt .Loop128 |
| 44 b.eq .L192 |
| 45 b .L256 |
| 46 |
| 47 .align 4 |
| 48 .Loop128: |
| 49 tbl v6.16b,{v3.16b},v2.16b |
| 50 ext v5.16b,v0.16b,v3.16b,#12 |
| 51 st1 {v3.4s},[x2],#16 |
| 52 aese v6.16b,v0.16b |
| 53 subs w1,w1,#1 |
| 54 |
| 55 eor v3.16b,v3.16b,v5.16b |
| 56 ext v5.16b,v0.16b,v5.16b,#12 |
| 57 eor v3.16b,v3.16b,v5.16b |
| 58 ext v5.16b,v0.16b,v5.16b,#12 |
| 59 eor v6.16b,v6.16b,v1.16b |
| 60 eor v3.16b,v3.16b,v5.16b |
| 61 shl v1.16b,v1.16b,#1 |
| 62 eor v3.16b,v3.16b,v6.16b |
| 63 b.ne .Loop128 |
| 64 |
| 65 ld1 {v1.4s},[x3] |
| 66 |
| 67 tbl v6.16b,{v3.16b},v2.16b |
| 68 ext v5.16b,v0.16b,v3.16b,#12 |
| 69 st1 {v3.4s},[x2],#16 |
| 70 aese v6.16b,v0.16b |
| 71 |
| 72 eor v3.16b,v3.16b,v5.16b |
| 73 ext v5.16b,v0.16b,v5.16b,#12 |
| 74 eor v3.16b,v3.16b,v5.16b |
| 75 ext v5.16b,v0.16b,v5.16b,#12 |
| 76 eor v6.16b,v6.16b,v1.16b |
| 77 eor v3.16b,v3.16b,v5.16b |
| 78 shl v1.16b,v1.16b,#1 |
| 79 eor v3.16b,v3.16b,v6.16b |
| 80 |
| 81 tbl v6.16b,{v3.16b},v2.16b |
| 82 ext v5.16b,v0.16b,v3.16b,#12 |
| 83 st1 {v3.4s},[x2],#16 |
| 84 aese v6.16b,v0.16b |
| 85 |
| 86 eor v3.16b,v3.16b,v5.16b |
| 87 ext v5.16b,v0.16b,v5.16b,#12 |
| 88 eor v3.16b,v3.16b,v5.16b |
| 89 ext v5.16b,v0.16b,v5.16b,#12 |
| 90 eor v6.16b,v6.16b,v1.16b |
| 91 eor v3.16b,v3.16b,v5.16b |
| 92 eor v3.16b,v3.16b,v6.16b |
| 93 st1 {v3.4s},[x2] |
| 94 add x2,x2,#0x50 |
| 95 |
| 96 mov w12,#10 |
| 97 b .Ldone |
| 98 |
| 99 .align 4 |
| 100 .L192: |
| 101 ld1 {v4.8b},[x0],#8 |
| 102 movi v6.16b,#8 // borrow v6.16b |
| 103 st1 {v3.4s},[x2],#16 |
| 104 sub v2.16b,v2.16b,v6.16b // adjust the mask |
| 105 |
| 106 .Loop192: |
| 107 tbl v6.16b,{v4.16b},v2.16b |
| 108 ext v5.16b,v0.16b,v3.16b,#12 |
| 109 st1 {v4.8b},[x2],#8 |
| 110 aese v6.16b,v0.16b |
| 111 subs w1,w1,#1 |
| 112 |
| 113 eor v3.16b,v3.16b,v5.16b |
| 114 ext v5.16b,v0.16b,v5.16b,#12 |
| 115 eor v3.16b,v3.16b,v5.16b |
| 116 ext v5.16b,v0.16b,v5.16b,#12 |
| 117 eor v3.16b,v3.16b,v5.16b |
| 118 |
| 119 dup v5.4s,v3.s[3] |
| 120 eor v5.16b,v5.16b,v4.16b |
| 121 eor v6.16b,v6.16b,v1.16b |
| 122 ext v4.16b,v0.16b,v4.16b,#12 |
| 123 shl v1.16b,v1.16b,#1 |
| 124 eor v4.16b,v4.16b,v5.16b |
| 125 eor v3.16b,v3.16b,v6.16b |
| 126 eor v4.16b,v4.16b,v6.16b |
| 127 st1 {v3.4s},[x2],#16 |
| 128 b.ne .Loop192 |
| 129 |
| 130 mov w12,#12 |
| 131 add x2,x2,#0x20 |
| 132 b .Ldone |
| 133 |
| 134 .align 4 |
| 135 .L256: |
| 136 ld1 {v4.16b},[x0] |
| 137 mov w1,#7 |
| 138 mov w12,#14 |
| 139 st1 {v3.4s},[x2],#16 |
| 140 |
| 141 .Loop256: |
| 142 tbl v6.16b,{v4.16b},v2.16b |
| 143 ext v5.16b,v0.16b,v3.16b,#12 |
| 144 st1 {v4.4s},[x2],#16 |
| 145 aese v6.16b,v0.16b |
| 146 subs w1,w1,#1 |
| 147 |
| 148 eor v3.16b,v3.16b,v5.16b |
| 149 ext v5.16b,v0.16b,v5.16b,#12 |
| 150 eor v3.16b,v3.16b,v5.16b |
| 151 ext v5.16b,v0.16b,v5.16b,#12 |
| 152 eor v6.16b,v6.16b,v1.16b |
| 153 eor v3.16b,v3.16b,v5.16b |
| 154 shl v1.16b,v1.16b,#1 |
| 155 eor v3.16b,v3.16b,v6.16b |
| 156 st1 {v3.4s},[x2],#16 |
| 157 b.eq .Ldone |
| 158 |
| 159 dup v6.4s,v3.s[3] // just splat |
| 160 ext v5.16b,v0.16b,v4.16b,#12 |
| 161 aese v6.16b,v0.16b |
| 162 |
| 163 eor v4.16b,v4.16b,v5.16b |
| 164 ext v5.16b,v0.16b,v5.16b,#12 |
| 165 eor v4.16b,v4.16b,v5.16b |
| 166 ext v5.16b,v0.16b,v5.16b,#12 |
| 167 eor v4.16b,v4.16b,v5.16b |
| 168 |
| 169 eor v4.16b,v4.16b,v6.16b |
| 170 b .Loop256 |
| 171 |
| 172 .Ldone: |
| 173 str w12,[x2] |
| 174 mov x3,#0 |
| 175 |
| 176 .Lenc_key_abort: |
| 177 mov x0,x3 // return value |
| 178 ldr x29,[sp],#16 |
| 179 ret |
| 180 .size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key |
| 181 |
| 182 .globl aes_v8_set_decrypt_key |
| 183 .type aes_v8_set_decrypt_key,%function |
| 184 .align 5 |
| 185 aes_v8_set_decrypt_key: |
| 186 stp x29,x30,[sp,#-16]! |
| 187 add x29,sp,#0 |
| 188 bl .Lenc_key |
| 189 |
| 190 cmp x0,#0 |
| 191 b.ne .Ldec_key_abort |
| 192 |
| 193 sub x2,x2,#240 // restore original x2 |
| 194 mov x4,#-16 |
| 195 add x0,x2,x12,lsl#4 // end of key schedule |
| 196 |
| 197 ld1 {v0.4s},[x2] |
| 198 ld1 {v1.4s},[x0] |
| 199 st1 {v0.4s},[x0],x4 |
| 200 st1 {v1.4s},[x2],#16 |
| 201 |
| 202 .Loop_imc: |
| 203 ld1 {v0.4s},[x2] |
| 204 ld1 {v1.4s},[x0] |
| 205 aesimc v0.16b,v0.16b |
| 206 aesimc v1.16b,v1.16b |
| 207 st1 {v0.4s},[x0],x4 |
| 208 st1 {v1.4s},[x2],#16 |
| 209 cmp x0,x2 |
| 210 b.hi .Loop_imc |
| 211 |
| 212 ld1 {v0.4s},[x2] |
| 213 aesimc v0.16b,v0.16b |
| 214 st1 {v0.4s},[x0] |
| 215 |
| 216 eor x0,x0,x0 // return value |
| 217 .Ldec_key_abort: |
| 218 ldp x29,x30,[sp],#16 |
| 219 ret |
| 220 .size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key |
| 221 .globl aes_v8_encrypt |
| 222 .type aes_v8_encrypt,%function |
| 223 .align 5 |
| 224 aes_v8_encrypt: |
| 225 ldr w3,[x2,#240] |
| 226 ld1 {v0.4s},[x2],#16 |
| 227 ld1 {v2.16b},[x0] |
| 228 sub w3,w3,#2 |
| 229 ld1 {v1.4s},[x2],#16 |
| 230 |
| 231 .Loop_enc: |
| 232 aese v2.16b,v0.16b |
| 233 aesmc v2.16b,v2.16b |
| 234 ld1 {v0.4s},[x2],#16 |
| 235 subs w3,w3,#2 |
| 236 aese v2.16b,v1.16b |
| 237 aesmc v2.16b,v2.16b |
| 238 ld1 {v1.4s},[x2],#16 |
| 239 b.gt .Loop_enc |
| 240 |
| 241 aese v2.16b,v0.16b |
| 242 aesmc v2.16b,v2.16b |
| 243 ld1 {v0.4s},[x2] |
| 244 aese v2.16b,v1.16b |
| 245 eor v2.16b,v2.16b,v0.16b |
| 246 |
| 247 st1 {v2.16b},[x1] |
| 248 ret |
| 249 .size aes_v8_encrypt,.-aes_v8_encrypt |
| 250 .globl aes_v8_decrypt |
| 251 .type aes_v8_decrypt,%function |
| 252 .align 5 |
| 253 aes_v8_decrypt: |
| 254 ldr w3,[x2,#240] |
| 255 ld1 {v0.4s},[x2],#16 |
| 256 ld1 {v2.16b},[x0] |
| 257 sub w3,w3,#2 |
| 258 ld1 {v1.4s},[x2],#16 |
| 259 |
| 260 .Loop_dec: |
| 261 aesd v2.16b,v0.16b |
| 262 aesimc v2.16b,v2.16b |
| 263 ld1 {v0.4s},[x2],#16 |
| 264 subs w3,w3,#2 |
| 265 aesd v2.16b,v1.16b |
| 266 aesimc v2.16b,v2.16b |
| 267 ld1 {v1.4s},[x2],#16 |
| 268 b.gt .Loop_dec |
| 269 |
| 270 aesd v2.16b,v0.16b |
| 271 aesimc v2.16b,v2.16b |
| 272 ld1 {v0.4s},[x2] |
| 273 aesd v2.16b,v1.16b |
| 274 eor v2.16b,v2.16b,v0.16b |
| 275 |
| 276 st1 {v2.16b},[x1] |
| 277 ret |
| 278 .size aes_v8_decrypt,.-aes_v8_decrypt |
| 279 .globl aes_v8_cbc_encrypt |
| 280 .type aes_v8_cbc_encrypt,%function |
| 281 .align 5 |
| 282 aes_v8_cbc_encrypt: |
| 283 stp x29,x30,[sp,#-16]! |
| 284 add x29,sp,#0 |
| 285 subs x2,x2,#16 |
| 286 mov x8,#16 |
| 287 b.lo .Lcbc_abort |
| 288 csel x8,xzr,x8,eq |
| 289 |
| 290 cmp w5,#0 // en- or decrypting? |
| 291 ldr w5,[x3,#240] |
| 292 and x2,x2,#-16 |
| 293 ld1 {v6.16b},[x4] |
| 294 ld1 {v0.16b},[x0],x8 |
| 295 |
| 296 ld1 {v16.4s,v17.4s},[x3] // load key schedule... |
| 297 sub w5,w5,#6 |
| 298 add x7,x3,x5,lsl#4 // pointer to last 7 round keys |
| 299 sub w5,w5,#2 |
| 300 ld1 {v18.4s,v19.4s},[x7],#32 |
| 301 ld1 {v20.4s,v21.4s},[x7],#32 |
| 302 ld1 {v22.4s,v23.4s},[x7],#32 |
| 303 ld1 {v7.4s},[x7] |
| 304 |
| 305 add x7,x3,#32 |
| 306 mov w6,w5 |
| 307 b.eq .Lcbc_dec |
| 308 |
| 309 cmp w5,#2 |
| 310 eor v0.16b,v0.16b,v6.16b |
| 311 eor v5.16b,v16.16b,v7.16b |
| 312 b.eq .Lcbc_enc128 |
| 313 |
| 314 ld1 {v2.4s,v3.4s},[x7] |
| 315 add x7,x3,#16 |
| 316 add x6,x3,#16*4 |
| 317 add x12,x3,#16*5 |
| 318 aese v0.16b,v16.16b |
| 319 aesmc v0.16b,v0.16b |
| 320 add x14,x3,#16*6 |
| 321 add x3,x3,#16*7 |
| 322 b .Lenter_cbc_enc |
| 323 |
| 324 .align 4 |
| 325 .Loop_cbc_enc: |
| 326 aese v0.16b,v16.16b |
| 327 aesmc v0.16b,v0.16b |
| 328 st1 {v6.16b},[x1],#16 |
| 329 .Lenter_cbc_enc: |
| 330 aese v0.16b,v17.16b |
| 331 aesmc v0.16b,v0.16b |
| 332 aese v0.16b,v2.16b |
| 333 aesmc v0.16b,v0.16b |
| 334 ld1 {v16.4s},[x6] |
| 335 cmp w5,#4 |
| 336 aese v0.16b,v3.16b |
| 337 aesmc v0.16b,v0.16b |
| 338 ld1 {v17.4s},[x12] |
| 339 b.eq .Lcbc_enc192 |
| 340 |
| 341 aese v0.16b,v16.16b |
| 342 aesmc v0.16b,v0.16b |
| 343 ld1 {v16.4s},[x14] |
| 344 aese v0.16b,v17.16b |
| 345 aesmc v0.16b,v0.16b |
| 346 ld1 {v17.4s},[x3] |
| 347 nop |
| 348 |
| 349 .Lcbc_enc192: |
| 350 aese v0.16b,v16.16b |
| 351 aesmc v0.16b,v0.16b |
| 352 subs x2,x2,#16 |
| 353 aese v0.16b,v17.16b |
| 354 aesmc v0.16b,v0.16b |
| 355 csel x8,xzr,x8,eq |
| 356 aese v0.16b,v18.16b |
| 357 aesmc v0.16b,v0.16b |
| 358 aese v0.16b,v19.16b |
| 359 aesmc v0.16b,v0.16b |
| 360 ld1 {v16.16b},[x0],x8 |
| 361 aese v0.16b,v20.16b |
| 362 aesmc v0.16b,v0.16b |
| 363 eor v16.16b,v16.16b,v5.16b |
| 364 aese v0.16b,v21.16b |
| 365 aesmc v0.16b,v0.16b |
| 366 ld1 {v17.4s},[x7] // re-pre-load rndkey[1] |
| 367 aese v0.16b,v22.16b |
| 368 aesmc v0.16b,v0.16b |
| 369 aese v0.16b,v23.16b |
| 370 eor v6.16b,v0.16b,v7.16b |
| 371 b.hs .Loop_cbc_enc |
| 372 |
| 373 st1 {v6.16b},[x1],#16 |
| 374 b .Lcbc_done |
| 375 |
| 376 .align 5 |
| 377 .Lcbc_enc128: |
| 378 ld1 {v2.4s,v3.4s},[x7] |
| 379 aese v0.16b,v16.16b |
| 380 aesmc v0.16b,v0.16b |
| 381 b .Lenter_cbc_enc128 |
| 382 .Loop_cbc_enc128: |
| 383 aese v0.16b,v16.16b |
| 384 aesmc v0.16b,v0.16b |
| 385 st1 {v6.16b},[x1],#16 |
| 386 .Lenter_cbc_enc128: |
| 387 aese v0.16b,v17.16b |
| 388 aesmc v0.16b,v0.16b |
| 389 subs x2,x2,#16 |
| 390 aese v0.16b,v2.16b |
| 391 aesmc v0.16b,v0.16b |
| 392 csel x8,xzr,x8,eq |
| 393 aese v0.16b,v3.16b |
| 394 aesmc v0.16b,v0.16b |
| 395 aese v0.16b,v18.16b |
| 396 aesmc v0.16b,v0.16b |
| 397 aese v0.16b,v19.16b |
| 398 aesmc v0.16b,v0.16b |
| 399 ld1 {v16.16b},[x0],x8 |
| 400 aese v0.16b,v20.16b |
| 401 aesmc v0.16b,v0.16b |
| 402 aese v0.16b,v21.16b |
| 403 aesmc v0.16b,v0.16b |
| 404 aese v0.16b,v22.16b |
| 405 aesmc v0.16b,v0.16b |
| 406 eor v16.16b,v16.16b,v5.16b |
| 407 aese v0.16b,v23.16b |
| 408 eor v6.16b,v0.16b,v7.16b |
| 409 b.hs .Loop_cbc_enc128 |
| 410 |
| 411 st1 {v6.16b},[x1],#16 |
| 412 b .Lcbc_done |
| 413 .align 5 |
| 414 .Lcbc_dec: |
| 415 ld1 {v18.16b},[x0],#16 |
| 416 subs x2,x2,#32 // bias |
| 417 add w6,w5,#2 |
| 418 orr v3.16b,v0.16b,v0.16b |
| 419 orr v1.16b,v0.16b,v0.16b |
| 420 orr v19.16b,v18.16b,v18.16b |
| 421 b.lo .Lcbc_dec_tail |
| 422 |
| 423 orr v1.16b,v18.16b,v18.16b |
| 424 ld1 {v18.16b},[x0],#16 |
| 425 orr v2.16b,v0.16b,v0.16b |
| 426 orr v3.16b,v1.16b,v1.16b |
| 427 orr v19.16b,v18.16b,v18.16b |
| 428 |
| 429 .Loop3x_cbc_dec: |
| 430 aesd v0.16b,v16.16b |
| 431 aesimc v0.16b,v0.16b |
| 432 aesd v1.16b,v16.16b |
| 433 aesimc v1.16b,v1.16b |
| 434 aesd v18.16b,v16.16b |
| 435 aesimc v18.16b,v18.16b |
| 436 ld1 {v16.4s},[x7],#16 |
| 437 subs w6,w6,#2 |
| 438 aesd v0.16b,v17.16b |
| 439 aesimc v0.16b,v0.16b |
| 440 aesd v1.16b,v17.16b |
| 441 aesimc v1.16b,v1.16b |
| 442 aesd v18.16b,v17.16b |
| 443 aesimc v18.16b,v18.16b |
| 444 ld1 {v17.4s},[x7],#16 |
| 445 b.gt .Loop3x_cbc_dec |
| 446 |
| 447 aesd v0.16b,v16.16b |
| 448 aesimc v0.16b,v0.16b |
| 449 aesd v1.16b,v16.16b |
| 450 aesimc v1.16b,v1.16b |
| 451 aesd v18.16b,v16.16b |
| 452 aesimc v18.16b,v18.16b |
| 453 eor v4.16b,v6.16b,v7.16b |
| 454 subs x2,x2,#0x30 |
| 455 eor v5.16b,v2.16b,v7.16b |
| 456 csel x6,x2,x6,lo // x6, w6, is zero at this point |
| 457 aesd v0.16b,v17.16b |
| 458 aesimc v0.16b,v0.16b |
| 459 aesd v1.16b,v17.16b |
| 460 aesimc v1.16b,v1.16b |
| 461 aesd v18.16b,v17.16b |
| 462 aesimc v18.16b,v18.16b |
| 463 eor v17.16b,v3.16b,v7.16b |
| 464 add x0,x0,x6 // x0 is adjusted in such way that |
| 465 // at exit from the loop v1.16b-v18.16b |
| 466 // are loaded with last "words" |
| 467 orr v6.16b,v19.16b,v19.16b |
| 468 mov x7,x3 |
| 469 aesd v0.16b,v20.16b |
| 470 aesimc v0.16b,v0.16b |
| 471 aesd v1.16b,v20.16b |
| 472 aesimc v1.16b,v1.16b |
| 473 aesd v18.16b,v20.16b |
| 474 aesimc v18.16b,v18.16b |
| 475 ld1 {v2.16b},[x0],#16 |
| 476 aesd v0.16b,v21.16b |
| 477 aesimc v0.16b,v0.16b |
| 478 aesd v1.16b,v21.16b |
| 479 aesimc v1.16b,v1.16b |
| 480 aesd v18.16b,v21.16b |
| 481 aesimc v18.16b,v18.16b |
| 482 ld1 {v3.16b},[x0],#16 |
| 483 aesd v0.16b,v22.16b |
| 484 aesimc v0.16b,v0.16b |
| 485 aesd v1.16b,v22.16b |
| 486 aesimc v1.16b,v1.16b |
| 487 aesd v18.16b,v22.16b |
| 488 aesimc v18.16b,v18.16b |
| 489 ld1 {v19.16b},[x0],#16 |
| 490 aesd v0.16b,v23.16b |
| 491 aesd v1.16b,v23.16b |
| 492 aesd v18.16b,v23.16b |
| 493 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] |
| 494 add w6,w5,#2 |
| 495 eor v4.16b,v4.16b,v0.16b |
| 496 eor v5.16b,v5.16b,v1.16b |
| 497 eor v18.16b,v18.16b,v17.16b |
| 498 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] |
| 499 st1 {v4.16b},[x1],#16 |
| 500 orr v0.16b,v2.16b,v2.16b |
| 501 st1 {v5.16b},[x1],#16 |
| 502 orr v1.16b,v3.16b,v3.16b |
| 503 st1 {v18.16b},[x1],#16 |
| 504 orr v18.16b,v19.16b,v19.16b |
| 505 b.hs .Loop3x_cbc_dec |
| 506 |
| 507 cmn x2,#0x30 |
| 508 b.eq .Lcbc_done |
| 509 nop |
| 510 |
| 511 .Lcbc_dec_tail: |
| 512 aesd v1.16b,v16.16b |
| 513 aesimc v1.16b,v1.16b |
| 514 aesd v18.16b,v16.16b |
| 515 aesimc v18.16b,v18.16b |
| 516 ld1 {v16.4s},[x7],#16 |
| 517 subs w6,w6,#2 |
| 518 aesd v1.16b,v17.16b |
| 519 aesimc v1.16b,v1.16b |
| 520 aesd v18.16b,v17.16b |
| 521 aesimc v18.16b,v18.16b |
| 522 ld1 {v17.4s},[x7],#16 |
| 523 b.gt .Lcbc_dec_tail |
| 524 |
| 525 aesd v1.16b,v16.16b |
| 526 aesimc v1.16b,v1.16b |
| 527 aesd v18.16b,v16.16b |
| 528 aesimc v18.16b,v18.16b |
| 529 aesd v1.16b,v17.16b |
| 530 aesimc v1.16b,v1.16b |
| 531 aesd v18.16b,v17.16b |
| 532 aesimc v18.16b,v18.16b |
| 533 aesd v1.16b,v20.16b |
| 534 aesimc v1.16b,v1.16b |
| 535 aesd v18.16b,v20.16b |
| 536 aesimc v18.16b,v18.16b |
| 537 cmn x2,#0x20 |
| 538 aesd v1.16b,v21.16b |
| 539 aesimc v1.16b,v1.16b |
| 540 aesd v18.16b,v21.16b |
| 541 aesimc v18.16b,v18.16b |
| 542 eor v5.16b,v6.16b,v7.16b |
| 543 aesd v1.16b,v22.16b |
| 544 aesimc v1.16b,v1.16b |
| 545 aesd v18.16b,v22.16b |
| 546 aesimc v18.16b,v18.16b |
| 547 eor v17.16b,v3.16b,v7.16b |
| 548 aesd v1.16b,v23.16b |
| 549 aesd v18.16b,v23.16b |
| 550 b.eq .Lcbc_dec_one |
| 551 eor v5.16b,v5.16b,v1.16b |
| 552 eor v17.16b,v17.16b,v18.16b |
| 553 orr v6.16b,v19.16b,v19.16b |
| 554 st1 {v5.16b},[x1],#16 |
| 555 st1 {v17.16b},[x1],#16 |
| 556 b .Lcbc_done |
| 557 |
| 558 .Lcbc_dec_one: |
| 559 eor v5.16b,v5.16b,v18.16b |
| 560 orr v6.16b,v19.16b,v19.16b |
| 561 st1 {v5.16b},[x1],#16 |
| 562 |
| 563 .Lcbc_done: |
| 564 st1 {v6.16b},[x4] |
| 565 .Lcbc_abort: |
| 566 ldr x29,[sp],#16 |
| 567 ret |
| 568 .size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt |
| 569 .globl aes_v8_ctr32_encrypt_blocks |
| 570 .type aes_v8_ctr32_encrypt_blocks,%function |
| 571 .align 5 |
| 572 aes_v8_ctr32_encrypt_blocks: |
| 573 stp x29,x30,[sp,#-16]! |
| 574 add x29,sp,#0 |
| 575 ldr w5,[x3,#240] |
| 576 |
| 577 ldr w8, [x4, #12] |
| 578 ld1 {v0.4s},[x4] |
| 579 |
| 580 ld1 {v16.4s,v17.4s},[x3] // load key schedule... |
| 581 sub w5,w5,#4 |
| 582 mov x12,#16 |
| 583 cmp x2,#2 |
| 584 add x7,x3,x5,lsl#4 // pointer to last 5 round keys |
| 585 sub w5,w5,#2 |
| 586 ld1 {v20.4s,v21.4s},[x7],#32 |
| 587 ld1 {v22.4s,v23.4s},[x7],#32 |
| 588 ld1 {v7.4s},[x7] |
| 589 add x7,x3,#32 |
| 590 mov w6,w5 |
| 591 csel x12,xzr,x12,lo |
| 592 #ifndef __ARMEB__ |
| 593 rev w8, w8 |
| 594 #endif |
| 595 orr v1.16b,v0.16b,v0.16b |
| 596 add w10, w8, #1 |
| 597 orr v18.16b,v0.16b,v0.16b |
| 598 add w8, w8, #2 |
| 599 orr v6.16b,v0.16b,v0.16b |
| 600 rev w10, w10 |
| 601 mov v1.s[3],w10 |
| 602 b.ls .Lctr32_tail |
| 603 rev w12, w8 |
| 604 sub x2,x2,#3 // bias |
| 605 mov v18.s[3],w12 |
| 606 b .Loop3x_ctr32 |
| 607 |
| 608 .align 4 |
| 609 .Loop3x_ctr32: |
| 610 aese v0.16b,v16.16b |
| 611 aesmc v0.16b,v0.16b |
| 612 aese v1.16b,v16.16b |
| 613 aesmc v1.16b,v1.16b |
| 614 aese v18.16b,v16.16b |
| 615 aesmc v18.16b,v18.16b |
| 616 ld1 {v16.4s},[x7],#16 |
| 617 subs w6,w6,#2 |
| 618 aese v0.16b,v17.16b |
| 619 aesmc v0.16b,v0.16b |
| 620 aese v1.16b,v17.16b |
| 621 aesmc v1.16b,v1.16b |
| 622 aese v18.16b,v17.16b |
| 623 aesmc v18.16b,v18.16b |
| 624 ld1 {v17.4s},[x7],#16 |
| 625 b.gt .Loop3x_ctr32 |
| 626 |
| 627 aese v0.16b,v16.16b |
| 628 aesmc v4.16b,v0.16b |
| 629 aese v1.16b,v16.16b |
| 630 aesmc v5.16b,v1.16b |
| 631 ld1 {v2.16b},[x0],#16 |
| 632 orr v0.16b,v6.16b,v6.16b |
| 633 aese v18.16b,v16.16b |
| 634 aesmc v18.16b,v18.16b |
| 635 ld1 {v3.16b},[x0],#16 |
| 636 orr v1.16b,v6.16b,v6.16b |
| 637 aese v4.16b,v17.16b |
| 638 aesmc v4.16b,v4.16b |
| 639 aese v5.16b,v17.16b |
| 640 aesmc v5.16b,v5.16b |
| 641 ld1 {v19.16b},[x0],#16 |
| 642 mov x7,x3 |
| 643 aese v18.16b,v17.16b |
| 644 aesmc v17.16b,v18.16b |
| 645 orr v18.16b,v6.16b,v6.16b |
| 646 add w9,w8,#1 |
| 647 aese v4.16b,v20.16b |
| 648 aesmc v4.16b,v4.16b |
| 649 aese v5.16b,v20.16b |
| 650 aesmc v5.16b,v5.16b |
| 651 eor v2.16b,v2.16b,v7.16b |
| 652 add w10,w8,#2 |
| 653 aese v17.16b,v20.16b |
| 654 aesmc v17.16b,v17.16b |
| 655 eor v3.16b,v3.16b,v7.16b |
| 656 add w8,w8,#3 |
| 657 aese v4.16b,v21.16b |
| 658 aesmc v4.16b,v4.16b |
| 659 aese v5.16b,v21.16b |
| 660 aesmc v5.16b,v5.16b |
| 661 eor v19.16b,v19.16b,v7.16b |
| 662 rev w9,w9 |
| 663 aese v17.16b,v21.16b |
| 664 aesmc v17.16b,v17.16b |
| 665 mov v0.s[3], w9 |
| 666 rev w10,w10 |
| 667 aese v4.16b,v22.16b |
| 668 aesmc v4.16b,v4.16b |
| 669 aese v5.16b,v22.16b |
| 670 aesmc v5.16b,v5.16b |
| 671 mov v1.s[3], w10 |
| 672 rev w12,w8 |
| 673 aese v17.16b,v22.16b |
| 674 aesmc v17.16b,v17.16b |
| 675 mov v18.s[3], w12 |
| 676 subs x2,x2,#3 |
| 677 aese v4.16b,v23.16b |
| 678 aese v5.16b,v23.16b |
| 679 aese v17.16b,v23.16b |
| 680 |
| 681 eor v2.16b,v2.16b,v4.16b |
| 682 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] |
| 683 st1 {v2.16b},[x1],#16 |
| 684 eor v3.16b,v3.16b,v5.16b |
| 685 mov w6,w5 |
| 686 st1 {v3.16b},[x1],#16 |
| 687 eor v19.16b,v19.16b,v17.16b |
| 688 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] |
| 689 st1 {v19.16b},[x1],#16 |
| 690 b.hs .Loop3x_ctr32 |
| 691 |
| 692 adds x2,x2,#3 |
| 693 b.eq .Lctr32_done |
| 694 cmp x2,#1 |
| 695 mov x12,#16 |
| 696 csel x12,xzr,x12,eq |
| 697 |
| 698 .Lctr32_tail: |
| 699 aese v0.16b,v16.16b |
| 700 aesmc v0.16b,v0.16b |
| 701 aese v1.16b,v16.16b |
| 702 aesmc v1.16b,v1.16b |
| 703 ld1 {v16.4s},[x7],#16 |
| 704 subs w6,w6,#2 |
| 705 aese v0.16b,v17.16b |
| 706 aesmc v0.16b,v0.16b |
| 707 aese v1.16b,v17.16b |
| 708 aesmc v1.16b,v1.16b |
| 709 ld1 {v17.4s},[x7],#16 |
| 710 b.gt .Lctr32_tail |
| 711 |
| 712 aese v0.16b,v16.16b |
| 713 aesmc v0.16b,v0.16b |
| 714 aese v1.16b,v16.16b |
| 715 aesmc v1.16b,v1.16b |
| 716 aese v0.16b,v17.16b |
| 717 aesmc v0.16b,v0.16b |
| 718 aese v1.16b,v17.16b |
| 719 aesmc v1.16b,v1.16b |
| 720 ld1 {v2.16b},[x0],x12 |
| 721 aese v0.16b,v20.16b |
| 722 aesmc v0.16b,v0.16b |
| 723 aese v1.16b,v20.16b |
| 724 aesmc v1.16b,v1.16b |
| 725 ld1 {v3.16b},[x0] |
| 726 aese v0.16b,v21.16b |
| 727 aesmc v0.16b,v0.16b |
| 728 aese v1.16b,v21.16b |
| 729 aesmc v1.16b,v1.16b |
| 730 eor v2.16b,v2.16b,v7.16b |
| 731 aese v0.16b,v22.16b |
| 732 aesmc v0.16b,v0.16b |
| 733 aese v1.16b,v22.16b |
| 734 aesmc v1.16b,v1.16b |
| 735 eor v3.16b,v3.16b,v7.16b |
| 736 aese v0.16b,v23.16b |
| 737 aese v1.16b,v23.16b |
| 738 |
| 739 cmp x2,#1 |
| 740 eor v2.16b,v2.16b,v0.16b |
| 741 eor v3.16b,v3.16b,v1.16b |
| 742 st1 {v2.16b},[x1],#16 |
| 743 b.eq .Lctr32_done |
| 744 st1 {v3.16b},[x1] |
| 745 |
| 746 .Lctr32_done: |
| 747 ldr x29,[sp],#16 |
| 748 ret |
| 749 .size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks |
| 750 #endif |
| 751 #endif |
OLD | NEW |