| OLD | NEW | 
|---|
|  | (Empty) | 
| 1 #!/usr/bin/env perl |  | 
| 2 |  | 
| 3 ###################################################################### |  | 
| 4 ## Constant-time SSSE3 AES core implementation. |  | 
| 5 ## version 0.1 |  | 
| 6 ## |  | 
| 7 ## By Mike Hamburg (Stanford University), 2009 |  | 
| 8 ## Public domain. |  | 
| 9 ## |  | 
| 10 ## For details see http://shiftleft.org/papers/vector_aes/ and |  | 
| 11 ## http://crypto.stanford.edu/vpaes/. |  | 
| 12 |  | 
| 13 ###################################################################### |  | 
| 14 # September 2011. |  | 
| 15 # |  | 
| 16 # Interface to OpenSSL as "almost" drop-in replacement for |  | 
| 17 # aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt |  | 
| 18 # doesn't handle partial vectors (doesn't have to if called from |  | 
| 19 # EVP only). "Drop-in" implies that this module doesn't share key |  | 
| 20 # schedule structure with the original nor does it make assumption |  | 
| 21 # about its alignment... |  | 
| 22 # |  | 
| 23 # Performance summary. aes-x86_64.pl column lists large-block CBC |  | 
| 24 # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per |  | 
| 25 # byte processed with 128-bit key, and vpaes-x86_64.pl column - |  | 
| 26 # [also large-block CBC] encrypt/decrypt. |  | 
| 27 # |  | 
| 28 #               aes-x86_64.pl           vpaes-x86_64.pl |  | 
| 29 # |  | 
| 30 # Core 2(**)    30.5/43.7/14.3          21.8/25.7(***) |  | 
| 31 # Nehalem       30.5/42.2/14.6           9.8/11.8 |  | 
| 32 # Atom          63.9/79.0/32.1          64.0/84.8(***) |  | 
| 33 # |  | 
| 34 # (*)   "Hyper-threading" in the context refers rather to cache shared |  | 
| 35 #       among multiple cores, than to specifically Intel HTT. As vast |  | 
| 36 #       majority of contemporary cores share cache, slower code path |  | 
| 37 #       is common place. In other words "with-hyper-threading-off" |  | 
| 38 #       results are presented mostly for reference purposes. |  | 
| 39 # |  | 
| 40 # (**)  "Core 2" refers to initial 65nm design, a.k.a. Conroe. |  | 
| 41 # |  | 
| 42 # (***) Less impressive improvement on Core 2 and Atom is due to slow |  | 
| 43 #       pshufb, yet it's respectable +40%/78% improvement on Core 2 |  | 
| 44 #       (as implied, over "hyper-threading-safe" code path). |  | 
| 45 # |  | 
| 46 #                                               <appro@openssl.org> |  | 
| 47 |  | 
| 48 $flavour = shift; |  | 
| 49 $output  = shift; |  | 
| 50 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } |  | 
| 51 |  | 
| 52 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); |  | 
| 53 |  | 
| 54 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |  | 
| 55 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or |  | 
| 56 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or |  | 
| 57 die "can't locate x86_64-xlate.pl"; |  | 
| 58 |  | 
| 59 open OUT,"| \"$^X\" $xlate $flavour $output"; |  | 
| 60 *STDOUT=*OUT; |  | 
| 61 |  | 
| 62 $PREFIX="vpaes"; |  | 
| 63 |  | 
| 64 $code.=<<___; |  | 
| 65 .text |  | 
| 66 |  | 
| 67 ## |  | 
| 68 ##  _aes_encrypt_core |  | 
| 69 ## |  | 
| 70 ##  AES-encrypt %xmm0. |  | 
| 71 ## |  | 
| 72 ##  Inputs: |  | 
| 73 ##     %xmm0 = input |  | 
| 74 ##     %xmm9-%xmm15 as in _vpaes_preheat |  | 
| 75 ##    (%rdx) = scheduled keys |  | 
| 76 ## |  | 
| 77 ##  Output in %xmm0 |  | 
| 78 ##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax |  | 
| 79 ##  Preserves %xmm6 - %xmm8 so you get some local vectors |  | 
| 80 ## |  | 
| 81 ## |  | 
| 82 .type   _vpaes_encrypt_core,\@abi-omnipotent |  | 
| 83 .align 16 |  | 
| 84 _vpaes_encrypt_core: |  | 
| 85         mov     %rdx,   %r9 |  | 
| 86         mov     \$16,   %r11 |  | 
| 87         mov     240(%rdx),%eax |  | 
| 88         movdqa  %xmm9,  %xmm1 |  | 
| 89         movdqa  .Lk_ipt(%rip), %xmm2    # iptlo |  | 
| 90         pandn   %xmm0,  %xmm1 |  | 
| 91         movdqu  (%r9),  %xmm5           # round0 key |  | 
| 92         psrld   \$4,    %xmm1 |  | 
| 93         pand    %xmm9,  %xmm0 |  | 
| 94         pshufb  %xmm0,  %xmm2 |  | 
| 95         movdqa  .Lk_ipt+16(%rip), %xmm0 # ipthi |  | 
| 96         pshufb  %xmm1,  %xmm0 |  | 
| 97         pxor    %xmm5,  %xmm2 |  | 
| 98         pxor    %xmm2,  %xmm0 |  | 
| 99         add     \$16,   %r9 |  | 
| 100         lea     .Lk_mc_backward(%rip),%r10 |  | 
| 101         jmp     .Lenc_entry |  | 
| 102 |  | 
| 103 .align 16 |  | 
| 104 .Lenc_loop: |  | 
| 105         # middle of middle round |  | 
| 106         movdqa  %xmm13, %xmm4   # 4 : sb1u |  | 
| 107         pshufb  %xmm2,  %xmm4   # 4 = sb1u |  | 
| 108         pxor    %xmm5,  %xmm4   # 4 = sb1u + k |  | 
| 109         movdqa  %xmm12, %xmm0   # 0 : sb1t |  | 
| 110         pshufb  %xmm3,  %xmm0   # 0 = sb1t |  | 
| 111         pxor    %xmm4,  %xmm0   # 0 = A |  | 
| 112         movdqa  %xmm15, %xmm5   # 4 : sb2u |  | 
| 113         pshufb  %xmm2,  %xmm5   # 4 = sb2u |  | 
| 114         movdqa  -0x40(%r11,%r10), %xmm1         # .Lk_mc_forward[] |  | 
| 115         movdqa  %xmm14, %xmm2   # 2 : sb2t |  | 
| 116         pshufb  %xmm3,  %xmm2   # 2 = sb2t |  | 
| 117         pxor    %xmm5,  %xmm2   # 2 = 2A |  | 
| 118         movdqa  (%r11,%r10), %xmm4              # .Lk_mc_backward[] |  | 
| 119         movdqa  %xmm0,  %xmm3   # 3 = A |  | 
| 120         pshufb  %xmm1,  %xmm0   # 0 = B |  | 
| 121         add     \$16,   %r9     # next key |  | 
| 122         pxor    %xmm2,  %xmm0   # 0 = 2A+B |  | 
| 123         pshufb  %xmm4,  %xmm3   # 3 = D |  | 
| 124         add     \$16,   %r11    # next mc |  | 
| 125         pxor    %xmm0,  %xmm3   # 3 = 2A+B+D |  | 
| 126         pshufb  %xmm1,  %xmm0   # 0 = 2B+C |  | 
| 127         and     \$0x30, %r11    # ... mod 4 |  | 
| 128         pxor    %xmm3,  %xmm0   # 0 = 2A+3B+C+D |  | 
| 129         sub     \$1,%rax        # nr-- |  | 
| 130 |  | 
| 131 .Lenc_entry: |  | 
| 132         # top of round |  | 
| 133         movdqa  %xmm9,  %xmm1   # 1 : i |  | 
| 134         pandn   %xmm0,  %xmm1   # 1 = i<<4 |  | 
| 135         psrld   \$4,    %xmm1   # 1 = i |  | 
| 136         pand    %xmm9,  %xmm0   # 0 = k |  | 
| 137         movdqa  %xmm11, %xmm5   # 2 : a/k |  | 
| 138         pshufb  %xmm0,  %xmm5   # 2 = a/k |  | 
| 139         pxor    %xmm1,  %xmm0   # 0 = j |  | 
| 140         movdqa  %xmm10, %xmm3   # 3 : 1/i |  | 
| 141         pshufb  %xmm1,  %xmm3   # 3 = 1/i |  | 
| 142         pxor    %xmm5,  %xmm3   # 3 = iak = 1/i + a/k |  | 
| 143         movdqa  %xmm10, %xmm4   # 4 : 1/j |  | 
| 144         pshufb  %xmm0,  %xmm4   # 4 = 1/j |  | 
| 145         pxor    %xmm5,  %xmm4   # 4 = jak = 1/j + a/k |  | 
| 146         movdqa  %xmm10, %xmm2   # 2 : 1/iak |  | 
| 147         pshufb  %xmm3,  %xmm2   # 2 = 1/iak |  | 
| 148         pxor    %xmm0,  %xmm2   # 2 = io |  | 
| 149         movdqa  %xmm10, %xmm3   # 3 : 1/jak |  | 
| 150         movdqu  (%r9),  %xmm5 |  | 
| 151         pshufb  %xmm4,  %xmm3   # 3 = 1/jak |  | 
| 152         pxor    %xmm1,  %xmm3   # 3 = jo |  | 
| 153         jnz     .Lenc_loop |  | 
| 154 |  | 
| 155         # middle of last round |  | 
| 156         movdqa  -0x60(%r10), %xmm4      # 3 : sbou      .Lk_sbo |  | 
| 157         movdqa  -0x50(%r10), %xmm0      # 0 : sbot      .Lk_sbo+16 |  | 
| 158         pshufb  %xmm2,  %xmm4   # 4 = sbou |  | 
| 159         pxor    %xmm5,  %xmm4   # 4 = sb1u + k |  | 
| 160         pshufb  %xmm3,  %xmm0   # 0 = sb1t |  | 
| 161         movdqa  0x40(%r11,%r10), %xmm1          # .Lk_sr[] |  | 
| 162         pxor    %xmm4,  %xmm0   # 0 = A |  | 
| 163         pshufb  %xmm1,  %xmm0 |  | 
| 164         ret |  | 
| 165 .size   _vpaes_encrypt_core,.-_vpaes_encrypt_core |  | 
| 166 |  | 
| 167 ## |  | 
| 168 ##  Decryption core |  | 
| 169 ## |  | 
| 170 ##  Same API as encryption core. |  | 
| 171 ## |  | 
| 172 .type   _vpaes_decrypt_core,\@abi-omnipotent |  | 
| 173 .align  16 |  | 
| 174 _vpaes_decrypt_core: |  | 
| 175         mov     %rdx,   %r9             # load key |  | 
| 176         mov     240(%rdx),%eax |  | 
| 177         movdqa  %xmm9,  %xmm1 |  | 
| 178         movdqa  .Lk_dipt(%rip), %xmm2   # iptlo |  | 
| 179         pandn   %xmm0,  %xmm1 |  | 
| 180         mov     %rax,   %r11 |  | 
| 181         psrld   \$4,    %xmm1 |  | 
| 182         movdqu  (%r9),  %xmm5           # round0 key |  | 
| 183         shl     \$4,    %r11 |  | 
| 184         pand    %xmm9,  %xmm0 |  | 
| 185         pshufb  %xmm0,  %xmm2 |  | 
| 186         movdqa  .Lk_dipt+16(%rip), %xmm0 # ipthi |  | 
| 187         xor     \$0x30, %r11 |  | 
| 188         lea     .Lk_dsbd(%rip),%r10 |  | 
| 189         pshufb  %xmm1,  %xmm0 |  | 
| 190         and     \$0x30, %r11 |  | 
| 191         pxor    %xmm5,  %xmm2 |  | 
| 192         movdqa  .Lk_mc_forward+48(%rip), %xmm5 |  | 
| 193         pxor    %xmm2,  %xmm0 |  | 
| 194         add     \$16,   %r9 |  | 
| 195         add     %r10,   %r11 |  | 
| 196         jmp     .Ldec_entry |  | 
| 197 |  | 
| 198 .align 16 |  | 
| 199 .Ldec_loop: |  | 
| 200 ## |  | 
| 201 ##  Inverse mix columns |  | 
| 202 ## |  | 
| 203         movdqa  -0x20(%r10),%xmm4       # 4 : sb9u |  | 
| 204         pshufb  %xmm2,  %xmm4           # 4 = sb9u |  | 
| 205         pxor    %xmm0,  %xmm4 |  | 
| 206         movdqa  -0x10(%r10),%xmm0       # 0 : sb9t |  | 
| 207         pshufb  %xmm3,  %xmm0           # 0 = sb9t |  | 
| 208         pxor    %xmm4,  %xmm0           # 0 = ch |  | 
| 209         add     \$16, %r9               # next round key |  | 
| 210 |  | 
| 211         pshufb  %xmm5,  %xmm0           # MC ch |  | 
| 212         movdqa  0x00(%r10),%xmm4        # 4 : sbdu |  | 
| 213         pshufb  %xmm2,  %xmm4           # 4 = sbdu |  | 
| 214         pxor    %xmm0,  %xmm4           # 4 = ch |  | 
| 215         movdqa  0x10(%r10),%xmm0        # 0 : sbdt |  | 
| 216         pshufb  %xmm3,  %xmm0           # 0 = sbdt |  | 
| 217         pxor    %xmm4,  %xmm0           # 0 = ch |  | 
| 218         sub     \$1,%rax                # nr-- |  | 
| 219 |  | 
| 220         pshufb  %xmm5,  %xmm0           # MC ch |  | 
| 221         movdqa  0x20(%r10),%xmm4        # 4 : sbbu |  | 
| 222         pshufb  %xmm2,  %xmm4           # 4 = sbbu |  | 
| 223         pxor    %xmm0,  %xmm4           # 4 = ch |  | 
| 224         movdqa  0x30(%r10),%xmm0        # 0 : sbbt |  | 
| 225         pshufb  %xmm3,  %xmm0           # 0 = sbbt |  | 
| 226         pxor    %xmm4,  %xmm0           # 0 = ch |  | 
| 227 |  | 
| 228         pshufb  %xmm5,  %xmm0           # MC ch |  | 
| 229         movdqa  0x40(%r10),%xmm4        # 4 : sbeu |  | 
| 230         pshufb  %xmm2,  %xmm4           # 4 = sbeu |  | 
| 231         pxor    %xmm0,  %xmm4           # 4 = ch |  | 
| 232         movdqa  0x50(%r10),%xmm0        # 0 : sbet |  | 
| 233         pshufb  %xmm3,  %xmm0           # 0 = sbet |  | 
| 234         pxor    %xmm4,  %xmm0           # 0 = ch |  | 
| 235 |  | 
| 236         palignr \$12,   %xmm5,  %xmm5 |  | 
| 237 |  | 
| 238 .Ldec_entry: |  | 
| 239         # top of round |  | 
| 240         movdqa  %xmm9,  %xmm1   # 1 : i |  | 
| 241         pandn   %xmm0,  %xmm1   # 1 = i<<4 |  | 
| 242         psrld   \$4,    %xmm1   # 1 = i |  | 
| 243         pand    %xmm9,  %xmm0   # 0 = k |  | 
| 244         movdqa  %xmm11, %xmm2   # 2 : a/k |  | 
| 245         pshufb  %xmm0,  %xmm2   # 2 = a/k |  | 
| 246         pxor    %xmm1,  %xmm0   # 0 = j |  | 
| 247         movdqa  %xmm10, %xmm3   # 3 : 1/i |  | 
| 248         pshufb  %xmm1,  %xmm3   # 3 = 1/i |  | 
| 249         pxor    %xmm2,  %xmm3   # 3 = iak = 1/i + a/k |  | 
| 250         movdqa  %xmm10, %xmm4   # 4 : 1/j |  | 
| 251         pshufb  %xmm0,  %xmm4   # 4 = 1/j |  | 
| 252         pxor    %xmm2,  %xmm4   # 4 = jak = 1/j + a/k |  | 
| 253         movdqa  %xmm10, %xmm2   # 2 : 1/iak |  | 
| 254         pshufb  %xmm3,  %xmm2   # 2 = 1/iak |  | 
| 255         pxor    %xmm0,  %xmm2   # 2 = io |  | 
| 256         movdqa  %xmm10, %xmm3   # 3 : 1/jak |  | 
| 257         pshufb  %xmm4,  %xmm3   # 3 = 1/jak |  | 
| 258         pxor    %xmm1,  %xmm3   # 3 = jo |  | 
| 259         movdqu  (%r9),  %xmm0 |  | 
| 260         jnz     .Ldec_loop |  | 
| 261 |  | 
| 262         # middle of last round |  | 
| 263         movdqa  0x60(%r10), %xmm4       # 3 : sbou |  | 
| 264         pshufb  %xmm2,  %xmm4   # 4 = sbou |  | 
| 265         pxor    %xmm0,  %xmm4   # 4 = sb1u + k |  | 
| 266         movdqa  0x70(%r10), %xmm0       # 0 : sbot |  | 
| 267         movdqa  -0x160(%r11), %xmm2     # .Lk_sr-.Lk_dsbd=-0x160 |  | 
| 268         pshufb  %xmm3,  %xmm0   # 0 = sb1t |  | 
| 269         pxor    %xmm4,  %xmm0   # 0 = A |  | 
| 270         pshufb  %xmm2,  %xmm0 |  | 
| 271         ret |  | 
| 272 .size   _vpaes_decrypt_core,.-_vpaes_decrypt_core |  | 
| 273 |  | 
| 274 ######################################################## |  | 
| 275 ##                                                    ## |  | 
| 276 ##                  AES key schedule                  ## |  | 
| 277 ##                                                    ## |  | 
| 278 ######################################################## |  | 
| 279 .type   _vpaes_schedule_core,\@abi-omnipotent |  | 
| 280 .align  16 |  | 
| 281 _vpaes_schedule_core: |  | 
| 282         # rdi = key |  | 
| 283         # rsi = size in bits |  | 
| 284         # rdx = buffer |  | 
| 285         # rcx = direction.  0=encrypt, 1=decrypt |  | 
| 286 |  | 
| 287         call    _vpaes_preheat          # load the tables |  | 
| 288         movdqa  .Lk_rcon(%rip), %xmm8   # load rcon |  | 
| 289         movdqu  (%rdi), %xmm0           # load key (unaligned) |  | 
| 290 |  | 
| 291         # input transform |  | 
| 292         movdqa  %xmm0,  %xmm3 |  | 
| 293         lea     .Lk_ipt(%rip), %r11 |  | 
| 294         call    _vpaes_schedule_transform |  | 
| 295         movdqa  %xmm0,  %xmm7 |  | 
| 296 |  | 
| 297         lea     .Lk_sr(%rip),%r10 |  | 
| 298         test    %rcx,   %rcx |  | 
| 299         jnz     .Lschedule_am_decrypting |  | 
| 300 |  | 
| 301         # encrypting, output zeroth round key after transform |  | 
| 302         movdqu  %xmm0,  (%rdx) |  | 
| 303         jmp     .Lschedule_go |  | 
| 304 |  | 
| 305 .Lschedule_am_decrypting: |  | 
| 306         # decrypting, output zeroth round key after shiftrows |  | 
| 307         movdqa  (%r8,%r10),%xmm1 |  | 
| 308         pshufb  %xmm1,  %xmm3 |  | 
| 309         movdqu  %xmm3,  (%rdx) |  | 
| 310         xor     \$0x30, %r8 |  | 
| 311 |  | 
| 312 .Lschedule_go: |  | 
| 313         cmp     \$192,  %esi |  | 
| 314         ja      .Lschedule_256 |  | 
| 315         je      .Lschedule_192 |  | 
| 316         # 128: fall though |  | 
| 317 |  | 
| 318 ## |  | 
| 319 ##  .schedule_128 |  | 
| 320 ## |  | 
| 321 ##  128-bit specific part of key schedule. |  | 
| 322 ## |  | 
| 323 ##  This schedule is really simple, because all its parts |  | 
| 324 ##  are accomplished by the subroutines. |  | 
| 325 ## |  | 
| 326 .Lschedule_128: |  | 
| 327         mov     \$10, %esi |  | 
| 328 |  | 
| 329 .Loop_schedule_128: |  | 
| 330         call    _vpaes_schedule_round |  | 
| 331         dec     %rsi |  | 
| 332         jz      .Lschedule_mangle_last |  | 
| 333         call    _vpaes_schedule_mangle  # write output |  | 
| 334         jmp     .Loop_schedule_128 |  | 
| 335 |  | 
| 336 ## |  | 
| 337 ##  .aes_schedule_192 |  | 
| 338 ## |  | 
| 339 ##  192-bit specific part of key schedule. |  | 
| 340 ## |  | 
| 341 ##  The main body of this schedule is the same as the 128-bit |  | 
| 342 ##  schedule, but with more smearing.  The long, high side is |  | 
| 343 ##  stored in %xmm7 as before, and the short, low side is in |  | 
| 344 ##  the high bits of %xmm6. |  | 
| 345 ## |  | 
| 346 ##  This schedule is somewhat nastier, however, because each |  | 
| 347 ##  round produces 192 bits of key material, or 1.5 round keys. |  | 
| 348 ##  Therefore, on each cycle we do 2 rounds and produce 3 round |  | 
| 349 ##  keys. |  | 
| 350 ## |  | 
| 351 .align  16 |  | 
| 352 .Lschedule_192: |  | 
| 353         movdqu  8(%rdi),%xmm0           # load key part 2 (very unaligned) |  | 
| 354         call    _vpaes_schedule_transform       # input transform |  | 
| 355         movdqa  %xmm0,  %xmm6           # save short part |  | 
| 356         pxor    %xmm4,  %xmm4           # clear 4 |  | 
| 357         movhlps %xmm4,  %xmm6           # clobber low side with zeros |  | 
| 358         mov     \$4,    %esi |  | 
| 359 |  | 
| 360 .Loop_schedule_192: |  | 
| 361         call    _vpaes_schedule_round |  | 
| 362         palignr \$8,%xmm6,%xmm0 |  | 
| 363         call    _vpaes_schedule_mangle  # save key n |  | 
| 364         call    _vpaes_schedule_192_smear |  | 
| 365         call    _vpaes_schedule_mangle  # save key n+1 |  | 
| 366         call    _vpaes_schedule_round |  | 
| 367         dec     %rsi |  | 
| 368         jz      .Lschedule_mangle_last |  | 
| 369         call    _vpaes_schedule_mangle  # save key n+2 |  | 
| 370         call    _vpaes_schedule_192_smear |  | 
| 371         jmp     .Loop_schedule_192 |  | 
| 372 |  | 
| 373 ## |  | 
| 374 ##  .aes_schedule_256 |  | 
| 375 ## |  | 
| 376 ##  256-bit specific part of key schedule. |  | 
| 377 ## |  | 
| 378 ##  The structure here is very similar to the 128-bit |  | 
| 379 ##  schedule, but with an additional "low side" in |  | 
| 380 ##  %xmm6.  The low side's rounds are the same as the |  | 
| 381 ##  high side's, except no rcon and no rotation. |  | 
| 382 ## |  | 
| 383 .align  16 |  | 
| 384 .Lschedule_256: |  | 
| 385         movdqu  16(%rdi),%xmm0          # load key part 2 (unaligned) |  | 
| 386         call    _vpaes_schedule_transform       # input transform |  | 
| 387         mov     \$7, %esi |  | 
| 388 |  | 
| 389 .Loop_schedule_256: |  | 
| 390         call    _vpaes_schedule_mangle  # output low result |  | 
| 391         movdqa  %xmm0,  %xmm6           # save cur_lo in xmm6 |  | 
| 392 |  | 
| 393         # high round |  | 
| 394         call    _vpaes_schedule_round |  | 
| 395         dec     %rsi |  | 
| 396         jz      .Lschedule_mangle_last |  | 
| 397         call    _vpaes_schedule_mangle |  | 
| 398 |  | 
| 399         # low round. swap xmm7 and xmm6 |  | 
| 400         pshufd  \$0xFF, %xmm0,  %xmm0 |  | 
| 401         movdqa  %xmm7,  %xmm5 |  | 
| 402         movdqa  %xmm6,  %xmm7 |  | 
| 403         call    _vpaes_schedule_low_round |  | 
| 404         movdqa  %xmm5,  %xmm7 |  | 
| 405 |  | 
| 406         jmp     .Loop_schedule_256 |  | 
| 407 |  | 
| 408 |  | 
| 409 ## |  | 
| 410 ##  .aes_schedule_mangle_last |  | 
| 411 ## |  | 
| 412 ##  Mangler for last round of key schedule |  | 
| 413 ##  Mangles %xmm0 |  | 
| 414 ##    when encrypting, outputs out(%xmm0) ^ 63 |  | 
| 415 ##    when decrypting, outputs unskew(%xmm0) |  | 
| 416 ## |  | 
| 417 ##  Always called right before return... jumps to cleanup and exits |  | 
| 418 ## |  | 
| 419 .align  16 |  | 
| 420 .Lschedule_mangle_last: |  | 
| 421         # schedule last round key from xmm0 |  | 
| 422         lea     .Lk_deskew(%rip),%r11   # prepare to deskew |  | 
| 423         test    %rcx,   %rcx |  | 
| 424         jnz     .Lschedule_mangle_last_dec |  | 
| 425 |  | 
| 426         # encrypting |  | 
| 427         movdqa  (%r8,%r10),%xmm1 |  | 
| 428         pshufb  %xmm1,  %xmm0           # output permute |  | 
| 429         lea     .Lk_opt(%rip),  %r11    # prepare to output transform |  | 
| 430         add     \$32,   %rdx |  | 
| 431 |  | 
| 432 .Lschedule_mangle_last_dec: |  | 
| 433         add     \$-16,  %rdx |  | 
| 434         pxor    .Lk_s63(%rip),  %xmm0 |  | 
| 435         call    _vpaes_schedule_transform # output transform |  | 
| 436         movdqu  %xmm0,  (%rdx)          # save last key |  | 
| 437 |  | 
| 438         # cleanup |  | 
| 439         pxor    %xmm0,  %xmm0 |  | 
| 440         pxor    %xmm1,  %xmm1 |  | 
| 441         pxor    %xmm2,  %xmm2 |  | 
| 442         pxor    %xmm3,  %xmm3 |  | 
| 443         pxor    %xmm4,  %xmm4 |  | 
| 444         pxor    %xmm5,  %xmm5 |  | 
| 445         pxor    %xmm6,  %xmm6 |  | 
| 446         pxor    %xmm7,  %xmm7 |  | 
| 447         ret |  | 
| 448 .size   _vpaes_schedule_core,.-_vpaes_schedule_core |  | 
| 449 |  | 
| 450 ## |  | 
| 451 ##  .aes_schedule_192_smear |  | 
| 452 ## |  | 
| 453 ##  Smear the short, low side in the 192-bit key schedule. |  | 
| 454 ## |  | 
| 455 ##  Inputs: |  | 
| 456 ##    %xmm7: high side, b  a  x  y |  | 
| 457 ##    %xmm6:  low side, d  c  0  0 |  | 
| 458 ##    %xmm13: 0 |  | 
| 459 ## |  | 
| 460 ##  Outputs: |  | 
| 461 ##    %xmm6: b+c+d  b+c  0  0 |  | 
| 462 ##    %xmm0: b+c+d  b+c  b  a |  | 
| 463 ## |  | 
| 464 .type   _vpaes_schedule_192_smear,\@abi-omnipotent |  | 
| 465 .align  16 |  | 
| 466 _vpaes_schedule_192_smear: |  | 
| 467         pshufd  \$0x80, %xmm6,  %xmm0   # d c 0 0 -> c 0 0 0 |  | 
| 468         pxor    %xmm0,  %xmm6           # -> c+d c 0 0 |  | 
| 469         pshufd  \$0xFE, %xmm7,  %xmm0   # b a _ _ -> b b b a |  | 
| 470         pxor    %xmm0,  %xmm6           # -> b+c+d b+c b a |  | 
| 471         movdqa  %xmm6,  %xmm0 |  | 
| 472         pxor    %xmm1,  %xmm1 |  | 
| 473         movhlps %xmm1,  %xmm6           # clobber low side with zeros |  | 
| 474         ret |  | 
| 475 .size   _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear |  | 
| 476 |  | 
| 477 ## |  | 
| 478 ##  .aes_schedule_round |  | 
| 479 ## |  | 
| 480 ##  Runs one main round of the key schedule on %xmm0, %xmm7 |  | 
| 481 ## |  | 
| 482 ##  Specifically, runs subbytes on the high dword of %xmm0 |  | 
| 483 ##  then rotates it by one byte and xors into the low dword of |  | 
| 484 ##  %xmm7. |  | 
| 485 ## |  | 
| 486 ##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for |  | 
| 487 ##  next rcon. |  | 
| 488 ## |  | 
| 489 ##  Smears the dwords of %xmm7 by xoring the low into the |  | 
| 490 ##  second low, result into third, result into highest. |  | 
| 491 ## |  | 
| 492 ##  Returns results in %xmm7 = %xmm0. |  | 
| 493 ##  Clobbers %xmm1-%xmm4, %r11. |  | 
| 494 ## |  | 
| 495 .type   _vpaes_schedule_round,\@abi-omnipotent |  | 
| 496 .align  16 |  | 
| 497 _vpaes_schedule_round: |  | 
| 498         # extract rcon from xmm8 |  | 
| 499         pxor    %xmm1,  %xmm1 |  | 
| 500         palignr \$15,   %xmm8,  %xmm1 |  | 
| 501         palignr \$15,   %xmm8,  %xmm8 |  | 
| 502         pxor    %xmm1,  %xmm7 |  | 
| 503 |  | 
| 504         # rotate |  | 
| 505         pshufd  \$0xFF, %xmm0,  %xmm0 |  | 
| 506         palignr \$1,    %xmm0,  %xmm0 |  | 
| 507 |  | 
| 508         # fall through... |  | 
| 509 |  | 
| 510         # low round: same as high round, but no rotation and no rcon. |  | 
| 511 _vpaes_schedule_low_round: |  | 
| 512         # smear xmm7 |  | 
| 513         movdqa  %xmm7,  %xmm1 |  | 
| 514         pslldq  \$4,    %xmm7 |  | 
| 515         pxor    %xmm1,  %xmm7 |  | 
| 516         movdqa  %xmm7,  %xmm1 |  | 
| 517         pslldq  \$8,    %xmm7 |  | 
| 518         pxor    %xmm1,  %xmm7 |  | 
| 519         pxor    .Lk_s63(%rip), %xmm7 |  | 
| 520 |  | 
| 521         # subbytes |  | 
| 522         movdqa  %xmm9,  %xmm1 |  | 
| 523         pandn   %xmm0,  %xmm1 |  | 
| 524         psrld   \$4,    %xmm1           # 1 = i |  | 
| 525         pand    %xmm9,  %xmm0           # 0 = k |  | 
| 526         movdqa  %xmm11, %xmm2           # 2 : a/k |  | 
| 527         pshufb  %xmm0,  %xmm2           # 2 = a/k |  | 
| 528         pxor    %xmm1,  %xmm0           # 0 = j |  | 
| 529         movdqa  %xmm10, %xmm3           # 3 : 1/i |  | 
| 530         pshufb  %xmm1,  %xmm3           # 3 = 1/i |  | 
| 531         pxor    %xmm2,  %xmm3           # 3 = iak = 1/i + a/k |  | 
| 532         movdqa  %xmm10, %xmm4           # 4 : 1/j |  | 
| 533         pshufb  %xmm0,  %xmm4           # 4 = 1/j |  | 
| 534         pxor    %xmm2,  %xmm4           # 4 = jak = 1/j + a/k |  | 
| 535         movdqa  %xmm10, %xmm2           # 2 : 1/iak |  | 
| 536         pshufb  %xmm3,  %xmm2           # 2 = 1/iak |  | 
| 537         pxor    %xmm0,  %xmm2           # 2 = io |  | 
| 538         movdqa  %xmm10, %xmm3           # 3 : 1/jak |  | 
| 539         pshufb  %xmm4,  %xmm3           # 3 = 1/jak |  | 
| 540         pxor    %xmm1,  %xmm3           # 3 = jo |  | 
| 541         movdqa  %xmm13, %xmm4           # 4 : sbou |  | 
| 542         pshufb  %xmm2,  %xmm4           # 4 = sbou |  | 
| 543         movdqa  %xmm12, %xmm0           # 0 : sbot |  | 
| 544         pshufb  %xmm3,  %xmm0           # 0 = sb1t |  | 
| 545         pxor    %xmm4,  %xmm0           # 0 = sbox output |  | 
| 546 |  | 
| 547         # add in smeared stuff |  | 
| 548         pxor    %xmm7,  %xmm0 |  | 
| 549         movdqa  %xmm0,  %xmm7 |  | 
| 550         ret |  | 
| 551 .size   _vpaes_schedule_round,.-_vpaes_schedule_round |  | 
| 552 |  | 
| 553 ## |  | 
| 554 ##  .aes_schedule_transform |  | 
| 555 ## |  | 
| 556 ##  Linear-transform %xmm0 according to tables at (%r11) |  | 
| 557 ## |  | 
| 558 ##  Requires that %xmm9 = 0x0F0F... as in preheat |  | 
| 559 ##  Output in %xmm0 |  | 
| 560 ##  Clobbers %xmm1, %xmm2 |  | 
| 561 ## |  | 
| 562 .type   _vpaes_schedule_transform,\@abi-omnipotent |  | 
| 563 .align  16 |  | 
| 564 _vpaes_schedule_transform: |  | 
| 565         movdqa  %xmm9,  %xmm1 |  | 
| 566         pandn   %xmm0,  %xmm1 |  | 
| 567         psrld   \$4,    %xmm1 |  | 
| 568         pand    %xmm9,  %xmm0 |  | 
| 569         movdqa  (%r11), %xmm2   # lo |  | 
| 570         pshufb  %xmm0,  %xmm2 |  | 
| 571         movdqa  16(%r11), %xmm0 # hi |  | 
| 572         pshufb  %xmm1,  %xmm0 |  | 
| 573         pxor    %xmm2,  %xmm0 |  | 
| 574         ret |  | 
| 575 .size   _vpaes_schedule_transform,.-_vpaes_schedule_transform |  | 
| 576 |  | 
| 577 ## |  | 
| 578 ##  .aes_schedule_mangle |  | 
| 579 ## |  | 
| 580 ##  Mangle xmm0 from (basis-transformed) standard version |  | 
| 581 ##  to our version. |  | 
| 582 ## |  | 
| 583 ##  On encrypt, |  | 
| 584 ##    xor with 0x63 |  | 
| 585 ##    multiply by circulant 0,1,1,1 |  | 
| 586 ##    apply shiftrows transform |  | 
| 587 ## |  | 
| 588 ##  On decrypt, |  | 
| 589 ##    xor with 0x63 |  | 
| 590 ##    multiply by "inverse mixcolumns" circulant E,B,D,9 |  | 
| 591 ##    deskew |  | 
| 592 ##    apply shiftrows transform |  | 
| 593 ## |  | 
| 594 ## |  | 
| 595 ##  Writes out to (%rdx), and increments or decrements it |  | 
| 596 ##  Keeps track of round number mod 4 in %r8 |  | 
| 597 ##  Preserves xmm0 |  | 
| 598 ##  Clobbers xmm1-xmm5 |  | 
| 599 ## |  | 
| 600 .type   _vpaes_schedule_mangle,\@abi-omnipotent |  | 
| 601 .align  16 |  | 
| 602 _vpaes_schedule_mangle: |  | 
| 603         movdqa  %xmm0,  %xmm4   # save xmm0 for later |  | 
| 604         movdqa  .Lk_mc_forward(%rip),%xmm5 |  | 
| 605         test    %rcx,   %rcx |  | 
| 606         jnz     .Lschedule_mangle_dec |  | 
| 607 |  | 
| 608         # encrypting |  | 
| 609         add     \$16,   %rdx |  | 
| 610         pxor    .Lk_s63(%rip),%xmm4 |  | 
| 611         pshufb  %xmm5,  %xmm4 |  | 
| 612         movdqa  %xmm4,  %xmm3 |  | 
| 613         pshufb  %xmm5,  %xmm4 |  | 
| 614         pxor    %xmm4,  %xmm3 |  | 
| 615         pshufb  %xmm5,  %xmm4 |  | 
| 616         pxor    %xmm4,  %xmm3 |  | 
| 617 |  | 
| 618         jmp     .Lschedule_mangle_both |  | 
| 619 .align  16 |  | 
| 620 .Lschedule_mangle_dec: |  | 
| 621         # inverse mix columns |  | 
| 622         lea     .Lk_dksd(%rip),%r11 |  | 
| 623         movdqa  %xmm9,  %xmm1 |  | 
| 624         pandn   %xmm4,  %xmm1 |  | 
| 625         psrld   \$4,    %xmm1   # 1 = hi |  | 
| 626         pand    %xmm9,  %xmm4   # 4 = lo |  | 
| 627 |  | 
| 628         movdqa  0x00(%r11), %xmm2 |  | 
| 629         pshufb  %xmm4,  %xmm2 |  | 
| 630         movdqa  0x10(%r11), %xmm3 |  | 
| 631         pshufb  %xmm1,  %xmm3 |  | 
| 632         pxor    %xmm2,  %xmm3 |  | 
| 633         pshufb  %xmm5,  %xmm3 |  | 
| 634 |  | 
| 635         movdqa  0x20(%r11), %xmm2 |  | 
| 636         pshufb  %xmm4,  %xmm2 |  | 
| 637         pxor    %xmm3,  %xmm2 |  | 
| 638         movdqa  0x30(%r11), %xmm3 |  | 
| 639         pshufb  %xmm1,  %xmm3 |  | 
| 640         pxor    %xmm2,  %xmm3 |  | 
| 641         pshufb  %xmm5,  %xmm3 |  | 
| 642 |  | 
| 643         movdqa  0x40(%r11), %xmm2 |  | 
| 644         pshufb  %xmm4,  %xmm2 |  | 
| 645         pxor    %xmm3,  %xmm2 |  | 
| 646         movdqa  0x50(%r11), %xmm3 |  | 
| 647         pshufb  %xmm1,  %xmm3 |  | 
| 648         pxor    %xmm2,  %xmm3 |  | 
| 649         pshufb  %xmm5,  %xmm3 |  | 
| 650 |  | 
| 651         movdqa  0x60(%r11), %xmm2 |  | 
| 652         pshufb  %xmm4,  %xmm2 |  | 
| 653         pxor    %xmm3,  %xmm2 |  | 
| 654         movdqa  0x70(%r11), %xmm3 |  | 
| 655         pshufb  %xmm1,  %xmm3 |  | 
| 656         pxor    %xmm2,  %xmm3 |  | 
| 657 |  | 
| 658         add     \$-16,  %rdx |  | 
| 659 |  | 
| 660 .Lschedule_mangle_both: |  | 
| 661         movdqa  (%r8,%r10),%xmm1 |  | 
| 662         pshufb  %xmm1,%xmm3 |  | 
| 663         add     \$-16,  %r8 |  | 
| 664         and     \$0x30, %r8 |  | 
| 665         movdqu  %xmm3,  (%rdx) |  | 
| 666         ret |  | 
| 667 .size   _vpaes_schedule_mangle,.-_vpaes_schedule_mangle |  | 
| 668 |  | 
| 669 # |  | 
| 670 # Interface to OpenSSL |  | 
| 671 # |  | 
| 672 .globl  ${PREFIX}_set_encrypt_key |  | 
| 673 .type   ${PREFIX}_set_encrypt_key,\@function,3 |  | 
| 674 .align  16 |  | 
| 675 ${PREFIX}_set_encrypt_key: |  | 
| 676 ___ |  | 
| 677 $code.=<<___ if ($win64); |  | 
| 678         lea     -0xb8(%rsp),%rsp |  | 
| 679         movaps  %xmm6,0x10(%rsp) |  | 
| 680         movaps  %xmm7,0x20(%rsp) |  | 
| 681         movaps  %xmm8,0x30(%rsp) |  | 
| 682         movaps  %xmm9,0x40(%rsp) |  | 
| 683         movaps  %xmm10,0x50(%rsp) |  | 
| 684         movaps  %xmm11,0x60(%rsp) |  | 
| 685         movaps  %xmm12,0x70(%rsp) |  | 
| 686         movaps  %xmm13,0x80(%rsp) |  | 
| 687         movaps  %xmm14,0x90(%rsp) |  | 
| 688         movaps  %xmm15,0xa0(%rsp) |  | 
| 689 .Lenc_key_body: |  | 
| 690 ___ |  | 
| 691 $code.=<<___; |  | 
| 692         mov     %esi,%eax |  | 
| 693         shr     \$5,%eax |  | 
| 694         add     \$5,%eax |  | 
| 695         mov     %eax,240(%rdx)  # AES_KEY->rounds = nbits/32+5; |  | 
| 696 |  | 
| 697         mov     \$0,%ecx |  | 
| 698         mov     \$0x30,%r8d |  | 
| 699         call    _vpaes_schedule_core |  | 
| 700 ___ |  | 
| 701 $code.=<<___ if ($win64); |  | 
| 702         movaps  0x10(%rsp),%xmm6 |  | 
| 703         movaps  0x20(%rsp),%xmm7 |  | 
| 704         movaps  0x30(%rsp),%xmm8 |  | 
| 705         movaps  0x40(%rsp),%xmm9 |  | 
| 706         movaps  0x50(%rsp),%xmm10 |  | 
| 707         movaps  0x60(%rsp),%xmm11 |  | 
| 708         movaps  0x70(%rsp),%xmm12 |  | 
| 709         movaps  0x80(%rsp),%xmm13 |  | 
| 710         movaps  0x90(%rsp),%xmm14 |  | 
| 711         movaps  0xa0(%rsp),%xmm15 |  | 
| 712         lea     0xb8(%rsp),%rsp |  | 
| 713 .Lenc_key_epilogue: |  | 
| 714 ___ |  | 
| 715 $code.=<<___; |  | 
| 716         xor     %eax,%eax |  | 
| 717         ret |  | 
| 718 .size   ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key |  | 
| 719 |  | 
| 720 .globl  ${PREFIX}_set_decrypt_key |  | 
| 721 .type   ${PREFIX}_set_decrypt_key,\@function,3 |  | 
| 722 .align  16 |  | 
| 723 ${PREFIX}_set_decrypt_key: |  | 
| 724 ___ |  | 
| 725 $code.=<<___ if ($win64); |  | 
| 726         lea     -0xb8(%rsp),%rsp |  | 
| 727         movaps  %xmm6,0x10(%rsp) |  | 
| 728         movaps  %xmm7,0x20(%rsp) |  | 
| 729         movaps  %xmm8,0x30(%rsp) |  | 
| 730         movaps  %xmm9,0x40(%rsp) |  | 
| 731         movaps  %xmm10,0x50(%rsp) |  | 
| 732         movaps  %xmm11,0x60(%rsp) |  | 
| 733         movaps  %xmm12,0x70(%rsp) |  | 
| 734         movaps  %xmm13,0x80(%rsp) |  | 
| 735         movaps  %xmm14,0x90(%rsp) |  | 
| 736         movaps  %xmm15,0xa0(%rsp) |  | 
| 737 .Ldec_key_body: |  | 
| 738 ___ |  | 
| 739 $code.=<<___; |  | 
| 740         mov     %esi,%eax |  | 
| 741         shr     \$5,%eax |  | 
| 742         add     \$5,%eax |  | 
| 743         mov     %eax,240(%rdx)  # AES_KEY->rounds = nbits/32+5; |  | 
| 744         shl     \$4,%eax |  | 
| 745         lea     16(%rdx,%rax),%rdx |  | 
| 746 |  | 
| 747         mov     \$1,%ecx |  | 
| 748         mov     %esi,%r8d |  | 
| 749         shr     \$1,%r8d |  | 
| 750         and     \$32,%r8d |  | 
| 751         xor     \$32,%r8d       # nbits==192?0:32 |  | 
| 752         call    _vpaes_schedule_core |  | 
| 753 ___ |  | 
| 754 $code.=<<___ if ($win64); |  | 
| 755         movaps  0x10(%rsp),%xmm6 |  | 
| 756         movaps  0x20(%rsp),%xmm7 |  | 
| 757         movaps  0x30(%rsp),%xmm8 |  | 
| 758         movaps  0x40(%rsp),%xmm9 |  | 
| 759         movaps  0x50(%rsp),%xmm10 |  | 
| 760         movaps  0x60(%rsp),%xmm11 |  | 
| 761         movaps  0x70(%rsp),%xmm12 |  | 
| 762         movaps  0x80(%rsp),%xmm13 |  | 
| 763         movaps  0x90(%rsp),%xmm14 |  | 
| 764         movaps  0xa0(%rsp),%xmm15 |  | 
| 765         lea     0xb8(%rsp),%rsp |  | 
| 766 .Ldec_key_epilogue: |  | 
| 767 ___ |  | 
| 768 $code.=<<___; |  | 
| 769         xor     %eax,%eax |  | 
| 770         ret |  | 
| 771 .size   ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key |  | 
| 772 |  | 
| 773 .globl  ${PREFIX}_encrypt |  | 
| 774 .type   ${PREFIX}_encrypt,\@function,3 |  | 
| 775 .align  16 |  | 
| 776 ${PREFIX}_encrypt: |  | 
| 777 ___ |  | 
| 778 $code.=<<___ if ($win64); |  | 
| 779         lea     -0xb8(%rsp),%rsp |  | 
| 780         movaps  %xmm6,0x10(%rsp) |  | 
| 781         movaps  %xmm7,0x20(%rsp) |  | 
| 782         movaps  %xmm8,0x30(%rsp) |  | 
| 783         movaps  %xmm9,0x40(%rsp) |  | 
| 784         movaps  %xmm10,0x50(%rsp) |  | 
| 785         movaps  %xmm11,0x60(%rsp) |  | 
| 786         movaps  %xmm12,0x70(%rsp) |  | 
| 787         movaps  %xmm13,0x80(%rsp) |  | 
| 788         movaps  %xmm14,0x90(%rsp) |  | 
| 789         movaps  %xmm15,0xa0(%rsp) |  | 
| 790 .Lenc_body: |  | 
| 791 ___ |  | 
| 792 $code.=<<___; |  | 
| 793         movdqu  (%rdi),%xmm0 |  | 
| 794         call    _vpaes_preheat |  | 
| 795         call    _vpaes_encrypt_core |  | 
| 796         movdqu  %xmm0,(%rsi) |  | 
| 797 ___ |  | 
| 798 $code.=<<___ if ($win64); |  | 
| 799         movaps  0x10(%rsp),%xmm6 |  | 
| 800         movaps  0x20(%rsp),%xmm7 |  | 
| 801         movaps  0x30(%rsp),%xmm8 |  | 
| 802         movaps  0x40(%rsp),%xmm9 |  | 
| 803         movaps  0x50(%rsp),%xmm10 |  | 
| 804         movaps  0x60(%rsp),%xmm11 |  | 
| 805         movaps  0x70(%rsp),%xmm12 |  | 
| 806         movaps  0x80(%rsp),%xmm13 |  | 
| 807         movaps  0x90(%rsp),%xmm14 |  | 
| 808         movaps  0xa0(%rsp),%xmm15 |  | 
| 809         lea     0xb8(%rsp),%rsp |  | 
| 810 .Lenc_epilogue: |  | 
| 811 ___ |  | 
| 812 $code.=<<___; |  | 
| 813         ret |  | 
| 814 .size   ${PREFIX}_encrypt,.-${PREFIX}_encrypt |  | 
| 815 |  | 
| 816 .globl  ${PREFIX}_decrypt |  | 
| 817 .type   ${PREFIX}_decrypt,\@function,3 |  | 
| 818 .align  16 |  | 
| 819 ${PREFIX}_decrypt: |  | 
| 820 ___ |  | 
| 821 $code.=<<___ if ($win64); |  | 
| 822         lea     -0xb8(%rsp),%rsp |  | 
| 823         movaps  %xmm6,0x10(%rsp) |  | 
| 824         movaps  %xmm7,0x20(%rsp) |  | 
| 825         movaps  %xmm8,0x30(%rsp) |  | 
| 826         movaps  %xmm9,0x40(%rsp) |  | 
| 827         movaps  %xmm10,0x50(%rsp) |  | 
| 828         movaps  %xmm11,0x60(%rsp) |  | 
| 829         movaps  %xmm12,0x70(%rsp) |  | 
| 830         movaps  %xmm13,0x80(%rsp) |  | 
| 831         movaps  %xmm14,0x90(%rsp) |  | 
| 832         movaps  %xmm15,0xa0(%rsp) |  | 
| 833 .Ldec_body: |  | 
| 834 ___ |  | 
| 835 $code.=<<___; |  | 
| 836         movdqu  (%rdi),%xmm0 |  | 
| 837         call    _vpaes_preheat |  | 
| 838         call    _vpaes_decrypt_core |  | 
| 839         movdqu  %xmm0,(%rsi) |  | 
| 840 ___ |  | 
| 841 $code.=<<___ if ($win64); |  | 
| 842         movaps  0x10(%rsp),%xmm6 |  | 
| 843         movaps  0x20(%rsp),%xmm7 |  | 
| 844         movaps  0x30(%rsp),%xmm8 |  | 
| 845         movaps  0x40(%rsp),%xmm9 |  | 
| 846         movaps  0x50(%rsp),%xmm10 |  | 
| 847         movaps  0x60(%rsp),%xmm11 |  | 
| 848         movaps  0x70(%rsp),%xmm12 |  | 
| 849         movaps  0x80(%rsp),%xmm13 |  | 
| 850         movaps  0x90(%rsp),%xmm14 |  | 
| 851         movaps  0xa0(%rsp),%xmm15 |  | 
| 852         lea     0xb8(%rsp),%rsp |  | 
| 853 .Ldec_epilogue: |  | 
| 854 ___ |  | 
| 855 $code.=<<___; |  | 
| 856         ret |  | 
| 857 .size   ${PREFIX}_decrypt,.-${PREFIX}_decrypt |  | 
| 858 ___ |  | 
| 859 { |  | 
| 860 my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); |  | 
| 861 # void AES_cbc_encrypt (const void char *inp, unsigned char *out, |  | 
| 862 #                       size_t length, const AES_KEY *key, |  | 
| 863 #                       unsigned char *ivp,const int enc); |  | 
| 864 $code.=<<___; |  | 
| 865 .globl  ${PREFIX}_cbc_encrypt |  | 
| 866 .type   ${PREFIX}_cbc_encrypt,\@function,6 |  | 
| 867 .align  16 |  | 
| 868 ${PREFIX}_cbc_encrypt: |  | 
| 869         xchg    $key,$len |  | 
| 870 ___ |  | 
| 871 ($len,$key)=($key,$len); |  | 
| 872 $code.=<<___; |  | 
| 873         sub     \$16,$len |  | 
| 874         jc      .Lcbc_abort |  | 
| 875 ___ |  | 
| 876 $code.=<<___ if ($win64); |  | 
| 877         lea     -0xb8(%rsp),%rsp |  | 
| 878         movaps  %xmm6,0x10(%rsp) |  | 
| 879         movaps  %xmm7,0x20(%rsp) |  | 
| 880         movaps  %xmm8,0x30(%rsp) |  | 
| 881         movaps  %xmm9,0x40(%rsp) |  | 
| 882         movaps  %xmm10,0x50(%rsp) |  | 
| 883         movaps  %xmm11,0x60(%rsp) |  | 
| 884         movaps  %xmm12,0x70(%rsp) |  | 
| 885         movaps  %xmm13,0x80(%rsp) |  | 
| 886         movaps  %xmm14,0x90(%rsp) |  | 
| 887         movaps  %xmm15,0xa0(%rsp) |  | 
| 888 .Lcbc_body: |  | 
| 889 ___ |  | 
| 890 $code.=<<___; |  | 
| 891         movdqu  ($ivp),%xmm6            # load IV |  | 
| 892         sub     $inp,$out |  | 
| 893         call    _vpaes_preheat |  | 
| 894         cmp     \$0,${enc}d |  | 
| 895         je      .Lcbc_dec_loop |  | 
| 896         jmp     .Lcbc_enc_loop |  | 
| 897 .align  16 |  | 
| 898 .Lcbc_enc_loop: |  | 
| 899         movdqu  ($inp),%xmm0 |  | 
| 900         pxor    %xmm6,%xmm0 |  | 
| 901         call    _vpaes_encrypt_core |  | 
| 902         movdqa  %xmm0,%xmm6 |  | 
| 903         movdqu  %xmm0,($out,$inp) |  | 
| 904         lea     16($inp),$inp |  | 
| 905         sub     \$16,$len |  | 
| 906         jnc     .Lcbc_enc_loop |  | 
| 907         jmp     .Lcbc_done |  | 
| 908 .align  16 |  | 
| 909 .Lcbc_dec_loop: |  | 
| 910         movdqu  ($inp),%xmm0 |  | 
| 911         movdqa  %xmm0,%xmm7 |  | 
| 912         call    _vpaes_decrypt_core |  | 
| 913         pxor    %xmm6,%xmm0 |  | 
| 914         movdqa  %xmm7,%xmm6 |  | 
| 915         movdqu  %xmm0,($out,$inp) |  | 
| 916         lea     16($inp),$inp |  | 
| 917         sub     \$16,$len |  | 
| 918         jnc     .Lcbc_dec_loop |  | 
| 919 .Lcbc_done: |  | 
| 920         movdqu  %xmm6,($ivp)            # save IV |  | 
| 921 ___ |  | 
| 922 $code.=<<___ if ($win64); |  | 
| 923         movaps  0x10(%rsp),%xmm6 |  | 
| 924         movaps  0x20(%rsp),%xmm7 |  | 
| 925         movaps  0x30(%rsp),%xmm8 |  | 
| 926         movaps  0x40(%rsp),%xmm9 |  | 
| 927         movaps  0x50(%rsp),%xmm10 |  | 
| 928         movaps  0x60(%rsp),%xmm11 |  | 
| 929         movaps  0x70(%rsp),%xmm12 |  | 
| 930         movaps  0x80(%rsp),%xmm13 |  | 
| 931         movaps  0x90(%rsp),%xmm14 |  | 
| 932         movaps  0xa0(%rsp),%xmm15 |  | 
| 933         lea     0xb8(%rsp),%rsp |  | 
| 934 .Lcbc_epilogue: |  | 
| 935 ___ |  | 
| 936 $code.=<<___; |  | 
| 937 .Lcbc_abort: |  | 
| 938         ret |  | 
| 939 .size   ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt |  | 
| 940 ___ |  | 
| 941 } |  | 
| 942 $code.=<<___; |  | 
| 943 ## |  | 
| 944 ##  _aes_preheat |  | 
| 945 ## |  | 
| 946 ##  Fills register %r10 -> .aes_consts (so you can -fPIC) |  | 
| 947 ##  and %xmm9-%xmm15 as specified below. |  | 
| 948 ## |  | 
| 949 .type   _vpaes_preheat,\@abi-omnipotent |  | 
| 950 .align  16 |  | 
| 951 _vpaes_preheat: |  | 
| 952         lea     .Lk_s0F(%rip), %r10 |  | 
| 953         movdqa  -0x20(%r10), %xmm10     # .Lk_inv |  | 
| 954         movdqa  -0x10(%r10), %xmm11     # .Lk_inv+16 |  | 
| 955         movdqa  0x00(%r10), %xmm9       # .Lk_s0F |  | 
| 956         movdqa  0x30(%r10), %xmm13      # .Lk_sb1 |  | 
| 957         movdqa  0x40(%r10), %xmm12      # .Lk_sb1+16 |  | 
| 958         movdqa  0x50(%r10), %xmm15      # .Lk_sb2 |  | 
| 959         movdqa  0x60(%r10), %xmm14      # .Lk_sb2+16 |  | 
| 960         ret |  | 
| 961 .size   _vpaes_preheat,.-_vpaes_preheat |  | 
| 962 ######################################################## |  | 
| 963 ##                                                    ## |  | 
| 964 ##                     Constants                      ## |  | 
| 965 ##                                                    ## |  | 
| 966 ######################################################## |  | 
| 967 .type   _vpaes_consts,\@object |  | 
| 968 .align  64 |  | 
| 969 _vpaes_consts: |  | 
| 970 .Lk_inv:        # inv, inva |  | 
| 971         .quad   0x0E05060F0D080180, 0x040703090A0B0C02 |  | 
| 972         .quad   0x01040A060F0B0780, 0x030D0E0C02050809 |  | 
| 973 |  | 
| 974 .Lk_s0F:        # s0F |  | 
| 975         .quad   0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F |  | 
| 976 |  | 
| 977 .Lk_ipt:        # input transform (lo, hi) |  | 
| 978         .quad   0xC2B2E8985A2A7000, 0xCABAE09052227808 |  | 
| 979         .quad   0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 |  | 
| 980 |  | 
| 981 .Lk_sb1:        # sb1u, sb1t |  | 
| 982         .quad   0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 |  | 
| 983         .quad   0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF |  | 
| 984 .Lk_sb2:        # sb2u, sb2t |  | 
| 985         .quad   0xE27A93C60B712400, 0x5EB7E955BC982FCD |  | 
| 986         .quad   0x69EB88400AE12900, 0xC2A163C8AB82234A |  | 
| 987 .Lk_sbo:        # sbou, sbot |  | 
| 988         .quad   0xD0D26D176FBDC700, 0x15AABF7AC502A878 |  | 
| 989         .quad   0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA |  | 
| 990 |  | 
| 991 .Lk_mc_forward: # mc_forward |  | 
| 992         .quad   0x0407060500030201, 0x0C0F0E0D080B0A09 |  | 
| 993         .quad   0x080B0A0904070605, 0x000302010C0F0E0D |  | 
| 994         .quad   0x0C0F0E0D080B0A09, 0x0407060500030201 |  | 
| 995         .quad   0x000302010C0F0E0D, 0x080B0A0904070605 |  | 
| 996 |  | 
| 997 .Lk_mc_backward:# mc_backward |  | 
| 998         .quad   0x0605040702010003, 0x0E0D0C0F0A09080B |  | 
| 999         .quad   0x020100030E0D0C0F, 0x0A09080B06050407 |  | 
| 1000         .quad   0x0E0D0C0F0A09080B, 0x0605040702010003 |  | 
| 1001         .quad   0x0A09080B06050407, 0x020100030E0D0C0F |  | 
| 1002 |  | 
| 1003 .Lk_sr:         # sr |  | 
| 1004         .quad   0x0706050403020100, 0x0F0E0D0C0B0A0908 |  | 
| 1005         .quad   0x030E09040F0A0500, 0x0B06010C07020D08 |  | 
| 1006         .quad   0x0F060D040B020900, 0x070E050C030A0108 |  | 
| 1007         .quad   0x0B0E0104070A0D00, 0x0306090C0F020508 |  | 
| 1008 |  | 
| 1009 .Lk_rcon:       # rcon |  | 
| 1010         .quad   0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 |  | 
| 1011 |  | 
| 1012 .Lk_s63:        # s63: all equal to 0x63 transformed |  | 
| 1013         .quad   0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B |  | 
| 1014 |  | 
| 1015 .Lk_opt:        # output transform |  | 
| 1016         .quad   0xFF9F4929D6B66000, 0xF7974121DEBE6808 |  | 
| 1017         .quad   0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 |  | 
| 1018 |  | 
| 1019 .Lk_deskew:     # deskew tables: inverts the sbox's "skew" |  | 
| 1020         .quad   0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A |  | 
| 1021         .quad   0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 |  | 
| 1022 |  | 
| 1023 ## |  | 
| 1024 ##  Decryption stuff |  | 
| 1025 ##  Key schedule constants |  | 
| 1026 ## |  | 
| 1027 .Lk_dksd:       # decryption key schedule: invskew x*D |  | 
| 1028         .quad   0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 |  | 
| 1029         .quad   0x41C277F4B5368300, 0x5FDC69EAAB289D1E |  | 
| 1030 .Lk_dksb:       # decryption key schedule: invskew x*B |  | 
| 1031         .quad   0x9A4FCA1F8550D500, 0x03D653861CC94C99 |  | 
| 1032         .quad   0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 |  | 
| 1033 .Lk_dkse:       # decryption key schedule: invskew x*E + 0x63 |  | 
| 1034         .quad   0xD5031CCA1FC9D600, 0x53859A4C994F5086 |  | 
| 1035         .quad   0xA23196054FDC7BE8, 0xCD5EF96A20B31487 |  | 
| 1036 .Lk_dks9:       # decryption key schedule: invskew x*9 |  | 
| 1037         .quad   0xB6116FC87ED9A700, 0x4AED933482255BFC |  | 
| 1038         .quad   0x4576516227143300, 0x8BB89FACE9DAFDCE |  | 
| 1039 |  | 
| 1040 ## |  | 
| 1041 ##  Decryption stuff |  | 
| 1042 ##  Round function constants |  | 
| 1043 ## |  | 
| 1044 .Lk_dipt:       # decryption input transform |  | 
| 1045         .quad   0x0F505B040B545F00, 0x154A411E114E451A |  | 
| 1046         .quad   0x86E383E660056500, 0x12771772F491F194 |  | 
| 1047 |  | 
| 1048 .Lk_dsb9:       # decryption sbox output *9*u, *9*t |  | 
| 1049         .quad   0x851C03539A86D600, 0xCAD51F504F994CC9 |  | 
| 1050         .quad   0xC03B1789ECD74900, 0x725E2C9EB2FBA565 |  | 
| 1051 .Lk_dsbd:       # decryption sbox output *D*u, *D*t |  | 
| 1052         .quad   0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 |  | 
| 1053         .quad   0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 |  | 
| 1054 .Lk_dsbb:       # decryption sbox output *B*u, *B*t |  | 
| 1055         .quad   0xD022649296B44200, 0x602646F6B0F2D404 |  | 
| 1056         .quad   0xC19498A6CD596700, 0xF3FF0C3E3255AA6B |  | 
| 1057 .Lk_dsbe:       # decryption sbox output *E*u, *E*t |  | 
| 1058         .quad   0x46F2929626D4D000, 0x2242600464B4F6B0 |  | 
| 1059         .quad   0x0C55A6CDFFAAC100, 0x9467F36B98593E32 |  | 
| 1060 .Lk_dsbo:       # decryption sbox final output |  | 
| 1061         .quad   0x1387EA537EF94000, 0xC7AA6DB9D4943E2D |  | 
| 1062         .quad   0x12D7560F93441D00, 0xCA4B8159D8C58E9C |  | 
| 1063 .asciz  "Vector Permutaion AES for x86_64/SSSE3, Mike Hamburg (Stanford Universi
      ty)" |  | 
| 1064 .align  64 |  | 
| 1065 .size   _vpaes_consts,.-_vpaes_consts |  | 
| 1066 ___ |  | 
| 1067 |  | 
| 1068 if ($win64) { |  | 
| 1069 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, |  | 
| 1070 #               CONTEXT *context,DISPATCHER_CONTEXT *disp) |  | 
| 1071 $rec="%rcx"; |  | 
| 1072 $frame="%rdx"; |  | 
| 1073 $context="%r8"; |  | 
| 1074 $disp="%r9"; |  | 
| 1075 |  | 
| 1076 $code.=<<___; |  | 
| 1077 .extern __imp_RtlVirtualUnwind |  | 
| 1078 .type   se_handler,\@abi-omnipotent |  | 
| 1079 .align  16 |  | 
| 1080 se_handler: |  | 
| 1081         push    %rsi |  | 
| 1082         push    %rdi |  | 
| 1083         push    %rbx |  | 
| 1084         push    %rbp |  | 
| 1085         push    %r12 |  | 
| 1086         push    %r13 |  | 
| 1087         push    %r14 |  | 
| 1088         push    %r15 |  | 
| 1089         pushfq |  | 
| 1090         sub     \$64,%rsp |  | 
| 1091 |  | 
| 1092         mov     120($context),%rax      # pull context->Rax |  | 
| 1093         mov     248($context),%rbx      # pull context->Rip |  | 
| 1094 |  | 
| 1095         mov     8($disp),%rsi           # disp->ImageBase |  | 
| 1096         mov     56($disp),%r11          # disp->HandlerData |  | 
| 1097 |  | 
| 1098         mov     0(%r11),%r10d           # HandlerData[0] |  | 
| 1099         lea     (%rsi,%r10),%r10        # prologue label |  | 
| 1100         cmp     %r10,%rbx               # context->Rip<prologue label |  | 
| 1101         jb      .Lin_prologue |  | 
| 1102 |  | 
| 1103         mov     152($context),%rax      # pull context->Rsp |  | 
| 1104 |  | 
| 1105         mov     4(%r11),%r10d           # HandlerData[1] |  | 
| 1106         lea     (%rsi,%r10),%r10        # epilogue label |  | 
| 1107         cmp     %r10,%rbx               # context->Rip>=epilogue label |  | 
| 1108         jae     .Lin_prologue |  | 
| 1109 |  | 
| 1110         lea     16(%rax),%rsi           # %xmm save area |  | 
| 1111         lea     512($context),%rdi      # &context.Xmm6 |  | 
| 1112         mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax) |  | 
| 1113         .long   0xa548f3fc              # cld; rep movsq |  | 
| 1114         lea     0xb8(%rax),%rax         # adjust stack pointer |  | 
| 1115 |  | 
| 1116 .Lin_prologue: |  | 
| 1117         mov     8(%rax),%rdi |  | 
| 1118         mov     16(%rax),%rsi |  | 
| 1119         mov     %rax,152($context)      # restore context->Rsp |  | 
| 1120         mov     %rsi,168($context)      # restore context->Rsi |  | 
| 1121         mov     %rdi,176($context)      # restore context->Rdi |  | 
| 1122 |  | 
| 1123         mov     40($disp),%rdi          # disp->ContextRecord |  | 
| 1124         mov     $context,%rsi           # context |  | 
| 1125         mov     \$`1232/8`,%ecx         # sizeof(CONTEXT) |  | 
| 1126         .long   0xa548f3fc              # cld; rep movsq |  | 
| 1127 |  | 
| 1128         mov     $disp,%rsi |  | 
| 1129         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER |  | 
| 1130         mov     8(%rsi),%rdx            # arg2, disp->ImageBase |  | 
| 1131         mov     0(%rsi),%r8             # arg3, disp->ControlPc |  | 
| 1132         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry |  | 
| 1133         mov     40(%rsi),%r10           # disp->ContextRecord |  | 
| 1134         lea     56(%rsi),%r11           # &disp->HandlerData |  | 
| 1135         lea     24(%rsi),%r12           # &disp->EstablisherFrame |  | 
| 1136         mov     %r10,32(%rsp)           # arg5 |  | 
| 1137         mov     %r11,40(%rsp)           # arg6 |  | 
| 1138         mov     %r12,48(%rsp)           # arg7 |  | 
| 1139         mov     %rcx,56(%rsp)           # arg8, (NULL) |  | 
| 1140         call    *__imp_RtlVirtualUnwind(%rip) |  | 
| 1141 |  | 
| 1142         mov     \$1,%eax                # ExceptionContinueSearch |  | 
| 1143         add     \$64,%rsp |  | 
| 1144         popfq |  | 
| 1145         pop     %r15 |  | 
| 1146         pop     %r14 |  | 
| 1147         pop     %r13 |  | 
| 1148         pop     %r12 |  | 
| 1149         pop     %rbp |  | 
| 1150         pop     %rbx |  | 
| 1151         pop     %rdi |  | 
| 1152         pop     %rsi |  | 
| 1153         ret |  | 
| 1154 .size   se_handler,.-se_handler |  | 
| 1155 |  | 
| 1156 .section        .pdata |  | 
| 1157 .align  4 |  | 
| 1158         .rva    .LSEH_begin_${PREFIX}_set_encrypt_key |  | 
| 1159         .rva    .LSEH_end_${PREFIX}_set_encrypt_key |  | 
| 1160         .rva    .LSEH_info_${PREFIX}_set_encrypt_key |  | 
| 1161 |  | 
| 1162         .rva    .LSEH_begin_${PREFIX}_set_decrypt_key |  | 
| 1163         .rva    .LSEH_end_${PREFIX}_set_decrypt_key |  | 
| 1164         .rva    .LSEH_info_${PREFIX}_set_decrypt_key |  | 
| 1165 |  | 
| 1166         .rva    .LSEH_begin_${PREFIX}_encrypt |  | 
| 1167         .rva    .LSEH_end_${PREFIX}_encrypt |  | 
| 1168         .rva    .LSEH_info_${PREFIX}_encrypt |  | 
| 1169 |  | 
| 1170         .rva    .LSEH_begin_${PREFIX}_decrypt |  | 
| 1171         .rva    .LSEH_end_${PREFIX}_decrypt |  | 
| 1172         .rva    .LSEH_info_${PREFIX}_decrypt |  | 
| 1173 |  | 
| 1174         .rva    .LSEH_begin_${PREFIX}_cbc_encrypt |  | 
| 1175         .rva    .LSEH_end_${PREFIX}_cbc_encrypt |  | 
| 1176         .rva    .LSEH_info_${PREFIX}_cbc_encrypt |  | 
| 1177 |  | 
| 1178 .section        .xdata |  | 
| 1179 .align  8 |  | 
| 1180 .LSEH_info_${PREFIX}_set_encrypt_key: |  | 
| 1181         .byte   9,0,0,0 |  | 
| 1182         .rva    se_handler |  | 
| 1183         .rva    .Lenc_key_body,.Lenc_key_epilogue       # HandlerData[] |  | 
| 1184 .LSEH_info_${PREFIX}_set_decrypt_key: |  | 
| 1185         .byte   9,0,0,0 |  | 
| 1186         .rva    se_handler |  | 
| 1187         .rva    .Ldec_key_body,.Ldec_key_epilogue       # HandlerData[] |  | 
| 1188 .LSEH_info_${PREFIX}_encrypt: |  | 
| 1189         .byte   9,0,0,0 |  | 
| 1190         .rva    se_handler |  | 
| 1191         .rva    .Lenc_body,.Lenc_epilogue               # HandlerData[] |  | 
| 1192 .LSEH_info_${PREFIX}_decrypt: |  | 
| 1193         .byte   9,0,0,0 |  | 
| 1194         .rva    se_handler |  | 
| 1195         .rva    .Ldec_body,.Ldec_epilogue               # HandlerData[] |  | 
| 1196 .LSEH_info_${PREFIX}_cbc_encrypt: |  | 
| 1197         .byte   9,0,0,0 |  | 
| 1198         .rva    se_handler |  | 
| 1199         .rva    .Lcbc_body,.Lcbc_epilogue               # HandlerData[] |  | 
| 1200 ___ |  | 
| 1201 } |  | 
| 1202 |  | 
| 1203 $code =~ s/\`([^\`]*)\`/eval($1)/gem; |  | 
| 1204 |  | 
| 1205 print $code; |  | 
| 1206 |  | 
| 1207 close STDOUT; |  | 
| OLD | NEW | 
|---|