OLD | NEW |
1 #!/usr/bin/env perl | 1 #!/usr/bin/env perl |
2 # | 2 # |
3 # ==================================================================== | 3 # ==================================================================== |
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | 4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
5 # project. Rights for redistribution and usage in source and binary | 5 # project. The module is, however, dual licensed under OpenSSL and |
6 # forms are granted according to the OpenSSL license. | 6 # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 7 # details see http://www.openssl.org/~appro/cryptogams/. |
7 # ==================================================================== | 8 # ==================================================================== |
8 # | 9 # |
9 # Version 3.6. | 10 # Version 4.3. |
10 # | 11 # |
11 # You might fail to appreciate this module performance from the first | 12 # You might fail to appreciate this module performance from the first |
12 # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered | 13 # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered |
13 # to be *the* best Intel C compiler without -KPIC, performance appears | 14 # to be *the* best Intel C compiler without -KPIC, performance appears |
14 # to be virtually identical... But try to re-configure with shared | 15 # to be virtually identical... But try to re-configure with shared |
15 # library support... Aha! Intel compiler "suddenly" lags behind by 30% | 16 # library support... Aha! Intel compiler "suddenly" lags behind by 30% |
16 # [on P4, more on others]:-) And if compared to position-independent | 17 # [on P4, more on others]:-) And if compared to position-independent |
17 # code generated by GNU C, this code performs *more* than *twice* as | 18 # code generated by GNU C, this code performs *more* than *twice* as |
18 # fast! Yes, all this buzz about PIC means that unlike other hand- | 19 # fast! Yes, all this buzz about PIC means that unlike other hand- |
19 # coded implementations, this one was explicitly designed to be safe | 20 # coded implementations, this one was explicitly designed to be safe |
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
74 # Version 3.6 compresses Td4 to 256 bytes and prefetches it in ECB. | 75 # Version 3.6 compresses Td4 to 256 bytes and prefetches it in ECB. |
75 # | 76 # |
76 # Current ECB performance numbers for 128-bit key in CPU cycles per | 77 # Current ECB performance numbers for 128-bit key in CPU cycles per |
77 # processed byte [measure commonly used by AES benchmarkers] are: | 78 # processed byte [measure commonly used by AES benchmarkers] are: |
78 # | 79 # |
79 # small footprint fully unrolled | 80 # small footprint fully unrolled |
80 # P4 24 22 | 81 # P4 24 22 |
81 # AMD K8 20 19 | 82 # AMD K8 20 19 |
82 # PIII 25 23 | 83 # PIII 25 23 |
83 # Pentium 81 78 | 84 # Pentium 81 78 |
| 85 # |
| 86 # Version 3.7 reimplements outer rounds as "compact." Meaning that |
| 87 # first and last rounds reference compact 256 bytes S-box. This means |
| 88 # that first round consumes a lot more CPU cycles and that encrypt |
| 89 # and decrypt performance becomes asymmetric. Encrypt performance |
| 90 # drops by 10-12%, while decrypt - by 20-25%:-( 256 bytes S-box is |
| 91 # aggressively pre-fetched. |
| 92 # |
| 93 # Version 4.0 effectively rolls back to 3.6 and instead implements |
| 94 # additional set of functions, _[x86|sse]_AES_[en|de]crypt_compact, |
| 95 # which use exclusively 256 byte S-box. These functions are to be |
| 96 # called in modes not concealing plain text, such as ECB, or when |
| 97 # we're asked to process smaller amount of data [or unconditionally |
| 98 # on hyper-threading CPU]. Currently it's called unconditionally from |
| 99 # AES_[en|de]crypt, which affects all modes, but CBC. CBC routine |
| 100 # still needs to be modified to switch between slower and faster |
| 101 # mode when appropriate... But in either case benchmark landscape |
| 102 # changes dramatically and below numbers are CPU cycles per processed |
| 103 # byte for 128-bit key. |
| 104 # |
| 105 # ECB encrypt ECB decrypt CBC large chunk |
| 106 # P4 56[60] 84[100] 23 |
| 107 # AMD K8 48[44] 70[79] 18 |
| 108 # PIII 41[50] 61[91] 24 |
| 109 # Core 2 32[38] 45[70] 18.5 |
| 110 # Pentium 120 160 77 |
| 111 # |
| 112 # Version 4.1 switches to compact S-box even in key schedule setup. |
| 113 # |
| 114 # Version 4.2 prefetches compact S-box in every SSE round or in other |
| 115 # words every cache-line is *guaranteed* to be accessed within ~50 |
| 116 # cycles window. Why just SSE? Because it's needed on hyper-threading |
| 117 # CPU! Which is also why it's prefetched with 64 byte stride. Best |
| 118 # part is that it has no negative effect on performance:-) |
| 119 # |
| 120 # Version 4.3 implements switch between compact and non-compact block |
| 121 # functions in AES_cbc_encrypt depending on how much data was asked |
| 122 # to be processed in one stroke. |
| 123 # |
| 124 ###################################################################### |
| 125 # Timing attacks are classified in two classes: synchronous when |
| 126 # attacker consciously initiates cryptographic operation and collects |
| 127 # timing data of various character afterwards, and asynchronous when |
| 128 # malicious code is executed on same CPU simultaneously with AES, |
| 129 # instruments itself and performs statistical analysis of this data. |
| 130 # |
| 131 # As far as synchronous attacks go the root to the AES timing |
| 132 # vulnerability is twofold. Firstly, of 256 S-box elements at most 160 |
| 133 # are referred to in single 128-bit block operation. Well, in C |
| 134 # implementation with 4 distinct tables it's actually as little as 40 |
| 135 # references per 256 elements table, but anyway... Secondly, even |
| 136 # though S-box elements are clustered into smaller amount of cache- |
| 137 # lines, smaller than 160 and even 40, it turned out that for certain |
| 138 # plain-text pattern[s] or simply put chosen plain-text and given key |
| 139 # few cache-lines remain unaccessed during block operation. Now, if |
| 140 # attacker can figure out this access pattern, he can deduct the key |
| 141 # [or at least part of it]. The natural way to mitigate this kind of |
| 142 # attacks is to minimize the amount of cache-lines in S-box and/or |
| 143 # prefetch them to ensure that every one is accessed for more uniform |
| 144 # timing. But note that *if* plain-text was concealed in such way that |
| 145 # input to block function is distributed *uniformly*, then attack |
| 146 # wouldn't apply. Now note that some encryption modes, most notably |
| 147 # CBC, do mask the plain-text in this exact way [secure cipher output |
| 148 # is distributed uniformly]. Yes, one still might find input that |
| 149 # would reveal the information about given key, but if amount of |
| 150 # candidate inputs to be tried is larger than amount of possible key |
| 151 # combinations then attack becomes infeasible. This is why revised |
| 152 # AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk |
| 153 # of data is to be processed in one stroke. The current size limit of |
| 154 # 512 bytes is chosen to provide same [diminishigly low] probability |
| 155 # for cache-line to remain untouched in large chunk operation with |
| 156 # large S-box as for single block operation with compact S-box and |
| 157 # surely needs more careful consideration... |
| 158 # |
| 159 # As for asynchronous attacks. There are two flavours: attacker code |
| 160 # being interleaved with AES on hyper-threading CPU at *instruction* |
| 161 # level, and two processes time sharing single core. As for latter. |
| 162 # Two vectors. 1. Given that attacker process has higher priority, |
| 163 # yield execution to process performing AES just before timer fires |
| 164 # off the scheduler, immediately regain control of CPU and analyze the |
| 165 # cache state. For this attack to be efficient attacker would have to |
| 166 # effectively slow down the operation by several *orders* of magnitute, |
| 167 # by ratio of time slice to duration of handful of AES rounds, which |
| 168 # unlikely to remain unnoticed. Not to mention that this also means |
| 169 # that he would spend correspondigly more time to collect enough |
| 170 # statistical data to mount the attack. It's probably appropriate to |
| 171 # say that if adeversary reckons that this attack is beneficial and |
| 172 # risks to be noticed, you probably have larger problems having him |
| 173 # mere opportunity. In other words suggested code design expects you |
| 174 # to preclude/mitigate this attack by overall system security design. |
| 175 # 2. Attacker manages to make his code interrupt driven. In order for |
| 176 # this kind of attack to be feasible, interrupt rate has to be high |
| 177 # enough, again comparable to duration of handful of AES rounds. But |
| 178 # is there interrupt source of such rate? Hardly, not even 1Gbps NIC |
| 179 # generates interrupts at such raging rate... |
| 180 # |
| 181 # And now back to the former, hyper-threading CPU or more specifically |
| 182 # Intel P4. Recall that asynchronous attack implies that malicious |
| 183 # code instruments itself. And naturally instrumentation granularity |
| 184 # has be noticeably lower than duration of codepath accessing S-box. |
| 185 # Given that all cache-lines are accessed during that time that is. |
| 186 # Current implementation accesses *all* cache-lines within ~50 cycles |
| 187 # window, which is actually *less* than RDTSC latency on Intel P4! |
84 | 188 |
85 push(@INC,"perlasm","../../perlasm"); | 189 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 190 push(@INC,"${dir}","${dir}../../perlasm"); |
86 require "x86asm.pl"; | 191 require "x86asm.pl"; |
87 | 192 |
88 &asm_init($ARGV[0],"aes-586.pl",$ARGV[$#ARGV] eq "386"); | 193 &asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386"); |
| 194 &static_label("AES_Te"); |
| 195 &static_label("AES_Td"); |
89 | 196 |
90 $s0="eax"; | 197 $s0="eax"; |
91 $s1="ebx"; | 198 $s1="ebx"; |
92 $s2="ecx"; | 199 $s2="ecx"; |
93 $s3="edx"; | 200 $s3="edx"; |
94 $key="edi"; | 201 $key="edi"; |
95 $acc="esi"; | 202 $acc="esi"; |
| 203 $tbl="ebp"; |
96 | 204 |
97 $compromise=0;» » # $compromise=128 abstains from copying key | 205 # stack frame layout in _[x86|sse]_AES_* routines, frame is allocated |
98 » » » # schedule to stack when encrypting inputs | 206 # by caller |
99 » » » # shorter than 128 bytes at the cost of | 207 $__ra=&DWP(0,"esp");» # return address |
100 » » » # risksing aliasing with S-boxes. In return | 208 $__s0=&DWP(4,"esp");» # s0 backing store |
101 » » » # you get way better, up to +70%, small block | 209 $__s1=&DWP(8,"esp");» # s1 backing store |
102 » » » # performance. | 210 $__s2=&DWP(12,"esp");» # s2 backing store |
| 211 $__s3=&DWP(16,"esp");» # s3 backing store |
| 212 $__key=&DWP(20,"esp");» # pointer to key schedule |
| 213 $__end=&DWP(24,"esp");» # pointer to end of key schedule |
| 214 $__tbl=&DWP(28,"esp");» # %ebp backing store |
| 215 |
| 216 # stack frame layout in AES_[en|crypt] routines, which differs from |
| 217 # above by 4 and overlaps by %ebp backing store |
| 218 $_tbl=&DWP(24,"esp"); |
| 219 $_esp=&DWP(28,"esp"); |
| 220 |
| 221 sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } |
| 222 |
| 223 $speed_limit=512;» # chunks smaller than $speed_limit are |
| 224 » » » # processed with compact routine in CBC mode |
103 $small_footprint=1; # $small_footprint=1 code is ~5% slower [on | 225 $small_footprint=1; # $small_footprint=1 code is ~5% slower [on |
104 # recent µ-archs], but ~5 times smaller! | 226 # recent µ-archs], but ~5 times smaller! |
105 # I favor compact code to minimize cache | 227 # I favor compact code to minimize cache |
106 # contention and in hope to "collect" 5% back | 228 # contention and in hope to "collect" 5% back |
107 # in real-life applications... | 229 # in real-life applications... |
| 230 |
108 $vertical_spin=0; # shift "verticaly" defaults to 0, because of | 231 $vertical_spin=0; # shift "verticaly" defaults to 0, because of |
109 # its proof-of-concept status... | 232 # its proof-of-concept status... |
110 | |
111 # Note that there is no decvert(), as well as last encryption round is | 233 # Note that there is no decvert(), as well as last encryption round is |
112 # performed with "horizontal" shifts. This is because this "vertical" | 234 # performed with "horizontal" shifts. This is because this "vertical" |
113 # implementation [one which groups shifts on a given $s[i] to form a | 235 # implementation [one which groups shifts on a given $s[i] to form a |
114 # "column," unlike "horizontal" one, which groups shifts on different | 236 # "column," unlike "horizontal" one, which groups shifts on different |
115 # $s[i] to form a "row"] is work in progress. It was observed to run | 237 # $s[i] to form a "row"] is work in progress. It was observed to run |
116 # few percents faster on Intel cores, but not AMD. On AMD K8 core it's | 238 # few percents faster on Intel cores, but not AMD. On AMD K8 core it's |
117 # whole 12% slower:-( So we face a trade-off... Shall it be resolved | 239 # whole 12% slower:-( So we face a trade-off... Shall it be resolved |
118 # some day? Till then the code is considered experimental and by | 240 # some day? Till then the code is considered experimental and by |
119 # default remains dormant... | 241 # default remains dormant... |
120 | 242 |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
163 | 285 |
164 &mov ($v1,$v0); | 286 &mov ($v1,$v0); |
165 &and ($v0,0xFF); | 287 &and ($v0,0xFF); |
166 &xor ($s[1],&DWP(0,$te,$v0,8)); # s1>>0 | 288 &xor ($s[1],&DWP(0,$te,$v0,8)); # s1>>0 |
167 &movz ($v0,&HB($v1)); | 289 &movz ($v0,&HB($v1)); |
168 &shr ($v1,16); | 290 &shr ($v1,16); |
169 &xor ($s[0],&DWP(3,$te,$v0,8)); # s1>>8 | 291 &xor ($s[0],&DWP(3,$te,$v0,8)); # s1>>8 |
170 &movz ($v0,&HB($v1)); | 292 &movz ($v0,&HB($v1)); |
171 &and ($v1,0xFF); | 293 &and ($v1,0xFF); |
172 &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16 | 294 &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16 |
173 » &mov» ($key,&DWP(12,"esp"));» » » # reincarnate v1 as key | 295 » &mov» ($key,$__key);» » » » # reincarnate v1 as key |
174 &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24 | 296 &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24 |
175 } | 297 } |
176 | 298 |
| 299 # Another experimental routine, which features "horizontal spin," but |
| 300 # eliminates one reference to stack. Strangely enough runs slower... |
| 301 sub enchoriz() |
| 302 { my $v0 = $key, $v1 = $acc; |
| 303 |
| 304 &movz ($v0,&LB($s0)); # 3, 2, 1, 0* |
| 305 &rotr ($s2,8); # 8,11,10, 9 |
| 306 &mov ($v1,&DWP(0,$te,$v0,8)); # 0 |
| 307 &movz ($v0,&HB($s1)); # 7, 6, 5*, 4 |
| 308 &rotr ($s3,16); # 13,12,15,14 |
| 309 &xor ($v1,&DWP(3,$te,$v0,8)); # 5 |
| 310 &movz ($v0,&HB($s2)); # 8,11,10*, 9 |
| 311 &rotr ($s0,16); # 1, 0, 3, 2 |
| 312 &xor ($v1,&DWP(2,$te,$v0,8)); # 10 |
| 313 &movz ($v0,&HB($s3)); # 13,12,15*,14 |
| 314 &xor ($v1,&DWP(1,$te,$v0,8)); # 15, t[0] collected |
| 315 &mov ($__s0,$v1); # t[0] saved |
| 316 |
| 317 &movz ($v0,&LB($s1)); # 7, 6, 5, 4* |
| 318 &shr ($s1,16); # -, -, 7, 6 |
| 319 &mov ($v1,&DWP(0,$te,$v0,8)); # 4 |
| 320 &movz ($v0,&LB($s3)); # 13,12,15,14* |
| 321 &xor ($v1,&DWP(2,$te,$v0,8)); # 14 |
| 322 &movz ($v0,&HB($s0)); # 1, 0, 3*, 2 |
| 323 &and ($s3,0xffff0000); # 13,12, -, - |
| 324 &xor ($v1,&DWP(1,$te,$v0,8)); # 3 |
| 325 &movz ($v0,&LB($s2)); # 8,11,10, 9* |
| 326 &or ($s3,$s1); # 13,12, 7, 6 |
| 327 &xor ($v1,&DWP(3,$te,$v0,8)); # 9, t[1] collected |
| 328 &mov ($s1,$v1); # s[1]=t[1] |
| 329 |
| 330 &movz ($v0,&LB($s0)); # 1, 0, 3, 2* |
| 331 &shr ($s2,16); # -, -, 8,11 |
| 332 &mov ($v1,&DWP(2,$te,$v0,8)); # 2 |
| 333 &movz ($v0,&HB($s3)); # 13,12, 7*, 6 |
| 334 &xor ($v1,&DWP(1,$te,$v0,8)); # 7 |
| 335 &movz ($v0,&HB($s2)); # -, -, 8*,11 |
| 336 &xor ($v1,&DWP(0,$te,$v0,8)); # 8 |
| 337 &mov ($v0,$s3); |
| 338 &shr ($v0,24); # 13 |
| 339 &xor ($v1,&DWP(3,$te,$v0,8)); # 13, t[2] collected |
| 340 |
| 341 &movz ($v0,&LB($s2)); # -, -, 8,11* |
| 342 &shr ($s0,24); # 1* |
| 343 &mov ($s2,&DWP(1,$te,$v0,8)); # 11 |
| 344 &xor ($s2,&DWP(3,$te,$s0,8)); # 1 |
| 345 &mov ($s0,$__s0); # s[0]=t[0] |
| 346 &movz ($v0,&LB($s3)); # 13,12, 7, 6* |
| 347 &shr ($s3,16); # , ,13,12 |
| 348 &xor ($s2,&DWP(2,$te,$v0,8)); # 6 |
| 349 &mov ($key,$__key); # reincarnate v0 as key |
| 350 &and ($s3,0xff); # , ,13,12* |
| 351 &mov ($s3,&DWP(0,$te,$s3,8)); # 12 |
| 352 &xor ($s3,$s2); # s[2]=t[3] collected |
| 353 &mov ($s2,$v1); # s[2]=t[2] |
| 354 } |
| 355 |
| 356 # More experimental code... SSE one... Even though this one eliminates |
| 357 # *all* references to stack, it's not faster... |
| 358 sub sse_encbody() |
| 359 { |
| 360 &movz ($acc,&LB("eax")); # 0 |
| 361 &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 0 |
| 362 &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2 |
| 363 &movz ("edx",&HB("eax")); # 1 |
| 364 &mov ("edx",&DWP(3,$tbl,"edx",8)); # 1 |
| 365 &shr ("eax",16); # 5, 4 |
| 366 |
| 367 &movz ($acc,&LB("ebx")); # 10 |
| 368 &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 10 |
| 369 &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8 |
| 370 &movz ($acc,&HB("ebx")); # 11 |
| 371 &xor ("edx",&DWP(1,$tbl,$acc,8)); # 11 |
| 372 &shr ("ebx",16); # 15,14 |
| 373 |
| 374 &movz ($acc,&HB("eax")); # 5 |
| 375 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 5 |
| 376 &movq ("mm3",QWP(16,$key)); |
| 377 &movz ($acc,&HB("ebx")); # 15 |
| 378 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 15 |
| 379 &movd ("mm0","ecx"); # t[0] collected |
| 380 |
| 381 &movz ($acc,&LB("eax")); # 4 |
| 382 &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 4 |
| 383 &movd ("eax","mm2"); # 7, 6, 3, 2 |
| 384 &movz ($acc,&LB("ebx")); # 14 |
| 385 &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 14 |
| 386 &movd ("ebx","mm6"); # 13,12, 9, 8 |
| 387 |
| 388 &movz ($acc,&HB("eax")); # 3 |
| 389 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 3 |
| 390 &movz ($acc,&HB("ebx")); # 9 |
| 391 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 9 |
| 392 &movd ("mm1","ecx"); # t[1] collected |
| 393 |
| 394 &movz ($acc,&LB("eax")); # 2 |
| 395 &mov ("ecx",&DWP(2,$tbl,$acc,8)); # 2 |
| 396 &shr ("eax",16); # 7, 6 |
| 397 &punpckldq ("mm0","mm1"); # t[0,1] collected |
| 398 &movz ($acc,&LB("ebx")); # 8 |
| 399 &xor ("ecx",&DWP(0,$tbl,$acc,8)); # 8 |
| 400 &shr ("ebx",16); # 13,12 |
| 401 |
| 402 &movz ($acc,&HB("eax")); # 7 |
| 403 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 7 |
| 404 &pxor ("mm0","mm3"); |
| 405 &movz ("eax",&LB("eax")); # 6 |
| 406 &xor ("edx",&DWP(2,$tbl,"eax",8)); # 6 |
| 407 &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0 |
| 408 &movz ($acc,&HB("ebx")); # 13 |
| 409 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 13 |
| 410 &xor ("ecx",&DWP(24,$key)); # t[2] |
| 411 &movd ("mm4","ecx"); # t[2] collected |
| 412 &movz ("ebx",&LB("ebx")); # 12 |
| 413 &xor ("edx",&DWP(0,$tbl,"ebx",8)); # 12 |
| 414 &shr ("ecx",16); |
| 415 &movd ("eax","mm1"); # 5, 4, 1, 0 |
| 416 &mov ("ebx",&DWP(28,$key)); # t[3] |
| 417 &xor ("ebx","edx"); |
| 418 &movd ("mm5","ebx"); # t[3] collected |
| 419 &and ("ebx",0xffff0000); |
| 420 &or ("ebx","ecx"); |
| 421 |
| 422 &punpckldq ("mm4","mm5"); # t[2,3] collected |
| 423 } |
| 424 |
| 425 ###################################################################### |
| 426 # "Compact" block function |
| 427 ###################################################################### |
| 428 |
| 429 sub enccompact() |
| 430 { my $Fn = mov; |
| 431 while ($#_>5) { pop(@_); $Fn=sub{}; } |
| 432 my ($i,$te,@s)=@_; |
| 433 my $tmp = $key; |
| 434 my $out = $i==3?$s[0]:$acc; |
| 435 |
| 436 # $Fn is used in first compact round and its purpose is to |
| 437 # void restoration of some values from stack, so that after |
| 438 # 4xenccompact with extra argument $key value is left there... |
| 439 if ($i==3) { &$Fn ($key,$__key); }##%edx |
| 440 else { &mov ($out,$s[0]); } |
| 441 &and ($out,0xFF); |
| 442 if ($i==1) { &shr ($s[0],16); }#%ebx[1] |
| 443 if ($i==2) { &shr ($s[0],24); }#%ecx[2] |
| 444 &movz ($out,&BP(-128,$te,$out,1)); |
| 445 |
| 446 if ($i==3) { $tmp=$s[1]; }##%eax |
| 447 &movz ($tmp,&HB($s[1])); |
| 448 &movz ($tmp,&BP(-128,$te,$tmp,1)); |
| 449 &shl ($tmp,8); |
| 450 &xor ($out,$tmp); |
| 451 |
| 452 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx |
| 453 else { &mov ($tmp,$s[2]); |
| 454 &shr ($tmp,16); } |
| 455 if ($i==2) { &and ($s[1],0xFF); }#%edx[2] |
| 456 &and ($tmp,0xFF); |
| 457 &movz ($tmp,&BP(-128,$te,$tmp,1)); |
| 458 &shl ($tmp,16); |
| 459 &xor ($out,$tmp); |
| 460 |
| 461 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx |
| 462 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] |
| 463 else { &mov ($tmp,$s[3]); |
| 464 &shr ($tmp,24); } |
| 465 &movz ($tmp,&BP(-128,$te,$tmp,1)); |
| 466 &shl ($tmp,24); |
| 467 &xor ($out,$tmp); |
| 468 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } |
| 469 if ($i==3) { &mov ($s[3],$acc); } |
| 470 &comment(); |
| 471 } |
| 472 |
| 473 sub enctransform() |
| 474 { my @s = ($s0,$s1,$s2,$s3); |
| 475 my $i = shift; |
| 476 my $tmp = $tbl; |
| 477 my $r2 = $key ; |
| 478 |
| 479 &mov ($acc,$s[$i]); |
| 480 &and ($acc,0x80808080); |
| 481 &mov ($tmp,$acc); |
| 482 &shr ($tmp,7); |
| 483 &lea ($r2,&DWP(0,$s[$i],$s[$i])); |
| 484 &sub ($acc,$tmp); |
| 485 &and ($r2,0xfefefefe); |
| 486 &and ($acc,0x1b1b1b1b); |
| 487 &mov ($tmp,$s[$i]); |
| 488 &xor ($acc,$r2); # r2 |
| 489 |
| 490 &xor ($s[$i],$acc); # r0 ^ r2 |
| 491 &rotl ($s[$i],24); |
| 492 &xor ($s[$i],$acc) # ROTATE(r2^r0,24) ^ r2 |
| 493 &rotr ($tmp,16); |
| 494 &xor ($s[$i],$tmp); |
| 495 &rotr ($tmp,8); |
| 496 &xor ($s[$i],$tmp); |
| 497 } |
| 498 |
| 499 &function_begin_B("_x86_AES_encrypt_compact"); |
| 500 # note that caller is expected to allocate stack frame for me! |
| 501 &mov ($__key,$key); # save key |
| 502 |
| 503 &xor ($s0,&DWP(0,$key)); # xor with key |
| 504 &xor ($s1,&DWP(4,$key)); |
| 505 &xor ($s2,&DWP(8,$key)); |
| 506 &xor ($s3,&DWP(12,$key)); |
| 507 |
| 508 &mov ($acc,&DWP(240,$key)); # load key->rounds |
| 509 &lea ($acc,&DWP(-2,$acc,$acc)); |
| 510 &lea ($acc,&DWP(0,$key,$acc,8)); |
| 511 &mov ($__end,$acc); # end of key schedule |
| 512 |
| 513 # prefetch Te4 |
| 514 &mov ($key,&DWP(0-128,$tbl)); |
| 515 &mov ($acc,&DWP(32-128,$tbl)); |
| 516 &mov ($key,&DWP(64-128,$tbl)); |
| 517 &mov ($acc,&DWP(96-128,$tbl)); |
| 518 &mov ($key,&DWP(128-128,$tbl)); |
| 519 &mov ($acc,&DWP(160-128,$tbl)); |
| 520 &mov ($key,&DWP(192-128,$tbl)); |
| 521 &mov ($acc,&DWP(224-128,$tbl)); |
| 522 |
| 523 &set_label("loop",16); |
| 524 |
| 525 &enccompact(0,$tbl,$s0,$s1,$s2,$s3,1); |
| 526 &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1); |
| 527 &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1); |
| 528 &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1); |
| 529 &enctransform(2); |
| 530 &enctransform(3); |
| 531 &enctransform(0); |
| 532 &enctransform(1); |
| 533 &mov ($key,$__key); |
| 534 &mov ($tbl,$__tbl); |
| 535 &add ($key,16); # advance rd_key |
| 536 &xor ($s0,&DWP(0,$key)); |
| 537 &xor ($s1,&DWP(4,$key)); |
| 538 &xor ($s2,&DWP(8,$key)); |
| 539 &xor ($s3,&DWP(12,$key)); |
| 540 |
| 541 &cmp ($key,$__end); |
| 542 &mov ($__key,$key); |
| 543 &jb (&label("loop")); |
| 544 |
| 545 &enccompact(0,$tbl,$s0,$s1,$s2,$s3); |
| 546 &enccompact(1,$tbl,$s1,$s2,$s3,$s0); |
| 547 &enccompact(2,$tbl,$s2,$s3,$s0,$s1); |
| 548 &enccompact(3,$tbl,$s3,$s0,$s1,$s2); |
| 549 |
| 550 &xor ($s0,&DWP(16,$key)); |
| 551 &xor ($s1,&DWP(20,$key)); |
| 552 &xor ($s2,&DWP(24,$key)); |
| 553 &xor ($s3,&DWP(28,$key)); |
| 554 |
| 555 &ret (); |
| 556 &function_end_B("_x86_AES_encrypt_compact"); |
| 557 |
| 558 ###################################################################### |
| 559 # "Compact" SSE block function. |
| 560 ###################################################################### |
| 561 # |
| 562 # Performance is not actually extraordinary in comparison to pure |
| 563 # x86 code. In particular encrypt performance is virtually the same. |
| 564 # Decrypt performance on the other hand is 15-20% better on newer |
| 565 # µ-archs [but we're thankful for *any* improvement here], and ~50% |
| 566 # better on PIII:-) And additionally on the pros side this code |
| 567 # eliminates redundant references to stack and thus relieves/ |
| 568 # minimizes the pressure on the memory bus. |
| 569 # |
| 570 # MMX register layout lsb |
| 571 # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ |
| 572 # | mm4 | mm0 | |
| 573 # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ |
| 574 # | s3 | s2 | s1 | s0 | |
| 575 # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ |
| 576 # |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0| |
| 577 # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ |
| 578 # |
| 579 # Indexes translate as s[N/4]>>(8*(N%4)), e.g. 5 means s1>>8. |
| 580 # In this terms encryption and decryption "compact" permutation |
| 581 # matrices can be depicted as following: |
| 582 # |
| 583 # encryption lsb # decryption lsb |
| 584 # +----++----+----+----+----+ # +----++----+----+----+----+ |
| 585 # | t0 || 15 | 10 | 5 | 0 | # | t0 || 7 | 10 | 13 | 0 | |
| 586 # +----++----+----+----+----+ # +----++----+----+----+----+ |
| 587 # | t1 || 3 | 14 | 9 | 4 | # | t1 || 11 | 14 | 1 | 4 | |
| 588 # +----++----+----+----+----+ # +----++----+----+----+----+ |
| 589 # | t2 || 7 | 2 | 13 | 8 | # | t2 || 15 | 2 | 5 | 8 | |
| 590 # +----++----+----+----+----+ # +----++----+----+----+----+ |
| 591 # | t3 || 11 | 6 | 1 | 12 | # | t3 || 3 | 6 | 9 | 12 | |
| 592 # +----++----+----+----+----+ # +----++----+----+----+----+ |
| 593 # |
| 594 ###################################################################### |
| 595 # Why not xmm registers? Short answer. It was actually tested and |
| 596 # was not any faster, but *contrary*, most notably on Intel CPUs. |
| 597 # Longer answer. Main advantage of using mm registers is that movd |
| 598 # latency is lower, especially on Intel P4. While arithmetic |
| 599 # instructions are twice as many, they can be scheduled every cycle |
| 600 # and not every second one when they are operating on xmm register, |
| 601 # so that "arithmetic throughput" remains virtually the same. And |
| 602 # finally the code can be executed even on elder SSE-only CPUs:-) |
| 603 |
| 604 sub sse_enccompact() |
| 605 { |
| 606 &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0 |
| 607 &pshufw ("mm5","mm4",0x0d); # 15,14,11,10 |
| 608 &movd ("eax","mm1"); # 5, 4, 1, 0 |
| 609 &movd ("ebx","mm5"); # 15,14,11,10 |
| 610 |
| 611 &movz ($acc,&LB("eax")); # 0 |
| 612 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 |
| 613 &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2 |
| 614 &movz ("edx",&HB("eax")); # 1 |
| 615 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1 |
| 616 &shl ("edx",8); # 1 |
| 617 &shr ("eax",16); # 5, 4 |
| 618 |
| 619 &movz ($acc,&LB("ebx")); # 10 |
| 620 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10 |
| 621 &shl ($acc,16); # 10 |
| 622 &or ("ecx",$acc); # 10 |
| 623 &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8 |
| 624 &movz ($acc,&HB("ebx")); # 11 |
| 625 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11 |
| 626 &shl ($acc,24); # 11 |
| 627 &or ("edx",$acc); # 11 |
| 628 &shr ("ebx",16); # 15,14 |
| 629 |
| 630 &movz ($acc,&HB("eax")); # 5 |
| 631 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 5 |
| 632 &shl ($acc,8); # 5 |
| 633 &or ("ecx",$acc); # 5 |
| 634 &movz ($acc,&HB("ebx")); # 15 |
| 635 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15 |
| 636 &shl ($acc,24); # 15 |
| 637 &or ("ecx",$acc); # 15 |
| 638 &movd ("mm0","ecx"); # t[0] collected |
| 639 |
| 640 &movz ($acc,&LB("eax")); # 4 |
| 641 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4 |
| 642 &movd ("eax","mm2"); # 7, 6, 3, 2 |
| 643 &movz ($acc,&LB("ebx")); # 14 |
| 644 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14 |
| 645 &shl ($acc,16); # 14 |
| 646 &or ("ecx",$acc); # 14 |
| 647 |
| 648 &movd ("ebx","mm6"); # 13,12, 9, 8 |
| 649 &movz ($acc,&HB("eax")); # 3 |
| 650 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 3 |
| 651 &shl ($acc,24); # 3 |
| 652 &or ("ecx",$acc); # 3 |
| 653 &movz ($acc,&HB("ebx")); # 9 |
| 654 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9 |
| 655 &shl ($acc,8); # 9 |
| 656 &or ("ecx",$acc); # 9 |
| 657 &movd ("mm1","ecx"); # t[1] collected |
| 658 |
| 659 &movz ($acc,&LB("ebx")); # 8 |
| 660 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8 |
| 661 &shr ("ebx",16); # 13,12 |
| 662 &movz ($acc,&LB("eax")); # 2 |
| 663 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2 |
| 664 &shl ($acc,16); # 2 |
| 665 &or ("ecx",$acc); # 2 |
| 666 &shr ("eax",16); # 7, 6 |
| 667 |
| 668 &punpckldq ("mm0","mm1"); # t[0,1] collected |
| 669 |
| 670 &movz ($acc,&HB("eax")); # 7 |
| 671 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7 |
| 672 &shl ($acc,24); # 7 |
| 673 &or ("ecx",$acc); # 7 |
| 674 &and ("eax",0xff); # 6 |
| 675 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6 |
| 676 &shl ("eax",16); # 6 |
| 677 &or ("edx","eax"); # 6 |
| 678 &movz ($acc,&HB("ebx")); # 13 |
| 679 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13 |
| 680 &shl ($acc,8); # 13 |
| 681 &or ("ecx",$acc); # 13 |
| 682 &movd ("mm4","ecx"); # t[2] collected |
| 683 &and ("ebx",0xff); # 12 |
| 684 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12 |
| 685 &or ("edx","ebx"); # 12 |
| 686 &movd ("mm5","edx"); # t[3] collected |
| 687 |
| 688 &punpckldq ("mm4","mm5"); # t[2,3] collected |
| 689 } |
| 690 |
| 691 if (!$x86only) { |
| 692 &function_begin_B("_sse_AES_encrypt_compact"); |
| 693 &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0 |
| 694 &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8 |
| 695 |
| 696 # note that caller is expected to allocate stack frame for me! |
| 697 &mov ($acc,&DWP(240,$key)); # load key->rounds |
| 698 &lea ($acc,&DWP(-2,$acc,$acc)); |
| 699 &lea ($acc,&DWP(0,$key,$acc,8)); |
| 700 &mov ($__end,$acc); # end of key schedule |
| 701 |
| 702 &mov ($s0,0x1b1b1b1b); # magic constant |
| 703 &mov (&DWP(8,"esp"),$s0); |
| 704 &mov (&DWP(12,"esp"),$s0); |
| 705 |
| 706 # prefetch Te4 |
| 707 &mov ($s0,&DWP(0-128,$tbl)); |
| 708 &mov ($s1,&DWP(32-128,$tbl)); |
| 709 &mov ($s2,&DWP(64-128,$tbl)); |
| 710 &mov ($s3,&DWP(96-128,$tbl)); |
| 711 &mov ($s0,&DWP(128-128,$tbl)); |
| 712 &mov ($s1,&DWP(160-128,$tbl)); |
| 713 &mov ($s2,&DWP(192-128,$tbl)); |
| 714 &mov ($s3,&DWP(224-128,$tbl)); |
| 715 |
| 716 &set_label("loop",16); |
| 717 &sse_enccompact(); |
| 718 &add ($key,16); |
| 719 &cmp ($key,$__end); |
| 720 &ja (&label("out")); |
| 721 |
| 722 &movq ("mm2",&QWP(8,"esp")); |
| 723 &pxor ("mm3","mm3"); &pxor ("mm7","mm7"); |
| 724 &movq ("mm1","mm0"); &movq ("mm5","mm4"); # r0 |
| 725 &pcmpgtb("mm3","mm0"); &pcmpgtb("mm7","mm4"); |
| 726 &pand ("mm3","mm2"); &pand ("mm7","mm2"); |
| 727 &pshufw ("mm2","mm0",0xb1); &pshufw ("mm6","mm4",0xb1);# ROT
ATE(r0,16) |
| 728 &paddb ("mm0","mm0"); &paddb ("mm4","mm4"); |
| 729 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # = r2 |
| 730 &pshufw ("mm3","mm2",0xb1); &pshufw ("mm7","mm6",0xb1);# r0 |
| 731 &pxor ("mm1","mm0"); &pxor ("mm5","mm4"); # r0^r2 |
| 732 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROT
ATE(r0,16) |
| 733 |
| 734 &movq ("mm2","mm3"); &movq ("mm6","mm7"); |
| 735 &pslld ("mm3",8); &pslld ("mm7",8); |
| 736 &psrld ("mm2",24); &psrld ("mm6",24); |
| 737 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= r0<
<8 |
| 738 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= r0>
>24 |
| 739 |
| 740 &movq ("mm3","mm1"); &movq ("mm7","mm5"); |
| 741 &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key)); |
| 742 &psrld ("mm1",8); &psrld ("mm5",8); |
| 743 &mov ($s0,&DWP(0-128,$tbl)); |
| 744 &pslld ("mm3",24); &pslld ("mm7",24); |
| 745 &mov ($s1,&DWP(64-128,$tbl)); |
| 746 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= (r2
^r0)<<8 |
| 747 &mov ($s2,&DWP(128-128,$tbl)); |
| 748 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= (r2
^r0)>>24 |
| 749 &mov ($s3,&DWP(192-128,$tbl)); |
| 750 |
| 751 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); |
| 752 &jmp (&label("loop")); |
| 753 |
| 754 &set_label("out",16); |
| 755 &pxor ("mm0",&QWP(0,$key)); |
| 756 &pxor ("mm4",&QWP(8,$key)); |
| 757 |
| 758 &ret (); |
| 759 &function_end_B("_sse_AES_encrypt_compact"); |
| 760 } |
| 761 |
| 762 ###################################################################### |
| 763 # Vanilla block function. |
| 764 ###################################################################### |
| 765 |
177 sub encstep() | 766 sub encstep() |
178 { my ($i,$te,@s) = @_; | 767 { my ($i,$te,@s) = @_; |
179 my $tmp = $key; | 768 my $tmp = $key; |
180 my $out = $i==3?$s[0]:$acc; | 769 my $out = $i==3?$s[0]:$acc; |
181 | 770 |
182 # lines marked with #%e?x[i] denote "reordered" instructions... | 771 # lines marked with #%e?x[i] denote "reordered" instructions... |
183 » if ($i==3) {» &mov» ($key,&DWP(12,"esp"));» » }##%edx | 772 » if ($i==3) {» &mov» ($key,$__key);» » » }##%edx |
184 else { &mov ($out,$s[0]); | 773 else { &mov ($out,$s[0]); |
185 &and ($out,0xFF); } | 774 &and ($out,0xFF); } |
186 if ($i==1) { &shr ($s[0],16); }#%ebx[1] | 775 if ($i==1) { &shr ($s[0],16); }#%ebx[1] |
187 if ($i==2) { &shr ($s[0],24); }#%ecx[2] | 776 if ($i==2) { &shr ($s[0],24); }#%ecx[2] |
188 &mov ($out,&DWP(0,$te,$out,8)); | 777 &mov ($out,&DWP(0,$te,$out,8)); |
189 | 778 |
190 if ($i==3) { $tmp=$s[1]; }##%eax | 779 if ($i==3) { $tmp=$s[1]; }##%eax |
191 &movz ($tmp,&HB($s[1])); | 780 &movz ($tmp,&HB($s[1])); |
192 &xor ($out,&DWP(3,$te,$tmp,8)); | 781 &xor ($out,&DWP(3,$te,$tmp,8)); |
193 | 782 |
194 » if ($i==3) {» $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp"));»}##%ebx | 783 » if ($i==3) {» $tmp=$s[2]; &mov ($s[1],$__s0);»» }##%ebx |
195 else { &mov ($tmp,$s[2]); | 784 else { &mov ($tmp,$s[2]); |
196 &shr ($tmp,16); } | 785 &shr ($tmp,16); } |
197 if ($i==2) { &and ($s[1],0xFF); }#%edx[2] | 786 if ($i==2) { &and ($s[1],0xFF); }#%edx[2] |
198 &and ($tmp,0xFF); | 787 &and ($tmp,0xFF); |
199 &xor ($out,&DWP(2,$te,$tmp,8)); | 788 &xor ($out,&DWP(2,$te,$tmp,8)); |
200 | 789 |
201 » if ($i==3) {» $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp"));»}##%ecx | 790 » if ($i==3) {» $tmp=$s[3]; &mov ($s[2],$__s1);»» }##%ecx |
202 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] | 791 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] |
203 else { &mov ($tmp,$s[3]); | 792 else { &mov ($tmp,$s[3]); |
204 &shr ($tmp,24) } | 793 &shr ($tmp,24) } |
205 &xor ($out,&DWP(1,$te,$tmp,8)); | 794 &xor ($out,&DWP(1,$te,$tmp,8)); |
206 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } | 795 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } |
207 if ($i==3) { &mov ($s[3],$acc); } | 796 if ($i==3) { &mov ($s[3],$acc); } |
208 &comment(); | 797 &comment(); |
209 } | 798 } |
210 | 799 |
211 sub enclast() | 800 sub enclast() |
212 { my ($i,$te,@s)=@_; | 801 { my ($i,$te,@s)=@_; |
213 my $tmp = $key; | 802 my $tmp = $key; |
214 my $out = $i==3?$s[0]:$acc; | 803 my $out = $i==3?$s[0]:$acc; |
215 | 804 |
216 » if ($i==3) {» &mov» ($key,&DWP(12,"esp"));» » }##%edx | 805 » if ($i==3) {» &mov» ($key,$__key);» » » }##%edx |
217 else { &mov ($out,$s[0]); } | 806 else { &mov ($out,$s[0]); } |
218 &and ($out,0xFF); | 807 &and ($out,0xFF); |
219 if ($i==1) { &shr ($s[0],16); }#%ebx[1] | 808 if ($i==1) { &shr ($s[0],16); }#%ebx[1] |
220 if ($i==2) { &shr ($s[0],24); }#%ecx[2] | 809 if ($i==2) { &shr ($s[0],24); }#%ecx[2] |
221 &mov ($out,&DWP(2,$te,$out,8)); | 810 &mov ($out,&DWP(2,$te,$out,8)); |
222 &and ($out,0x000000ff); | 811 &and ($out,0x000000ff); |
223 | 812 |
224 if ($i==3) { $tmp=$s[1]; }##%eax | 813 if ($i==3) { $tmp=$s[1]; }##%eax |
225 &movz ($tmp,&HB($s[1])); | 814 &movz ($tmp,&HB($s[1])); |
226 &mov ($tmp,&DWP(0,$te,$tmp,8)); | 815 &mov ($tmp,&DWP(0,$te,$tmp,8)); |
227 &and ($tmp,0x0000ff00); | 816 &and ($tmp,0x0000ff00); |
228 &xor ($out,$tmp); | 817 &xor ($out,$tmp); |
229 | 818 |
230 » if ($i==3) {» $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp"));»}##%ebx | 819 » if ($i==3) {» $tmp=$s[2]; &mov ($s[1],$__s0);»» }##%ebx |
231 » else {» mov» ($tmp,$s[2]); | 820 » else {» &mov» ($tmp,$s[2]); |
232 &shr ($tmp,16); } | 821 &shr ($tmp,16); } |
233 if ($i==2) { &and ($s[1],0xFF); }#%edx[2] | 822 if ($i==2) { &and ($s[1],0xFF); }#%edx[2] |
234 &and ($tmp,0xFF); | 823 &and ($tmp,0xFF); |
235 &mov ($tmp,&DWP(0,$te,$tmp,8)); | 824 &mov ($tmp,&DWP(0,$te,$tmp,8)); |
236 &and ($tmp,0x00ff0000); | 825 &and ($tmp,0x00ff0000); |
237 &xor ($out,$tmp); | 826 &xor ($out,$tmp); |
238 | 827 |
239 » if ($i==3) {» $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp"));»}##%ecx | 828 » if ($i==3) {» $tmp=$s[3]; &mov ($s[2],$__s1);»» }##%ecx |
240 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] | 829 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] |
241 else { &mov ($tmp,$s[3]); | 830 else { &mov ($tmp,$s[3]); |
242 &shr ($tmp,24); } | 831 &shr ($tmp,24); } |
243 &mov ($tmp,&DWP(2,$te,$tmp,8)); | 832 &mov ($tmp,&DWP(2,$te,$tmp,8)); |
244 &and ($tmp,0xff000000); | 833 &and ($tmp,0xff000000); |
245 &xor ($out,$tmp); | 834 &xor ($out,$tmp); |
246 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } | 835 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } |
247 if ($i==3) { &mov ($s[3],$acc); } | 836 if ($i==3) { &mov ($s[3],$acc); } |
248 } | 837 } |
249 | 838 |
250 sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |
251 | |
252 &public_label("AES_Te"); | |
253 &function_begin_B("_x86_AES_encrypt"); | 839 &function_begin_B("_x86_AES_encrypt"); |
254 if ($vertical_spin) { | 840 if ($vertical_spin) { |
255 # I need high parts of volatile registers to be accessible... | 841 # I need high parts of volatile registers to be accessible... |
256 &exch ($s1="edi",$key="ebx"); | 842 &exch ($s1="edi",$key="ebx"); |
257 &mov ($s2="esi",$acc="ecx"); | 843 &mov ($s2="esi",$acc="ecx"); |
258 } | 844 } |
259 | 845 |
260 # note that caller is expected to allocate stack frame for me! | 846 # note that caller is expected to allocate stack frame for me! |
261 » &mov» (&DWP(12,"esp"),$key);» » # save key | 847 » &mov» ($__key,$key);» » » # save key |
262 | 848 |
263 &xor ($s0,&DWP(0,$key)); # xor with key | 849 &xor ($s0,&DWP(0,$key)); # xor with key |
264 &xor ($s1,&DWP(4,$key)); | 850 &xor ($s1,&DWP(4,$key)); |
265 &xor ($s2,&DWP(8,$key)); | 851 &xor ($s2,&DWP(8,$key)); |
266 &xor ($s3,&DWP(12,$key)); | 852 &xor ($s3,&DWP(12,$key)); |
267 | 853 |
268 &mov ($acc,&DWP(240,$key)); # load key->rounds | 854 &mov ($acc,&DWP(240,$key)); # load key->rounds |
269 | 855 |
270 if ($small_footprint) { | 856 if ($small_footprint) { |
271 &lea ($acc,&DWP(-2,$acc,$acc)); | 857 &lea ($acc,&DWP(-2,$acc,$acc)); |
272 &lea ($acc,&DWP(0,$key,$acc,8)); | 858 &lea ($acc,&DWP(0,$key,$acc,8)); |
273 » &mov» (&DWP(16,"esp"),$acc);» # end of key schedule | 859 » &mov» ($__end,$acc);» » # end of key schedule |
274 » &align» (4); | 860 |
275 » &set_label("loop"); | 861 » &set_label("loop",16); |
276 if ($vertical_spin) { | 862 if ($vertical_spin) { |
277 » » &encvert("ebp",$s0,$s1,$s2,$s3); | 863 » » &encvert($tbl,$s0,$s1,$s2,$s3); |
278 } else { | 864 } else { |
279 » » &encstep(0,"ebp",$s0,$s1,$s2,$s3); | 865 » » &encstep(0,$tbl,$s0,$s1,$s2,$s3); |
280 » » &encstep(1,"ebp",$s1,$s2,$s3,$s0); | 866 » » &encstep(1,$tbl,$s1,$s2,$s3,$s0); |
281 » » &encstep(2,"ebp",$s2,$s3,$s0,$s1); | 867 » » &encstep(2,$tbl,$s2,$s3,$s0,$s1); |
282 » » &encstep(3,"ebp",$s3,$s0,$s1,$s2); | 868 » » &encstep(3,$tbl,$s3,$s0,$s1,$s2); |
283 } | 869 } |
284 &add ($key,16); # advance rd_key | 870 &add ($key,16); # advance rd_key |
285 &xor ($s0,&DWP(0,$key)); | 871 &xor ($s0,&DWP(0,$key)); |
286 &xor ($s1,&DWP(4,$key)); | 872 &xor ($s1,&DWP(4,$key)); |
287 &xor ($s2,&DWP(8,$key)); | 873 &xor ($s2,&DWP(8,$key)); |
288 &xor ($s3,&DWP(12,$key)); | 874 &xor ($s3,&DWP(12,$key)); |
289 » &cmp» ($key,&DWP(16,"esp")); | 875 » &cmp» ($key,$__end); |
290 » &mov» (&DWP(12,"esp"),$key); | 876 » &mov» ($__key,$key); |
291 &jb (&label("loop")); | 877 &jb (&label("loop")); |
292 } | 878 } |
293 else { | 879 else { |
294 &cmp ($acc,10); | 880 &cmp ($acc,10); |
295 &jle (&label("10rounds")); | 881 &jle (&label("10rounds")); |
296 &cmp ($acc,12); | 882 &cmp ($acc,12); |
297 &jle (&label("12rounds")); | 883 &jle (&label("12rounds")); |
298 | 884 |
299 » &set_label("14rounds"); | 885 » &set_label("14rounds",4); |
300 for ($i=1;$i<3;$i++) { | 886 for ($i=1;$i<3;$i++) { |
301 if ($vertical_spin) { | 887 if ($vertical_spin) { |
302 » » &encvert("ebp",$s0,$s1,$s2,$s3); | 888 » » &encvert($tbl,$s0,$s1,$s2,$s3); |
303 } else { | 889 } else { |
304 » » &encstep(0,"ebp",$s0,$s1,$s2,$s3); | 890 » » &encstep(0,$tbl,$s0,$s1,$s2,$s3); |
305 » » &encstep(1,"ebp",$s1,$s2,$s3,$s0); | 891 » » &encstep(1,$tbl,$s1,$s2,$s3,$s0); |
306 » » &encstep(2,"ebp",$s2,$s3,$s0,$s1); | 892 » » &encstep(2,$tbl,$s2,$s3,$s0,$s1); |
307 » » &encstep(3,"ebp",$s3,$s0,$s1,$s2); | 893 » » &encstep(3,$tbl,$s3,$s0,$s1,$s2); |
308 } | 894 } |
309 &xor ($s0,&DWP(16*$i+0,$key)); | 895 &xor ($s0,&DWP(16*$i+0,$key)); |
310 &xor ($s1,&DWP(16*$i+4,$key)); | 896 &xor ($s1,&DWP(16*$i+4,$key)); |
311 &xor ($s2,&DWP(16*$i+8,$key)); | 897 &xor ($s2,&DWP(16*$i+8,$key)); |
312 &xor ($s3,&DWP(16*$i+12,$key)); | 898 &xor ($s3,&DWP(16*$i+12,$key)); |
313 } | 899 } |
314 &add ($key,32); | 900 &add ($key,32); |
315 » &mov» (&DWP(12,"esp"),$key);» # advance rd_key | 901 » &mov» ($__key,$key);» » # advance rd_key |
316 » &set_label("12rounds"); | 902 » &set_label("12rounds",4); |
317 for ($i=1;$i<3;$i++) { | 903 for ($i=1;$i<3;$i++) { |
318 if ($vertical_spin) { | 904 if ($vertical_spin) { |
319 » » &encvert("ebp",$s0,$s1,$s2,$s3); | 905 » » &encvert($tbl,$s0,$s1,$s2,$s3); |
320 } else { | 906 } else { |
321 » » &encstep(0,"ebp",$s0,$s1,$s2,$s3); | 907 » » &encstep(0,$tbl,$s0,$s1,$s2,$s3); |
322 » » &encstep(1,"ebp",$s1,$s2,$s3,$s0); | 908 » » &encstep(1,$tbl,$s1,$s2,$s3,$s0); |
323 » » &encstep(2,"ebp",$s2,$s3,$s0,$s1); | 909 » » &encstep(2,$tbl,$s2,$s3,$s0,$s1); |
324 » » &encstep(3,"ebp",$s3,$s0,$s1,$s2); | 910 » » &encstep(3,$tbl,$s3,$s0,$s1,$s2); |
325 } | 911 } |
326 &xor ($s0,&DWP(16*$i+0,$key)); | 912 &xor ($s0,&DWP(16*$i+0,$key)); |
327 &xor ($s1,&DWP(16*$i+4,$key)); | 913 &xor ($s1,&DWP(16*$i+4,$key)); |
328 &xor ($s2,&DWP(16*$i+8,$key)); | 914 &xor ($s2,&DWP(16*$i+8,$key)); |
329 &xor ($s3,&DWP(16*$i+12,$key)); | 915 &xor ($s3,&DWP(16*$i+12,$key)); |
330 } | 916 } |
331 &add ($key,32); | 917 &add ($key,32); |
332 » &mov» (&DWP(12,"esp"),$key);» # advance rd_key | 918 » &mov» ($__key,$key);» » # advance rd_key |
333 » &set_label("10rounds"); | 919 » &set_label("10rounds",4); |
334 for ($i=1;$i<10;$i++) { | 920 for ($i=1;$i<10;$i++) { |
335 if ($vertical_spin) { | 921 if ($vertical_spin) { |
336 » » &encvert("ebp",$s0,$s1,$s2,$s3); | 922 » » &encvert($tbl,$s0,$s1,$s2,$s3); |
337 } else { | 923 } else { |
338 » » &encstep(0,"ebp",$s0,$s1,$s2,$s3); | 924 » » &encstep(0,$tbl,$s0,$s1,$s2,$s3); |
339 » » &encstep(1,"ebp",$s1,$s2,$s3,$s0); | 925 » » &encstep(1,$tbl,$s1,$s2,$s3,$s0); |
340 » » &encstep(2,"ebp",$s2,$s3,$s0,$s1); | 926 » » &encstep(2,$tbl,$s2,$s3,$s0,$s1); |
341 » » &encstep(3,"ebp",$s3,$s0,$s1,$s2); | 927 » » &encstep(3,$tbl,$s3,$s0,$s1,$s2); |
342 } | 928 } |
343 &xor ($s0,&DWP(16*$i+0,$key)); | 929 &xor ($s0,&DWP(16*$i+0,$key)); |
344 &xor ($s1,&DWP(16*$i+4,$key)); | 930 &xor ($s1,&DWP(16*$i+4,$key)); |
345 &xor ($s2,&DWP(16*$i+8,$key)); | 931 &xor ($s2,&DWP(16*$i+8,$key)); |
346 &xor ($s3,&DWP(16*$i+12,$key)); | 932 &xor ($s3,&DWP(16*$i+12,$key)); |
347 } | 933 } |
348 } | 934 } |
349 | 935 |
350 if ($vertical_spin) { | 936 if ($vertical_spin) { |
351 # "reincarnate" some registers for "horizontal" spin... | 937 # "reincarnate" some registers for "horizontal" spin... |
352 &mov ($s1="ebx",$key="edi"); | 938 &mov ($s1="ebx",$key="edi"); |
353 &mov ($s2="ecx",$acc="esi"); | 939 &mov ($s2="ecx",$acc="esi"); |
354 } | 940 } |
355 » &enclast(0,"ebp",$s0,$s1,$s2,$s3); | 941 » &enclast(0,$tbl,$s0,$s1,$s2,$s3); |
356 » &enclast(1,"ebp",$s1,$s2,$s3,$s0); | 942 » &enclast(1,$tbl,$s1,$s2,$s3,$s0); |
357 » &enclast(2,"ebp",$s2,$s3,$s0,$s1); | 943 » &enclast(2,$tbl,$s2,$s3,$s0,$s1); |
358 » &enclast(3,"ebp",$s3,$s0,$s1,$s2); | 944 » &enclast(3,$tbl,$s3,$s0,$s1,$s2); |
359 | 945 |
360 &add ($key,$small_footprint?16:160); | 946 &add ($key,$small_footprint?16:160); |
361 &xor ($s0,&DWP(0,$key)); | 947 &xor ($s0,&DWP(0,$key)); |
362 &xor ($s1,&DWP(4,$key)); | 948 &xor ($s1,&DWP(4,$key)); |
363 &xor ($s2,&DWP(8,$key)); | 949 &xor ($s2,&DWP(8,$key)); |
364 &xor ($s3,&DWP(12,$key)); | 950 &xor ($s3,&DWP(12,$key)); |
365 | 951 |
366 &ret (); | 952 &ret (); |
367 | 953 |
368 &set_label("AES_Te",64); # Yes! I keep it in the code segment! | 954 &set_label("AES_Te",64); # Yes! I keep it in the code segment! |
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
423 &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969); | 1009 &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969); |
424 &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27); | 1010 &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27); |
425 &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122); | 1011 &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122); |
426 &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433); | 1012 &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433); |
427 &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9); | 1013 &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9); |
428 &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5); | 1014 &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5); |
429 &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a); | 1015 &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a); |
430 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); | 1016 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); |
431 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); | 1017 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); |
432 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); | 1018 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); |
| 1019 |
| 1020 #Te4 # four copies of Te4 to choose from to avoid L1 aliasing |
| 1021 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); |
| 1022 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); |
| 1023 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); |
| 1024 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); |
| 1025 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); |
| 1026 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); |
| 1027 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); |
| 1028 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); |
| 1029 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); |
| 1030 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); |
| 1031 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); |
| 1032 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); |
| 1033 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); |
| 1034 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); |
| 1035 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); |
| 1036 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); |
| 1037 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); |
| 1038 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); |
| 1039 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); |
| 1040 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); |
| 1041 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); |
| 1042 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); |
| 1043 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); |
| 1044 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); |
| 1045 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); |
| 1046 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); |
| 1047 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); |
| 1048 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); |
| 1049 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); |
| 1050 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); |
| 1051 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); |
| 1052 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); |
| 1053 |
| 1054 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); |
| 1055 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); |
| 1056 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); |
| 1057 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); |
| 1058 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); |
| 1059 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); |
| 1060 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); |
| 1061 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); |
| 1062 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); |
| 1063 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); |
| 1064 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); |
| 1065 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); |
| 1066 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); |
| 1067 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); |
| 1068 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); |
| 1069 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); |
| 1070 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); |
| 1071 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); |
| 1072 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); |
| 1073 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); |
| 1074 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); |
| 1075 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); |
| 1076 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); |
| 1077 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); |
| 1078 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); |
| 1079 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); |
| 1080 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); |
| 1081 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); |
| 1082 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); |
| 1083 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); |
| 1084 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); |
| 1085 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); |
| 1086 |
| 1087 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); |
| 1088 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); |
| 1089 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); |
| 1090 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); |
| 1091 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); |
| 1092 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); |
| 1093 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); |
| 1094 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); |
| 1095 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); |
| 1096 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); |
| 1097 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); |
| 1098 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); |
| 1099 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); |
| 1100 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); |
| 1101 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); |
| 1102 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); |
| 1103 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); |
| 1104 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); |
| 1105 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); |
| 1106 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); |
| 1107 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); |
| 1108 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); |
| 1109 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); |
| 1110 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); |
| 1111 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); |
| 1112 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); |
| 1113 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); |
| 1114 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); |
| 1115 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); |
| 1116 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); |
| 1117 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); |
| 1118 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); |
| 1119 |
| 1120 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); |
| 1121 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); |
| 1122 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); |
| 1123 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); |
| 1124 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); |
| 1125 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); |
| 1126 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); |
| 1127 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); |
| 1128 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); |
| 1129 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); |
| 1130 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); |
| 1131 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); |
| 1132 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); |
| 1133 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); |
| 1134 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); |
| 1135 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); |
| 1136 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); |
| 1137 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); |
| 1138 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); |
| 1139 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); |
| 1140 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); |
| 1141 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); |
| 1142 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); |
| 1143 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); |
| 1144 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); |
| 1145 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); |
| 1146 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); |
| 1147 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); |
| 1148 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); |
| 1149 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); |
| 1150 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); |
| 1151 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); |
433 #rcon: | 1152 #rcon: |
434 &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008); | 1153 &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008); |
435 &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080); | 1154 &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080); |
436 » &data_word(0x0000001b, 0x00000036, 0, 0, 0, 0, 0, 0); | 1155 » &data_word(0x0000001b, 0x00000036, 0x00000000, 0x00000000); |
| 1156 » &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000); |
437 &function_end_B("_x86_AES_encrypt"); | 1157 &function_end_B("_x86_AES_encrypt"); |
438 | 1158 |
439 # void AES_encrypt (const void *inp,void *out,const AES_KEY *key); | 1159 # void AES_encrypt (const void *inp,void *out,const AES_KEY *key); |
440 &public_label("AES_Te"); | |
441 &function_begin("AES_encrypt"); | 1160 &function_begin("AES_encrypt"); |
442 &mov ($acc,&wparam(0)); # load inp | 1161 &mov ($acc,&wparam(0)); # load inp |
443 &mov ($key,&wparam(2)); # load key | 1162 &mov ($key,&wparam(2)); # load key |
444 | 1163 |
445 &mov ($s0,"esp"); | 1164 &mov ($s0,"esp"); |
446 » &sub» ("esp",24); | 1165 » &sub» ("esp",36); |
447 » &and» ("esp",-64); | 1166 » &and» ("esp",-64);» » » # align to cache-line |
448 » &add» ("esp",4); | 1167 |
449 » &mov» (&DWP(16,"esp"),$s0); | 1168 » # place stack frame just "above" the key schedule |
| 1169 » &lea» ($s1,&DWP(-64-63,$key)); |
| 1170 » &sub» ($s1,"esp"); |
| 1171 » &neg» ($s1); |
| 1172 » &and» ($s1,0x3C0);» # modulo 1024, but aligned to cache-line |
| 1173 » &sub» ("esp",$s1); |
| 1174 » &add» ("esp",4);» # 4 is reserved for caller's return address |
| 1175 » &mov» ($_esp,$s0);» » » # save stack pointer |
450 | 1176 |
451 &call (&label("pic_point")); # make it PIC! | 1177 &call (&label("pic_point")); # make it PIC! |
452 &set_label("pic_point"); | 1178 &set_label("pic_point"); |
453 » &blindpop("ebp"); | 1179 » &blindpop($tbl); |
454 » &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); | 1180 » &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if (!$x86only
); |
| 1181 » &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl)); |
455 | 1182 |
| 1183 # pick Te4 copy which can't "overlap" with stack frame or key schedule |
| 1184 &lea ($s1,&DWP(768-4,"esp")); |
| 1185 &sub ($s1,$tbl); |
| 1186 &and ($s1,0x300); |
| 1187 &lea ($tbl,&DWP(2048+128,$tbl,$s1)); |
| 1188 |
| 1189 if (!$x86only) { |
| 1190 &bt (&DWP(0,$s0),25); # check for SSE bit |
| 1191 &jnc (&label("x86")); |
| 1192 |
| 1193 &movq ("mm0",&QWP(0,$acc)); |
| 1194 &movq ("mm4",&QWP(8,$acc)); |
| 1195 &call ("_sse_AES_encrypt_compact"); |
| 1196 &mov ("esp",$_esp); # restore stack pointer |
| 1197 &mov ($acc,&wparam(1)); # load out |
| 1198 &movq (&QWP(0,$acc),"mm0"); # write output data |
| 1199 &movq (&QWP(8,$acc),"mm4"); |
| 1200 &emms (); |
| 1201 &function_end_A(); |
| 1202 } |
| 1203 &set_label("x86",16); |
| 1204 &mov ($_tbl,$tbl); |
456 &mov ($s0,&DWP(0,$acc)); # load input data | 1205 &mov ($s0,&DWP(0,$acc)); # load input data |
457 &mov ($s1,&DWP(4,$acc)); | 1206 &mov ($s1,&DWP(4,$acc)); |
458 &mov ($s2,&DWP(8,$acc)); | 1207 &mov ($s2,&DWP(8,$acc)); |
459 &mov ($s3,&DWP(12,$acc)); | 1208 &mov ($s3,&DWP(12,$acc)); |
460 | 1209 » &call» ("_x86_AES_encrypt_compact"); |
461 » &call» ("_x86_AES_encrypt"); | 1210 » &mov» ("esp",$_esp);» » » # restore stack pointer |
462 | |
463 » &mov» ("esp",&DWP(16,"esp")); | |
464 | |
465 &mov ($acc,&wparam(1)); # load out | 1211 &mov ($acc,&wparam(1)); # load out |
466 &mov (&DWP(0,$acc),$s0); # write output data | 1212 &mov (&DWP(0,$acc),$s0); # write output data |
467 &mov (&DWP(4,$acc),$s1); | 1213 &mov (&DWP(4,$acc),$s1); |
468 &mov (&DWP(8,$acc),$s2); | 1214 &mov (&DWP(8,$acc),$s2); |
469 &mov (&DWP(12,$acc),$s3); | 1215 &mov (&DWP(12,$acc),$s3); |
470 &function_end("AES_encrypt"); | 1216 &function_end("AES_encrypt"); |
471 | 1217 |
472 #------------------------------------------------------------------# | 1218 #--------------------------------------------------------------------# |
| 1219 |
| 1220 ###################################################################### |
| 1221 # "Compact" block function |
| 1222 ###################################################################### |
| 1223 |
| 1224 sub deccompact() |
| 1225 { my $Fn = mov; |
| 1226 while ($#_>5) { pop(@_); $Fn=sub{}; } |
| 1227 my ($i,$td,@s)=@_; |
| 1228 my $tmp = $key; |
| 1229 my $out = $i==3?$s[0]:$acc; |
| 1230 |
| 1231 # $Fn is used in first compact round and its purpose is to |
| 1232 # void restoration of some values from stack, so that after |
| 1233 # 4xdeccompact with extra argument $key, $s0 and $s1 values |
| 1234 # are left there... |
| 1235 if($i==3) { &$Fn ($key,$__key); } |
| 1236 else { &mov ($out,$s[0]); } |
| 1237 &and ($out,0xFF); |
| 1238 &movz ($out,&BP(-128,$td,$out,1)); |
| 1239 |
| 1240 if ($i==3) { $tmp=$s[1]; } |
| 1241 &movz ($tmp,&HB($s[1])); |
| 1242 &movz ($tmp,&BP(-128,$td,$tmp,1)); |
| 1243 &shl ($tmp,8); |
| 1244 &xor ($out,$tmp); |
| 1245 |
| 1246 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); } |
| 1247 else { mov ($tmp,$s[2]); } |
| 1248 &shr ($tmp,16); |
| 1249 &and ($tmp,0xFF); |
| 1250 &movz ($tmp,&BP(-128,$td,$tmp,1)); |
| 1251 &shl ($tmp,16); |
| 1252 &xor ($out,$tmp); |
| 1253 |
| 1254 if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],$__s1); } |
| 1255 else { &mov ($tmp,$s[3]); } |
| 1256 &shr ($tmp,24); |
| 1257 &movz ($tmp,&BP(-128,$td,$tmp,1)); |
| 1258 &shl ($tmp,24); |
| 1259 &xor ($out,$tmp); |
| 1260 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } |
| 1261 if ($i==3) { &$Fn ($s[3],$__s0); } |
| 1262 } |
| 1263 |
| 1264 # must be called with 2,3,0,1 as argument sequence!!! |
| 1265 sub dectransform() |
| 1266 { my @s = ($s0,$s1,$s2,$s3); |
| 1267 my $i = shift; |
| 1268 my $tmp = $key; |
| 1269 my $tp2 = @s[($i+2)%4]; $tp2 = @s[2] if ($i==1); |
| 1270 my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1); |
| 1271 my $tp8 = $tbl; |
| 1272 |
| 1273 &mov ($acc,$s[$i]); |
| 1274 &and ($acc,0x80808080); |
| 1275 &mov ($tmp,$acc); |
| 1276 &shr ($tmp,7); |
| 1277 &lea ($tp2,&DWP(0,$s[$i],$s[$i])); |
| 1278 &sub ($acc,$tmp); |
| 1279 &and ($tp2,0xfefefefe); |
| 1280 &and ($acc,0x1b1b1b1b); |
| 1281 &xor ($acc,$tp2); |
| 1282 &mov ($tp2,$acc); |
| 1283 |
| 1284 &and ($acc,0x80808080); |
| 1285 &mov ($tmp,$acc); |
| 1286 &shr ($tmp,7); |
| 1287 &lea ($tp4,&DWP(0,$tp2,$tp2)); |
| 1288 &sub ($acc,$tmp); |
| 1289 &and ($tp4,0xfefefefe); |
| 1290 &and ($acc,0x1b1b1b1b); |
| 1291 &xor ($tp2,$s[$i]); # tp2^tp1 |
| 1292 &xor ($acc,$tp4); |
| 1293 &mov ($tp4,$acc); |
| 1294 |
| 1295 &and ($acc,0x80808080); |
| 1296 &mov ($tmp,$acc); |
| 1297 &shr ($tmp,7); |
| 1298 &lea ($tp8,&DWP(0,$tp4,$tp4)); |
| 1299 &sub ($acc,$tmp); |
| 1300 &and ($tp8,0xfefefefe); |
| 1301 &and ($acc,0x1b1b1b1b); |
| 1302 &xor ($tp4,$s[$i]); # tp4^tp1 |
| 1303 &rotl ($s[$i],8); # = ROTATE(tp1,8) |
| 1304 &xor ($tp8,$acc); |
| 1305 |
| 1306 &xor ($s[$i],$tp2); |
| 1307 &xor ($tp2,$tp8); |
| 1308 &rotl ($tp2,24); |
| 1309 &xor ($s[$i],$tp4); |
| 1310 &xor ($tp4,$tp8); |
| 1311 &rotl ($tp4,16); |
| 1312 &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1) |
| 1313 &rotl ($tp8,8); |
| 1314 &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24) |
| 1315 &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16) |
| 1316 &mov ($s[0],$__s0) if($i==2); #prefetch $s0 |
| 1317 &mov ($s[1],$__s1) if($i==3); #prefetch $s1 |
| 1318 &mov ($s[2],$__s2) if($i==1); |
| 1319 &xor ($s[$i],$tp8); # ^= ROTATE(tp8,8) |
| 1320 |
| 1321 &mov ($s[3],$__s3) if($i==1); |
| 1322 &mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2); |
| 1323 } |
| 1324 |
| 1325 &function_begin_B("_x86_AES_decrypt_compact"); |
| 1326 # note that caller is expected to allocate stack frame for me! |
| 1327 &mov ($__key,$key); # save key |
| 1328 |
| 1329 &xor ($s0,&DWP(0,$key)); # xor with key |
| 1330 &xor ($s1,&DWP(4,$key)); |
| 1331 &xor ($s2,&DWP(8,$key)); |
| 1332 &xor ($s3,&DWP(12,$key)); |
| 1333 |
| 1334 &mov ($acc,&DWP(240,$key)); # load key->rounds |
| 1335 |
| 1336 &lea ($acc,&DWP(-2,$acc,$acc)); |
| 1337 &lea ($acc,&DWP(0,$key,$acc,8)); |
| 1338 &mov ($__end,$acc); # end of key schedule |
| 1339 |
| 1340 # prefetch Td4 |
| 1341 &mov ($key,&DWP(0-128,$tbl)); |
| 1342 &mov ($acc,&DWP(32-128,$tbl)); |
| 1343 &mov ($key,&DWP(64-128,$tbl)); |
| 1344 &mov ($acc,&DWP(96-128,$tbl)); |
| 1345 &mov ($key,&DWP(128-128,$tbl)); |
| 1346 &mov ($acc,&DWP(160-128,$tbl)); |
| 1347 &mov ($key,&DWP(192-128,$tbl)); |
| 1348 &mov ($acc,&DWP(224-128,$tbl)); |
| 1349 |
| 1350 &set_label("loop",16); |
| 1351 |
| 1352 &deccompact(0,$tbl,$s0,$s3,$s2,$s1,1); |
| 1353 &deccompact(1,$tbl,$s1,$s0,$s3,$s2,1); |
| 1354 &deccompact(2,$tbl,$s2,$s1,$s0,$s3,1); |
| 1355 &deccompact(3,$tbl,$s3,$s2,$s1,$s0,1); |
| 1356 &dectransform(2); |
| 1357 &dectransform(3); |
| 1358 &dectransform(0); |
| 1359 &dectransform(1); |
| 1360 &mov ($key,$__key); |
| 1361 &mov ($tbl,$__tbl); |
| 1362 &add ($key,16); # advance rd_key |
| 1363 &xor ($s0,&DWP(0,$key)); |
| 1364 &xor ($s1,&DWP(4,$key)); |
| 1365 &xor ($s2,&DWP(8,$key)); |
| 1366 &xor ($s3,&DWP(12,$key)); |
| 1367 |
| 1368 &cmp ($key,$__end); |
| 1369 &mov ($__key,$key); |
| 1370 &jb (&label("loop")); |
| 1371 |
| 1372 &deccompact(0,$tbl,$s0,$s3,$s2,$s1); |
| 1373 &deccompact(1,$tbl,$s1,$s0,$s3,$s2); |
| 1374 &deccompact(2,$tbl,$s2,$s1,$s0,$s3); |
| 1375 &deccompact(3,$tbl,$s3,$s2,$s1,$s0); |
| 1376 |
| 1377 &xor ($s0,&DWP(16,$key)); |
| 1378 &xor ($s1,&DWP(20,$key)); |
| 1379 &xor ($s2,&DWP(24,$key)); |
| 1380 &xor ($s3,&DWP(28,$key)); |
| 1381 |
| 1382 &ret (); |
| 1383 &function_end_B("_x86_AES_decrypt_compact"); |
| 1384 |
| 1385 ###################################################################### |
| 1386 # "Compact" SSE block function. |
| 1387 ###################################################################### |
| 1388 |
| 1389 sub sse_deccompact() |
| 1390 { |
| 1391 &pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0 |
| 1392 &movd ("eax","mm1"); # 7, 6, 1, 0 |
| 1393 |
| 1394 &pshufw ("mm5","mm4",0x09); # 13,12,11,10 |
| 1395 &movz ($acc,&LB("eax")); # 0 |
| 1396 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 |
| 1397 &movd ("ebx","mm5"); # 13,12,11,10 |
| 1398 &movz ("edx",&HB("eax")); # 1 |
| 1399 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1 |
| 1400 &shl ("edx",8); # 1 |
| 1401 |
| 1402 &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4 |
| 1403 &movz ($acc,&LB("ebx")); # 10 |
| 1404 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10 |
| 1405 &shl ($acc,16); # 10 |
| 1406 &or ("ecx",$acc); # 10 |
| 1407 &shr ("eax",16); # 7, 6 |
| 1408 &movz ($acc,&HB("ebx")); # 11 |
| 1409 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11 |
| 1410 &shl ($acc,24); # 11 |
| 1411 &or ("edx",$acc); # 11 |
| 1412 &shr ("ebx",16); # 13,12 |
| 1413 |
| 1414 &pshufw ("mm6","mm4",0x03); # 9, 8,15,14 |
| 1415 &movz ($acc,&HB("eax")); # 7 |
| 1416 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7 |
| 1417 &shl ($acc,24); # 7 |
| 1418 &or ("ecx",$acc); # 7 |
| 1419 &movz ($acc,&HB("ebx")); # 13 |
| 1420 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13 |
| 1421 &shl ($acc,8); # 13 |
| 1422 &or ("ecx",$acc); # 13 |
| 1423 &movd ("mm0","ecx"); # t[0] collected |
| 1424 |
| 1425 &movz ($acc,&LB("eax")); # 6 |
| 1426 &movd ("eax","mm2"); # 3, 2, 5, 4 |
| 1427 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6 |
| 1428 &shl ("ecx",16); # 6 |
| 1429 &movz ($acc,&LB("ebx")); # 12 |
| 1430 &movd ("ebx","mm6"); # 9, 8,15,14 |
| 1431 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 12 |
| 1432 &or ("ecx",$acc); # 12 |
| 1433 |
| 1434 &movz ($acc,&LB("eax")); # 4 |
| 1435 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 4 |
| 1436 &or ("edx",$acc); # 4 |
| 1437 &movz ($acc,&LB("ebx")); # 14 |
| 1438 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14 |
| 1439 &shl ($acc,16); # 14 |
| 1440 &or ("edx",$acc); # 14 |
| 1441 &movd ("mm1","edx"); # t[1] collected |
| 1442 |
| 1443 &movz ($acc,&HB("eax")); # 5 |
| 1444 &movz ("edx",&BP(-128,$tbl,$acc,1)); # 5 |
| 1445 &shl ("edx",8); # 5 |
| 1446 &movz ($acc,&HB("ebx")); # 15 |
| 1447 &shr ("eax",16); # 3, 2 |
| 1448 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15 |
| 1449 &shl ($acc,24); # 15 |
| 1450 &or ("edx",$acc); # 15 |
| 1451 &shr ("ebx",16); # 9, 8 |
| 1452 |
| 1453 &punpckldq ("mm0","mm1"); # t[0,1] collected |
| 1454 |
| 1455 &movz ($acc,&HB("ebx")); # 9 |
| 1456 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9 |
| 1457 &shl ($acc,8); # 9 |
| 1458 &or ("ecx",$acc); # 9 |
| 1459 &and ("ebx",0xff); # 8 |
| 1460 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8 |
| 1461 &or ("edx","ebx"); # 8 |
| 1462 &movz ($acc,&LB("eax")); # 2 |
| 1463 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2 |
| 1464 &shl ($acc,16); # 2 |
| 1465 &or ("edx",$acc); # 2 |
| 1466 &movd ("mm4","edx"); # t[2] collected |
| 1467 &movz ("eax",&HB("eax")); # 3 |
| 1468 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3 |
| 1469 &shl ("eax",24); # 3 |
| 1470 &or ("ecx","eax"); # 3 |
| 1471 &movd ("mm5","ecx"); # t[3] collected |
| 1472 |
| 1473 &punpckldq ("mm4","mm5"); # t[2,3] collected |
| 1474 } |
| 1475 |
| 1476 if (!$x86only) { |
| 1477 &function_begin_B("_sse_AES_decrypt_compact"); |
| 1478 &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0 |
| 1479 &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8 |
| 1480 |
| 1481 # note that caller is expected to allocate stack frame for me! |
| 1482 &mov ($acc,&DWP(240,$key)); # load key->rounds |
| 1483 &lea ($acc,&DWP(-2,$acc,$acc)); |
| 1484 &lea ($acc,&DWP(0,$key,$acc,8)); |
| 1485 &mov ($__end,$acc); # end of key schedule |
| 1486 |
| 1487 &mov ($s0,0x1b1b1b1b); # magic constant |
| 1488 &mov (&DWP(8,"esp"),$s0); |
| 1489 &mov (&DWP(12,"esp"),$s0); |
| 1490 |
| 1491 # prefetch Td4 |
| 1492 &mov ($s0,&DWP(0-128,$tbl)); |
| 1493 &mov ($s1,&DWP(32-128,$tbl)); |
| 1494 &mov ($s2,&DWP(64-128,$tbl)); |
| 1495 &mov ($s3,&DWP(96-128,$tbl)); |
| 1496 &mov ($s0,&DWP(128-128,$tbl)); |
| 1497 &mov ($s1,&DWP(160-128,$tbl)); |
| 1498 &mov ($s2,&DWP(192-128,$tbl)); |
| 1499 &mov ($s3,&DWP(224-128,$tbl)); |
| 1500 |
| 1501 &set_label("loop",16); |
| 1502 &sse_deccompact(); |
| 1503 &add ($key,16); |
| 1504 &cmp ($key,$__end); |
| 1505 &ja (&label("out")); |
| 1506 |
| 1507 # ROTATE(x^y,N) == ROTATE(x,N)^ROTATE(y,N) |
| 1508 &movq ("mm3","mm0"); &movq ("mm7","mm4"); |
| 1509 &movq ("mm2","mm0",1); &movq ("mm6","mm4",1); |
| 1510 &movq ("mm1","mm0"); &movq ("mm5","mm4"); |
| 1511 &pshufw ("mm0","mm0",0xb1); &pshufw ("mm4","mm4",0xb1);# = R
OTATE(tp0,16) |
| 1512 &pslld ("mm2",8); &pslld ("mm6",8); |
| 1513 &psrld ("mm3",8); &psrld ("mm7",8); |
| 1514 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0
<<8 |
| 1515 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0
>>8 |
| 1516 &pslld ("mm2",16); &pslld ("mm6",16); |
| 1517 &psrld ("mm3",16); &psrld ("mm7",16); |
| 1518 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0
<<24 |
| 1519 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0
>>24 |
| 1520 |
| 1521 &movq ("mm3",&QWP(8,"esp")); |
| 1522 &pxor ("mm2","mm2"); &pxor ("mm6","mm6"); |
| 1523 &pcmpgtb("mm2","mm1"); &pcmpgtb("mm6","mm5"); |
| 1524 &pand ("mm2","mm3"); &pand ("mm6","mm3"); |
| 1525 &paddb ("mm1","mm1"); &paddb ("mm5","mm5"); |
| 1526 &pxor ("mm1","mm2"); &pxor ("mm5","mm6"); # tp2 |
| 1527 &movq ("mm3","mm1"); &movq ("mm7","mm5"); |
| 1528 &movq ("mm2","mm1"); &movq ("mm6","mm5"); |
| 1529 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp2 |
| 1530 &pslld ("mm3",24); &pslld ("mm7",24); |
| 1531 &psrld ("mm2",8); &psrld ("mm6",8); |
| 1532 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp2
<<24 |
| 1533 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp2
>>8 |
| 1534 |
| 1535 &movq ("mm2",&QWP(8,"esp")); |
| 1536 &pxor ("mm3","mm3"); &pxor ("mm7","mm7"); |
| 1537 &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5"); |
| 1538 &pand ("mm3","mm2"); &pand ("mm7","mm2"); |
| 1539 &paddb ("mm1","mm1"); &paddb ("mm5","mm5"); |
| 1540 &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4 |
| 1541 &pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1); |
| 1542 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4 |
| 1543 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROT
ATE(tp4,16) |
| 1544 |
| 1545 &pxor ("mm3","mm3"); &pxor ("mm7","mm7"); |
| 1546 &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5"); |
| 1547 &pand ("mm3","mm2"); &pand ("mm7","mm2"); |
| 1548 &paddb ("mm1","mm1"); &paddb ("mm5","mm5"); |
| 1549 &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp8 |
| 1550 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8 |
| 1551 &movq ("mm3","mm1"); &movq ("mm7","mm5"); |
| 1552 &pshufw ("mm2","mm1",0xb1); &pshufw ("mm6","mm5",0xb1); |
| 1553 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROT
ATE(tp8,16) |
| 1554 &pslld ("mm1",8); &pslld ("mm5",8); |
| 1555 &psrld ("mm3",8); &psrld ("mm7",8); |
| 1556 &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key)); |
| 1557 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8
<<8 |
| 1558 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8
>>8 |
| 1559 &mov ($s0,&DWP(0-128,$tbl)); |
| 1560 &pslld ("mm1",16); &pslld ("mm5",16); |
| 1561 &mov ($s1,&DWP(64-128,$tbl)); |
| 1562 &psrld ("mm3",16); &psrld ("mm7",16); |
| 1563 &mov ($s2,&DWP(128-128,$tbl)); |
| 1564 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8
<<24 |
| 1565 &mov ($s3,&DWP(192-128,$tbl)); |
| 1566 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8
>>24 |
| 1567 |
| 1568 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); |
| 1569 &jmp (&label("loop")); |
| 1570 |
| 1571 &set_label("out",16); |
| 1572 &pxor ("mm0",&QWP(0,$key)); |
| 1573 &pxor ("mm4",&QWP(8,$key)); |
| 1574 |
| 1575 &ret (); |
| 1576 &function_end_B("_sse_AES_decrypt_compact"); |
| 1577 } |
| 1578 |
| 1579 ###################################################################### |
| 1580 # Vanilla block function. |
| 1581 ###################################################################### |
473 | 1582 |
474 sub decstep() | 1583 sub decstep() |
475 { my ($i,$td,@s) = @_; | 1584 { my ($i,$td,@s) = @_; |
476 my $tmp = $key; | 1585 my $tmp = $key; |
477 my $out = $i==3?$s[0]:$acc; | 1586 my $out = $i==3?$s[0]:$acc; |
478 | 1587 |
479 # no instructions are reordered, as performance appears | 1588 # no instructions are reordered, as performance appears |
480 # optimal... or rather that all attempts to reorder didn't | 1589 # optimal... or rather that all attempts to reorder didn't |
481 # result in better performance [which by the way is not a | 1590 # result in better performance [which by the way is not a |
482 # bit lower than ecryption]. | 1591 # bit lower than ecryption]. |
483 » if($i==3) {» &mov» ($key,&DWP(12,"esp"));» » } | 1592 » if($i==3) {» &mov» ($key,$__key);» » » } |
484 else { &mov ($out,$s[0]); } | 1593 else { &mov ($out,$s[0]); } |
485 &and ($out,0xFF); | 1594 &and ($out,0xFF); |
486 &mov ($out,&DWP(0,$td,$out,8)); | 1595 &mov ($out,&DWP(0,$td,$out,8)); |
487 | 1596 |
488 if ($i==3) { $tmp=$s[1]; } | 1597 if ($i==3) { $tmp=$s[1]; } |
489 &movz ($tmp,&HB($s[1])); | 1598 &movz ($tmp,&HB($s[1])); |
490 &xor ($out,&DWP(3,$td,$tmp,8)); | 1599 &xor ($out,&DWP(3,$td,$tmp,8)); |
491 | 1600 |
492 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); } | 1601 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); } |
493 else { &mov ($tmp,$s[2]); } | 1602 else { &mov ($tmp,$s[2]); } |
494 &shr ($tmp,16); | 1603 &shr ($tmp,16); |
495 &and ($tmp,0xFF); | 1604 &and ($tmp,0xFF); |
496 &xor ($out,&DWP(2,$td,$tmp,8)); | 1605 &xor ($out,&DWP(2,$td,$tmp,8)); |
497 | 1606 |
498 » if ($i==3) {» $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp"));»} | 1607 » if ($i==3) {» $tmp=$s[3]; &mov ($s[2],$__s1);»» } |
499 else { &mov ($tmp,$s[3]); } | 1608 else { &mov ($tmp,$s[3]); } |
500 &shr ($tmp,24); | 1609 &shr ($tmp,24); |
501 &xor ($out,&DWP(1,$td,$tmp,8)); | 1610 &xor ($out,&DWP(1,$td,$tmp,8)); |
502 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } | 1611 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } |
503 » if ($i==3) {» &mov» ($s[3],&DWP(4,"esp"));» » } | 1612 » if ($i==3) {» &mov» ($s[3],$__s0);» » » } |
504 &comment(); | 1613 &comment(); |
505 } | 1614 } |
506 | 1615 |
507 sub declast() | 1616 sub declast() |
508 { my ($i,$td,@s)=@_; | 1617 { my ($i,$td,@s)=@_; |
509 my $tmp = $key; | 1618 my $tmp = $key; |
510 my $out = $i==3?$s[0]:$acc; | 1619 my $out = $i==3?$s[0]:$acc; |
511 | 1620 |
512 » if($i==3) {» &mov» ($key,&DWP(12,"esp"));» » } | 1621 » if($i==0) {» &lea» ($td,&DWP(2048+128,$td)); |
| 1622 » » » &mov» ($tmp,&DWP(0-128,$td)); |
| 1623 » » » &mov» ($acc,&DWP(32-128,$td)); |
| 1624 » » » &mov» ($tmp,&DWP(64-128,$td)); |
| 1625 » » » &mov» ($acc,&DWP(96-128,$td)); |
| 1626 » » » &mov» ($tmp,&DWP(128-128,$td)); |
| 1627 » » » &mov» ($acc,&DWP(160-128,$td)); |
| 1628 » » » &mov» ($tmp,&DWP(192-128,$td)); |
| 1629 » » » &mov» ($acc,&DWP(224-128,$td)); |
| 1630 » » » &lea» ($td,&DWP(-128,$td));» » } |
| 1631 » if($i==3) {» &mov» ($key,$__key);» » » } |
513 else { &mov ($out,$s[0]); } | 1632 else { &mov ($out,$s[0]); } |
514 &and ($out,0xFF); | 1633 &and ($out,0xFF); |
515 » » » &movz» ($out,&BP(2048,$td,$out,1)); | 1634 » » » &movz» ($out,&BP(0,$td,$out,1)); |
516 | 1635 |
517 if ($i==3) { $tmp=$s[1]; } | 1636 if ($i==3) { $tmp=$s[1]; } |
518 &movz ($tmp,&HB($s[1])); | 1637 &movz ($tmp,&HB($s[1])); |
519 » » » &movz» ($tmp,&BP(2048,$td,$tmp,1)); | 1638 » » » &movz» ($tmp,&BP(0,$td,$tmp,1)); |
520 &shl ($tmp,8); | 1639 &shl ($tmp,8); |
521 &xor ($out,$tmp); | 1640 &xor ($out,$tmp); |
522 | 1641 |
523 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); } | 1642 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); } |
524 else { mov ($tmp,$s[2]); } | 1643 else { mov ($tmp,$s[2]); } |
525 &shr ($tmp,16); | 1644 &shr ($tmp,16); |
526 &and ($tmp,0xFF); | 1645 &and ($tmp,0xFF); |
527 » » » &movz» ($tmp,&BP(2048,$td,$tmp,1)); | 1646 » » » &movz» ($tmp,&BP(0,$td,$tmp,1)); |
528 &shl ($tmp,16); | 1647 &shl ($tmp,16); |
529 &xor ($out,$tmp); | 1648 &xor ($out,$tmp); |
530 | 1649 |
531 » if ($i==3) {» $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp"));»} | 1650 » if ($i==3) {» $tmp=$s[3]; &mov ($s[2],$__s1);»» } |
532 else { &mov ($tmp,$s[3]); } | 1651 else { &mov ($tmp,$s[3]); } |
533 &shr ($tmp,24); | 1652 &shr ($tmp,24); |
534 » » » &movz» ($tmp,&BP(2048,$td,$tmp,1)); | 1653 » » » &movz» ($tmp,&BP(0,$td,$tmp,1)); |
535 &shl ($tmp,24); | 1654 &shl ($tmp,24); |
536 &xor ($out,$tmp); | 1655 &xor ($out,$tmp); |
537 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } | 1656 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } |
538 » if ($i==3) {» &mov» ($s[3],&DWP(4,"esp"));» » } | 1657 » if ($i==3) {» &mov» ($s[3],$__s0); |
| 1658 » » » &lea» ($td,&DWP(-2048,$td));» » } |
539 } | 1659 } |
540 | 1660 |
541 &public_label("AES_Td"); | |
542 &function_begin_B("_x86_AES_decrypt"); | 1661 &function_begin_B("_x86_AES_decrypt"); |
543 # note that caller is expected to allocate stack frame for me! | 1662 # note that caller is expected to allocate stack frame for me! |
544 » &mov» (&DWP(12,"esp"),$key);» » # save key | 1663 » &mov» ($__key,$key);» » » # save key |
545 | 1664 |
546 &xor ($s0,&DWP(0,$key)); # xor with key | 1665 &xor ($s0,&DWP(0,$key)); # xor with key |
547 &xor ($s1,&DWP(4,$key)); | 1666 &xor ($s1,&DWP(4,$key)); |
548 &xor ($s2,&DWP(8,$key)); | 1667 &xor ($s2,&DWP(8,$key)); |
549 &xor ($s3,&DWP(12,$key)); | 1668 &xor ($s3,&DWP(12,$key)); |
550 | 1669 |
551 &mov ($acc,&DWP(240,$key)); # load key->rounds | 1670 &mov ($acc,&DWP(240,$key)); # load key->rounds |
552 | 1671 |
553 if ($small_footprint) { | 1672 if ($small_footprint) { |
554 &lea ($acc,&DWP(-2,$acc,$acc)); | 1673 &lea ($acc,&DWP(-2,$acc,$acc)); |
555 &lea ($acc,&DWP(0,$key,$acc,8)); | 1674 &lea ($acc,&DWP(0,$key,$acc,8)); |
556 » &mov» (&DWP(16,"esp"),$acc);» # end of key schedule | 1675 » &mov» ($__end,$acc);» » # end of key schedule |
557 » &align» (4); | 1676 » &set_label("loop",16); |
558 » &set_label("loop"); | 1677 » » &decstep(0,$tbl,$s0,$s3,$s2,$s1); |
559 » » &decstep(0,"ebp",$s0,$s3,$s2,$s1); | 1678 » » &decstep(1,$tbl,$s1,$s0,$s3,$s2); |
560 » » &decstep(1,"ebp",$s1,$s0,$s3,$s2); | 1679 » » &decstep(2,$tbl,$s2,$s1,$s0,$s3); |
561 » » &decstep(2,"ebp",$s2,$s1,$s0,$s3); | 1680 » » &decstep(3,$tbl,$s3,$s2,$s1,$s0); |
562 » » &decstep(3,"ebp",$s3,$s2,$s1,$s0); | |
563 &add ($key,16); # advance rd_key | 1681 &add ($key,16); # advance rd_key |
564 &xor ($s0,&DWP(0,$key)); | 1682 &xor ($s0,&DWP(0,$key)); |
565 &xor ($s1,&DWP(4,$key)); | 1683 &xor ($s1,&DWP(4,$key)); |
566 &xor ($s2,&DWP(8,$key)); | 1684 &xor ($s2,&DWP(8,$key)); |
567 &xor ($s3,&DWP(12,$key)); | 1685 &xor ($s3,&DWP(12,$key)); |
568 » &cmp» ($key,&DWP(16,"esp")); | 1686 » &cmp» ($key,$__end); |
569 » &mov» (&DWP(12,"esp"),$key); | 1687 » &mov» ($__key,$key); |
570 &jb (&label("loop")); | 1688 &jb (&label("loop")); |
571 } | 1689 } |
572 else { | 1690 else { |
573 &cmp ($acc,10); | 1691 &cmp ($acc,10); |
574 &jle (&label("10rounds")); | 1692 &jle (&label("10rounds")); |
575 &cmp ($acc,12); | 1693 &cmp ($acc,12); |
576 &jle (&label("12rounds")); | 1694 &jle (&label("12rounds")); |
577 | 1695 |
578 » &set_label("14rounds"); | 1696 » &set_label("14rounds",4); |
579 for ($i=1;$i<3;$i++) { | 1697 for ($i=1;$i<3;$i++) { |
580 » » &decstep(0,"ebp",$s0,$s3,$s2,$s1); | 1698 » » &decstep(0,$tbl,$s0,$s3,$s2,$s1); |
581 » » &decstep(1,"ebp",$s1,$s0,$s3,$s2); | 1699 » » &decstep(1,$tbl,$s1,$s0,$s3,$s2); |
582 » » &decstep(2,"ebp",$s2,$s1,$s0,$s3); | 1700 » » &decstep(2,$tbl,$s2,$s1,$s0,$s3); |
583 » » &decstep(3,"ebp",$s3,$s2,$s1,$s0); | 1701 » » &decstep(3,$tbl,$s3,$s2,$s1,$s0); |
584 &xor ($s0,&DWP(16*$i+0,$key)); | 1702 &xor ($s0,&DWP(16*$i+0,$key)); |
585 &xor ($s1,&DWP(16*$i+4,$key)); | 1703 &xor ($s1,&DWP(16*$i+4,$key)); |
586 &xor ($s2,&DWP(16*$i+8,$key)); | 1704 &xor ($s2,&DWP(16*$i+8,$key)); |
587 &xor ($s3,&DWP(16*$i+12,$key)); | 1705 &xor ($s3,&DWP(16*$i+12,$key)); |
588 } | 1706 } |
589 &add ($key,32); | 1707 &add ($key,32); |
590 » &mov» (&DWP(12,"esp"),$key);» # advance rd_key | 1708 » &mov» ($__key,$key);» » # advance rd_key |
591 » &set_label("12rounds"); | 1709 » &set_label("12rounds",4); |
592 for ($i=1;$i<3;$i++) { | 1710 for ($i=1;$i<3;$i++) { |
593 » » &decstep(0,"ebp",$s0,$s3,$s2,$s1); | 1711 » » &decstep(0,$tbl,$s0,$s3,$s2,$s1); |
594 » » &decstep(1,"ebp",$s1,$s0,$s3,$s2); | 1712 » » &decstep(1,$tbl,$s1,$s0,$s3,$s2); |
595 » » &decstep(2,"ebp",$s2,$s1,$s0,$s3); | 1713 » » &decstep(2,$tbl,$s2,$s1,$s0,$s3); |
596 » » &decstep(3,"ebp",$s3,$s2,$s1,$s0); | 1714 » » &decstep(3,$tbl,$s3,$s2,$s1,$s0); |
597 &xor ($s0,&DWP(16*$i+0,$key)); | 1715 &xor ($s0,&DWP(16*$i+0,$key)); |
598 &xor ($s1,&DWP(16*$i+4,$key)); | 1716 &xor ($s1,&DWP(16*$i+4,$key)); |
599 &xor ($s2,&DWP(16*$i+8,$key)); | 1717 &xor ($s2,&DWP(16*$i+8,$key)); |
600 &xor ($s3,&DWP(16*$i+12,$key)); | 1718 &xor ($s3,&DWP(16*$i+12,$key)); |
601 } | 1719 } |
602 &add ($key,32); | 1720 &add ($key,32); |
603 » &mov» (&DWP(12,"esp"),$key);» # advance rd_key | 1721 » &mov» ($__key,$key);» » # advance rd_key |
604 » &set_label("10rounds"); | 1722 » &set_label("10rounds",4); |
605 for ($i=1;$i<10;$i++) { | 1723 for ($i=1;$i<10;$i++) { |
606 » » &decstep(0,"ebp",$s0,$s3,$s2,$s1); | 1724 » » &decstep(0,$tbl,$s0,$s3,$s2,$s1); |
607 » » &decstep(1,"ebp",$s1,$s0,$s3,$s2); | 1725 » » &decstep(1,$tbl,$s1,$s0,$s3,$s2); |
608 » » &decstep(2,"ebp",$s2,$s1,$s0,$s3); | 1726 » » &decstep(2,$tbl,$s2,$s1,$s0,$s3); |
609 » » &decstep(3,"ebp",$s3,$s2,$s1,$s0); | 1727 » » &decstep(3,$tbl,$s3,$s2,$s1,$s0); |
610 &xor ($s0,&DWP(16*$i+0,$key)); | 1728 &xor ($s0,&DWP(16*$i+0,$key)); |
611 &xor ($s1,&DWP(16*$i+4,$key)); | 1729 &xor ($s1,&DWP(16*$i+4,$key)); |
612 &xor ($s2,&DWP(16*$i+8,$key)); | 1730 &xor ($s2,&DWP(16*$i+8,$key)); |
613 &xor ($s3,&DWP(16*$i+12,$key)); | 1731 &xor ($s3,&DWP(16*$i+12,$key)); |
614 } | 1732 } |
615 } | 1733 } |
616 | 1734 |
617 » &declast(0,"ebp",$s0,$s3,$s2,$s1); | 1735 » &declast(0,$tbl,$s0,$s3,$s2,$s1); |
618 » &declast(1,"ebp",$s1,$s0,$s3,$s2); | 1736 » &declast(1,$tbl,$s1,$s0,$s3,$s2); |
619 » &declast(2,"ebp",$s2,$s1,$s0,$s3); | 1737 » &declast(2,$tbl,$s2,$s1,$s0,$s3); |
620 » &declast(3,"ebp",$s3,$s2,$s1,$s0); | 1738 » &declast(3,$tbl,$s3,$s2,$s1,$s0); |
621 | 1739 |
622 &add ($key,$small_footprint?16:160); | 1740 &add ($key,$small_footprint?16:160); |
623 &xor ($s0,&DWP(0,$key)); | 1741 &xor ($s0,&DWP(0,$key)); |
624 &xor ($s1,&DWP(4,$key)); | 1742 &xor ($s1,&DWP(4,$key)); |
625 &xor ($s2,&DWP(8,$key)); | 1743 &xor ($s2,&DWP(8,$key)); |
626 &xor ($s3,&DWP(12,$key)); | 1744 &xor ($s3,&DWP(12,$key)); |
627 | 1745 |
628 &ret (); | 1746 &ret (); |
629 | 1747 |
630 &set_label("AES_Td",64); # Yes! I keep it in the code segment! | 1748 &set_label("AES_Td",64); # Yes! I keep it in the code segment! |
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
685 &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb); | 1803 &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb); |
686 &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d); | 1804 &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d); |
687 &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb); | 1805 &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb); |
688 &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a); | 1806 &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a); |
689 &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773); | 1807 &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773); |
690 &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478); | 1808 &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478); |
691 &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2); | 1809 &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2); |
692 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); | 1810 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); |
693 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); | 1811 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); |
694 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); | 1812 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); |
695 #Td4: | 1813 |
| 1814 #Td4:» # four copies of Td4 to choose from to avoid L1 aliasing |
696 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | 1815 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); |
697 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | 1816 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); |
698 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | 1817 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); |
699 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | 1818 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); |
700 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | 1819 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); |
701 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | 1820 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); |
702 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | 1821 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); |
703 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | 1822 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); |
704 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | 1823 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); |
705 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | 1824 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); |
(...skipping 12 matching lines...) Expand all Loading... |
718 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | 1837 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); |
719 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | 1838 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); |
720 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | 1839 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); |
721 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | 1840 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); |
722 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | 1841 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); |
723 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | 1842 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); |
724 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | 1843 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); |
725 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | 1844 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); |
726 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | 1845 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); |
727 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | 1846 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); |
| 1847 |
| 1848 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); |
| 1849 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); |
| 1850 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); |
| 1851 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); |
| 1852 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); |
| 1853 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); |
| 1854 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); |
| 1855 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); |
| 1856 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); |
| 1857 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); |
| 1858 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); |
| 1859 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); |
| 1860 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); |
| 1861 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); |
| 1862 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); |
| 1863 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); |
| 1864 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); |
| 1865 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); |
| 1866 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); |
| 1867 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); |
| 1868 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); |
| 1869 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); |
| 1870 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); |
| 1871 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); |
| 1872 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); |
| 1873 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); |
| 1874 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); |
| 1875 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); |
| 1876 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); |
| 1877 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); |
| 1878 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); |
| 1879 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); |
| 1880 |
| 1881 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); |
| 1882 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); |
| 1883 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); |
| 1884 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); |
| 1885 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); |
| 1886 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); |
| 1887 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); |
| 1888 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); |
| 1889 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); |
| 1890 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); |
| 1891 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); |
| 1892 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); |
| 1893 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); |
| 1894 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); |
| 1895 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); |
| 1896 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); |
| 1897 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); |
| 1898 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); |
| 1899 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); |
| 1900 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); |
| 1901 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); |
| 1902 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); |
| 1903 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); |
| 1904 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); |
| 1905 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); |
| 1906 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); |
| 1907 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); |
| 1908 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); |
| 1909 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); |
| 1910 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); |
| 1911 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); |
| 1912 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); |
| 1913 |
| 1914 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); |
| 1915 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); |
| 1916 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); |
| 1917 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); |
| 1918 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); |
| 1919 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); |
| 1920 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); |
| 1921 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); |
| 1922 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); |
| 1923 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); |
| 1924 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); |
| 1925 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); |
| 1926 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); |
| 1927 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); |
| 1928 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); |
| 1929 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); |
| 1930 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); |
| 1931 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); |
| 1932 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); |
| 1933 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); |
| 1934 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); |
| 1935 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); |
| 1936 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); |
| 1937 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); |
| 1938 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); |
| 1939 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); |
| 1940 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); |
| 1941 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); |
| 1942 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); |
| 1943 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); |
| 1944 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); |
| 1945 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); |
728 &function_end_B("_x86_AES_decrypt"); | 1946 &function_end_B("_x86_AES_decrypt"); |
729 | 1947 |
730 # void AES_decrypt (const void *inp,void *out,const AES_KEY *key); | 1948 # void AES_decrypt (const void *inp,void *out,const AES_KEY *key); |
731 &public_label("AES_Td"); | |
732 &function_begin("AES_decrypt"); | 1949 &function_begin("AES_decrypt"); |
733 &mov ($acc,&wparam(0)); # load inp | 1950 &mov ($acc,&wparam(0)); # load inp |
734 &mov ($key,&wparam(2)); # load key | 1951 &mov ($key,&wparam(2)); # load key |
735 | 1952 |
736 &mov ($s0,"esp"); | 1953 &mov ($s0,"esp"); |
737 » &sub» ("esp",24); | 1954 » &sub» ("esp",36); |
738 » &and» ("esp",-64); | 1955 » &and» ("esp",-64);» » » # align to cache-line |
739 » &add» ("esp",4); | 1956 |
740 » &mov» (&DWP(16,"esp"),$s0); | 1957 » # place stack frame just "above" the key schedule |
| 1958 » &lea» ($s1,&DWP(-64-63,$key)); |
| 1959 » &sub» ($s1,"esp"); |
| 1960 » &neg» ($s1); |
| 1961 » &and» ($s1,0x3C0);» # modulo 1024, but aligned to cache-line |
| 1962 » &sub» ("esp",$s1); |
| 1963 » &add» ("esp",4);» # 4 is reserved for caller's return address |
| 1964 » &mov» ($_esp,$s0);» # save stack pointer |
741 | 1965 |
742 &call (&label("pic_point")); # make it PIC! | 1966 &call (&label("pic_point")); # make it PIC! |
743 &set_label("pic_point"); | 1967 &set_label("pic_point"); |
744 » &blindpop("ebp"); | 1968 » &blindpop($tbl); |
745 » &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp")); | 1969 » &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only)
; |
| 1970 » &lea ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl)); |
746 | 1971 |
747 » # prefetch Td4 | 1972 » # pick Td4 copy which can't "overlap" with stack frame or key schedule |
748 » &lea» ("ebp",&DWP(2048+128,"ebp")); | 1973 » &lea» ($s1,&DWP(768-4,"esp")); |
749 » &mov» ($s0,&DWP(0-128,"ebp")); | 1974 » &sub» ($s1,$tbl); |
750 » &mov» ($s1,&DWP(32-128,"ebp")); | 1975 » &and» ($s1,0x300); |
751 » &mov» ($s2,&DWP(64-128,"ebp")); | 1976 » &lea» ($tbl,&DWP(2048+128,$tbl,$s1)); |
752 » &mov» ($s3,&DWP(96-128,"ebp")); | |
753 » &mov» ($s0,&DWP(128-128,"ebp")); | |
754 » &mov» ($s1,&DWP(160-128,"ebp")); | |
755 » &mov» ($s2,&DWP(192-128,"ebp")); | |
756 » &mov» ($s3,&DWP(224-128,"ebp")); | |
757 » &lea» ("ebp",&DWP(-2048-128,"ebp")); | |
758 | 1977 |
| 1978 if (!$x86only) { |
| 1979 &bt (&DWP(0,$s0),25); # check for SSE bit |
| 1980 &jnc (&label("x86")); |
| 1981 |
| 1982 &movq ("mm0",&QWP(0,$acc)); |
| 1983 &movq ("mm4",&QWP(8,$acc)); |
| 1984 &call ("_sse_AES_decrypt_compact"); |
| 1985 &mov ("esp",$_esp); # restore stack pointer |
| 1986 &mov ($acc,&wparam(1)); # load out |
| 1987 &movq (&QWP(0,$acc),"mm0"); # write output data |
| 1988 &movq (&QWP(8,$acc),"mm4"); |
| 1989 &emms (); |
| 1990 &function_end_A(); |
| 1991 } |
| 1992 &set_label("x86",16); |
| 1993 &mov ($_tbl,$tbl); |
759 &mov ($s0,&DWP(0,$acc)); # load input data | 1994 &mov ($s0,&DWP(0,$acc)); # load input data |
760 &mov ($s1,&DWP(4,$acc)); | 1995 &mov ($s1,&DWP(4,$acc)); |
761 &mov ($s2,&DWP(8,$acc)); | 1996 &mov ($s2,&DWP(8,$acc)); |
762 &mov ($s3,&DWP(12,$acc)); | 1997 &mov ($s3,&DWP(12,$acc)); |
763 | 1998 » &call» ("_x86_AES_decrypt_compact"); |
764 » &call» ("_x86_AES_decrypt"); | 1999 » &mov» ("esp",$_esp);» » » # restore stack pointer |
765 | |
766 » &mov» ("esp",&DWP(16,"esp")); | |
767 | |
768 &mov ($acc,&wparam(1)); # load out | 2000 &mov ($acc,&wparam(1)); # load out |
769 &mov (&DWP(0,$acc),$s0); # write output data | 2001 &mov (&DWP(0,$acc),$s0); # write output data |
770 &mov (&DWP(4,$acc),$s1); | 2002 &mov (&DWP(4,$acc),$s1); |
771 &mov (&DWP(8,$acc),$s2); | 2003 &mov (&DWP(8,$acc),$s2); |
772 &mov (&DWP(12,$acc),$s3); | 2004 &mov (&DWP(12,$acc),$s3); |
773 &function_end("AES_decrypt"); | 2005 &function_end("AES_decrypt"); |
774 | 2006 |
775 # void AES_cbc_encrypt (const void char *inp, unsigned char *out, | 2007 # void AES_cbc_encrypt (const void char *inp, unsigned char *out, |
776 # size_t length, const AES_KEY *key, | 2008 # size_t length, const AES_KEY *key, |
777 # unsigned char *ivp,const int enc); | 2009 # unsigned char *ivp,const int enc); |
778 { | 2010 { |
779 # stack frame layout | 2011 # stack frame layout |
780 # -4(%esp)» 0(%esp)»» return address | 2012 # -4(%esp)» » # return address» 0(%esp) |
781 # 0(%esp)» 4(%esp)»» tmp1 | 2013 # 0(%esp)» » # s0 backing store» 4(%esp)» |
782 # 4(%esp)» 8(%esp)»» tmp2 | 2014 # 4(%esp)» » # s1 backing store» 8(%esp) |
783 # 8(%esp)» 12(%esp)» key | 2015 # 8(%esp)» » # s2 backing store» 12(%esp) |
784 # 12(%esp)» 16(%esp)» end of key schedule | 2016 # 12(%esp)» » # s3 backing store» 16(%esp) |
785 my $_esp=&DWP(16,"esp");» #saved %esp | 2017 # 16(%esp)» » # key backup» » 20(%esp) |
786 my $_inp=&DWP(20,"esp");» #copy of wparam(0) | 2018 # 20(%esp)» » # end of key schedule» 24(%esp) |
787 my $_out=&DWP(24,"esp");» #copy of wparam(1) | 2019 # 24(%esp)» » # %ebp backup» » 28(%esp) |
788 my $_len=&DWP(28,"esp");» #copy of wparam(2) | 2020 # 28(%esp)» » # %esp backup |
789 my $_key=&DWP(32,"esp");» #copy of wparam(3) | 2021 my $_inp=&DWP(32,"esp");» # copy of wparam(0) |
790 my $_ivp=&DWP(36,"esp");» #copy of wparam(4) | 2022 my $_out=&DWP(36,"esp");» # copy of wparam(1) |
791 my $_tmp=&DWP(40,"esp");» #volatile variable | 2023 my $_len=&DWP(40,"esp");» # copy of wparam(2) |
792 my $ivec=&DWP(44,"esp");» #ivec[16] | 2024 my $_key=&DWP(44,"esp");» # copy of wparam(3) |
793 my $aes_key=&DWP(60,"esp");» #copy of aes_key | 2025 my $_ivp=&DWP(48,"esp");» # copy of wparam(4) |
794 my $mark=&DWP(60+240,"esp");» #copy of aes_key->rounds | 2026 my $_tmp=&DWP(52,"esp");» # volatile variable |
| 2027 # |
| 2028 my $ivec=&DWP(60,"esp");» # ivec[16] |
| 2029 my $aes_key=&DWP(76,"esp");» # copy of aes_key |
| 2030 my $mark=&DWP(76+240,"esp");» # copy of aes_key->rounds |
795 | 2031 |
796 &public_label("AES_Te"); | |
797 &public_label("AES_Td"); | |
798 &function_begin("AES_cbc_encrypt"); | 2032 &function_begin("AES_cbc_encrypt"); |
799 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len | 2033 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len |
800 &cmp ($s2,0); | 2034 &cmp ($s2,0); |
801 » &je» (&label("enc_out")); | 2035 » &je» (&label("drop_out")); |
802 | 2036 |
803 &call (&label("pic_point")); # make it PIC! | 2037 &call (&label("pic_point")); # make it PIC! |
804 &set_label("pic_point"); | 2038 &set_label("pic_point"); |
805 » &blindpop("ebp"); | 2039 » &blindpop($tbl); |
| 2040 » &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only)
; |
806 | 2041 |
| 2042 &cmp (&wparam(5),0); |
| 2043 &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl)); |
| 2044 &jne (&label("picked_te")); |
| 2045 &lea ($tbl,&DWP(&label("AES_Td")."-".&label("AES_Te"),$tbl)); |
| 2046 &set_label("picked_te"); |
| 2047 |
| 2048 # one can argue if this is required |
807 &pushf (); | 2049 &pushf (); |
808 &cld (); | 2050 &cld (); |
809 | 2051 |
810 » &cmp» (&wparam(5),0); | 2052 » &cmp» ($s2,$speed_limit); |
811 » &je» (&label("DECRYPT")); | 2053 » &jb» (&label("slow_way")); |
| 2054 » &test» ($s2,15); |
| 2055 » &jnz» (&label("slow_way")); |
| 2056 » » » » » if (!$x86only) { |
| 2057 » &bt» (&DWP(0,$s0),28);» # check for hyper-threading bit |
| 2058 » &jc» (&label("slow_way")); |
| 2059 » » » » » } |
| 2060 » # pre-allocate aligned stack frame... |
| 2061 » &lea» ($acc,&DWP(-80-244,"esp")); |
| 2062 » &and» ($acc,-64); |
812 | 2063 |
813 » &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); | 2064 » # ... and make sure it doesn't alias with $tbl modulo 4096 |
814 | 2065 » &mov» ($s0,$tbl); |
815 » # allocate aligned stack frame... | 2066 » &lea» ($s1,&DWP(2048+256,$tbl)); |
816 » &lea» ($key,&DWP(-64-244,"esp")); | 2067 » &mov» ($s3,$acc); |
817 » &and» ($key,-64); | |
818 | |
819 » # ... and make sure it doesn't alias with AES_Te modulo 4096 | |
820 » &mov» ($s0,"ebp"); | |
821 » &lea» ($s1,&DWP(2048,"ebp")); | |
822 » &mov» ($s3,$key); | |
823 &and ($s0,0xfff); # s = %ebp&0xfff | 2068 &and ($s0,0xfff); # s = %ebp&0xfff |
824 » &and» ($s1,0xfff);» » # e = (%ebp+2048)&0xfff | 2069 » &and» ($s1,0xfff);» » # e = (%ebp+2048+256)&0xfff |
825 &and ($s3,0xfff); # p = %esp&0xfff | 2070 &and ($s3,0xfff); # p = %esp&0xfff |
826 | 2071 |
827 &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e); | 2072 &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e); |
828 » &jb» (&label("te_break_out")); | 2073 » &jb» (&label("tbl_break_out")); |
829 &sub ($s3,$s1); | 2074 &sub ($s3,$s1); |
830 » &sub» ($key,$s3); | 2075 » &sub» ($acc,$s3); |
831 » &jmp» (&label("te_ok")); | 2076 » &jmp» (&label("tbl_ok")); |
832 » &set_label("te_break_out");» # else %esp -= (p-s)&0xfff + framesz; | 2077 » &set_label("tbl_break_out",4);» # else %esp -= (p-s)&0xfff + framesz; |
833 &sub ($s3,$s0); | 2078 &sub ($s3,$s0); |
834 &and ($s3,0xfff); | 2079 &and ($s3,0xfff); |
835 » &add» ($s3,64+256); | 2080 » &add» ($s3,384); |
836 » &sub» ($key,$s3); | 2081 » &sub» ($acc,$s3); |
837 » &align» (4); | 2082 » &set_label("tbl_ok",4); |
838 » &set_label("te_ok"); | |
839 | 2083 |
840 » &mov» ($s0,&wparam(0));» # load inp | 2084 » &lea» ($s3,&wparam(0));» # obtain pointer to parameter block |
841 » &mov» ($s1,&wparam(1));» # load out | 2085 » &exch» ("esp",$acc);» » # allocate stack frame |
842 » &mov» ($s3,&wparam(3));» # load key | 2086 » &add» ("esp",4);» » # reserve for return address! |
843 » &mov» ($acc,&wparam(4));» # load ivp | 2087 » &mov» ($_tbl,$tbl);» » # save %ebp |
| 2088 » &mov» ($_esp,$acc);» » # save %esp |
844 | 2089 |
845 » &exch» ("esp",$key); | 2090 » &mov» ($s0,&DWP(0,$s3));» # load inp |
846 » &add» ("esp",4);» » # reserve for return address! | 2091 » &mov» ($s1,&DWP(4,$s3));» # load out |
847 » &mov» ($_esp,$key);» » # save %esp | 2092 » #&mov» ($s2,&DWP(8,$s3));» # load len |
| 2093 » &mov» ($key,&DWP(12,$s3));» # load key |
| 2094 » &mov» ($acc,&DWP(16,$s3));» # load ivp |
| 2095 » &mov» ($s3,&DWP(20,$s3));» # load enc flag |
848 | 2096 |
849 &mov ($_inp,$s0); # save copy of inp | 2097 &mov ($_inp,$s0); # save copy of inp |
850 &mov ($_out,$s1); # save copy of out | 2098 &mov ($_out,$s1); # save copy of out |
851 &mov ($_len,$s2); # save copy of len | 2099 &mov ($_len,$s2); # save copy of len |
852 » &mov» ($_key,$s3);» » # save copy of key | 2100 » &mov» ($_key,$key);» » # save copy of key |
853 &mov ($_ivp,$acc); # save copy of ivp | 2101 &mov ($_ivp,$acc); # save copy of ivp |
854 | 2102 |
855 &mov ($mark,0); # copy of aes_key->rounds = 0; | 2103 &mov ($mark,0); # copy of aes_key->rounds = 0; |
856 if ($compromise) { | |
857 &cmp ($s2,$compromise); | |
858 &jb (&label("skip_ecopy")); | |
859 } | |
860 # do we copy key schedule to stack? | 2104 # do we copy key schedule to stack? |
861 » &mov» ($s1 eq "ebx" ? $s1 : "",$s3); | 2105 » &mov» ($s1 eq "ebx" ? $s1 : "",$key); |
862 &mov ($s2 eq "ecx" ? $s2 : "",244/4); | 2106 &mov ($s2 eq "ecx" ? $s2 : "",244/4); |
863 » &sub» ($s1,"ebp"); | 2107 » &sub» ($s1,$tbl); |
864 » &mov» ("esi",$s3); | 2108 » &mov» ("esi",$key); |
865 &and ($s1,0xfff); | 2109 &and ($s1,0xfff); |
866 &lea ("edi",$aes_key); | 2110 &lea ("edi",$aes_key); |
867 » &cmp» ($s1,2048); | 2111 » &cmp» ($s1,2048+256); |
868 » &jb» (&label("do_ecopy")); | 2112 » &jb» (&label("do_copy")); |
869 &cmp ($s1,4096-244); | 2113 &cmp ($s1,4096-244); |
870 » &jb» (&label("skip_ecopy")); | 2114 » &jb» (&label("skip_copy")); |
871 » &align» (4); | 2115 » &set_label("do_copy",4); |
872 » &set_label("do_ecopy"); | |
873 &mov ($_key,"edi"); | 2116 &mov ($_key,"edi"); |
874 &data_word(0xA5F3F689); # rep movsd | 2117 &data_word(0xA5F3F689); # rep movsd |
875 » &set_label("skip_ecopy"); | 2118 » &set_label("skip_copy"); |
876 | 2119 |
877 &mov ($acc,$s0); | |
878 &mov ($key,16); | 2120 &mov ($key,16); |
879 » &align» (4); | 2121 » &set_label("prefetch_tbl",4); |
880 » &set_label("prefetch_te"); | 2122 » » &mov» ($s0,&DWP(0,$tbl)); |
881 » » &mov» ($s0,&DWP(0,"ebp")); | 2123 » » &mov» ($s1,&DWP(32,$tbl)); |
882 » » &mov» ($s1,&DWP(32,"ebp")); | 2124 » » &mov» ($s2,&DWP(64,$tbl)); |
883 » » &mov» ($s2,&DWP(64,"ebp")); | 2125 » » &mov» ($acc,&DWP(96,$tbl)); |
884 » » &mov» ($s3,&DWP(96,"ebp")); | 2126 » » &lea» ($tbl,&DWP(128,$tbl)); |
885 » » &lea» ("ebp",&DWP(128,"ebp")); | 2127 » » &sub» ($key,1); |
886 » » &dec» ($key); | 2128 » &jnz» (&label("prefetch_tbl")); |
887 » &jnz» (&label("prefetch_te")); | 2129 » &sub» ($tbl,2048); |
888 » &sub» ("ebp",2048); | |
889 | 2130 |
890 » &mov» ($s2,$_len); | 2131 » &mov» ($acc,$_inp); |
891 &mov ($key,$_ivp); | 2132 &mov ($key,$_ivp); |
892 &test ($s2,0xFFFFFFF0); | |
893 &jz (&label("enc_tail")); # short input... | |
894 | 2133 |
| 2134 &cmp ($s3,0); |
| 2135 &je (&label("fast_decrypt")); |
| 2136 |
| 2137 #----------------------------- ENCRYPT -----------------------------# |
895 &mov ($s0,&DWP(0,$key)); # load iv | 2138 &mov ($s0,&DWP(0,$key)); # load iv |
896 &mov ($s1,&DWP(4,$key)); | 2139 &mov ($s1,&DWP(4,$key)); |
897 | 2140 |
898 » &align» (4); | 2141 » &set_label("fast_enc_loop",16); |
899 » &set_label("enc_loop"); | |
900 &mov ($s2,&DWP(8,$key)); | 2142 &mov ($s2,&DWP(8,$key)); |
901 &mov ($s3,&DWP(12,$key)); | 2143 &mov ($s3,&DWP(12,$key)); |
902 | 2144 |
903 &xor ($s0,&DWP(0,$acc)); # xor input data | 2145 &xor ($s0,&DWP(0,$acc)); # xor input data |
904 &xor ($s1,&DWP(4,$acc)); | 2146 &xor ($s1,&DWP(4,$acc)); |
905 &xor ($s2,&DWP(8,$acc)); | 2147 &xor ($s2,&DWP(8,$acc)); |
906 &xor ($s3,&DWP(12,$acc)); | 2148 &xor ($s3,&DWP(12,$acc)); |
907 | 2149 |
908 &mov ($key,$_key); # load key | 2150 &mov ($key,$_key); # load key |
909 &call ("_x86_AES_encrypt"); | 2151 &call ("_x86_AES_encrypt"); |
910 | 2152 |
911 &mov ($acc,$_inp); # load inp | 2153 &mov ($acc,$_inp); # load inp |
912 &mov ($key,$_out); # load out | 2154 &mov ($key,$_out); # load out |
913 | 2155 |
914 &mov (&DWP(0,$key),$s0); # save output data | 2156 &mov (&DWP(0,$key),$s0); # save output data |
915 &mov (&DWP(4,$key),$s1); | 2157 &mov (&DWP(4,$key),$s1); |
916 &mov (&DWP(8,$key),$s2); | 2158 &mov (&DWP(8,$key),$s2); |
917 &mov (&DWP(12,$key),$s3); | 2159 &mov (&DWP(12,$key),$s3); |
918 | 2160 |
| 2161 &lea ($acc,&DWP(16,$acc)); # advance inp |
919 &mov ($s2,$_len); # load len | 2162 &mov ($s2,$_len); # load len |
920 | |
921 &lea ($acc,&DWP(16,$acc)); | |
922 &mov ($_inp,$acc); # save inp | 2163 &mov ($_inp,$acc); # save inp |
923 | 2164 » » &lea» ($s3,&DWP(16,$key));» # advance out |
924 » » &lea» ($s3,&DWP(16,$key)); | |
925 &mov ($_out,$s3); # save out | 2165 &mov ($_out,$s3); # save out |
926 | 2166 » » &sub» ($s2,16);» » # decrease len |
927 » » &sub» ($s2,16); | |
928 » » &test» ($s2,0xFFFFFFF0); | |
929 &mov ($_len,$s2); # save len | 2167 &mov ($_len,$s2); # save len |
930 » &jnz» (&label("enc_loop")); | 2168 » &jnz» (&label("fast_enc_loop")); |
931 » &test» ($s2,15); | |
932 » &jnz» (&label("enc_tail")); | |
933 &mov ($acc,$_ivp); # load ivp | 2169 &mov ($acc,$_ivp); # load ivp |
934 » &mov» ($s2,&DWP(8,$key));» # restore last dwords | 2170 » &mov» ($s2,&DWP(8,$key));» # restore last 2 dwords |
935 &mov ($s3,&DWP(12,$key)); | 2171 &mov ($s3,&DWP(12,$key)); |
936 &mov (&DWP(0,$acc),$s0); # save ivec | 2172 &mov (&DWP(0,$acc),$s0); # save ivec |
937 &mov (&DWP(4,$acc),$s1); | 2173 &mov (&DWP(4,$acc),$s1); |
938 &mov (&DWP(8,$acc),$s2); | 2174 &mov (&DWP(8,$acc),$s2); |
939 &mov (&DWP(12,$acc),$s3); | 2175 &mov (&DWP(12,$acc),$s3); |
940 | 2176 |
941 &cmp ($mark,0); # was the key schedule copied? | 2177 &cmp ($mark,0); # was the key schedule copied? |
942 &mov ("edi",$_key); | 2178 &mov ("edi",$_key); |
943 &je (&label("skip_ezero")); | 2179 &je (&label("skip_ezero")); |
944 # zero copy of key schedule | 2180 # zero copy of key schedule |
945 &mov ("ecx",240/4); | 2181 &mov ("ecx",240/4); |
946 &xor ("eax","eax"); | 2182 &xor ("eax","eax"); |
947 &align (4); | 2183 &align (4); |
948 &data_word(0xABF3F689); # rep stosd | 2184 &data_word(0xABF3F689); # rep stosd |
949 &set_label("skip_ezero") | 2185 &set_label("skip_ezero") |
950 &mov ("esp",$_esp); | 2186 &mov ("esp",$_esp); |
951 &popf (); | 2187 &popf (); |
952 &set_label("enc_out"); | 2188 &set_label("drop_out"); |
953 &function_end_A(); | 2189 &function_end_A(); |
954 &pushf (); # kludge, never executed | 2190 &pushf (); # kludge, never executed |
955 | 2191 |
956 &align (4); | |
957 &set_label("enc_tail"); | |
958 &mov ($s0,$key eq "edi" ? $key : ""); | |
959 &mov ($key,$_out); # load out | |
960 &push ($s0); # push ivp | |
961 &mov ($s1,16); | |
962 &sub ($s1,$s2); | |
963 &cmp ($key,$acc); # compare with inp | |
964 &je (&label("enc_in_place")); | |
965 &align (4); | |
966 &data_word(0xA4F3F689); # rep movsb # copy input | |
967 &jmp (&label("enc_skip_in_place")); | |
968 &set_label("enc_in_place"); | |
969 &lea ($key,&DWP(0,$key,$s2)); | |
970 &set_label("enc_skip_in_place"); | |
971 &mov ($s2,$s1); | |
972 &xor ($s0,$s0); | |
973 &align (4); | |
974 &data_word(0xAAF3F689); # rep stosb # zero tail | |
975 &pop ($key); # pop ivp | |
976 | |
977 &mov ($acc,$_out); # output as input | |
978 &mov ($s0,&DWP(0,$key)); | |
979 &mov ($s1,&DWP(4,$key)); | |
980 &mov ($_len,16); # len=16 | |
981 &jmp (&label("enc_loop")); # one more spin... | |
982 | |
983 #----------------------------- DECRYPT -----------------------------# | 2192 #----------------------------- DECRYPT -----------------------------# |
984 &align» (4); | 2193 &set_label("fast_decrypt",16); |
985 &set_label("DECRYPT"); | |
986 » &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp")); | |
987 | |
988 » # allocate aligned stack frame... | |
989 » &lea» ($key,&DWP(-64-244,"esp")); | |
990 » &and» ($key,-64); | |
991 | |
992 » # ... and make sure it doesn't alias with AES_Td modulo 4096 | |
993 » &mov» ($s0,"ebp"); | |
994 » &lea» ($s1,&DWP(2048+256,"ebp")); | |
995 » &mov» ($s3,$key); | |
996 » &and» ($s0,0xfff);» » # s = %ebp&0xfff | |
997 » &and» ($s1,0xfff);» » # e = (%ebp+2048+256)&0xfff | |
998 » &and» ($s3,0xfff);» » # p = %esp&0xfff | |
999 | |
1000 » &cmp» ($s3,$s1);» » # if (p>=e) %esp =- (p-e); | |
1001 » &jb» (&label("td_break_out")); | |
1002 » &sub» ($s3,$s1); | |
1003 » &sub» ($key,$s3); | |
1004 » &jmp» (&label("td_ok")); | |
1005 » &set_label("td_break_out");» # else %esp -= (p-s)&0xfff + framesz; | |
1006 » &sub» ($s3,$s0); | |
1007 » &and» ($s3,0xfff); | |
1008 » &add» ($s3,64+256); | |
1009 » &sub» ($key,$s3); | |
1010 » &align» (4); | |
1011 » &set_label("td_ok"); | |
1012 | |
1013 » &mov» ($s0,&wparam(0));» # load inp | |
1014 » &mov» ($s1,&wparam(1));» # load out | |
1015 » &mov» ($s3,&wparam(3));» # load key | |
1016 » &mov» ($acc,&wparam(4));» # load ivp | |
1017 | |
1018 » &exch» ("esp",$key); | |
1019 » &add» ("esp",4);» » # reserve for return address! | |
1020 » &mov» ($_esp,$key);» » # save %esp | |
1021 | |
1022 » &mov» ($_inp,$s0);» » # save copy of inp | |
1023 » &mov» ($_out,$s1);» » # save copy of out | |
1024 » &mov» ($_len,$s2);» » # save copy of len | |
1025 » &mov» ($_key,$s3);» » # save copy of key | |
1026 » &mov» ($_ivp,$acc);» » # save copy of ivp | |
1027 | |
1028 » &mov» ($mark,0);» » # copy of aes_key->rounds = 0; | |
1029 » if ($compromise) { | |
1030 » » &cmp» ($s2,$compromise); | |
1031 » » &jb» (&label("skip_dcopy")); | |
1032 » } | |
1033 » # do we copy key schedule to stack? | |
1034 » &mov» ($s1 eq "ebx" ? $s1 : "",$s3); | |
1035 » &mov» ($s2 eq "ecx" ? $s2 : "",244/4); | |
1036 » &sub» ($s1,"ebp"); | |
1037 » &mov» ("esi",$s3); | |
1038 » &and» ($s1,0xfff); | |
1039 » &lea» ("edi",$aes_key); | |
1040 » &cmp» ($s1,2048+256); | |
1041 » &jb» (&label("do_dcopy")); | |
1042 » &cmp» ($s1,4096-244); | |
1043 » &jb» (&label("skip_dcopy")); | |
1044 » &align» (4); | |
1045 » &set_label("do_dcopy"); | |
1046 » » &mov» ($_key,"edi"); | |
1047 » » &data_word(0xA5F3F689);»# rep movsd | |
1048 » &set_label("skip_dcopy"); | |
1049 | |
1050 » &mov» ($acc,$s0); | |
1051 » &mov» ($key,18); | |
1052 » &align» (4); | |
1053 » &set_label("prefetch_td"); | |
1054 » » &mov» ($s0,&DWP(0,"ebp")); | |
1055 » » &mov» ($s1,&DWP(32,"ebp")); | |
1056 » » &mov» ($s2,&DWP(64,"ebp")); | |
1057 » » &mov» ($s3,&DWP(96,"ebp")); | |
1058 » » &lea» ("ebp",&DWP(128,"ebp")); | |
1059 » » &dec» ($key); | |
1060 » &jnz» (&label("prefetch_td")); | |
1061 » &sub» ("ebp",2048+256); | |
1062 | 2194 |
1063 &cmp ($acc,$_out); | 2195 &cmp ($acc,$_out); |
1064 » &je» (&label("dec_in_place"));» # in-place processing... | 2196 » &je» (&label("fast_dec_in_place"));» # in-place processing... |
1065 | 2197 |
1066 &mov ($key,$_ivp); # load ivp | |
1067 &mov ($_tmp,$key); | 2198 &mov ($_tmp,$key); |
1068 | 2199 |
1069 &align (4); | 2200 &align (4); |
1070 » &set_label("dec_loop"); | 2201 » &set_label("fast_dec_loop",16); |
1071 &mov ($s0,&DWP(0,$acc)); # read input | 2202 &mov ($s0,&DWP(0,$acc)); # read input |
1072 &mov ($s1,&DWP(4,$acc)); | 2203 &mov ($s1,&DWP(4,$acc)); |
1073 &mov ($s2,&DWP(8,$acc)); | 2204 &mov ($s2,&DWP(8,$acc)); |
1074 &mov ($s3,&DWP(12,$acc)); | 2205 &mov ($s3,&DWP(12,$acc)); |
1075 | 2206 |
1076 &mov ($key,$_key); # load key | 2207 &mov ($key,$_key); # load key |
1077 &call ("_x86_AES_decrypt"); | 2208 &call ("_x86_AES_decrypt"); |
1078 | 2209 |
1079 &mov ($key,$_tmp); # load ivp | 2210 &mov ($key,$_tmp); # load ivp |
1080 &mov ($acc,$_len); # load len | 2211 &mov ($acc,$_len); # load len |
1081 &xor ($s0,&DWP(0,$key)); # xor iv | 2212 &xor ($s0,&DWP(0,$key)); # xor iv |
1082 &xor ($s1,&DWP(4,$key)); | 2213 &xor ($s1,&DWP(4,$key)); |
1083 &xor ($s2,&DWP(8,$key)); | 2214 &xor ($s2,&DWP(8,$key)); |
1084 &xor ($s3,&DWP(12,$key)); | 2215 &xor ($s3,&DWP(12,$key)); |
1085 | 2216 |
1086 » » &sub» ($acc,16); | 2217 » » &mov» ($key,$_out);» » # load out |
1087 » » &jc» (&label("dec_partial")); | |
1088 » » &mov» ($_len,$acc);» » # save len | |
1089 &mov ($acc,$_inp); # load inp | 2218 &mov ($acc,$_inp); # load inp |
1090 &mov ($key,$_out); # load out | |
1091 | 2219 |
1092 &mov (&DWP(0,$key),$s0); # write output | 2220 &mov (&DWP(0,$key),$s0); # write output |
1093 &mov (&DWP(4,$key),$s1); | 2221 &mov (&DWP(4,$key),$s1); |
1094 &mov (&DWP(8,$key),$s2); | 2222 &mov (&DWP(8,$key),$s2); |
1095 &mov (&DWP(12,$key),$s3); | 2223 &mov (&DWP(12,$key),$s3); |
1096 | 2224 |
| 2225 &mov ($s2,$_len); # load len |
1097 &mov ($_tmp,$acc); # save ivp | 2226 &mov ($_tmp,$acc); # save ivp |
1098 » » &lea» ($acc,&DWP(16,$acc)); | 2227 » » &lea» ($acc,&DWP(16,$acc));» # advance inp |
1099 &mov ($_inp,$acc); # save inp | 2228 &mov ($_inp,$acc); # save inp |
1100 | 2229 » » &lea» ($key,&DWP(16,$key));» # advance out |
1101 » » &lea» ($key,&DWP(16,$key)); | |
1102 &mov ($_out,$key); # save out | 2230 &mov ($_out,$key); # save out |
1103 | 2231 » » &sub» ($s2,16);» » # decrease len |
1104 » &jnz» (&label("dec_loop")); | 2232 » » &mov» ($_len,$s2);» » # save len |
| 2233 » &jnz» (&label("fast_dec_loop")); |
1105 &mov ($key,$_tmp); # load temp ivp | 2234 &mov ($key,$_tmp); # load temp ivp |
1106 &set_label("dec_end"); | |
1107 &mov ($acc,$_ivp); # load user ivp | 2235 &mov ($acc,$_ivp); # load user ivp |
1108 &mov ($s0,&DWP(0,$key)); # load iv | 2236 &mov ($s0,&DWP(0,$key)); # load iv |
1109 &mov ($s1,&DWP(4,$key)); | 2237 &mov ($s1,&DWP(4,$key)); |
1110 &mov ($s2,&DWP(8,$key)); | 2238 &mov ($s2,&DWP(8,$key)); |
1111 &mov ($s3,&DWP(12,$key)); | 2239 &mov ($s3,&DWP(12,$key)); |
1112 &mov (&DWP(0,$acc),$s0); # copy back to user | 2240 &mov (&DWP(0,$acc),$s0); # copy back to user |
1113 &mov (&DWP(4,$acc),$s1); | 2241 &mov (&DWP(4,$acc),$s1); |
1114 &mov (&DWP(8,$acc),$s2); | 2242 &mov (&DWP(8,$acc),$s2); |
1115 &mov (&DWP(12,$acc),$s3); | 2243 &mov (&DWP(12,$acc),$s3); |
1116 » &jmp» (&label("dec_out")); | 2244 » &jmp» (&label("fast_dec_out")); |
1117 | 2245 |
1118 &align» (4); | 2246 &set_label("fast_dec_in_place",16); |
1119 &set_label("dec_partial"); | 2247 » &set_label("fast_dec_in_place_loop"); |
1120 » &lea» ($key,$ivec); | |
1121 » &mov» (&DWP(0,$key),$s0);» # dump output to stack | |
1122 » &mov» (&DWP(4,$key),$s1); | |
1123 » &mov» (&DWP(8,$key),$s2); | |
1124 » &mov» (&DWP(12,$key),$s3); | |
1125 » &lea» ($s2 eq "ecx" ? $s2 : "",&DWP(16,$acc)); | |
1126 » &mov» ($acc eq "esi" ? $acc : "",$key); | |
1127 » &mov» ($key eq "edi" ? $key : "",$_out);» # load out | |
1128 » &data_word(0xA4F3F689);»# rep movsb» » # copy output | |
1129 » &mov» ($key,$_inp);» » » » # use inp as temp ivp | |
1130 » &jmp» (&label("dec_end")); | |
1131 | |
1132 &align» (4); | |
1133 &set_label("dec_in_place"); | |
1134 » &set_label("dec_in_place_loop"); | |
1135 » » &lea» ($key,$ivec); | |
1136 &mov ($s0,&DWP(0,$acc)); # read input | 2248 &mov ($s0,&DWP(0,$acc)); # read input |
1137 &mov ($s1,&DWP(4,$acc)); | 2249 &mov ($s1,&DWP(4,$acc)); |
1138 &mov ($s2,&DWP(8,$acc)); | 2250 &mov ($s2,&DWP(8,$acc)); |
1139 &mov ($s3,&DWP(12,$acc)); | 2251 &mov ($s3,&DWP(12,$acc)); |
1140 | 2252 |
| 2253 &lea ($key,$ivec); |
1141 &mov (&DWP(0,$key),$s0); # copy to temp | 2254 &mov (&DWP(0,$key),$s0); # copy to temp |
1142 &mov (&DWP(4,$key),$s1); | 2255 &mov (&DWP(4,$key),$s1); |
1143 &mov (&DWP(8,$key),$s2); | 2256 &mov (&DWP(8,$key),$s2); |
1144 &mov (&DWP(12,$key),$s3); | 2257 &mov (&DWP(12,$key),$s3); |
1145 | 2258 |
1146 &mov ($key,$_key); # load key | 2259 &mov ($key,$_key); # load key |
1147 &call ("_x86_AES_decrypt"); | 2260 &call ("_x86_AES_decrypt"); |
1148 | 2261 |
1149 &mov ($key,$_ivp); # load ivp | 2262 &mov ($key,$_ivp); # load ivp |
1150 &mov ($acc,$_out); # load out | 2263 &mov ($acc,$_out); # load out |
1151 &xor ($s0,&DWP(0,$key)); # xor iv | 2264 &xor ($s0,&DWP(0,$key)); # xor iv |
1152 &xor ($s1,&DWP(4,$key)); | 2265 &xor ($s1,&DWP(4,$key)); |
1153 &xor ($s2,&DWP(8,$key)); | 2266 &xor ($s2,&DWP(8,$key)); |
1154 &xor ($s3,&DWP(12,$key)); | 2267 &xor ($s3,&DWP(12,$key)); |
1155 | 2268 |
1156 &mov (&DWP(0,$acc),$s0); # write output | 2269 &mov (&DWP(0,$acc),$s0); # write output |
1157 &mov (&DWP(4,$acc),$s1); | 2270 &mov (&DWP(4,$acc),$s1); |
1158 &mov (&DWP(8,$acc),$s2); | 2271 &mov (&DWP(8,$acc),$s2); |
1159 &mov (&DWP(12,$acc),$s3); | 2272 &mov (&DWP(12,$acc),$s3); |
1160 | 2273 |
1161 » » &lea» ($acc,&DWP(16,$acc)); | 2274 » » &lea» ($acc,&DWP(16,$acc));» # advance out |
1162 &mov ($_out,$acc); # save out | 2275 &mov ($_out,$acc); # save out |
1163 | 2276 |
1164 &lea ($acc,$ivec); | 2277 &lea ($acc,$ivec); |
1165 &mov ($s0,&DWP(0,$acc)); # read temp | 2278 &mov ($s0,&DWP(0,$acc)); # read temp |
1166 &mov ($s1,&DWP(4,$acc)); | 2279 &mov ($s1,&DWP(4,$acc)); |
1167 &mov ($s2,&DWP(8,$acc)); | 2280 &mov ($s2,&DWP(8,$acc)); |
1168 &mov ($s3,&DWP(12,$acc)); | 2281 &mov ($s3,&DWP(12,$acc)); |
1169 | 2282 |
1170 &mov (&DWP(0,$key),$s0); # copy iv | 2283 &mov (&DWP(0,$key),$s0); # copy iv |
1171 &mov (&DWP(4,$key),$s1); | 2284 &mov (&DWP(4,$key),$s1); |
1172 &mov (&DWP(8,$key),$s2); | 2285 &mov (&DWP(8,$key),$s2); |
1173 &mov (&DWP(12,$key),$s3); | 2286 &mov (&DWP(12,$key),$s3); |
1174 | 2287 |
1175 &mov ($acc,$_inp); # load inp | 2288 &mov ($acc,$_inp); # load inp |
1176 | |
1177 &lea ($acc,&DWP(16,$acc)); | |
1178 &mov ($_inp,$acc); # save inp | |
1179 | |
1180 &mov ($s2,$_len); # load len | 2289 &mov ($s2,$_len); # load len |
1181 » » &sub» ($s2,16); | 2290 » » &lea» ($acc,&DWP(16,$acc));» # advance inp |
1182 » » &jc» (&label("dec_in_place_partial")); | 2291 » » &mov» ($_inp,$acc);» » # save inp |
| 2292 » » &sub» ($s2,16);» » # decrease len |
1183 &mov ($_len,$s2); # save len | 2293 &mov ($_len,$s2); # save len |
1184 » &jnz» (&label("dec_in_place_loop")); | 2294 » &jnz» (&label("fast_dec_in_place_loop")); |
1185 » &jmp» (&label("dec_out")); | 2295 |
1186 | 2296 &set_label("fast_dec_out",4); |
1187 &align» (4); | 2297 » &cmp» ($mark,0);» » # was the key schedule copied? |
1188 &set_label("dec_in_place_partial"); | 2298 » &mov» ("edi",$_key); |
1189 » # one can argue if this is actually required... | 2299 » &je» (&label("skip_dzero")); |
1190 » &mov» ($key eq "edi" ? $key : "",$_out); | 2300 » # zero copy of key schedule |
1191 » &lea» ($acc eq "esi" ? $acc : "",$ivec); | 2301 » &mov» ("ecx",240/4); |
| 2302 » &xor» ("eax","eax"); |
| 2303 » &align» (4); |
| 2304 » &data_word(0xABF3F689);»# rep stosd |
| 2305 » &set_label("skip_dzero") |
| 2306 » &mov» ("esp",$_esp); |
| 2307 » &popf» (); |
| 2308 » &function_end_A(); |
| 2309 » &pushf» ();» » » # kludge, never executed |
| 2310 |
| 2311 #--------------------------- SLOW ROUTINE ---------------------------# |
| 2312 &set_label("slow_way",16); |
| 2313 |
| 2314 » &mov» ($s0,&DWP(0,$s0)) if (!$x86only);# load OPENSSL_ia32cap |
| 2315 » &mov» ($key,&wparam(3));» # load key |
| 2316 |
| 2317 » # pre-allocate aligned stack frame... |
| 2318 » &lea» ($acc,&DWP(-80,"esp")); |
| 2319 » &and» ($acc,-64); |
| 2320 |
| 2321 » # ... and make sure it doesn't alias with $key modulo 1024 |
| 2322 » &lea» ($s1,&DWP(-80-63,$key)); |
| 2323 » &sub» ($s1,$acc); |
| 2324 » &neg» ($s1); |
| 2325 » &and» ($s1,0x3C0);» # modulo 1024, but aligned to cache-line |
| 2326 » &sub» ($acc,$s1); |
| 2327 |
| 2328 » # pick S-box copy which can't overlap with stack frame or $key |
| 2329 » &lea» ($s1,&DWP(768,$acc)); |
| 2330 » &sub» ($s1,$tbl); |
| 2331 » &and» ($s1,0x300); |
| 2332 » &lea» ($tbl,&DWP(2048+128,$tbl,$s1)); |
| 2333 |
| 2334 » &lea» ($s3,&wparam(0));» # pointer to parameter block |
| 2335 |
| 2336 » &exch» ("esp",$acc); |
| 2337 » &add» ("esp",4);» » # reserve for return address! |
| 2338 » &mov» ($_tbl,$tbl);» » # save %ebp |
| 2339 » &mov» ($_esp,$acc);» » # save %esp |
| 2340 » &mov» ($_tmp,$s0);» » # save OPENSSL_ia32cap |
| 2341 |
| 2342 » &mov» ($s0,&DWP(0,$s3));» # load inp |
| 2343 » &mov» ($s1,&DWP(4,$s3));» # load out |
| 2344 » #&mov» ($s2,&DWP(8,$s3));» # load len |
| 2345 » #&mov» ($key,&DWP(12,$s3));» # load key |
| 2346 » &mov» ($acc,&DWP(16,$s3));» # load ivp |
| 2347 » &mov» ($s3,&DWP(20,$s3));» # load enc flag |
| 2348 |
| 2349 » &mov» ($_inp,$s0);» » # save copy of inp |
| 2350 » &mov» ($_out,$s1);» » # save copy of out |
| 2351 » &mov» ($_len,$s2);» » # save copy of len |
| 2352 » &mov» ($_key,$key);» » # save copy of key |
| 2353 » &mov» ($_ivp,$acc);» » # save copy of ivp |
| 2354 |
| 2355 » &mov» ($key,$acc); |
| 2356 » &mov» ($acc,$s0); |
| 2357 |
| 2358 » &cmp» ($s3,0); |
| 2359 » &je» (&label("slow_decrypt")); |
| 2360 |
| 2361 #--------------------------- SLOW ENCRYPT ---------------------------# |
| 2362 » &cmp» ($s2,16); |
| 2363 » &mov» ($s3,$s1); |
| 2364 » &jb» (&label("slow_enc_tail")); |
| 2365 |
| 2366 » » » » » if (!$x86only) { |
| 2367 » &bt» ($_tmp,25);» » # check for SSE bit |
| 2368 » &jnc» (&label("slow_enc_x86")); |
| 2369 |
| 2370 » &movq» ("mm0",&QWP(0,$key));» # load iv |
| 2371 » &movq» ("mm4",&QWP(8,$key)); |
| 2372 |
| 2373 » &set_label("slow_enc_loop_sse",16); |
| 2374 » » &pxor» ("mm0",&QWP(0,$acc));» # xor input data |
| 2375 » » &pxor» ("mm4",&QWP(8,$acc)); |
| 2376 |
| 2377 » » &mov» ($key,$_key); |
| 2378 » » &call» ("_sse_AES_encrypt_compact"); |
| 2379 |
| 2380 » » &mov» ($acc,$_inp);» » # load inp |
| 2381 » » &mov» ($key,$_out);» » # load out |
| 2382 » » &mov» ($s2,$_len);» » # load len |
| 2383 |
| 2384 » » &movq» (&QWP(0,$key),"mm0");» # save output data |
| 2385 » » &movq» (&QWP(8,$key),"mm4"); |
| 2386 |
| 2387 » » &lea» ($acc,&DWP(16,$acc));» # advance inp |
| 2388 » » &mov» ($_inp,$acc);» » # save inp |
| 2389 » » &lea» ($s3,&DWP(16,$key));» # advance out |
| 2390 » » &mov» ($_out,$s3);» » # save out |
| 2391 » » &sub» ($s2,16);» » # decrease len |
| 2392 » » &cmp» ($s2,16); |
| 2393 » » &mov» ($_len,$s2);» » # save len |
| 2394 » &jae» (&label("slow_enc_loop_sse")); |
| 2395 » &test» ($s2,15); |
| 2396 » &jnz» (&label("slow_enc_tail")); |
| 2397 » &mov» ($acc,$_ivp);» » # load ivp |
| 2398 » &movq» (&QWP(0,$acc),"mm0");» # save ivec |
| 2399 » &movq» (&QWP(8,$acc),"mm4"); |
| 2400 » &emms» (); |
| 2401 » &mov» ("esp",$_esp); |
| 2402 » &popf» (); |
| 2403 » &function_end_A(); |
| 2404 » &pushf» ();» » » # kludge, never executed |
| 2405 » » » » » } |
| 2406 &set_label("slow_enc_x86",16); |
| 2407 » &mov» ($s0,&DWP(0,$key));» # load iv |
| 2408 » &mov» ($s1,&DWP(4,$key)); |
| 2409 |
| 2410 » &set_label("slow_enc_loop_x86",4); |
| 2411 » » &mov» ($s2,&DWP(8,$key)); |
| 2412 » » &mov» ($s3,&DWP(12,$key)); |
| 2413 |
| 2414 » » &xor» ($s0,&DWP(0,$acc));» # xor input data |
| 2415 » » &xor» ($s1,&DWP(4,$acc)); |
| 2416 » » &xor» ($s2,&DWP(8,$acc)); |
| 2417 » » &xor» ($s3,&DWP(12,$acc)); |
| 2418 |
| 2419 » » &mov» ($key,$_key);» » # load key |
| 2420 » » &call» ("_x86_AES_encrypt_compact"); |
| 2421 |
| 2422 » » &mov» ($acc,$_inp);» » # load inp |
| 2423 » » &mov» ($key,$_out);» » # load out |
| 2424 |
| 2425 » » &mov» (&DWP(0,$key),$s0);» # save output data |
| 2426 » » &mov» (&DWP(4,$key),$s1); |
| 2427 » » &mov» (&DWP(8,$key),$s2); |
| 2428 » » &mov» (&DWP(12,$key),$s3); |
| 2429 |
| 2430 » » &mov» ($s2,$_len);» » # load len |
| 2431 » » &lea» ($acc,&DWP(16,$acc));» # advance inp |
| 2432 » » &mov» ($_inp,$acc);» » # save inp |
| 2433 » » &lea» ($s3,&DWP(16,$key));» # advance out |
| 2434 » » &mov» ($_out,$s3);» » # save out |
| 2435 » » &sub» ($s2,16);» » # decrease len |
| 2436 » » &cmp» ($s2,16); |
| 2437 » » &mov» ($_len,$s2);» » # save len |
| 2438 » &jae» (&label("slow_enc_loop_x86")); |
| 2439 » &test» ($s2,15); |
| 2440 » &jnz» (&label("slow_enc_tail")); |
| 2441 » &mov» ($acc,$_ivp);» » # load ivp |
| 2442 » &mov» ($s2,&DWP(8,$key));» # restore last dwords |
| 2443 » &mov» ($s3,&DWP(12,$key)); |
| 2444 » &mov» (&DWP(0,$acc),$s0);» # save ivec |
| 2445 » &mov» (&DWP(4,$acc),$s1); |
| 2446 » &mov» (&DWP(8,$acc),$s2); |
| 2447 » &mov» (&DWP(12,$acc),$s3); |
| 2448 |
| 2449 » &mov» ("esp",$_esp); |
| 2450 » &popf» (); |
| 2451 » &function_end_A(); |
| 2452 » &pushf» ();» » » # kludge, never executed |
| 2453 |
| 2454 &set_label("slow_enc_tail",16); |
| 2455 » &emms» ()» if (!$x86only); |
| 2456 » &mov» ($key eq "edi"? $key:"",$s3);» # load out to edi |
| 2457 » &mov» ($s1,16); |
| 2458 » &sub» ($s1,$s2); |
| 2459 » &cmp» ($key,$acc eq "esi"? $acc:"");» # compare with inp |
| 2460 » &je» (&label("enc_in_place")); |
| 2461 » &align» (4); |
| 2462 » &data_word(0xA4F3F689);»# rep movsb» # copy input |
| 2463 » &jmp» (&label("enc_skip_in_place")); |
| 2464 &set_label("enc_in_place"); |
1192 &lea ($key,&DWP(0,$key,$s2)); | 2465 &lea ($key,&DWP(0,$key,$s2)); |
1193 » &lea» ($acc,&DWP(16,$acc,$s2)); | 2466 &set_label("enc_skip_in_place"); |
1194 » &neg» ($s2 eq "ecx" ? $s2 : ""); | 2467 » &mov» ($s2,$s1); |
1195 » &data_word(0xA4F3F689);»# rep movsb» # restore tail | 2468 » &xor» ($s0,$s0); |
1196 | 2469 » &align» (4); |
1197 &align» (4); | 2470 » &data_word(0xAAF3F689);»# rep stosb» # zero tail |
1198 &set_label("dec_out"); | 2471 |
1199 &cmp» ($mark,0);» » # was the key schedule copied? | 2472 » &mov» ($key,$_ivp);» » » # restore ivp |
1200 &mov» ("edi",$_key); | 2473 » &mov» ($acc,$s3);» » » # output as input |
1201 &je»» (&label("skip_dzero")); | 2474 » &mov» ($s0,&DWP(0,$key)); |
1202 # zero copy of key schedule | 2475 » &mov» ($s1,&DWP(4,$key)); |
1203 &mov» ("ecx",240/4); | 2476 » &mov» ($_len,16);» » » # len=16 |
1204 &xor» ("eax","eax"); | 2477 » &jmp» (&label("slow_enc_loop_x86"));» # one more spin... |
1205 &align» (4); | 2478 |
1206 &data_word(0xABF3F689);» # rep stosd | 2479 #--------------------------- SLOW DECRYPT ---------------------------# |
1207 &set_label("skip_dzero") | 2480 &set_label("slow_decrypt",16); |
1208 &mov» ("esp",$_esp); | 2481 » » » » » if (!$x86only) { |
1209 &popf» (); | 2482 » &bt» ($_tmp,25);» » # check for SSE bit |
| 2483 » &jnc» (&label("slow_dec_loop_x86")); |
| 2484 |
| 2485 » &set_label("slow_dec_loop_sse",4); |
| 2486 » » &movq» ("mm0",&QWP(0,$acc));» # read input |
| 2487 » » &movq» ("mm4",&QWP(8,$acc)); |
| 2488 |
| 2489 » » &mov» ($key,$_key); |
| 2490 » » &call» ("_sse_AES_decrypt_compact"); |
| 2491 |
| 2492 » » &mov» ($acc,$_inp);» » # load inp |
| 2493 » » &lea» ($s0,$ivec); |
| 2494 » » &mov» ($s1,$_out);» » # load out |
| 2495 » » &mov» ($s2,$_len);» » # load len |
| 2496 » » &mov» ($key,$_ivp);» » # load ivp |
| 2497 |
| 2498 » » &movq» ("mm1",&QWP(0,$acc));» # re-read input |
| 2499 » » &movq» ("mm5",&QWP(8,$acc)); |
| 2500 |
| 2501 » » &pxor» ("mm0",&QWP(0,$key));» # xor iv |
| 2502 » » &pxor» ("mm4",&QWP(8,$key)); |
| 2503 |
| 2504 » » &movq» (&QWP(0,$key),"mm1");» # copy input to iv |
| 2505 » » &movq» (&QWP(8,$key),"mm5"); |
| 2506 |
| 2507 » » &sub» ($s2,16);» » # decrease len |
| 2508 » » &jc» (&label("slow_dec_partial_sse")); |
| 2509 |
| 2510 » » &movq» (&QWP(0,$s1),"mm0");» # write output |
| 2511 » » &movq» (&QWP(8,$s1),"mm4"); |
| 2512 |
| 2513 » » &lea» ($s1,&DWP(16,$s1));» # advance out |
| 2514 » » &mov» ($_out,$s1);» » # save out |
| 2515 » » &lea» ($acc,&DWP(16,$acc));» # advance inp |
| 2516 » » &mov» ($_inp,$acc);» » # save inp |
| 2517 » » &mov» ($_len,$s2);» » # save len |
| 2518 » &jnz» (&label("slow_dec_loop_sse")); |
| 2519 » &emms» (); |
| 2520 » &mov» ("esp",$_esp); |
| 2521 » &popf» (); |
| 2522 » &function_end_A(); |
| 2523 » &pushf» ();» » » # kludge, never executed |
| 2524 |
| 2525 &set_label("slow_dec_partial_sse",16); |
| 2526 » &movq» (&QWP(0,$s0),"mm0");» # save output to temp |
| 2527 » &movq» (&QWP(8,$s0),"mm4"); |
| 2528 » &emms» (); |
| 2529 |
| 2530 » &add» ($s2 eq "ecx" ? "ecx":"",16); |
| 2531 » &mov» ("edi",$s1);» » # out |
| 2532 » &mov» ("esi",$s0);» » # temp |
| 2533 » &align» (4); |
| 2534 » &data_word(0xA4F3F689);»» # rep movsb # copy partial output |
| 2535 |
| 2536 » &mov» ("esp",$_esp); |
| 2537 » &popf» (); |
| 2538 » &function_end_A(); |
| 2539 » &pushf» ();» » » # kludge, never executed |
| 2540 » » » » » } |
| 2541 » &set_label("slow_dec_loop_x86",16); |
| 2542 » » &mov» ($s0,&DWP(0,$acc));» # read input |
| 2543 » » &mov» ($s1,&DWP(4,$acc)); |
| 2544 » » &mov» ($s2,&DWP(8,$acc)); |
| 2545 » » &mov» ($s3,&DWP(12,$acc)); |
| 2546 |
| 2547 » » &lea» ($key,$ivec); |
| 2548 » » &mov» (&DWP(0,$key),$s0);» # copy to temp |
| 2549 » » &mov» (&DWP(4,$key),$s1); |
| 2550 » » &mov» (&DWP(8,$key),$s2); |
| 2551 » » &mov» (&DWP(12,$key),$s3); |
| 2552 |
| 2553 » » &mov» ($key,$_key);» » # load key |
| 2554 » » &call» ("_x86_AES_decrypt_compact"); |
| 2555 |
| 2556 » » &mov» ($key,$_ivp);» » # load ivp |
| 2557 » » &mov» ($acc,$_len);» » # load len |
| 2558 » » &xor» ($s0,&DWP(0,$key));» # xor iv |
| 2559 » » &xor» ($s1,&DWP(4,$key)); |
| 2560 » » &xor» ($s2,&DWP(8,$key)); |
| 2561 » » &xor» ($s3,&DWP(12,$key)); |
| 2562 |
| 2563 » » &sub» ($acc,16); |
| 2564 » » &jc» (&label("slow_dec_partial_x86")); |
| 2565 |
| 2566 » » &mov» ($_len,$acc);» » # save len |
| 2567 » » &mov» ($acc,$_out);» » # load out |
| 2568 |
| 2569 » » &mov» (&DWP(0,$acc),$s0);» # write output |
| 2570 » » &mov» (&DWP(4,$acc),$s1); |
| 2571 » » &mov» (&DWP(8,$acc),$s2); |
| 2572 » » &mov» (&DWP(12,$acc),$s3); |
| 2573 |
| 2574 » » &lea» ($acc,&DWP(16,$acc));» # advance out |
| 2575 » » &mov» ($_out,$acc);» » # save out |
| 2576 |
| 2577 » » &lea» ($acc,$ivec); |
| 2578 » » &mov» ($s0,&DWP(0,$acc));» # read temp |
| 2579 » » &mov» ($s1,&DWP(4,$acc)); |
| 2580 » » &mov» ($s2,&DWP(8,$acc)); |
| 2581 » » &mov» ($s3,&DWP(12,$acc)); |
| 2582 |
| 2583 » » &mov» (&DWP(0,$key),$s0);» # copy it to iv |
| 2584 » » &mov» (&DWP(4,$key),$s1); |
| 2585 » » &mov» (&DWP(8,$key),$s2); |
| 2586 » » &mov» (&DWP(12,$key),$s3); |
| 2587 |
| 2588 » » &mov» ($acc,$_inp);» » # load inp |
| 2589 » » &lea» ($acc,&DWP(16,$acc));» # advance inp |
| 2590 » » &mov» ($_inp,$acc);» » # save inp |
| 2591 » &jnz» (&label("slow_dec_loop_x86")); |
| 2592 » &mov» ("esp",$_esp); |
| 2593 » &popf» (); |
| 2594 » &function_end_A(); |
| 2595 » &pushf» ();» » » # kludge, never executed |
| 2596 |
| 2597 &set_label("slow_dec_partial_x86",16); |
| 2598 » &lea» ($acc,$ivec); |
| 2599 » &mov» (&DWP(0,$acc),$s0);» # save output to temp |
| 2600 » &mov» (&DWP(4,$acc),$s1); |
| 2601 » &mov» (&DWP(8,$acc),$s2); |
| 2602 » &mov» (&DWP(12,$acc),$s3); |
| 2603 |
| 2604 » &mov» ($acc,$_inp); |
| 2605 » &mov» ($s0,&DWP(0,$acc));» # re-read input |
| 2606 » &mov» ($s1,&DWP(4,$acc)); |
| 2607 » &mov» ($s2,&DWP(8,$acc)); |
| 2608 » &mov» ($s3,&DWP(12,$acc)); |
| 2609 |
| 2610 » &mov» (&DWP(0,$key),$s0);» # copy it to iv |
| 2611 » &mov» (&DWP(4,$key),$s1); |
| 2612 » &mov» (&DWP(8,$key),$s2); |
| 2613 » &mov» (&DWP(12,$key),$s3); |
| 2614 |
| 2615 » &mov» ("ecx",$_len); |
| 2616 » &mov» ("edi",$_out); |
| 2617 » &lea» ("esi",$ivec); |
| 2618 » &align» (4); |
| 2619 » &data_word(0xA4F3F689);»» # rep movsb # copy partial output |
| 2620 |
| 2621 » &mov» ("esp",$_esp); |
| 2622 » &popf» (); |
1210 &function_end("AES_cbc_encrypt"); | 2623 &function_end("AES_cbc_encrypt"); |
1211 } | 2624 } |
1212 | 2625 |
1213 #------------------------------------------------------------------# | 2626 #------------------------------------------------------------------# |
1214 | 2627 |
1215 sub enckey() | 2628 sub enckey() |
1216 { | 2629 { |
1217 &movz ("esi",&LB("edx")); # rk[i]>>0 | 2630 &movz ("esi",&LB("edx")); # rk[i]>>0 |
1218 » &mov» ("ebx",&DWP(2,"ebp","esi",8)); | 2631 » &movz» ("ebx",&BP(-128,$tbl,"esi",1)); |
1219 &movz ("esi",&HB("edx")); # rk[i]>>8 | 2632 &movz ("esi",&HB("edx")); # rk[i]>>8 |
1220 » &and» ("ebx",0xFF000000); | 2633 » &shl» ("ebx",24); |
1221 &xor ("eax","ebx"); | 2634 &xor ("eax","ebx"); |
1222 | 2635 |
1223 » &mov» ("ebx",&DWP(2,"ebp","esi",8)); | 2636 » &movz» ("ebx",&BP(-128,$tbl,"esi",1)); |
1224 &shr ("edx",16); | 2637 &shr ("edx",16); |
1225 &and ("ebx",0x000000FF); | |
1226 &movz ("esi",&LB("edx")); # rk[i]>>16 | 2638 &movz ("esi",&LB("edx")); # rk[i]>>16 |
1227 &xor ("eax","ebx"); | 2639 &xor ("eax","ebx"); |
1228 | 2640 |
1229 » &mov» ("ebx",&DWP(0,"ebp","esi",8)); | 2641 » &movz» ("ebx",&BP(-128,$tbl,"esi",1)); |
1230 &movz ("esi",&HB("edx")); # rk[i]>>24 | 2642 &movz ("esi",&HB("edx")); # rk[i]>>24 |
1231 » &and» ("ebx",0x0000FF00); | 2643 » &shl» ("ebx",8); |
1232 &xor ("eax","ebx"); | 2644 &xor ("eax","ebx"); |
1233 | 2645 |
1234 » &mov» ("ebx",&DWP(0,"ebp","esi",8)); | 2646 » &movz» ("ebx",&BP(-128,$tbl,"esi",1)); |
1235 » &and» ("ebx",0x00FF0000); | 2647 » &shl» ("ebx",16); |
1236 &xor ("eax","ebx"); | 2648 &xor ("eax","ebx"); |
1237 | 2649 |
1238 » &xor» ("eax",&DWP(2048,"ebp","ecx",4));» # rcon | 2650 » &xor» ("eax",&DWP(1024-128,$tbl,"ecx",4));» # rcon |
1239 } | 2651 } |
1240 | 2652 |
1241 # int AES_set_encrypt_key(const unsigned char *userKey, const int bits, | 2653 &function_begin("_x86_AES_set_encrypt_key"); |
1242 # AES_KEY *key) | 2654 » &mov» ("esi",&wparam(1));» » # user supplied key |
1243 &public_label("AES_Te"); | 2655 » &mov» ("edi",&wparam(3));» » # private key schedule |
1244 &function_begin("AES_set_encrypt_key"); | |
1245 » &mov» ("esi",&wparam(0));» » # user supplied key | |
1246 » &mov» ("edi",&wparam(2));» » # private key schedule | |
1247 | 2656 |
1248 &test ("esi",-1); | 2657 &test ("esi",-1); |
1249 &jz (&label("badpointer")); | 2658 &jz (&label("badpointer")); |
1250 &test ("edi",-1); | 2659 &test ("edi",-1); |
1251 &jz (&label("badpointer")); | 2660 &jz (&label("badpointer")); |
1252 | 2661 |
1253 &call (&label("pic_point")); | 2662 &call (&label("pic_point")); |
1254 &set_label("pic_point"); | 2663 &set_label("pic_point"); |
1255 » &blindpop("ebp"); | 2664 » &blindpop($tbl); |
1256 » &lea» ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); | 2665 » &lea» ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl)); |
| 2666 » &lea» ($tbl,&DWP(2048+128,$tbl)); |
1257 | 2667 |
1258 » &mov» ("ecx",&wparam(1));» » # number of bits in key | 2668 » # prefetch Te4 |
| 2669 » &mov» ("eax",&DWP(0-128,$tbl)); |
| 2670 » &mov» ("ebx",&DWP(32-128,$tbl)); |
| 2671 » &mov» ("ecx",&DWP(64-128,$tbl)); |
| 2672 » &mov» ("edx",&DWP(96-128,$tbl)); |
| 2673 » &mov» ("eax",&DWP(128-128,$tbl)); |
| 2674 » &mov» ("ebx",&DWP(160-128,$tbl)); |
| 2675 » &mov» ("ecx",&DWP(192-128,$tbl)); |
| 2676 » &mov» ("edx",&DWP(224-128,$tbl)); |
| 2677 |
| 2678 » &mov» ("ecx",&wparam(2));» » # number of bits in key |
1259 &cmp ("ecx",128); | 2679 &cmp ("ecx",128); |
1260 &je (&label("10rounds")); | 2680 &je (&label("10rounds")); |
1261 &cmp ("ecx",192); | 2681 &cmp ("ecx",192); |
1262 &je (&label("12rounds")); | 2682 &je (&label("12rounds")); |
1263 &cmp ("ecx",256); | 2683 &cmp ("ecx",256); |
1264 &je (&label("14rounds")); | 2684 &je (&label("14rounds")); |
1265 &mov ("eax",-2); # invalid number of bits | 2685 &mov ("eax",-2); # invalid number of bits |
1266 &jmp (&label("exit")); | 2686 &jmp (&label("exit")); |
1267 | 2687 |
1268 &set_label("10rounds"); | 2688 &set_label("10rounds"); |
(...skipping 118 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1387 &xor ("eax",&DWP(12,"edi")); | 2807 &xor ("eax",&DWP(12,"edi")); |
1388 &mov (&DWP(44,"edi"),"eax"); # rk[11] | 2808 &mov (&DWP(44,"edi"),"eax"); # rk[11] |
1389 | 2809 |
1390 &cmp ("ecx",6); | 2810 &cmp ("ecx",6); |
1391 &je (&label("14break")); | 2811 &je (&label("14break")); |
1392 &inc ("ecx"); | 2812 &inc ("ecx"); |
1393 | 2813 |
1394 &mov ("edx","eax"); | 2814 &mov ("edx","eax"); |
1395 &mov ("eax",&DWP(16,"edi")); # rk[4] | 2815 &mov ("eax",&DWP(16,"edi")); # rk[4] |
1396 &movz ("esi",&LB("edx")); # rk[11]>>0 | 2816 &movz ("esi",&LB("edx")); # rk[11]>>0 |
1397 » » &mov» ("ebx",&DWP(2,"ebp","esi",8)); | 2817 » » &movz» ("ebx",&BP(-128,$tbl,"esi",1)); |
1398 &movz ("esi",&HB("edx")); # rk[11]>>8 | 2818 &movz ("esi",&HB("edx")); # rk[11]>>8 |
1399 &and ("ebx",0x000000FF); | |
1400 &xor ("eax","ebx"); | 2819 &xor ("eax","ebx"); |
1401 | 2820 |
1402 » » &mov» ("ebx",&DWP(0,"ebp","esi",8)); | 2821 » » &movz» ("ebx",&BP(-128,$tbl,"esi",1)); |
1403 &shr ("edx",16); | 2822 &shr ("edx",16); |
1404 » » &and» ("ebx",0x0000FF00); | 2823 » » &shl» ("ebx",8); |
1405 &movz ("esi",&LB("edx")); # rk[11]>>16 | 2824 &movz ("esi",&LB("edx")); # rk[11]>>16 |
1406 &xor ("eax","ebx"); | 2825 &xor ("eax","ebx"); |
1407 | 2826 |
1408 » » &mov» ("ebx",&DWP(0,"ebp","esi",8)); | 2827 » » &movz» ("ebx",&BP(-128,$tbl,"esi",1)); |
1409 &movz ("esi",&HB("edx")); # rk[11]>>24 | 2828 &movz ("esi",&HB("edx")); # rk[11]>>24 |
1410 » » &and» ("ebx",0x00FF0000); | 2829 » » &shl» ("ebx",16); |
1411 &xor ("eax","ebx"); | 2830 &xor ("eax","ebx"); |
1412 | 2831 |
1413 » » &mov» ("ebx",&DWP(2,"ebp","esi",8)); | 2832 » » &movz» ("ebx",&BP(-128,$tbl,"esi",1)); |
1414 » » &and» ("ebx",0xFF000000); | 2833 » » &shl» ("ebx",24); |
1415 &xor ("eax","ebx"); | 2834 &xor ("eax","ebx"); |
1416 | 2835 |
1417 &mov (&DWP(48,"edi"),"eax"); # rk[12] | 2836 &mov (&DWP(48,"edi"),"eax"); # rk[12] |
1418 &xor ("eax",&DWP(20,"edi")); | 2837 &xor ("eax",&DWP(20,"edi")); |
1419 &mov (&DWP(52,"edi"),"eax"); # rk[13] | 2838 &mov (&DWP(52,"edi"),"eax"); # rk[13] |
1420 &xor ("eax",&DWP(24,"edi")); | 2839 &xor ("eax",&DWP(24,"edi")); |
1421 &mov (&DWP(56,"edi"),"eax"); # rk[14] | 2840 &mov (&DWP(56,"edi"),"eax"); # rk[14] |
1422 &xor ("eax",&DWP(28,"edi")); | 2841 &xor ("eax",&DWP(28,"edi")); |
1423 &mov (&DWP(60,"edi"),"eax"); # rk[15] | 2842 &mov (&DWP(60,"edi"),"eax"); # rk[15] |
1424 | 2843 |
1425 &add ("edi",32); | 2844 &add ("edi",32); |
1426 &jmp (&label("14loop")); | 2845 &jmp (&label("14loop")); |
1427 | 2846 |
1428 &set_label("14break"); | 2847 &set_label("14break"); |
1429 &mov (&DWP(48,"edi"),14); # setup number of rounds | 2848 &mov (&DWP(48,"edi"),14); # setup number of rounds |
1430 &xor ("eax","eax"); | 2849 &xor ("eax","eax"); |
1431 &jmp (&label("exit")); | 2850 &jmp (&label("exit")); |
1432 | 2851 |
1433 &set_label("badpointer"); | 2852 &set_label("badpointer"); |
1434 &mov ("eax",-1); | 2853 &mov ("eax",-1); |
1435 &set_label("exit"); | 2854 &set_label("exit"); |
1436 &function_end("AES_set_encrypt_key"); | 2855 &function_end("_x86_AES_set_encrypt_key"); |
| 2856 |
| 2857 # int AES_set_encrypt_key(const unsigned char *userKey, const int bits, |
| 2858 # AES_KEY *key) |
| 2859 &function_begin_B("AES_set_encrypt_key"); |
| 2860 » &call» ("_x86_AES_set_encrypt_key"); |
| 2861 » &ret» (); |
| 2862 &function_end_B("AES_set_encrypt_key"); |
1437 | 2863 |
1438 sub deckey() | 2864 sub deckey() |
1439 { my ($i,$ptr,$te,$td) = @_; | 2865 { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_; |
| 2866 my $tmp = $tbl; |
1440 | 2867 |
1441 » &mov» ("eax",&DWP($i,$ptr)); | 2868 » &mov» ($acc,$tp1); |
1442 » &mov» ("edx","eax"); | 2869 » &and» ($acc,0x80808080); |
1443 » &movz» ("ebx",&HB("eax")); | 2870 » &mov» ($tmp,$acc); |
1444 » &shr» ("edx",16); | 2871 » &shr» ($tmp,7); |
1445 » &and» ("eax",0xFF); | 2872 » &lea» ($tp2,&DWP(0,$tp1,$tp1)); |
1446 » &movz» ("eax",&BP(2,$te,"eax",8)); | 2873 » &sub» ($acc,$tmp); |
1447 » &movz» ("ebx",&BP(2,$te,"ebx",8)); | 2874 » &and» ($tp2,0xfefefefe); |
1448 » &mov» ("eax",&DWP(0,$td,"eax",8)); | 2875 » &and» ($acc,0x1b1b1b1b); |
1449 » &xor» ("eax",&DWP(3,$td,"ebx",8)); | 2876 » &xor» ($acc,$tp2); |
1450 » &movz» ("ebx",&HB("edx")); | 2877 » &mov» ($tp2,$acc); |
1451 » &and» ("edx",0xFF); | 2878 |
1452 » &movz» ("edx",&BP(2,$te,"edx",8)); | 2879 » &and» ($acc,0x80808080); |
1453 » &movz» ("ebx",&BP(2,$te,"ebx",8)); | 2880 » &mov» ($tmp,$acc); |
1454 » &xor» ("eax",&DWP(2,$td,"edx",8)); | 2881 » &shr» ($tmp,7); |
1455 » &xor» ("eax",&DWP(1,$td,"ebx",8)); | 2882 » &lea» ($tp4,&DWP(0,$tp2,$tp2)); |
1456 » &mov» (&DWP($i,$ptr),"eax"); | 2883 » &sub» ($acc,$tmp); |
| 2884 » &and» ($tp4,0xfefefefe); |
| 2885 » &and» ($acc,0x1b1b1b1b); |
| 2886 » &xor» ($tp2,$tp1);» # tp2^tp1 |
| 2887 » &xor» ($acc,$tp4); |
| 2888 » &mov» ($tp4,$acc); |
| 2889 |
| 2890 » &and» ($acc,0x80808080); |
| 2891 » &mov» ($tmp,$acc); |
| 2892 » &shr» ($tmp,7); |
| 2893 » &lea» ($tp8,&DWP(0,$tp4,$tp4)); |
| 2894 » &xor» ($tp4,$tp1);» # tp4^tp1 |
| 2895 » &sub» ($acc,$tmp); |
| 2896 » &and» ($tp8,0xfefefefe); |
| 2897 » &and» ($acc,0x1b1b1b1b); |
| 2898 » &rotl» ($tp1,8);» # = ROTATE(tp1,8) |
| 2899 » &xor» ($tp8,$acc); |
| 2900 |
| 2901 » &mov» ($tmp,&DWP(4*($i+1),$key));» # modulo-scheduled load |
| 2902 |
| 2903 » &xor» ($tp1,$tp2); |
| 2904 » &xor» ($tp2,$tp8); |
| 2905 » &xor» ($tp1,$tp4); |
| 2906 » &rotl» ($tp2,24); |
| 2907 » &xor» ($tp4,$tp8); |
| 2908 » &xor» ($tp1,$tp8);» # ^= tp8^(tp4^tp1)^(tp2^tp1) |
| 2909 » &rotl» ($tp4,16); |
| 2910 » &xor» ($tp1,$tp2);» # ^= ROTATE(tp8^tp2^tp1,24) |
| 2911 » &rotl» ($tp8,8); |
| 2912 » &xor» ($tp1,$tp4);» # ^= ROTATE(tp8^tp4^tp1,16) |
| 2913 » &mov» ($tp2,$tmp); |
| 2914 » &xor» ($tp1,$tp8);» # ^= ROTATE(tp8,8) |
| 2915 |
| 2916 » &mov» (&DWP(4*$i,$key),$tp1); |
1457 } | 2917 } |
1458 | 2918 |
1459 # int AES_set_decrypt_key(const unsigned char *userKey, const int bits, | 2919 # int AES_set_decrypt_key(const unsigned char *userKey, const int bits, |
1460 # AES_KEY *key) | 2920 # AES_KEY *key) |
1461 &public_label("AES_Td"); | |
1462 &public_label("AES_Te"); | |
1463 &function_begin_B("AES_set_decrypt_key"); | 2921 &function_begin_B("AES_set_decrypt_key"); |
1464 » &mov» ("eax",&wparam(0)); | 2922 » &call» ("_x86_AES_set_encrypt_key"); |
1465 » &mov» ("ecx",&wparam(1)); | |
1466 » &mov» ("edx",&wparam(2)); | |
1467 » &sub» ("esp",12); | |
1468 » &mov» (&DWP(0,"esp"),"eax"); | |
1469 » &mov» (&DWP(4,"esp"),"ecx"); | |
1470 » &mov» (&DWP(8,"esp"),"edx"); | |
1471 » &call» ("AES_set_encrypt_key"); | |
1472 » &add» ("esp",12); | |
1473 &cmp ("eax",0); | 2923 &cmp ("eax",0); |
1474 &je (&label("proceed")); | 2924 &je (&label("proceed")); |
1475 &ret (); | 2925 &ret (); |
1476 | 2926 |
1477 &set_label("proceed"); | 2927 &set_label("proceed"); |
1478 &push ("ebp"); | 2928 &push ("ebp"); |
1479 &push ("ebx"); | 2929 &push ("ebx"); |
1480 &push ("esi"); | 2930 &push ("esi"); |
1481 &push ("edi"); | 2931 &push ("edi"); |
1482 | 2932 |
1483 &mov ("esi",&wparam(2)); | 2933 &mov ("esi",&wparam(2)); |
1484 &mov ("ecx",&DWP(240,"esi")); # pull number of rounds | 2934 &mov ("ecx",&DWP(240,"esi")); # pull number of rounds |
1485 &lea ("ecx",&DWP(0,"","ecx",4)); | 2935 &lea ("ecx",&DWP(0,"","ecx",4)); |
1486 &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk | 2936 &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk |
1487 | 2937 |
1488 » &align» (4); | 2938 » &set_label("invert",4);»» » # invert order of chunks |
1489 » &set_label("invert");» » » # invert order of chunks | |
1490 &mov ("eax",&DWP(0,"esi")); | 2939 &mov ("eax",&DWP(0,"esi")); |
1491 &mov ("ebx",&DWP(4,"esi")); | 2940 &mov ("ebx",&DWP(4,"esi")); |
1492 &mov ("ecx",&DWP(0,"edi")); | 2941 &mov ("ecx",&DWP(0,"edi")); |
1493 &mov ("edx",&DWP(4,"edi")); | 2942 &mov ("edx",&DWP(4,"edi")); |
1494 &mov (&DWP(0,"edi"),"eax"); | 2943 &mov (&DWP(0,"edi"),"eax"); |
1495 &mov (&DWP(4,"edi"),"ebx"); | 2944 &mov (&DWP(4,"edi"),"ebx"); |
1496 &mov (&DWP(0,"esi"),"ecx"); | 2945 &mov (&DWP(0,"esi"),"ecx"); |
1497 &mov (&DWP(4,"esi"),"edx"); | 2946 &mov (&DWP(4,"esi"),"edx"); |
1498 &mov ("eax",&DWP(8,"esi")); | 2947 &mov ("eax",&DWP(8,"esi")); |
1499 &mov ("ebx",&DWP(12,"esi")); | 2948 &mov ("ebx",&DWP(12,"esi")); |
1500 &mov ("ecx",&DWP(8,"edi")); | 2949 &mov ("ecx",&DWP(8,"edi")); |
1501 &mov ("edx",&DWP(12,"edi")); | 2950 &mov ("edx",&DWP(12,"edi")); |
1502 &mov (&DWP(8,"edi"),"eax"); | 2951 &mov (&DWP(8,"edi"),"eax"); |
1503 &mov (&DWP(12,"edi"),"ebx"); | 2952 &mov (&DWP(12,"edi"),"ebx"); |
1504 &mov (&DWP(8,"esi"),"ecx"); | 2953 &mov (&DWP(8,"esi"),"ecx"); |
1505 &mov (&DWP(12,"esi"),"edx"); | 2954 &mov (&DWP(12,"esi"),"edx"); |
1506 &add ("esi",16); | 2955 &add ("esi",16); |
1507 &sub ("edi",16); | 2956 &sub ("edi",16); |
1508 &cmp ("esi","edi"); | 2957 &cmp ("esi","edi"); |
1509 &jne (&label("invert")); | 2958 &jne (&label("invert")); |
1510 | 2959 |
1511 » &call» (&label("pic_point")); | 2960 » &mov» ($key,&wparam(2)); |
1512 » &set_label("pic_point"); | 2961 » &mov» ($acc,&DWP(240,$key));» » # pull number of rounds |
1513 » blindpop("ebp"); | 2962 » &lea» ($acc,&DWP(-2,$acc,$acc)); |
1514 » &lea» ("edi",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp")); | 2963 » &lea» ($acc,&DWP(0,$key,$acc,8)); |
1515 » &lea» ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); | 2964 » &mov» (&wparam(2),$acc); |
1516 | 2965 |
1517 » &mov» ("esi",&wparam(2)); | 2966 » &mov» ($s0,&DWP(16,$key));» » # modulo-scheduled load |
1518 » &mov» ("ecx",&DWP(240,"esi"));» # pull number of rounds | 2967 » &set_label("permute",4);» » # permute the key schedule |
1519 » &dec» ("ecx"); | 2968 » » &add» ($key,16); |
1520 » &align» (4); | 2969 » » &deckey»(0,$key,$s0,$s1,$s2,$s3); |
1521 » &set_label("permute");» » » # permute the key schedule | 2970 » » &deckey»(1,$key,$s1,$s2,$s3,$s0); |
1522 » » &add» ("esi",16); | 2971 » » &deckey»(2,$key,$s2,$s3,$s0,$s1); |
1523 » » &deckey»(0,"esi","ebp","edi"); | 2972 » » &deckey»(3,$key,$s3,$s0,$s1,$s2); |
1524 » » &deckey»(4,"esi","ebp","edi"); | 2973 » » &cmp» ($key,&wparam(2)); |
1525 » » &deckey»(8,"esi","ebp","edi"); | 2974 » &jb» (&label("permute")); |
1526 » » &deckey»(12,"esi","ebp","edi"); | |
1527 » » &dec» ("ecx"); | |
1528 » &jnz» (&label("permute")); | |
1529 | 2975 |
1530 &xor ("eax","eax"); # return success | 2976 &xor ("eax","eax"); # return success |
1531 &function_end("AES_set_decrypt_key"); | 2977 &function_end("AES_set_decrypt_key"); |
| 2978 &asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>"); |
1532 | 2979 |
1533 &asm_finish(); | 2980 &asm_finish(); |
OLD | NEW |