openssl/crypto/aes/asm/aes-586.pl - Issue 2072073002: Delete bundled copy of OpenSSL and replace with README.

Side by Side Diff: openssl/crypto/aes/asm/aes-586.pl

Issue 2072073002: Delete bundled copy of OpenSSL and replace with README. (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/openssl@master

Patch Set: Delete bundled copy of OpenSSL and replace with README. Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 #!/usr/bin/env perl

2 #

3 # ====================================================================

4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL

5 # project. The module is, however, dual licensed under OpenSSL and

6 # CRYPTOGAMS licenses depending on where you obtain it. For further

7 # details see http://www.openssl.org/~appro/cryptogams/.

8 # ====================================================================

9 #

10 # Version 4.3.

11 #

12 # You might fail to appreciate this module performance from the first

13 # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered

14 # to be the best Intel C compiler without -KPIC, performance appears

15 # to be virtually identical... But try to re-configure with shared

16 # library support... Aha! Intel compiler "suddenly" lags behind by 30%

17 # [on P4, more on others]:-) And if compared to position-independent

18 # code generated by GNU C, this code performs more than twice as

19 # fast! Yes, all this buzz about PIC means that unlike other hand-

20 # coded implementations, this one was explicitly designed to be safe

21 # to use even in shared library context... This also means that this

22 # code isn't necessarily absolutely fastest "ever," because in order

23 # to achieve position independence an extra register has to be

24 # off-loaded to stack, which affects the benchmark result.

25 #

26 # Special note about instruction choice. Do you recall RC4_INT code

27 # performing poorly on P4? It might be the time to figure out why.

28 # RC4_INT code implies effective address calculations in base+offset*4

29 # form. Trouble is that it seems that offset scaling turned to be

30 # critical path... At least eliminating scaling resulted in 2.8x RC4

31 # performance improvement [as you might recall]. As AES code is hungry

32 # for scaling too, I [try to] avoid the latter by favoring off-by-2

33 # shifts and masking the result with 0xFF<<2 instead of "boring" 0xFF.

34 #

35 # As was shown by Dean Gaudet <dean@arctic.org>, the above note turned

36 # void. Performance improvement with off-by-2 shifts was observed on

37 # intermediate implementation, which was spilling yet another register

38 # to stack... Final offset*4 code below runs just a tad faster on P4,

39 # but exhibits up to 10% improvement on other cores.

40 #

41 # Second version is "monolithic" replacement for aes_core.c, which in

42 # addition to AES_[de\|en]crypt implements private_AES_set_[de\|en]cryption_key.

43 # This made it possible to implement little-endian variant of the

44 # algorithm without modifying the base C code. Motivating factor for

45 # the undertaken effort was that it appeared that in tight IA-32

46 # register window little-endian flavor could achieve slightly higher

47 # Instruction Level Parallelism, and it indeed resulted in up to 15%

48 # better performance on most recent µ-archs...

49 #

50 # Third version adds AES_cbc_encrypt implementation, which resulted in

51 # up to 40% performance imrovement of CBC benchmark results. 40% was

52 # observed on P4 core, where "overall" imrovement coefficient, i.e. if

53 # compared to PIC generated by GCC and in CBC mode, was observed to be

54 # as large as 4x:-) CBC performance is virtually identical to ECB now

55 # and on some platforms even better, e.g. 17.6 "small" cycles/byte on

56 # Opteron, because certain function prologues and epilogues are

57 # effectively taken out of the loop...

58 #

59 # Version 3.2 implements compressed tables and prefetch of these tables

60 # in CBC[!] mode. Former means that 3/4 of table references are now

61 # misaligned, which unfortunately has negative impact on elder IA-32

62 # implementations, Pentium suffered 30% penalty, PIII - 10%.

63 #

64 # Version 3.3 avoids L1 cache aliasing between stack frame and

65 # S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The

66 # latter is achieved by copying the key schedule to controlled place in

67 # stack. This unfortunately has rather strong impact on small block CBC

68 # performance, ~2x deterioration on 16-byte block if compared to 3.3.

69 #

70 # Version 3.5 checks if there is L1 cache aliasing between user-supplied

71 # key schedule and S-boxes and abstains from copying the former if

72 # there is no. This allows end-user to consciously retain small block

73 # performance by aligning key schedule in specific manner.

74 #

75 # Version 3.6 compresses Td4 to 256 bytes and prefetches it in ECB.

76 #

77 # Current ECB performance numbers for 128-bit key in CPU cycles per

78 # processed byte [measure commonly used by AES benchmarkers] are:

79 #

80 # small footprint fully unrolled

81 # P4 24 22

82 # AMD K8 20 19

83 # PIII 25 23

84 # Pentium 81 78

85 #

86 # Version 3.7 reimplements outer rounds as "compact." Meaning that

87 # first and last rounds reference compact 256 bytes S-box. This means

88 # that first round consumes a lot more CPU cycles and that encrypt

89 # and decrypt performance becomes asymmetric. Encrypt performance

90 # drops by 10-12%, while decrypt - by 20-25%:-( 256 bytes S-box is

91 # aggressively pre-fetched.

92 #

93 # Version 4.0 effectively rolls back to 3.6 and instead implements

94 # additional set of functions, _[x86\|sse]_AES_[en\|de]crypt_compact,

95 # which use exclusively 256 byte S-box. These functions are to be

96 # called in modes not concealing plain text, such as ECB, or when

97 # we're asked to process smaller amount of data [or unconditionally

98 # on hyper-threading CPU]. Currently it's called unconditionally from

99 # AES_[en\|de]crypt, which affects all modes, but CBC. CBC routine

100 # still needs to be modified to switch between slower and faster

101 # mode when appropriate... But in either case benchmark landscape

102 # changes dramatically and below numbers are CPU cycles per processed

103 # byte for 128-bit key.

104 #

105 # ECB encrypt ECB decrypt CBC large chunk

106 # P4 56[60] 84[100] 23

107 # AMD K8 48[44] 70[79] 18

108 # PIII 41[50] 61[91] 24

109 # Core 2 32[38] 45[70] 18.5

110 # Pentium 120 160 77

111 #

112 # Version 4.1 switches to compact S-box even in key schedule setup.

113 #

114 # Version 4.2 prefetches compact S-box in every SSE round or in other

115 # words every cache-line is guaranteed to be accessed within ~50

116 # cycles window. Why just SSE? Because it's needed on hyper-threading

117 # CPU! Which is also why it's prefetched with 64 byte stride. Best

118 # part is that it has no negative effect on performance:-)

119 #

120 # Version 4.3 implements switch between compact and non-compact block

121 # functions in AES_cbc_encrypt depending on how much data was asked

122 # to be processed in one stroke.

123 #

124 ######################################################################

125 # Timing attacks are classified in two classes: synchronous when

126 # attacker consciously initiates cryptographic operation and collects

127 # timing data of various character afterwards, and asynchronous when

128 # malicious code is executed on same CPU simultaneously with AES,

129 # instruments itself and performs statistical analysis of this data.

130 #

131 # As far as synchronous attacks go the root to the AES timing

132 # vulnerability is twofold. Firstly, of 256 S-box elements at most 160

133 # are referred to in single 128-bit block operation. Well, in C

134 # implementation with 4 distinct tables it's actually as little as 40

135 # references per 256 elements table, but anyway... Secondly, even

136 # though S-box elements are clustered into smaller amount of cache-

137 # lines, smaller than 160 and even 40, it turned out that for certain

138 # plain-text pattern[s] or simply put chosen plain-text and given key

139 # few cache-lines remain unaccessed during block operation. Now, if

140 # attacker can figure out this access pattern, he can deduct the key

141 # [or at least part of it]. The natural way to mitigate this kind of

142 # attacks is to minimize the amount of cache-lines in S-box and/or

143 # prefetch them to ensure that every one is accessed for more uniform

144 # timing. But note that if plain-text was concealed in such way that

145 # input to block function is distributed uniformly, then attack

146 # wouldn't apply. Now note that some encryption modes, most notably

147 # CBC, do mask the plain-text in this exact way [secure cipher output

148 # is distributed uniformly]. Yes, one still might find input that

149 # would reveal the information about given key, but if amount of

150 # candidate inputs to be tried is larger than amount of possible key

151 # combinations then attack becomes infeasible. This is why revised

152 # AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk

153 # of data is to be processed in one stroke. The current size limit of

154 # 512 bytes is chosen to provide same [diminishigly low] probability

155 # for cache-line to remain untouched in large chunk operation with

156 # large S-box as for single block operation with compact S-box and

157 # surely needs more careful consideration...

158 #

159 # As for asynchronous attacks. There are two flavours: attacker code

160 # being interleaved with AES on hyper-threading CPU at instruction

161 # level, and two processes time sharing single core. As for latter.

162 # Two vectors. 1. Given that attacker process has higher priority,

163 # yield execution to process performing AES just before timer fires

164 # off the scheduler, immediately regain control of CPU and analyze the

165 # cache state. For this attack to be efficient attacker would have to

166 # effectively slow down the operation by several orders of magnitute,

167 # by ratio of time slice to duration of handful of AES rounds, which

168 # unlikely to remain unnoticed. Not to mention that this also means

169 # that he would spend correspondigly more time to collect enough

170 # statistical data to mount the attack. It's probably appropriate to

171 # say that if adeversary reckons that this attack is beneficial and

172 # risks to be noticed, you probably have larger problems having him

173 # mere opportunity. In other words suggested code design expects you

174 # to preclude/mitigate this attack by overall system security design.

175 # 2. Attacker manages to make his code interrupt driven. In order for

176 # this kind of attack to be feasible, interrupt rate has to be high

177 # enough, again comparable to duration of handful of AES rounds. But

178 # is there interrupt source of such rate? Hardly, not even 1Gbps NIC

179 # generates interrupts at such raging rate...

180 #

181 # And now back to the former, hyper-threading CPU or more specifically

182 # Intel P4. Recall that asynchronous attack implies that malicious

183 # code instruments itself. And naturally instrumentation granularity

184 # has be noticeably lower than duration of codepath accessing S-box.

185 # Given that all cache-lines are accessed during that time that is.

186 # Current implementation accesses all cache-lines within ~50 cycles

187 # window, which is actually less than RDTSC latency on Intel P4!

188

189 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;

190 push(@INC,"${dir}","${dir}../../perlasm");

191 require "x86asm.pl";

192

193 &asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386");

194 &static_label("AES_Te");

195 &static_label("AES_Td");

196

197 $s0="eax";

198 $s1="ebx";

199 $s2="ecx";

200 $s3="edx";

201 $key="edi";

202 $acc="esi";

203 $tbl="ebp";

204

205 # stack frame layout in _[x86\|sse]_AES_* routines, frame is allocated

206 # by caller

207 $__ra=&DWP(0,"esp"); # return address

208 $__s0=&DWP(4,"esp"); # s0 backing store

209 $__s1=&DWP(8,"esp"); # s1 backing store

210 $__s2=&DWP(12,"esp"); # s2 backing store

211 $__s3=&DWP(16,"esp"); # s3 backing store

212 $__key=&DWP(20,"esp"); # pointer to key schedule

213 $__end=&DWP(24,"esp"); # pointer to end of key schedule

214 $__tbl=&DWP(28,"esp"); # %ebp backing store

215

216 # stack frame layout in AES_[en\|crypt] routines, which differs from

217 # above by 4 and overlaps by %ebp backing store

218 $_tbl=&DWP(24,"esp");

219 $_esp=&DWP(28,"esp");

220

221 sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }

222

223 $speed_limit=512; # chunks smaller than $speed_limit are

224 # processed with compact routine in CBC mode

225 $small_footprint=1; # $small_footprint=1 code is ~5% slower [on

226 # recent µ-archs], but ~5 times smaller!

227 # I favor compact code to minimize cache

228 # contention and in hope to "collect" 5% back

229 # in real-life applications...

230

231 $vertical_spin=0; # shift "verticaly" defaults to 0, because of

232 # its proof-of-concept status...

233 # Note that there is no decvert(), as well as last encryption round is

234 # performed with "horizontal" shifts. This is because this "vertical"

235 # implementation [one which groups shifts on a given $s[i] to form a

236 # "column," unlike "horizontal" one, which groups shifts on different

237 # $s[i] to form a "row"] is work in progress. It was observed to run

238 # few percents faster on Intel cores, but not AMD. On AMD K8 core it's

239 # whole 12% slower:-( So we face a trade-off... Shall it be resolved

240 # some day? Till then the code is considered experimental and by

241 # default remains dormant...

242

243 sub encvert()

244 { my ($te,@s) = @_;

245 my $v0 = $acc, $v1 = $key;

246

247 &mov ($v0,$s[3]); # copy s3

248 &mov (&DWP(4,"esp"),$s[2]); # save s2

249 &mov ($v1,$s[0]); # copy s0

250 &mov (&DWP(8,"esp"),$s[1]); # save s1

251

252 &movz ($s[2],&HB($s[0]));

253 &and ($s[0],0xFF);

254 &mov ($s[0],&DWP(0,$te,$s[0],8)); # s0>>0

255 &shr ($v1,16);

256 &mov ($s[3],&DWP(3,$te,$s[2],8)); # s0>>8

257 &movz ($s[1],&HB($v1));

258 &and ($v1,0xFF);

259 &mov ($s[2],&DWP(2,$te,$v1,8)); # s0>>16

260 &mov ($v1,$v0);

261 &mov ($s[1],&DWP(1,$te,$s[1],8)); # s0>>24

262

263 &and ($v0,0xFF);

264 &xor ($s[3],&DWP(0,$te,$v0,8)); # s3>>0

265 &movz ($v0,&HB($v1));

266 &shr ($v1,16);

267 &xor ($s[2],&DWP(3,$te,$v0,8)); # s3>>8

268 &movz ($v0,&HB($v1));

269 &and ($v1,0xFF);

270 &xor ($s[1],&DWP(2,$te,$v1,8)); # s3>>16

271 &mov ($v1,&DWP(4,"esp")); # restore s2

272 &xor ($s[0],&DWP(1,$te,$v0,8)); # s3>>24

273

274 &mov ($v0,$v1);

275 &and ($v1,0xFF);

276 &xor ($s[2],&DWP(0,$te,$v1,8)); # s2>>0

277 &movz ($v1,&HB($v0));

278 &shr ($v0,16);

279 &xor ($s[1],&DWP(3,$te,$v1,8)); # s2>>8

280 &movz ($v1,&HB($v0));

281 &and ($v0,0xFF);

282 &xor ($s[0],&DWP(2,$te,$v0,8)); # s2>>16

283 &mov ($v0,&DWP(8,"esp")); # restore s1

284 &xor ($s[3],&DWP(1,$te,$v1,8)); # s2>>24

285

286 &mov ($v1,$v0);

287 &and ($v0,0xFF);

288 &xor ($s[1],&DWP(0,$te,$v0,8)); # s1>>0

289 &movz ($v0,&HB($v1));

290 &shr ($v1,16);

291 &xor ($s[0],&DWP(3,$te,$v0,8)); # s1>>8

292 &movz ($v0,&HB($v1));

293 &and ($v1,0xFF);

294 &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16

295 &mov ($key,$__key); # reincarnate v1 as key

296 &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24

297 }

298

299 # Another experimental routine, which features "horizontal spin," but

300 # eliminates one reference to stack. Strangely enough runs slower...

301 sub enchoriz()

302 { my $v0 = $key, $v1 = $acc;

303

304 &movz ($v0,&LB($s0)); # 3, 2, 1, 0*

305 &rotr ($s2,8); # 8,11,10, 9

306 &mov ($v1,&DWP(0,$te,$v0,8)); # 0

307 &movz ($v0,&HB($s1)); # 7, 6, 5*, 4

308 &rotr ($s3,16); # 13,12,15,14

309 &xor ($v1,&DWP(3,$te,$v0,8)); # 5

310 &movz ($v0,&HB($s2)); # 8,11,10*, 9

311 &rotr ($s0,16); # 1, 0, 3, 2

312 &xor ($v1,&DWP(2,$te,$v0,8)); # 10

313 &movz ($v0,&HB($s3)); # 13,12,15*,14

314 &xor ($v1,&DWP(1,$te,$v0,8)); # 15, t[0] collected

315 &mov ($__s0,$v1); # t[0] saved

316

317 &movz ($v0,&LB($s1)); # 7, 6, 5, 4*

318 &shr ($s1,16); # -, -, 7, 6

319 &mov ($v1,&DWP(0,$te,$v0,8)); # 4

320 &movz ($v0,&LB($s3)); # 13,12,15,14*

321 &xor ($v1,&DWP(2,$te,$v0,8)); # 14

322 &movz ($v0,&HB($s0)); # 1, 0, 3*, 2

323 &and ($s3,0xffff0000); # 13,12, -, -

324 &xor ($v1,&DWP(1,$te,$v0,8)); # 3

325 &movz ($v0,&LB($s2)); # 8,11,10, 9*

326 &or ($s3,$s1); # 13,12, 7, 6

327 &xor ($v1,&DWP(3,$te,$v0,8)); # 9, t[1] collected

328 &mov ($s1,$v1); # s[1]=t[1]

329

330 &movz ($v0,&LB($s0)); # 1, 0, 3, 2*

331 &shr ($s2,16); # -, -, 8,11

332 &mov ($v1,&DWP(2,$te,$v0,8)); # 2

333 &movz ($v0,&HB($s3)); # 13,12, 7*, 6

334 &xor ($v1,&DWP(1,$te,$v0,8)); # 7

335 &movz ($v0,&HB($s2)); # -, -, 8*,11

336 &xor ($v1,&DWP(0,$te,$v0,8)); # 8

337 &mov ($v0,$s3);

338 &shr ($v0,24); # 13

339 &xor ($v1,&DWP(3,$te,$v0,8)); # 13, t[2] collected

340

341 &movz ($v0,&LB($s2)); # -, -, 8,11*

342 &shr ($s0,24); # 1*

343 &mov ($s2,&DWP(1,$te,$v0,8)); # 11

344 &xor ($s2,&DWP(3,$te,$s0,8)); # 1

345 &mov ($s0,$__s0); # s[0]=t[0]

346 &movz ($v0,&LB($s3)); # 13,12, 7, 6*

347 &shr ($s3,16); # , ,13,12

348 &xor ($s2,&DWP(2,$te,$v0,8)); # 6

349 &mov ($key,$__key); # reincarnate v0 as key

350 &and ($s3,0xff); # , ,13,12*

351 &mov ($s3,&DWP(0,$te,$s3,8)); # 12

352 &xor ($s3,$s2); # s[2]=t[3] collected

353 &mov ($s2,$v1); # s[2]=t[2]

354 }

355

356 # More experimental code... SSE one... Even though this one eliminates

357 # all references to stack, it's not faster...

358 sub sse_encbody()

359 {

360 &movz ($acc,&LB("eax")); # 0

361 &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 0

362 &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2

363 &movz ("edx",&HB("eax")); # 1

364 &mov ("edx",&DWP(3,$tbl,"edx",8)); # 1

365 &shr ("eax",16); # 5, 4

366

367 &movz ($acc,&LB("ebx")); # 10

368 &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 10

369 &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8

370 &movz ($acc,&HB("ebx")); # 11

371 &xor ("edx",&DWP(1,$tbl,$acc,8)); # 11

372 &shr ("ebx",16); # 15,14

373

374 &movz ($acc,&HB("eax")); # 5

375 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 5

376 &movq ("mm3",QWP(16,$key));

377 &movz ($acc,&HB("ebx")); # 15

378 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 15

379 &movd ("mm0","ecx"); # t[0] collected

380

381 &movz ($acc,&LB("eax")); # 4

382 &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 4

383 &movd ("eax","mm2"); # 7, 6, 3, 2

384 &movz ($acc,&LB("ebx")); # 14

385 &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 14

386 &movd ("ebx","mm6"); # 13,12, 9, 8

387

388 &movz ($acc,&HB("eax")); # 3

389 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 3

390 &movz ($acc,&HB("ebx")); # 9

391 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 9

392 &movd ("mm1","ecx"); # t[1] collected

393

394 &movz ($acc,&LB("eax")); # 2

395 &mov ("ecx",&DWP(2,$tbl,$acc,8)); # 2

396 &shr ("eax",16); # 7, 6

397 &punpckldq ("mm0","mm1"); # t[0,1] collected

398 &movz ($acc,&LB("ebx")); # 8

399 &xor ("ecx",&DWP(0,$tbl,$acc,8)); # 8

400 &shr ("ebx",16); # 13,12

401

402 &movz ($acc,&HB("eax")); # 7

403 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 7

404 &pxor ("mm0","mm3");

405 &movz ("eax",&LB("eax")); # 6

406 &xor ("edx",&DWP(2,$tbl,"eax",8)); # 6

407 &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0

408 &movz ($acc,&HB("ebx")); # 13

409 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 13

410 &xor ("ecx",&DWP(24,$key)); # t[2]

411 &movd ("mm4","ecx"); # t[2] collected

412 &movz ("ebx",&LB("ebx")); # 12

413 &xor ("edx",&DWP(0,$tbl,"ebx",8)); # 12

414 &shr ("ecx",16);

415 &movd ("eax","mm1"); # 5, 4, 1, 0

416 &mov ("ebx",&DWP(28,$key)); # t[3]

417 &xor ("ebx","edx");

418 &movd ("mm5","ebx"); # t[3] collected

419 &and ("ebx",0xffff0000);

420 &or ("ebx","ecx");

421

422 &punpckldq ("mm4","mm5"); # t[2,3] collected

423 }

424

425 ######################################################################

426 # "Compact" block function

427 ######################################################################

428

429 sub enccompact()

430 { my $Fn = mov;

431 while ($#_>5) { pop(@_); $Fn=sub{}; }

432 my ($i,$te,@s)=@_;

433 my $tmp = $key;

434 my $out = $i==3?$s[0]:$acc;

435

436 # $Fn is used in first compact round and its purpose is to

437 # void restoration of some values from stack, so that after

438 # 4xenccompact with extra argument $key value is left there...

439 if ($i==3) { &$Fn ($key,$__key); }##%edx

440 else { &mov ($out,$s[0]); }

441 &and ($out,0xFF);

442 if ($i==1) { &shr ($s[0],16); }#%ebx[1]

443 if ($i==2) { &shr ($s[0],24); }#%ecx[2]

444 &movz ($out,&BP(-128,$te,$out,1));

445

446 if ($i==3) { $tmp=$s[1]; }##%eax

447 &movz ($tmp,&HB($s[1]));

448 &movz ($tmp,&BP(-128,$te,$tmp,1));

449 &shl ($tmp,8);

450 &xor ($out,$tmp);

451

452 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx

453 else { &mov ($tmp,$s[2]);

454 &shr ($tmp,16); }

455 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]

456 &and ($tmp,0xFF);

457 &movz ($tmp,&BP(-128,$te,$tmp,1));

458 &shl ($tmp,16);

459 &xor ($out,$tmp);

460

461 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx

462 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]

463 else { &mov ($tmp,$s[3]);

464 &shr ($tmp,24); }

465 &movz ($tmp,&BP(-128,$te,$tmp,1));

466 &shl ($tmp,24);

467 &xor ($out,$tmp);

468 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }

469 if ($i==3) { &mov ($s[3],$acc); }

470 &comment();

471 }

472

473 sub enctransform()

474 { my @s = ($s0,$s1,$s2,$s3);

475 my $i = shift;

476 my $tmp = $tbl;

477 my $r2 = $key ;

478

479 &mov ($acc,$s[$i]);

480 &and ($acc,0x80808080);

481 &mov ($tmp,$acc);

482 &shr ($tmp,7);

483 &lea ($r2,&DWP(0,$s[$i],$s[$i]));

484 &sub ($acc,$tmp);

485 &and ($r2,0xfefefefe);

486 &and ($acc,0x1b1b1b1b);

487 &mov ($tmp,$s[$i]);

488 &xor ($acc,$r2); # r2

489

490 &xor ($s[$i],$acc); # r0 ^ r2

491 &rotl ($s[$i],24);

492 &xor ($s[$i],$acc) # ROTATE(r2^r0,24) ^ r2

493 &rotr ($tmp,16);

494 &xor ($s[$i],$tmp);

495 &rotr ($tmp,8);

496 &xor ($s[$i],$tmp);

497 }

498

499 &function_begin_B("_x86_AES_encrypt_compact");

500 # note that caller is expected to allocate stack frame for me!

501 &mov ($__key,$key); # save key

502

503 &xor ($s0,&DWP(0,$key)); # xor with key

504 &xor ($s1,&DWP(4,$key));

505 &xor ($s2,&DWP(8,$key));

506 &xor ($s3,&DWP(12,$key));

507

508 &mov ($acc,&DWP(240,$key)); # load key->rounds

509 &lea ($acc,&DWP(-2,$acc,$acc));

510 &lea ($acc,&DWP(0,$key,$acc,8));

511 &mov ($__end,$acc); # end of key schedule

512

513 # prefetch Te4

514 &mov ($key,&DWP(0-128,$tbl));

515 &mov ($acc,&DWP(32-128,$tbl));

516 &mov ($key,&DWP(64-128,$tbl));

517 &mov ($acc,&DWP(96-128,$tbl));

518 &mov ($key,&DWP(128-128,$tbl));

519 &mov ($acc,&DWP(160-128,$tbl));

520 &mov ($key,&DWP(192-128,$tbl));

521 &mov ($acc,&DWP(224-128,$tbl));

522

523 &set_label("loop",16);

524

525 &enccompact(0,$tbl,$s0,$s1,$s2,$s3,1);

526 &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);

527 &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);

528 &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);

529 &enctransform(2);

530 &enctransform(3);

531 &enctransform(0);

532 &enctransform(1);

533 &mov ($key,$__key);

534 &mov ($tbl,$__tbl);

535 &add ($key,16); # advance rd_key

536 &xor ($s0,&DWP(0,$key));

537 &xor ($s1,&DWP(4,$key));

538 &xor ($s2,&DWP(8,$key));

539 &xor ($s3,&DWP(12,$key));

540

541 &cmp ($key,$__end);

542 &mov ($__key,$key);

543 &jb (&label("loop"));

544

545 &enccompact(0,$tbl,$s0,$s1,$s2,$s3);

546 &enccompact(1,$tbl,$s1,$s2,$s3,$s0);

547 &enccompact(2,$tbl,$s2,$s3,$s0,$s1);

548 &enccompact(3,$tbl,$s3,$s0,$s1,$s2);

549

550 &xor ($s0,&DWP(16,$key));

551 &xor ($s1,&DWP(20,$key));

552 &xor ($s2,&DWP(24,$key));

553 &xor ($s3,&DWP(28,$key));

554

555 &ret ();

556 &function_end_B("_x86_AES_encrypt_compact");

557

558 ######################################################################

559 # "Compact" SSE block function.

560 ######################################################################

561 #

562 # Performance is not actually extraordinary in comparison to pure

563 # x86 code. In particular encrypt performance is virtually the same.

564 # Decrypt performance on the other hand is 15-20% better on newer

565 # µ-archs [but we're thankful for any improvement here], and ~50%

566 # better on PIII:-) And additionally on the pros side this code

567 # eliminates redundant references to stack and thus relieves/

568 # minimizes the pressure on the memory bus.

569 #

570 # MMX register layout lsb

571 # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+

572 # \| mm4 \| mm0 \|

573 # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+

574 # \| s3 \| s2 \| s1 \| s0 \|

575 # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+

576 # \|15\|14\|13\|12\|11\|10\| 9\| 8\| 7\| 6\| 5\| 4\| 3\| 2\| 1\| 0\|

577 # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+

578 #

579 # Indexes translate as s[N/4]>>(8*(N%4)), e.g. 5 means s1>>8.

580 # In this terms encryption and decryption "compact" permutation

581 # matrices can be depicted as following:

582 #

583 # encryption lsb # decryption lsb

584 # +----++----+----+----+----+ # +----++----+----+----+----+

585 # \| t0 \|\| 15 \| 10 \| 5 \| 0 \| # \| t0 \|\| 7 \| 10 \| 13 \| 0 \|

586 # +----++----+----+----+----+ # +----++----+----+----+----+

587 # \| t1 \|\| 3 \| 14 \| 9 \| 4 \| # \| t1 \|\| 11 \| 14 \| 1 \| 4 \|

588 # +----++----+----+----+----+ # +----++----+----+----+----+

589 # \| t2 \|\| 7 \| 2 \| 13 \| 8 \| # \| t2 \|\| 15 \| 2 \| 5 \| 8 \|

590 # +----++----+----+----+----+ # +----++----+----+----+----+

591 # \| t3 \|\| 11 \| 6 \| 1 \| 12 \| # \| t3 \|\| 3 \| 6 \| 9 \| 12 \|

592 # +----++----+----+----+----+ # +----++----+----+----+----+

593 #

594 ######################################################################

595 # Why not xmm registers? Short answer. It was actually tested and

596 # was not any faster, but contrary, most notably on Intel CPUs.

597 # Longer answer. Main advantage of using mm registers is that movd

598 # latency is lower, especially on Intel P4. While arithmetic

599 # instructions are twice as many, they can be scheduled every cycle

600 # and not every second one when they are operating on xmm register,

601 # so that "arithmetic throughput" remains virtually the same. And

602 # finally the code can be executed even on elder SSE-only CPUs:-)

603

604 sub sse_enccompact()

605 {

606 &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0

607 &pshufw ("mm5","mm4",0x0d); # 15,14,11,10

608 &movd ("eax","mm1"); # 5, 4, 1, 0

609 &movd ("ebx","mm5"); # 15,14,11,10

610

611 &movz ($acc,&LB("eax")); # 0

612 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0

613 &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2

614 &movz ("edx",&HB("eax")); # 1

615 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1

616 &shl ("edx",8); # 1

617 &shr ("eax",16); # 5, 4

618

619 &movz ($acc,&LB("ebx")); # 10

620 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10

621 &shl ($acc,16); # 10

622 &or ("ecx",$acc); # 10

623 &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8

624 &movz ($acc,&HB("ebx")); # 11

625 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11

626 &shl ($acc,24); # 11

627 &or ("edx",$acc); # 11

628 &shr ("ebx",16); # 15,14

629

630 &movz ($acc,&HB("eax")); # 5

631 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 5

632 &shl ($acc,8); # 5

633 &or ("ecx",$acc); # 5

634 &movz ($acc,&HB("ebx")); # 15

635 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15

636 &shl ($acc,24); # 15

637 &or ("ecx",$acc); # 15

638 &movd ("mm0","ecx"); # t[0] collected

639

640 &movz ($acc,&LB("eax")); # 4

641 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4

642 &movd ("eax","mm2"); # 7, 6, 3, 2

643 &movz ($acc,&LB("ebx")); # 14

644 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14

645 &shl ($acc,16); # 14

646 &or ("ecx",$acc); # 14

647

648 &movd ("ebx","mm6"); # 13,12, 9, 8

649 &movz ($acc,&HB("eax")); # 3

650 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 3

651 &shl ($acc,24); # 3

652 &or ("ecx",$acc); # 3

653 &movz ($acc,&HB("ebx")); # 9

654 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9

655 &shl ($acc,8); # 9

656 &or ("ecx",$acc); # 9

657 &movd ("mm1","ecx"); # t[1] collected

658

659 &movz ($acc,&LB("ebx")); # 8

660 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8

661 &shr ("ebx",16); # 13,12

662 &movz ($acc,&LB("eax")); # 2

663 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2

664 &shl ($acc,16); # 2

665 &or ("ecx",$acc); # 2

666 &shr ("eax",16); # 7, 6

667

668 &punpckldq ("mm0","mm1"); # t[0,1] collected

669

670 &movz ($acc,&HB("eax")); # 7

671 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7

672 &shl ($acc,24); # 7

673 &or ("ecx",$acc); # 7

674 &and ("eax",0xff); # 6

675 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6

676 &shl ("eax",16); # 6

677 &or ("edx","eax"); # 6

678 &movz ($acc,&HB("ebx")); # 13

679 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13

680 &shl ($acc,8); # 13

681 &or ("ecx",$acc); # 13

682 &movd ("mm4","ecx"); # t[2] collected

683 &and ("ebx",0xff); # 12

684 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12

685 &or ("edx","ebx"); # 12

686 &movd ("mm5","edx"); # t[3] collected

687

688 &punpckldq ("mm4","mm5"); # t[2,3] collected

689 }

690

691 if (!$x86only) {

692 &function_begin_B("_sse_AES_encrypt_compact");

693 &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0

694 &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8

695

696 # note that caller is expected to allocate stack frame for me!

697 &mov ($acc,&DWP(240,$key)); # load key->rounds

698 &lea ($acc,&DWP(-2,$acc,$acc));

699 &lea ($acc,&DWP(0,$key,$acc,8));

700 &mov ($__end,$acc); # end of key schedule

701

702 &mov ($s0,0x1b1b1b1b); # magic constant

703 &mov (&DWP(8,"esp"),$s0);

704 &mov (&DWP(12,"esp"),$s0);

705

706 # prefetch Te4

707 &mov ($s0,&DWP(0-128,$tbl));

708 &mov ($s1,&DWP(32-128,$tbl));

709 &mov ($s2,&DWP(64-128,$tbl));

710 &mov ($s3,&DWP(96-128,$tbl));

711 &mov ($s0,&DWP(128-128,$tbl));

712 &mov ($s1,&DWP(160-128,$tbl));

713 &mov ($s2,&DWP(192-128,$tbl));

714 &mov ($s3,&DWP(224-128,$tbl));

715

716 &set_label("loop",16);

717 &sse_enccompact();

718 &add ($key,16);

719 &cmp ($key,$__end);

720 &ja (&label("out"));

721

722 &movq ("mm2",&QWP(8,"esp"));

723 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");

724 &movq ("mm1","mm0"); &movq ("mm5","mm4"); # r0

725 &pcmpgtb("mm3","mm0"); &pcmpgtb("mm7","mm4");

726 &pand ("mm3","mm2"); &pand ("mm7","mm2");

727 &pshufw ("mm2","mm0",0xb1); &pshufw ("mm6","mm4",0xb1);# ROT ATE(r0,16)

728 &paddb ("mm0","mm0"); &paddb ("mm4","mm4");

729 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # = r2

730 &pshufw ("mm3","mm2",0xb1); &pshufw ("mm7","mm6",0xb1);# r0

731 &pxor ("mm1","mm0"); &pxor ("mm5","mm4"); # r0^r2

732 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROT ATE(r0,16)

733

734 &movq ("mm2","mm3"); &movq ("mm6","mm7");

735 &pslld ("mm3",8); &pslld ("mm7",8);

736 &psrld ("mm2",24); &psrld ("mm6",24);

737 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= r0< <8

738 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= r0> >24

739

740 &movq ("mm3","mm1"); &movq ("mm7","mm5");

741 &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));

742 &psrld ("mm1",8); &psrld ("mm5",8);

743 &mov ($s0,&DWP(0-128,$tbl));

744 &pslld ("mm3",24); &pslld ("mm7",24);

745 &mov ($s1,&DWP(64-128,$tbl));

746 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= (r2 ^r0)<<8

747 &mov ($s2,&DWP(128-128,$tbl));

748 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= (r2 ^r0)>>24

749 &mov ($s3,&DWP(192-128,$tbl));

750

751 &pxor ("mm0","mm2"); &pxor ("mm4","mm6");

752 &jmp (&label("loop"));

753

754 &set_label("out",16);

755 &pxor ("mm0",&QWP(0,$key));

756 &pxor ("mm4",&QWP(8,$key));

757

758 &ret ();

759 &function_end_B("_sse_AES_encrypt_compact");

760 }

761

762 ######################################################################

763 # Vanilla block function.

764 ######################################################################

765

766 sub encstep()

767 { my ($i,$te,@s) = @_;

768 my $tmp = $key;

769 my $out = $i==3?$s[0]:$acc;

770

771 # lines marked with #%e?x[i] denote "reordered" instructions...

772 if ($i==3) { &mov ($key,$__key); }##%edx

773 else { &mov ($out,$s[0]);

774 &and ($out,0xFF); }

775 if ($i==1) { &shr ($s[0],16); }#%ebx[1]

776 if ($i==2) { &shr ($s[0],24); }#%ecx[2]

777 &mov ($out,&DWP(0,$te,$out,8));

778

779 if ($i==3) { $tmp=$s[1]; }##%eax

780 &movz ($tmp,&HB($s[1]));

781 &xor ($out,&DWP(3,$te,$tmp,8));

782

783 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx

784 else { &mov ($tmp,$s[2]);

785 &shr ($tmp,16); }

786 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]

787 &and ($tmp,0xFF);

788 &xor ($out,&DWP(2,$te,$tmp,8));

789

790 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx

791 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]

792 else { &mov ($tmp,$s[3]);

793 &shr ($tmp,24) }

794 &xor ($out,&DWP(1,$te,$tmp,8));

795 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }

796 if ($i==3) { &mov ($s[3],$acc); }

797 &comment();

798 }

799

800 sub enclast()

801 { my ($i,$te,@s)=@_;

802 my $tmp = $key;

803 my $out = $i==3?$s[0]:$acc;

804

805 if ($i==3) { &mov ($key,$__key); }##%edx

806 else { &mov ($out,$s[0]); }

807 &and ($out,0xFF);

808 if ($i==1) { &shr ($s[0],16); }#%ebx[1]

809 if ($i==2) { &shr ($s[0],24); }#%ecx[2]

810 &mov ($out,&DWP(2,$te,$out,8));

811 &and ($out,0x000000ff);

812

813 if ($i==3) { $tmp=$s[1]; }##%eax

814 &movz ($tmp,&HB($s[1]));

815 &mov ($tmp,&DWP(0,$te,$tmp,8));

816 &and ($tmp,0x0000ff00);

817 &xor ($out,$tmp);

818

819 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx

820 else { &mov ($tmp,$s[2]);

821 &shr ($tmp,16); }

822 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]

823 &and ($tmp,0xFF);

824 &mov ($tmp,&DWP(0,$te,$tmp,8));

825 &and ($tmp,0x00ff0000);

826 &xor ($out,$tmp);

827

828 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx

829 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]

830 else { &mov ($tmp,$s[3]);

831 &shr ($tmp,24); }

832 &mov ($tmp,&DWP(2,$te,$tmp,8));

833 &and ($tmp,0xff000000);

834 &xor ($out,$tmp);

835 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }

836 if ($i==3) { &mov ($s[3],$acc); }

837 }

838

839 &function_begin_B("_x86_AES_encrypt");

840 if ($vertical_spin) {

841 # I need high parts of volatile registers to be accessible...

842 &exch ($s1="edi",$key="ebx");

843 &mov ($s2="esi",$acc="ecx");

844 }

845

846 # note that caller is expected to allocate stack frame for me!

847 &mov ($__key,$key); # save key

848

849 &xor ($s0,&DWP(0,$key)); # xor with key

850 &xor ($s1,&DWP(4,$key));

851 &xor ($s2,&DWP(8,$key));

852 &xor ($s3,&DWP(12,$key));

853

854 &mov ($acc,&DWP(240,$key)); # load key->rounds

855

856 if ($small_footprint) {

857 &lea ($acc,&DWP(-2,$acc,$acc));

858 &lea ($acc,&DWP(0,$key,$acc,8));

859 &mov ($__end,$acc); # end of key schedule

860

861 &set_label("loop",16);

862 if ($vertical_spin) {

863 &encvert($tbl,$s0,$s1,$s2,$s3);

864 } else {

865 &encstep(0,$tbl,$s0,$s1,$s2,$s3);

866 &encstep(1,$tbl,$s1,$s2,$s3,$s0);

867 &encstep(2,$tbl,$s2,$s3,$s0,$s1);

868 &encstep(3,$tbl,$s3,$s0,$s1,$s2);

869 }

870 &add ($key,16); # advance rd_key

871 &xor ($s0,&DWP(0,$key));

872 &xor ($s1,&DWP(4,$key));

873 &xor ($s2,&DWP(8,$key));

874 &xor ($s3,&DWP(12,$key));

875 &cmp ($key,$__end);

876 &mov ($__key,$key);

877 &jb (&label("loop"));

878 }

879 else {

880 &cmp ($acc,10);

881 &jle (&label("10rounds"));

882 &cmp ($acc,12);

883 &jle (&label("12rounds"));

884

885 &set_label("14rounds",4);

886 for ($i=1;$i<3;$i++) {

887 if ($vertical_spin) {

888 &encvert($tbl,$s0,$s1,$s2,$s3);

889 } else {

890 &encstep(0,$tbl,$s0,$s1,$s2,$s3);

891 &encstep(1,$tbl,$s1,$s2,$s3,$s0);

892 &encstep(2,$tbl,$s2,$s3,$s0,$s1);

893 &encstep(3,$tbl,$s3,$s0,$s1,$s2);

894 }

895 &xor ($s0,&DWP(16*$i+0,$key));

896 &xor ($s1,&DWP(16*$i+4,$key));

897 &xor ($s2,&DWP(16*$i+8,$key));

898 &xor ($s3,&DWP(16*$i+12,$key));

899 }

900 &add ($key,32);

901 &mov ($__key,$key); # advance rd_key

902 &set_label("12rounds",4);

903 for ($i=1;$i<3;$i++) {

904 if ($vertical_spin) {

905 &encvert($tbl,$s0,$s1,$s2,$s3);

906 } else {

907 &encstep(0,$tbl,$s0,$s1,$s2,$s3);

908 &encstep(1,$tbl,$s1,$s2,$s3,$s0);

909 &encstep(2,$tbl,$s2,$s3,$s0,$s1);

910 &encstep(3,$tbl,$s3,$s0,$s1,$s2);

911 }

912 &xor ($s0,&DWP(16*$i+0,$key));

913 &xor ($s1,&DWP(16*$i+4,$key));

914 &xor ($s2,&DWP(16*$i+8,$key));

915 &xor ($s3,&DWP(16*$i+12,$key));

916 }

917 &add ($key,32);

918 &mov ($__key,$key); # advance rd_key

919 &set_label("10rounds",4);

920 for ($i=1;$i<10;$i++) {

921 if ($vertical_spin) {

922 &encvert($tbl,$s0,$s1,$s2,$s3);

923 } else {

924 &encstep(0,$tbl,$s0,$s1,$s2,$s3);

925 &encstep(1,$tbl,$s1,$s2,$s3,$s0);

926 &encstep(2,$tbl,$s2,$s3,$s0,$s1);

927 &encstep(3,$tbl,$s3,$s0,$s1,$s2);

928 }

929 &xor ($s0,&DWP(16*$i+0,$key));

930 &xor ($s1,&DWP(16*$i+4,$key));

931 &xor ($s2,&DWP(16*$i+8,$key));

932 &xor ($s3,&DWP(16*$i+12,$key));

933 }

934 }

935

936 if ($vertical_spin) {

937 # "reincarnate" some registers for "horizontal" spin...

938 &mov ($s1="ebx",$key="edi");

939 &mov ($s2="ecx",$acc="esi");

940 }

941 &enclast(0,$tbl,$s0,$s1,$s2,$s3);

942 &enclast(1,$tbl,$s1,$s2,$s3,$s0);

943 &enclast(2,$tbl,$s2,$s3,$s0,$s1);

944 &enclast(3,$tbl,$s3,$s0,$s1,$s2);

945

946 &add ($key,$small_footprint?16:160);

947 &xor ($s0,&DWP(0,$key));

948 &xor ($s1,&DWP(4,$key));

949 &xor ($s2,&DWP(8,$key));

950 &xor ($s3,&DWP(12,$key));

951

952 &ret ();

953

954 &set_label("AES_Te",64); # Yes! I keep it in the code segment!

955 &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);

956 &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);

957 &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);

958 &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);

959 &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);

960 &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);

961 &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);

962 &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);

963 &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);

964 &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);

965 &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);

966 &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);

967 &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);

968 &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);

969 &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);

970 &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);

971 &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);

972 &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);

973 &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);

974 &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);

975 &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);

976 &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);

977 &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);

978 &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);

979 &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);

980 &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);

981 &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);

982 &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);

983 &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);

984 &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);

985 &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);

986 &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);

987 &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);

988 &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);

989 &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);

990 &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);

991 &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);

992 &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);

993 &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);

994 &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);

995 &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);

996 &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);

997 &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);

998 &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);

999 &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);

1000 &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);

1001 &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);

1002 &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);

1003 &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);

1004 &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);

1005 &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);

1006 &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);

1007 &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);

1008 &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);

1009 &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);

1010 &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);

1011 &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);

1012 &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);

1013 &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);

1014 &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);

1015 &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);

1016 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);

1017 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);

1018 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);

1019

1020 #Te4 # four copies of Te4 to choose from to avoid L1 aliasing

1021 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);

1022 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);

1023 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);

1024 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);

1025 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);

1026 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);

1027 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);

1028 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);

1029 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);

1030 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);

1031 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);

1032 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);

1033 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);

1034 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);

1035 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);

1036 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);

1037 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);

1038 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);

1039 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);

1040 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);

1041 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);

1042 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);

1043 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);

1044 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);

1045 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);

1046 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);

1047 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);

1048 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);

1049 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);

1050 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);

1051 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);

1052 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);

1053

1054 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);

1055 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);

1056 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);

1057 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);

1058 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);

1059 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);

1060 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);

1061 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);

1062 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);

1063 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);

1064 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);

1065 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);

1066 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);

1067 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);

1068 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);

1069 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);

1070 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);

1071 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);

1072 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);

1073 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);

1074 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);

1075 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);

1076 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);

1077 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);

1078 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);

1079 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);

1080 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);

1081 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);

1082 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);

1083 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);

1084 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);

1085 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);

1086

1087 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);

1088 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);

1089 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);

1090 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);

1091 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);

1092 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);

1093 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);

1094 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);

1095 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);

1096 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);

1097 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);

1098 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);

1099 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);

1100 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);

1101 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);

1102 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);

1103 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);

1104 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);

1105 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);

1106 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);

1107 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);

1108 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);

1109 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);

1110 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);

1111 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);

1112 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);

1113 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);

1114 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);

1115 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);

1116 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);

1117 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);

1118 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);

1119

1120 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);

1121 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);

1122 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);

1123 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);

1124 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);

1125 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);

1126 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);

1127 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);

1128 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);

1129 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);

1130 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);

1131 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);

1132 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);

1133 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);

1134 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);

1135 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);

1136 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);

1137 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);

1138 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);

1139 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);

1140 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);

1141 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);

1142 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);

1143 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);

1144 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);

1145 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);

1146 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);

1147 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);

1148 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);

1149 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);

1150 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);

1151 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);

1152 #rcon:

1153 &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008);

1154 &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080);

1155 &data_word(0x0000001b, 0x00000036, 0x00000000, 0x00000000);

1156 &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000);

1157 &function_end_B("_x86_AES_encrypt");

1158

1159 # void AES_encrypt (const void inp,void out,const AES_KEY *key);

1160 &function_begin("AES_encrypt");

1161 &mov ($acc,&wparam(0)); # load inp

1162 &mov ($key,&wparam(2)); # load key

1163

1164 &mov ($s0,"esp");

1165 &sub ("esp",36);

1166 &and ("esp",-64); # align to cache-line

1167

1168 # place stack frame just "above" the key schedule

1169 &lea ($s1,&DWP(-64-63,$key));

1170 &sub ($s1,"esp");

1171 &neg ($s1);

1172 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line

1173 &sub ("esp",$s1);

1174 &add ("esp",4); # 4 is reserved for caller's return address

1175 &mov ($_esp,$s0); # save stack pointer

1176

1177 &call (&label("pic_point")); # make it PIC!

1178 &set_label("pic_point");

1179 &blindpop($tbl);

1180 &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if (!$x86only );

1181 &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));

1182

1183 # pick Te4 copy which can't "overlap" with stack frame or key schedule

1184 &lea ($s1,&DWP(768-4,"esp"));

1185 &sub ($s1,$tbl);

1186 &and ($s1,0x300);

1187 &lea ($tbl,&DWP(2048+128,$tbl,$s1));

1188

1189 if (!$x86only) {

1190 &bt (&DWP(0,$s0),25); # check for SSE bit

1191 &jnc (&label("x86"));

1192

1193 &movq ("mm0",&QWP(0,$acc));

1194 &movq ("mm4",&QWP(8,$acc));

1195 &call ("_sse_AES_encrypt_compact");

1196 &mov ("esp",$_esp); # restore stack pointer

1197 &mov ($acc,&wparam(1)); # load out

1198 &movq (&QWP(0,$acc),"mm0"); # write output data

1199 &movq (&QWP(8,$acc),"mm4");

1200 &emms ();

1201 &function_end_A();

1202 }

1203 &set_label("x86",16);

1204 &mov ($_tbl,$tbl);

1205 &mov ($s0,&DWP(0,$acc)); # load input data

1206 &mov ($s1,&DWP(4,$acc));

1207 &mov ($s2,&DWP(8,$acc));

1208 &mov ($s3,&DWP(12,$acc));

1209 &call ("_x86_AES_encrypt_compact");

1210 &mov ("esp",$_esp); # restore stack pointer

1211 &mov ($acc,&wparam(1)); # load out

1212 &mov (&DWP(0,$acc),$s0); # write output data

1213 &mov (&DWP(4,$acc),$s1);

1214 &mov (&DWP(8,$acc),$s2);

1215 &mov (&DWP(12,$acc),$s3);

1216 &function_end("AES_encrypt");

1217

1218 #--------------------------------------------------------------------#

1219

1220 ######################################################################

1221 # "Compact" block function

1222 ######################################################################

1223

1224 sub deccompact()

1225 { my $Fn = mov;

1226 while ($#_>5) { pop(@_); $Fn=sub{}; }

1227 my ($i,$td,@s)=@_;

1228 my $tmp = $key;

1229 my $out = $i==3?$s[0]:$acc;

1230

1231 # $Fn is used in first compact round and its purpose is to

1232 # void restoration of some values from stack, so that after

1233 # 4xdeccompact with extra argument $key, $s0 and $s1 values

1234 # are left there...

1235 if($i==3) { &$Fn ($key,$__key); }

1236 else { &mov ($out,$s[0]); }

1237 &and ($out,0xFF);

1238 &movz ($out,&BP(-128,$td,$out,1));

1239

1240 if ($i==3) { $tmp=$s[1]; }

1241 &movz ($tmp,&HB($s[1]));

1242 &movz ($tmp,&BP(-128,$td,$tmp,1));

1243 &shl ($tmp,8);

1244 &xor ($out,$tmp);

1245

1246 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }

1247 else { mov ($tmp,$s[2]); }

1248 &shr ($tmp,16);

1249 &and ($tmp,0xFF);

1250 &movz ($tmp,&BP(-128,$td,$tmp,1));

1251 &shl ($tmp,16);

1252 &xor ($out,$tmp);

1253

1254 if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],$__s1); }

1255 else { &mov ($tmp,$s[3]); }

1256 &shr ($tmp,24);

1257 &movz ($tmp,&BP(-128,$td,$tmp,1));

1258 &shl ($tmp,24);

1259 &xor ($out,$tmp);

1260 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }

1261 if ($i==3) { &$Fn ($s[3],$__s0); }

1262 }

1263

1264 # must be called with 2,3,0,1 as argument sequence!!!

1265 sub dectransform()

1266 { my @s = ($s0,$s1,$s2,$s3);

1267 my $i = shift;

1268 my $tmp = $key;

1269 my $tp2 = @s[($i+2)%4]; $tp2 = @s[2] if ($i==1);

1270 my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);

1271 my $tp8 = $tbl;

1272

1273 &mov ($acc,$s[$i]);

1274 &and ($acc,0x80808080);

1275 &mov ($tmp,$acc);

1276 &shr ($tmp,7);

1277 &lea ($tp2,&DWP(0,$s[$i],$s[$i]));

1278 &sub ($acc,$tmp);

1279 &and ($tp2,0xfefefefe);

1280 &and ($acc,0x1b1b1b1b);

1281 &xor ($acc,$tp2);

1282 &mov ($tp2,$acc);

1283

1284 &and ($acc,0x80808080);

1285 &mov ($tmp,$acc);

1286 &shr ($tmp,7);

1287 &lea ($tp4,&DWP(0,$tp2,$tp2));

1288 &sub ($acc,$tmp);

1289 &and ($tp4,0xfefefefe);

1290 &and ($acc,0x1b1b1b1b);

1291 &xor ($tp2,$s[$i]); # tp2^tp1

1292 &xor ($acc,$tp4);

1293 &mov ($tp4,$acc);

1294

1295 &and ($acc,0x80808080);

1296 &mov ($tmp,$acc);

1297 &shr ($tmp,7);

1298 &lea ($tp8,&DWP(0,$tp4,$tp4));

1299 &sub ($acc,$tmp);

1300 &and ($tp8,0xfefefefe);

1301 &and ($acc,0x1b1b1b1b);

1302 &xor ($tp4,$s[$i]); # tp4^tp1

1303 &rotl ($s[$i],8); # = ROTATE(tp1,8)

1304 &xor ($tp8,$acc);

1305

1306 &xor ($s[$i],$tp2);

1307 &xor ($tp2,$tp8);

1308 &rotl ($tp2,24);

1309 &xor ($s[$i],$tp4);

1310 &xor ($tp4,$tp8);

1311 &rotl ($tp4,16);

1312 &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)

1313 &rotl ($tp8,8);

1314 &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24)

1315 &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)

1316 &mov ($s[0],$__s0) if($i==2); #prefetch $s0

1317 &mov ($s[1],$__s1) if($i==3); #prefetch $s1

1318 &mov ($s[2],$__s2) if($i==1);

1319 &xor ($s[$i],$tp8); # ^= ROTATE(tp8,8)

1320

1321 &mov ($s[3],$__s3) if($i==1);

1322 &mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2);

1323 }

1324

1325 &function_begin_B("_x86_AES_decrypt_compact");

1326 # note that caller is expected to allocate stack frame for me!

1327 &mov ($__key,$key); # save key

1328

1329 &xor ($s0,&DWP(0,$key)); # xor with key

1330 &xor ($s1,&DWP(4,$key));

1331 &xor ($s2,&DWP(8,$key));

1332 &xor ($s3,&DWP(12,$key));

1333

1334 &mov ($acc,&DWP(240,$key)); # load key->rounds

1335

1336 &lea ($acc,&DWP(-2,$acc,$acc));

1337 &lea ($acc,&DWP(0,$key,$acc,8));

1338 &mov ($__end,$acc); # end of key schedule

1339

1340 # prefetch Td4

1341 &mov ($key,&DWP(0-128,$tbl));

1342 &mov ($acc,&DWP(32-128,$tbl));

1343 &mov ($key,&DWP(64-128,$tbl));

1344 &mov ($acc,&DWP(96-128,$tbl));

1345 &mov ($key,&DWP(128-128,$tbl));

1346 &mov ($acc,&DWP(160-128,$tbl));

1347 &mov ($key,&DWP(192-128,$tbl));

1348 &mov ($acc,&DWP(224-128,$tbl));

1349

1350 &set_label("loop",16);

1351

1352 &deccompact(0,$tbl,$s0,$s3,$s2,$s1,1);

1353 &deccompact(1,$tbl,$s1,$s0,$s3,$s2,1);

1354 &deccompact(2,$tbl,$s2,$s1,$s0,$s3,1);

1355 &deccompact(3,$tbl,$s3,$s2,$s1,$s0,1);

1356 &dectransform(2);

1357 &dectransform(3);

1358 &dectransform(0);

1359 &dectransform(1);

1360 &mov ($key,$__key);

1361 &mov ($tbl,$__tbl);

1362 &add ($key,16); # advance rd_key

1363 &xor ($s0,&DWP(0,$key));

1364 &xor ($s1,&DWP(4,$key));

1365 &xor ($s2,&DWP(8,$key));

1366 &xor ($s3,&DWP(12,$key));

1367

1368 &cmp ($key,$__end);

1369 &mov ($__key,$key);

1370 &jb (&label("loop"));

1371

1372 &deccompact(0,$tbl,$s0,$s3,$s2,$s1);

1373 &deccompact(1,$tbl,$s1,$s0,$s3,$s2);

1374 &deccompact(2,$tbl,$s2,$s1,$s0,$s3);

1375 &deccompact(3,$tbl,$s3,$s2,$s1,$s0);

1376

1377 &xor ($s0,&DWP(16,$key));

1378 &xor ($s1,&DWP(20,$key));

1379 &xor ($s2,&DWP(24,$key));

1380 &xor ($s3,&DWP(28,$key));

1381

1382 &ret ();

1383 &function_end_B("_x86_AES_decrypt_compact");

1384

1385 ######################################################################

1386 # "Compact" SSE block function.

1387 ######################################################################

1388

1389 sub sse_deccompact()

1390 {

1391 &pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0

1392 &movd ("eax","mm1"); # 7, 6, 1, 0

1393

1394 &pshufw ("mm5","mm4",0x09); # 13,12,11,10

1395 &movz ($acc,&LB("eax")); # 0

1396 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0

1397 &movd ("ebx","mm5"); # 13,12,11,10

1398 &movz ("edx",&HB("eax")); # 1

1399 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1

1400 &shl ("edx",8); # 1

1401

1402 &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4

1403 &movz ($acc,&LB("ebx")); # 10

1404 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10

1405 &shl ($acc,16); # 10

1406 &or ("ecx",$acc); # 10

1407 &shr ("eax",16); # 7, 6

1408 &movz ($acc,&HB("ebx")); # 11

1409 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11

1410 &shl ($acc,24); # 11

1411 &or ("edx",$acc); # 11

1412 &shr ("ebx",16); # 13,12

1413

1414 &pshufw ("mm6","mm4",0x03); # 9, 8,15,14

1415 &movz ($acc,&HB("eax")); # 7

1416 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7

1417 &shl ($acc,24); # 7

1418 &or ("ecx",$acc); # 7

1419 &movz ($acc,&HB("ebx")); # 13

1420 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13

1421 &shl ($acc,8); # 13

1422 &or ("ecx",$acc); # 13

1423 &movd ("mm0","ecx"); # t[0] collected

1424

1425 &movz ($acc,&LB("eax")); # 6

1426 &movd ("eax","mm2"); # 3, 2, 5, 4

1427 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6

1428 &shl ("ecx",16); # 6

1429 &movz ($acc,&LB("ebx")); # 12

1430 &movd ("ebx","mm6"); # 9, 8,15,14

1431 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 12

1432 &or ("ecx",$acc); # 12

1433

1434 &movz ($acc,&LB("eax")); # 4

1435 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 4

1436 &or ("edx",$acc); # 4

1437 &movz ($acc,&LB("ebx")); # 14

1438 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14

1439 &shl ($acc,16); # 14

1440 &or ("edx",$acc); # 14

1441 &movd ("mm1","edx"); # t[1] collected

1442

1443 &movz ($acc,&HB("eax")); # 5

1444 &movz ("edx",&BP(-128,$tbl,$acc,1)); # 5

1445 &shl ("edx",8); # 5

1446 &movz ($acc,&HB("ebx")); # 15

1447 &shr ("eax",16); # 3, 2

1448 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15

1449 &shl ($acc,24); # 15

1450 &or ("edx",$acc); # 15

1451 &shr ("ebx",16); # 9, 8

1452

1453 &punpckldq ("mm0","mm1"); # t[0,1] collected

1454

1455 &movz ($acc,&HB("ebx")); # 9

1456 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9

1457 &shl ($acc,8); # 9

1458 &or ("ecx",$acc); # 9

1459 &and ("ebx",0xff); # 8

1460 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8

1461 &or ("edx","ebx"); # 8

1462 &movz ($acc,&LB("eax")); # 2

1463 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2

1464 &shl ($acc,16); # 2

1465 &or ("edx",$acc); # 2

1466 &movd ("mm4","edx"); # t[2] collected

1467 &movz ("eax",&HB("eax")); # 3

1468 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3

1469 &shl ("eax",24); # 3

1470 &or ("ecx","eax"); # 3

1471 &movd ("mm5","ecx"); # t[3] collected

1472

1473 &punpckldq ("mm4","mm5"); # t[2,3] collected

1474 }

1475

1476 if (!$x86only) {

1477 &function_begin_B("_sse_AES_decrypt_compact");

1478 &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0

1479 &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8

1480

1481 # note that caller is expected to allocate stack frame for me!

1482 &mov ($acc,&DWP(240,$key)); # load key->rounds

1483 &lea ($acc,&DWP(-2,$acc,$acc));

1484 &lea ($acc,&DWP(0,$key,$acc,8));

1485 &mov ($__end,$acc); # end of key schedule

1486

1487 &mov ($s0,0x1b1b1b1b); # magic constant

1488 &mov (&DWP(8,"esp"),$s0);

1489 &mov (&DWP(12,"esp"),$s0);

1490

1491 # prefetch Td4

1492 &mov ($s0,&DWP(0-128,$tbl));

1493 &mov ($s1,&DWP(32-128,$tbl));

1494 &mov ($s2,&DWP(64-128,$tbl));

1495 &mov ($s3,&DWP(96-128,$tbl));

1496 &mov ($s0,&DWP(128-128,$tbl));

1497 &mov ($s1,&DWP(160-128,$tbl));

1498 &mov ($s2,&DWP(192-128,$tbl));

1499 &mov ($s3,&DWP(224-128,$tbl));

1500

1501 &set_label("loop",16);

1502 &sse_deccompact();

1503 &add ($key,16);

1504 &cmp ($key,$__end);

1505 &ja (&label("out"));

1506

1507 # ROTATE(x^y,N) == ROTATE(x,N)^ROTATE(y,N)

1508 &movq ("mm3","mm0"); &movq ("mm7","mm4");

1509 &movq ("mm2","mm0",1); &movq ("mm6","mm4",1);

1510 &movq ("mm1","mm0"); &movq ("mm5","mm4");

1511 &pshufw ("mm0","mm0",0xb1); &pshufw ("mm4","mm4",0xb1);# = R OTATE(tp0,16)

1512 &pslld ("mm2",8); &pslld ("mm6",8);

1513 &psrld ("mm3",8); &psrld ("mm7",8);

1514 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0 <<8

1515 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0 >>8

1516 &pslld ("mm2",16); &pslld ("mm6",16);

1517 &psrld ("mm3",16); &psrld ("mm7",16);

1518 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0 <<24

1519 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0 >>24

1520

1521 &movq ("mm3",&QWP(8,"esp"));

1522 &pxor ("mm2","mm2"); &pxor ("mm6","mm6");

1523 &pcmpgtb("mm2","mm1"); &pcmpgtb("mm6","mm5");

1524 &pand ("mm2","mm3"); &pand ("mm6","mm3");

1525 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");

1526 &pxor ("mm1","mm2"); &pxor ("mm5","mm6"); # tp2

1527 &movq ("mm3","mm1"); &movq ("mm7","mm5");

1528 &movq ("mm2","mm1"); &movq ("mm6","mm5");

1529 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp2

1530 &pslld ("mm3",24); &pslld ("mm7",24);

1531 &psrld ("mm2",8); &psrld ("mm6",8);

1532 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp2 <<24

1533 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp2 >>8

1534

1535 &movq ("mm2",&QWP(8,"esp"));

1536 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");

1537 &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");

1538 &pand ("mm3","mm2"); &pand ("mm7","mm2");

1539 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");

1540 &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4

1541 &pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1);

1542 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4

1543 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROT ATE(tp4,16)

1544

1545 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");

1546 &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");

1547 &pand ("mm3","mm2"); &pand ("mm7","mm2");

1548 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");

1549 &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp8

1550 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8

1551 &movq ("mm3","mm1"); &movq ("mm7","mm5");

1552 &pshufw ("mm2","mm1",0xb1); &pshufw ("mm6","mm5",0xb1);

1553 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROT ATE(tp8,16)

1554 &pslld ("mm1",8); &pslld ("mm5",8);

1555 &psrld ("mm3",8); &psrld ("mm7",8);

1556 &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));

1557 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8 <<8

1558 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8 >>8

1559 &mov ($s0,&DWP(0-128,$tbl));

1560 &pslld ("mm1",16); &pslld ("mm5",16);

1561 &mov ($s1,&DWP(64-128,$tbl));

1562 &psrld ("mm3",16); &psrld ("mm7",16);

1563 &mov ($s2,&DWP(128-128,$tbl));

1564 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8 <<24

1565 &mov ($s3,&DWP(192-128,$tbl));

1566 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8 >>24

1567

1568 &pxor ("mm0","mm2"); &pxor ("mm4","mm6");

1569 &jmp (&label("loop"));

1570

1571 &set_label("out",16);

1572 &pxor ("mm0",&QWP(0,$key));

1573 &pxor ("mm4",&QWP(8,$key));

1574

1575 &ret ();

1576 &function_end_B("_sse_AES_decrypt_compact");

1577 }

1578

1579 ######################################################################

1580 # Vanilla block function.

1581 ######################################################################

1582

1583 sub decstep()

1584 { my ($i,$td,@s) = @_;

1585 my $tmp = $key;

1586 my $out = $i==3?$s[0]:$acc;

1587

1588 # no instructions are reordered, as performance appears

1589 # optimal... or rather that all attempts to reorder didn't

1590 # result in better performance [which by the way is not a

1591 # bit lower than ecryption].

1592 if($i==3) { &mov ($key,$__key); }

1593 else { &mov ($out,$s[0]); }

1594 &and ($out,0xFF);

1595 &mov ($out,&DWP(0,$td,$out,8));

1596

1597 if ($i==3) { $tmp=$s[1]; }

1598 &movz ($tmp,&HB($s[1]));

1599 &xor ($out,&DWP(3,$td,$tmp,8));

1600

1601 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }

1602 else { &mov ($tmp,$s[2]); }

1603 &shr ($tmp,16);

1604 &and ($tmp,0xFF);

1605 &xor ($out,&DWP(2,$td,$tmp,8));

1606

1607 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }

1608 else { &mov ($tmp,$s[3]); }

1609 &shr ($tmp,24);

1610 &xor ($out,&DWP(1,$td,$tmp,8));

1611 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }

1612 if ($i==3) { &mov ($s[3],$__s0); }

1613 &comment();

1614 }

1615

1616 sub declast()

1617 { my ($i,$td,@s)=@_;

1618 my $tmp = $key;

1619 my $out = $i==3?$s[0]:$acc;

1620

1621 if($i==0) { &lea ($td,&DWP(2048+128,$td));

1622 &mov ($tmp,&DWP(0-128,$td));

1623 &mov ($acc,&DWP(32-128,$td));

1624 &mov ($tmp,&DWP(64-128,$td));

1625 &mov ($acc,&DWP(96-128,$td));

1626 &mov ($tmp,&DWP(128-128,$td));

1627 &mov ($acc,&DWP(160-128,$td));

1628 &mov ($tmp,&DWP(192-128,$td));

1629 &mov ($acc,&DWP(224-128,$td));

1630 &lea ($td,&DWP(-128,$td)); }

1631 if($i==3) { &mov ($key,$__key); }

1632 else { &mov ($out,$s[0]); }

1633 &and ($out,0xFF);

1634 &movz ($out,&BP(0,$td,$out,1));

1635

1636 if ($i==3) { $tmp=$s[1]; }

1637 &movz ($tmp,&HB($s[1]));

1638 &movz ($tmp,&BP(0,$td,$tmp,1));

1639 &shl ($tmp,8);

1640 &xor ($out,$tmp);

1641

1642 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }

1643 else { mov ($tmp,$s[2]); }

1644 &shr ($tmp,16);

1645 &and ($tmp,0xFF);

1646 &movz ($tmp,&BP(0,$td,$tmp,1));

1647 &shl ($tmp,16);

1648 &xor ($out,$tmp);

1649

1650 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }

1651 else { &mov ($tmp,$s[3]); }

1652 &shr ($tmp,24);

1653 &movz ($tmp,&BP(0,$td,$tmp,1));

1654 &shl ($tmp,24);

1655 &xor ($out,$tmp);

1656 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }

1657 if ($i==3) { &mov ($s[3],$__s0);

1658 &lea ($td,&DWP(-2048,$td)); }

1659 }

1660

1661 &function_begin_B("_x86_AES_decrypt");

1662 # note that caller is expected to allocate stack frame for me!

1663 &mov ($__key,$key); # save key

1664

1665 &xor ($s0,&DWP(0,$key)); # xor with key

1666 &xor ($s1,&DWP(4,$key));

1667 &xor ($s2,&DWP(8,$key));

1668 &xor ($s3,&DWP(12,$key));

1669

1670 &mov ($acc,&DWP(240,$key)); # load key->rounds

1671

1672 if ($small_footprint) {

1673 &lea ($acc,&DWP(-2,$acc,$acc));

1674 &lea ($acc,&DWP(0,$key,$acc,8));

1675 &mov ($__end,$acc); # end of key schedule

1676 &set_label("loop",16);

1677 &decstep(0,$tbl,$s0,$s3,$s2,$s1);

1678 &decstep(1,$tbl,$s1,$s0,$s3,$s2);

1679 &decstep(2,$tbl,$s2,$s1,$s0,$s3);

1680 &decstep(3,$tbl,$s3,$s2,$s1,$s0);

1681 &add ($key,16); # advance rd_key

1682 &xor ($s0,&DWP(0,$key));

1683 &xor ($s1,&DWP(4,$key));

1684 &xor ($s2,&DWP(8,$key));

1685 &xor ($s3,&DWP(12,$key));

1686 &cmp ($key,$__end);

1687 &mov ($__key,$key);

1688 &jb (&label("loop"));

1689 }

1690 else {

1691 &cmp ($acc,10);

1692 &jle (&label("10rounds"));

1693 &cmp ($acc,12);

1694 &jle (&label("12rounds"));

1695

1696 &set_label("14rounds",4);

1697 for ($i=1;$i<3;$i++) {

1698 &decstep(0,$tbl,$s0,$s3,$s2,$s1);

1699 &decstep(1,$tbl,$s1,$s0,$s3,$s2);

1700 &decstep(2,$tbl,$s2,$s1,$s0,$s3);

1701 &decstep(3,$tbl,$s3,$s2,$s1,$s0);

1702 &xor ($s0,&DWP(16*$i+0,$key));

1703 &xor ($s1,&DWP(16*$i+4,$key));

1704 &xor ($s2,&DWP(16*$i+8,$key));

1705 &xor ($s3,&DWP(16*$i+12,$key));

1706 }

1707 &add ($key,32);

1708 &mov ($__key,$key); # advance rd_key

1709 &set_label("12rounds",4);

1710 for ($i=1;$i<3;$i++) {

1711 &decstep(0,$tbl,$s0,$s3,$s2,$s1);

1712 &decstep(1,$tbl,$s1,$s0,$s3,$s2);

1713 &decstep(2,$tbl,$s2,$s1,$s0,$s3);

1714 &decstep(3,$tbl,$s3,$s2,$s1,$s0);

1715 &xor ($s0,&DWP(16*$i+0,$key));

1716 &xor ($s1,&DWP(16*$i+4,$key));

1717 &xor ($s2,&DWP(16*$i+8,$key));

1718 &xor ($s3,&DWP(16*$i+12,$key));

1719 }

1720 &add ($key,32);

1721 &mov ($__key,$key); # advance rd_key

1722 &set_label("10rounds",4);

1723 for ($i=1;$i<10;$i++) {

1724 &decstep(0,$tbl,$s0,$s3,$s2,$s1);

1725 &decstep(1,$tbl,$s1,$s0,$s3,$s2);

1726 &decstep(2,$tbl,$s2,$s1,$s0,$s3);

1727 &decstep(3,$tbl,$s3,$s2,$s1,$s0);

1728 &xor ($s0,&DWP(16*$i+0,$key));

1729 &xor ($s1,&DWP(16*$i+4,$key));

1730 &xor ($s2,&DWP(16*$i+8,$key));

1731 &xor ($s3,&DWP(16*$i+12,$key));

1732 }

1733 }

1734

1735 &declast(0,$tbl,$s0,$s3,$s2,$s1);

1736 &declast(1,$tbl,$s1,$s0,$s3,$s2);

1737 &declast(2,$tbl,$s2,$s1,$s0,$s3);

1738 &declast(3,$tbl,$s3,$s2,$s1,$s0);

1739

1740 &add ($key,$small_footprint?16:160);

1741 &xor ($s0,&DWP(0,$key));

1742 &xor ($s1,&DWP(4,$key));

1743 &xor ($s2,&DWP(8,$key));

1744 &xor ($s3,&DWP(12,$key));

1745

1746 &ret ();

1747

1748 &set_label("AES_Td",64); # Yes! I keep it in the code segment!

1749 &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);

1750 &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);

1751 &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);

1752 &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);

1753 &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);

1754 &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);

1755 &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);

1756 &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);

1757 &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);

1758 &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);

1759 &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);

1760 &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);

1761 &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);

1762 &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);

1763 &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);

1764 &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);

1765 &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);

1766 &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);

1767 &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);

1768 &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);

1769 &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);

1770 &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);

1771 &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);

1772 &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);

1773 &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);

1774 &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);

1775 &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);

1776 &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);

1777 &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);

1778 &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);

1779 &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);

1780 &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);

1781 &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);

1782 &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);

1783 &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);

1784 &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);

1785 &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);

1786 &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);

1787 &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);

1788 &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);

1789 &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);

1790 &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);

1791 &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);

1792 &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);

1793 &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);

1794 &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);

1795 &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);

1796 &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);

1797 &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);

1798 &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);

1799 &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);

1800 &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);

1801 &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);

1802 &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);

1803 &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);

1804 &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);

1805 &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);

1806 &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);

1807 &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);

1808 &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);

1809 &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);

1810 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);

1811 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);

1812 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);

1813

1814 #Td4: # four copies of Td4 to choose from to avoid L1 aliasing

1815 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);

1816 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);

1817 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);

1818 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);

1819 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);

1820 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);

1821 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);

1822 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);

1823 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);

1824 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);

1825 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);

1826 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);

1827 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);

1828 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);

1829 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);

1830 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);

1831 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);

1832 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);

1833 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);

1834 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);

1835 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);

1836 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);

1837 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);

1838 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);

1839 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);

1840 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);

1841 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);

1842 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);

1843 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);

1844 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);

1845 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);

1846 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);

1847

1848 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);

1849 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);

1850 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);

1851 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);

1852 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);

1853 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);

1854 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);

1855 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);

1856 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);

1857 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);

1858 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);

1859 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);

1860 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);

1861 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);

1862 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);

1863 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);

1864 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);

1865 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);

1866 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);

1867 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);

1868 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);

1869 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);

1870 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);

1871 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);

1872 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);

1873 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);

1874 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);

1875 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);

1876 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);

1877 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);

1878 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);

1879 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);

1880

1881 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);

1882 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);

1883 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);

1884 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);

1885 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);

1886 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);

1887 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);

1888 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);

1889 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);

1890 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);

1891 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);

1892 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);

1893 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);

1894 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);

1895 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);

1896 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);

1897 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);

1898 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);

1899 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);

1900 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);

1901 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);

1902 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);

1903 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);

1904 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);

1905 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);

1906 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);

1907 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);

1908 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);

1909 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);

1910 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);

1911 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);

1912 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);

1913

1914 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);

1915 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);

1916 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);

1917 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);

1918 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);

1919 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);

1920 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);

1921 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);

1922 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);

1923 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);

1924 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);

1925 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);

1926 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);

1927 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);

1928 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);

1929 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);

1930 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);

1931 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);

1932 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);

1933 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);

1934 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);

1935 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);

1936 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);

1937 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);

1938 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);

1939 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);

1940 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);

1941 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);

1942 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);

1943 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);

1944 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);

1945 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);

1946 &function_end_B("_x86_AES_decrypt");

1947

1948 # void AES_decrypt (const void inp,void out,const AES_KEY *key);

1949 &function_begin("AES_decrypt");

1950 &mov ($acc,&wparam(0)); # load inp

1951 &mov ($key,&wparam(2)); # load key

1952

1953 &mov ($s0,"esp");

1954 &sub ("esp",36);

1955 &and ("esp",-64); # align to cache-line

1956

1957 # place stack frame just "above" the key schedule

1958 &lea ($s1,&DWP(-64-63,$key));

1959 &sub ($s1,"esp");

1960 &neg ($s1);

1961 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line

1962 &sub ("esp",$s1);

1963 &add ("esp",4); # 4 is reserved for caller's return address

1964 &mov ($_esp,$s0); # save stack pointer

1965

1966 &call (&label("pic_point")); # make it PIC!

1967 &set_label("pic_point");

1968 &blindpop($tbl);

1969 &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only) ;

1970 &lea ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl));

1971

1972 # pick Td4 copy which can't "overlap" with stack frame or key schedule

1973 &lea ($s1,&DWP(768-4,"esp"));

1974 &sub ($s1,$tbl);

1975 &and ($s1,0x300);

1976 &lea ($tbl,&DWP(2048+128,$tbl,$s1));

1977

1978 if (!$x86only) {

1979 &bt (&DWP(0,$s0),25); # check for SSE bit

1980 &jnc (&label("x86"));

1981

1982 &movq ("mm0",&QWP(0,$acc));

1983 &movq ("mm4",&QWP(8,$acc));

1984 &call ("_sse_AES_decrypt_compact");

1985 &mov ("esp",$_esp); # restore stack pointer

1986 &mov ($acc,&wparam(1)); # load out

1987 &movq (&QWP(0,$acc),"mm0"); # write output data

1988 &movq (&QWP(8,$acc),"mm4");

1989 &emms ();

1990 &function_end_A();

1991 }

1992 &set_label("x86",16);

1993 &mov ($_tbl,$tbl);

1994 &mov ($s0,&DWP(0,$acc)); # load input data

1995 &mov ($s1,&DWP(4,$acc));

1996 &mov ($s2,&DWP(8,$acc));

1997 &mov ($s3,&DWP(12,$acc));

1998 &call ("_x86_AES_decrypt_compact");

1999 &mov ("esp",$_esp); # restore stack pointer

2000 &mov ($acc,&wparam(1)); # load out

2001 &mov (&DWP(0,$acc),$s0); # write output data

2002 &mov (&DWP(4,$acc),$s1);

2003 &mov (&DWP(8,$acc),$s2);

2004 &mov (&DWP(12,$acc),$s3);

2005 &function_end("AES_decrypt");

2006

2007 # void AES_cbc_encrypt (const void char inp, unsigned char out,

2008 # size_t length, const AES_KEY *key,

2009 # unsigned char *ivp,const int enc);

2010 {

2011 # stack frame layout

2012 # -4(%esp) # return address 0(%esp)

2013 # 0(%esp) # s0 backing store 4(%esp)

2014 # 4(%esp) # s1 backing store 8(%esp)

2015 # 8(%esp) # s2 backing store 12(%esp)

2016 # 12(%esp) # s3 backing store 16(%esp)

2017 # 16(%esp) # key backup 20(%esp)

2018 # 20(%esp) # end of key schedule 24(%esp)

2019 # 24(%esp) # %ebp backup 28(%esp)

2020 # 28(%esp) # %esp backup

2021 my $_inp=&DWP(32,"esp"); # copy of wparam(0)

2022 my $_out=&DWP(36,"esp"); # copy of wparam(1)

2023 my $_len=&DWP(40,"esp"); # copy of wparam(2)

2024 my $_key=&DWP(44,"esp"); # copy of wparam(3)

2025 my $_ivp=&DWP(48,"esp"); # copy of wparam(4)

2026 my $_tmp=&DWP(52,"esp"); # volatile variable

2027 #

2028 my $ivec=&DWP(60,"esp"); # ivec[16]

2029 my $aes_key=&DWP(76,"esp"); # copy of aes_key

2030 my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds

2031

2032 &function_begin("AES_cbc_encrypt");

2033 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len

2034 &cmp ($s2,0);

2035 &je (&label("drop_out"));

2036

2037 &call (&label("pic_point")); # make it PIC!

2038 &set_label("pic_point");

2039 &blindpop($tbl);

2040 &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only) ;

2041

2042 &cmp (&wparam(5),0);

2043 &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));

2044 &jne (&label("picked_te"));

2045 &lea ($tbl,&DWP(&label("AES_Td")."-".&label("AES_Te"),$tbl));

2046 &set_label("picked_te");

2047

2048 # one can argue if this is required

2049 &pushf ();

2050 &cld ();

2051

2052 &cmp ($s2,$speed_limit);

2053 &jb (&label("slow_way"));

2054 &test ($s2,15);

2055 &jnz (&label("slow_way"));

2056 if (!$x86only) {

2057 &bt (&DWP(0,$s0),28); # check for hyper-threading bit

2058 &jc (&label("slow_way"));

2059 }

2060 # pre-allocate aligned stack frame...

2061 &lea ($acc,&DWP(-80-244,"esp"));

2062 &and ($acc,-64);

2063

2064 # ... and make sure it doesn't alias with $tbl modulo 4096

2065 &mov ($s0,$tbl);

2066 &lea ($s1,&DWP(2048+256,$tbl));

2067 &mov ($s3,$acc);

2068 &and ($s0,0xfff); # s = %ebp&0xfff

2069 &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff

2070 &and ($s3,0xfff); # p = %esp&0xfff

2071

2072 &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e);

2073 &jb (&label("tbl_break_out"));

2074 &sub ($s3,$s1);

2075 &sub ($acc,$s3);

2076 &jmp (&label("tbl_ok"));

2077 &set_label("tbl_break_out",4); # else %esp -= (p-s)&0xfff + framesz;

2078 &sub ($s3,$s0);

2079 &and ($s3,0xfff);

2080 &add ($s3,384);

2081 &sub ($acc,$s3);

2082 &set_label("tbl_ok",4);

2083

2084 &lea ($s3,&wparam(0)); # obtain pointer to parameter block

2085 &exch ("esp",$acc); # allocate stack frame

2086 &add ("esp",4); # reserve for return address!

2087 &mov ($_tbl,$tbl); # save %ebp

2088 &mov ($_esp,$acc); # save %esp

2089

2090 &mov ($s0,&DWP(0,$s3)); # load inp

2091 &mov ($s1,&DWP(4,$s3)); # load out

2092 #&mov ($s2,&DWP(8,$s3)); # load len

2093 &mov ($key,&DWP(12,$s3)); # load key

2094 &mov ($acc,&DWP(16,$s3)); # load ivp

2095 &mov ($s3,&DWP(20,$s3)); # load enc flag

2096

2097 &mov ($_inp,$s0); # save copy of inp

2098 &mov ($_out,$s1); # save copy of out

2099 &mov ($_len,$s2); # save copy of len

2100 &mov ($_key,$key); # save copy of key

2101 &mov ($_ivp,$acc); # save copy of ivp

2102

2103 &mov ($mark,0); # copy of aes_key->rounds = 0;

2104 # do we copy key schedule to stack?

2105 &mov ($s1 eq "ebx" ? $s1 : "",$key);

2106 &mov ($s2 eq "ecx" ? $s2 : "",244/4);

2107 &sub ($s1,$tbl);

2108 &mov ("esi",$key);

2109 &and ($s1,0xfff);

2110 &lea ("edi",$aes_key);

2111 &cmp ($s1,2048+256);

2112 &jb (&label("do_copy"));

2113 &cmp ($s1,4096-244);

2114 &jb (&label("skip_copy"));

2115 &set_label("do_copy",4);

2116 &mov ($_key,"edi");

2117 &data_word(0xA5F3F689); # rep movsd

2118 &set_label("skip_copy");

2119

2120 &mov ($key,16);

2121 &set_label("prefetch_tbl",4);

2122 &mov ($s0,&DWP(0,$tbl));

2123 &mov ($s1,&DWP(32,$tbl));

2124 &mov ($s2,&DWP(64,$tbl));

2125 &mov ($acc,&DWP(96,$tbl));

2126 &lea ($tbl,&DWP(128,$tbl));

2127 &sub ($key,1);

2128 &jnz (&label("prefetch_tbl"));

2129 &sub ($tbl,2048);

2130

2131 &mov ($acc,$_inp);

2132 &mov ($key,$_ivp);

2133

2134 &cmp ($s3,0);

2135 &je (&label("fast_decrypt"));

2136

2137 #----------------------------- ENCRYPT -----------------------------#

2138 &mov ($s0,&DWP(0,$key)); # load iv

2139 &mov ($s1,&DWP(4,$key));

2140

2141 &set_label("fast_enc_loop",16);

2142 &mov ($s2,&DWP(8,$key));

2143 &mov ($s3,&DWP(12,$key));

2144

2145 &xor ($s0,&DWP(0,$acc)); # xor input data

2146 &xor ($s1,&DWP(4,$acc));

2147 &xor ($s2,&DWP(8,$acc));

2148 &xor ($s3,&DWP(12,$acc));

2149

2150 &mov ($key,$_key); # load key

2151 &call ("_x86_AES_encrypt");

2152

2153 &mov ($acc,$_inp); # load inp

2154 &mov ($key,$_out); # load out

2155

2156 &mov (&DWP(0,$key),$s0); # save output data

2157 &mov (&DWP(4,$key),$s1);

2158 &mov (&DWP(8,$key),$s2);

2159 &mov (&DWP(12,$key),$s3);

2160

2161 &lea ($acc,&DWP(16,$acc)); # advance inp

2162 &mov ($s2,$_len); # load len

2163 &mov ($_inp,$acc); # save inp

2164 &lea ($s3,&DWP(16,$key)); # advance out

2165 &mov ($_out,$s3); # save out

2166 &sub ($s2,16); # decrease len

2167 &mov ($_len,$s2); # save len

2168 &jnz (&label("fast_enc_loop"));

2169 &mov ($acc,$_ivp); # load ivp

2170 &mov ($s2,&DWP(8,$key)); # restore last 2 dwords

2171 &mov ($s3,&DWP(12,$key));

2172 &mov (&DWP(0,$acc),$s0); # save ivec

2173 &mov (&DWP(4,$acc),$s1);

2174 &mov (&DWP(8,$acc),$s2);

2175 &mov (&DWP(12,$acc),$s3);

2176

2177 &cmp ($mark,0); # was the key schedule copied?

2178 &mov ("edi",$_key);

2179 &je (&label("skip_ezero"));

2180 # zero copy of key schedule

2181 &mov ("ecx",240/4);

2182 &xor ("eax","eax");

2183 &align (4);

2184 &data_word(0xABF3F689); # rep stosd

2185 &set_label("skip_ezero")

2186 &mov ("esp",$_esp);

2187 &popf ();

2188 &set_label("drop_out");

2189 &function_end_A();

2190 &pushf (); # kludge, never executed

2191

2192 #----------------------------- DECRYPT -----------------------------#

2193 &set_label("fast_decrypt",16);

2194

2195 &cmp ($acc,$_out);

2196 &je (&label("fast_dec_in_place")); # in-place processing...

2197

2198 &mov ($_tmp,$key);

2199

2200 &align (4);

2201 &set_label("fast_dec_loop",16);

2202 &mov ($s0,&DWP(0,$acc)); # read input

2203 &mov ($s1,&DWP(4,$acc));

2204 &mov ($s2,&DWP(8,$acc));

2205 &mov ($s3,&DWP(12,$acc));

2206

2207 &mov ($key,$_key); # load key

2208 &call ("_x86_AES_decrypt");

2209

2210 &mov ($key,$_tmp); # load ivp

2211 &mov ($acc,$_len); # load len

2212 &xor ($s0,&DWP(0,$key)); # xor iv

2213 &xor ($s1,&DWP(4,$key));

2214 &xor ($s2,&DWP(8,$key));

2215 &xor ($s3,&DWP(12,$key));

2216

2217 &mov ($key,$_out); # load out

2218 &mov ($acc,$_inp); # load inp

2219

2220 &mov (&DWP(0,$key),$s0); # write output

2221 &mov (&DWP(4,$key),$s1);

2222 &mov (&DWP(8,$key),$s2);

2223 &mov (&DWP(12,$key),$s3);

2224

2225 &mov ($s2,$_len); # load len

2226 &mov ($_tmp,$acc); # save ivp

2227 &lea ($acc,&DWP(16,$acc)); # advance inp

2228 &mov ($_inp,$acc); # save inp

2229 &lea ($key,&DWP(16,$key)); # advance out

2230 &mov ($_out,$key); # save out

2231 &sub ($s2,16); # decrease len

2232 &mov ($_len,$s2); # save len

2233 &jnz (&label("fast_dec_loop"));

2234 &mov ($key,$_tmp); # load temp ivp

2235 &mov ($acc,$_ivp); # load user ivp

2236 &mov ($s0,&DWP(0,$key)); # load iv

2237 &mov ($s1,&DWP(4,$key));

2238 &mov ($s2,&DWP(8,$key));

2239 &mov ($s3,&DWP(12,$key));

2240 &mov (&DWP(0,$acc),$s0); # copy back to user

2241 &mov (&DWP(4,$acc),$s1);

2242 &mov (&DWP(8,$acc),$s2);

2243 &mov (&DWP(12,$acc),$s3);

2244 &jmp (&label("fast_dec_out"));

2245

2246 &set_label("fast_dec_in_place",16);

2247 &set_label("fast_dec_in_place_loop");

2248 &mov ($s0,&DWP(0,$acc)); # read input

2249 &mov ($s1,&DWP(4,$acc));

2250 &mov ($s2,&DWP(8,$acc));

2251 &mov ($s3,&DWP(12,$acc));

2252

2253 &lea ($key,$ivec);

2254 &mov (&DWP(0,$key),$s0); # copy to temp

2255 &mov (&DWP(4,$key),$s1);

2256 &mov (&DWP(8,$key),$s2);

2257 &mov (&DWP(12,$key),$s3);

2258

2259 &mov ($key,$_key); # load key

2260 &call ("_x86_AES_decrypt");

2261

2262 &mov ($key,$_ivp); # load ivp

2263 &mov ($acc,$_out); # load out

2264 &xor ($s0,&DWP(0,$key)); # xor iv

2265 &xor ($s1,&DWP(4,$key));

2266 &xor ($s2,&DWP(8,$key));

2267 &xor ($s3,&DWP(12,$key));

2268

2269 &mov (&DWP(0,$acc),$s0); # write output

2270 &mov (&DWP(4,$acc),$s1);

2271 &mov (&DWP(8,$acc),$s2);

2272 &mov (&DWP(12,$acc),$s3);

2273

2274 &lea ($acc,&DWP(16,$acc)); # advance out

2275 &mov ($_out,$acc); # save out

2276

2277 &lea ($acc,$ivec);

2278 &mov ($s0,&DWP(0,$acc)); # read temp

2279 &mov ($s1,&DWP(4,$acc));

2280 &mov ($s2,&DWP(8,$acc));

2281 &mov ($s3,&DWP(12,$acc));

2282

2283 &mov (&DWP(0,$key),$s0); # copy iv

2284 &mov (&DWP(4,$key),$s1);

2285 &mov (&DWP(8,$key),$s2);

2286 &mov (&DWP(12,$key),$s3);

2287

2288 &mov ($acc,$_inp); # load inp

2289 &mov ($s2,$_len); # load len

2290 &lea ($acc,&DWP(16,$acc)); # advance inp

2291 &mov ($_inp,$acc); # save inp

2292 &sub ($s2,16); # decrease len

2293 &mov ($_len,$s2); # save len

2294 &jnz (&label("fast_dec_in_place_loop"));

2295

2296 &set_label("fast_dec_out",4);

2297 &cmp ($mark,0); # was the key schedule copied?

2298 &mov ("edi",$_key);

2299 &je (&label("skip_dzero"));

2300 # zero copy of key schedule

2301 &mov ("ecx",240/4);

2302 &xor ("eax","eax");

2303 &align (4);

2304 &data_word(0xABF3F689); # rep stosd

2305 &set_label("skip_dzero")

2306 &mov ("esp",$_esp);

2307 &popf ();

2308 &function_end_A();

2309 &pushf (); # kludge, never executed

2310

2311 #--------------------------- SLOW ROUTINE ---------------------------#

2312 &set_label("slow_way",16);

2313

2314 &mov ($s0,&DWP(0,$s0)) if (!$x86only);# load OPENSSL_ia32cap

2315 &mov ($key,&wparam(3)); # load key

2316

2317 # pre-allocate aligned stack frame...

2318 &lea ($acc,&DWP(-80,"esp"));

2319 &and ($acc,-64);

2320

2321 # ... and make sure it doesn't alias with $key modulo 1024

2322 &lea ($s1,&DWP(-80-63,$key));

2323 &sub ($s1,$acc);

2324 &neg ($s1);

2325 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line

2326 &sub ($acc,$s1);

2327

2328 # pick S-box copy which can't overlap with stack frame or $key

2329 &lea ($s1,&DWP(768,$acc));

2330 &sub ($s1,$tbl);

2331 &and ($s1,0x300);

2332 &lea ($tbl,&DWP(2048+128,$tbl,$s1));

2333

2334 &lea ($s3,&wparam(0)); # pointer to parameter block

2335

2336 &exch ("esp",$acc);

2337 &add ("esp",4); # reserve for return address!

2338 &mov ($_tbl,$tbl); # save %ebp

2339 &mov ($_esp,$acc); # save %esp

2340 &mov ($_tmp,$s0); # save OPENSSL_ia32cap

2341

2342 &mov ($s0,&DWP(0,$s3)); # load inp

2343 &mov ($s1,&DWP(4,$s3)); # load out

2344 #&mov ($s2,&DWP(8,$s3)); # load len

2345 #&mov ($key,&DWP(12,$s3)); # load key

2346 &mov ($acc,&DWP(16,$s3)); # load ivp

2347 &mov ($s3,&DWP(20,$s3)); # load enc flag

2348

2349 &mov ($_inp,$s0); # save copy of inp

2350 &mov ($_out,$s1); # save copy of out

2351 &mov ($_len,$s2); # save copy of len

2352 &mov ($_key,$key); # save copy of key

2353 &mov ($_ivp,$acc); # save copy of ivp

2354

2355 &mov ($key,$acc);

2356 &mov ($acc,$s0);

2357

2358 &cmp ($s3,0);

2359 &je (&label("slow_decrypt"));

2360

2361 #--------------------------- SLOW ENCRYPT ---------------------------#

2362 &cmp ($s2,16);

2363 &mov ($s3,$s1);

2364 &jb (&label("slow_enc_tail"));

2365

2366 if (!$x86only) {

2367 &bt ($_tmp,25); # check for SSE bit

2368 &jnc (&label("slow_enc_x86"));

2369

2370 &movq ("mm0",&QWP(0,$key)); # load iv

2371 &movq ("mm4",&QWP(8,$key));

2372

2373 &set_label("slow_enc_loop_sse",16);

2374 &pxor ("mm0",&QWP(0,$acc)); # xor input data

2375 &pxor ("mm4",&QWP(8,$acc));

2376

2377 &mov ($key,$_key);

2378 &call ("_sse_AES_encrypt_compact");

2379

2380 &mov ($acc,$_inp); # load inp

2381 &mov ($key,$_out); # load out

2382 &mov ($s2,$_len); # load len

2383

2384 &movq (&QWP(0,$key),"mm0"); # save output data

2385 &movq (&QWP(8,$key),"mm4");

2386

2387 &lea ($acc,&DWP(16,$acc)); # advance inp

2388 &mov ($_inp,$acc); # save inp

2389 &lea ($s3,&DWP(16,$key)); # advance out

2390 &mov ($_out,$s3); # save out

2391 &sub ($s2,16); # decrease len

2392 &cmp ($s2,16);

2393 &mov ($_len,$s2); # save len

2394 &jae (&label("slow_enc_loop_sse"));

2395 &test ($s2,15);

2396 &jnz (&label("slow_enc_tail"));

2397 &mov ($acc,$_ivp); # load ivp

2398 &movq (&QWP(0,$acc),"mm0"); # save ivec

2399 &movq (&QWP(8,$acc),"mm4");

2400 &emms ();

2401 &mov ("esp",$_esp);

2402 &popf ();

2403 &function_end_A();

2404 &pushf (); # kludge, never executed

2405 }

2406 &set_label("slow_enc_x86",16);

2407 &mov ($s0,&DWP(0,$key)); # load iv

2408 &mov ($s1,&DWP(4,$key));

2409

2410 &set_label("slow_enc_loop_x86",4);

2411 &mov ($s2,&DWP(8,$key));

2412 &mov ($s3,&DWP(12,$key));

2413

2414 &xor ($s0,&DWP(0,$acc)); # xor input data

2415 &xor ($s1,&DWP(4,$acc));

2416 &xor ($s2,&DWP(8,$acc));

2417 &xor ($s3,&DWP(12,$acc));

2418

2419 &mov ($key,$_key); # load key

2420 &call ("_x86_AES_encrypt_compact");

2421

2422 &mov ($acc,$_inp); # load inp

2423 &mov ($key,$_out); # load out

2424

2425 &mov (&DWP(0,$key),$s0); # save output data

2426 &mov (&DWP(4,$key),$s1);

2427 &mov (&DWP(8,$key),$s2);

2428 &mov (&DWP(12,$key),$s3);

2429

2430 &mov ($s2,$_len); # load len

2431 &lea ($acc,&DWP(16,$acc)); # advance inp

2432 &mov ($_inp,$acc); # save inp

2433 &lea ($s3,&DWP(16,$key)); # advance out

2434 &mov ($_out,$s3); # save out

2435 &sub ($s2,16); # decrease len

2436 &cmp ($s2,16);

2437 &mov ($_len,$s2); # save len

2438 &jae (&label("slow_enc_loop_x86"));

2439 &test ($s2,15);

2440 &jnz (&label("slow_enc_tail"));

2441 &mov ($acc,$_ivp); # load ivp

2442 &mov ($s2,&DWP(8,$key)); # restore last dwords

2443 &mov ($s3,&DWP(12,$key));

2444 &mov (&DWP(0,$acc),$s0); # save ivec

2445 &mov (&DWP(4,$acc),$s1);

2446 &mov (&DWP(8,$acc),$s2);

2447 &mov (&DWP(12,$acc),$s3);

2448

2449 &mov ("esp",$_esp);

2450 &popf ();

2451 &function_end_A();

2452 &pushf (); # kludge, never executed

2453

2454 &set_label("slow_enc_tail",16);

2455 &emms () if (!$x86only);

2456 &mov ($key eq "edi"? $key:"",$s3); # load out to edi

2457 &mov ($s1,16);

2458 &sub ($s1,$s2);

2459 &cmp ($key,$acc eq "esi"? $acc:""); # compare with inp

2460 &je (&label("enc_in_place"));

2461 &align (4);

2462 &data_word(0xA4F3F689); # rep movsb # copy input

2463 &jmp (&label("enc_skip_in_place"));

2464 &set_label("enc_in_place");

2465 &lea ($key,&DWP(0,$key,$s2));

2466 &set_label("enc_skip_in_place");

2467 &mov ($s2,$s1);

2468 &xor ($s0,$s0);

2469 &align (4);

2470 &data_word(0xAAF3F689); # rep stosb # zero tail

2471

2472 &mov ($key,$_ivp); # restore ivp

2473 &mov ($acc,$s3); # output as input

2474 &mov ($s0,&DWP(0,$key));

2475 &mov ($s1,&DWP(4,$key));

2476 &mov ($_len,16); # len=16

2477 &jmp (&label("slow_enc_loop_x86")); # one more spin...

2478

2479 #--------------------------- SLOW DECRYPT ---------------------------#

2480 &set_label("slow_decrypt",16);

2481 if (!$x86only) {

2482 &bt ($_tmp,25); # check for SSE bit

2483 &jnc (&label("slow_dec_loop_x86"));

2484

2485 &set_label("slow_dec_loop_sse",4);

2486 &movq ("mm0",&QWP(0,$acc)); # read input

2487 &movq ("mm4",&QWP(8,$acc));

2488

2489 &mov ($key,$_key);

2490 &call ("_sse_AES_decrypt_compact");

2491

2492 &mov ($acc,$_inp); # load inp

2493 &lea ($s0,$ivec);

2494 &mov ($s1,$_out); # load out

2495 &mov ($s2,$_len); # load len

2496 &mov ($key,$_ivp); # load ivp

2497

2498 &movq ("mm1",&QWP(0,$acc)); # re-read input

2499 &movq ("mm5",&QWP(8,$acc));

2500

2501 &pxor ("mm0",&QWP(0,$key)); # xor iv

2502 &pxor ("mm4",&QWP(8,$key));

2503

2504 &movq (&QWP(0,$key),"mm1"); # copy input to iv

2505 &movq (&QWP(8,$key),"mm5");

2506

2507 &sub ($s2,16); # decrease len

2508 &jc (&label("slow_dec_partial_sse"));

2509

2510 &movq (&QWP(0,$s1),"mm0"); # write output

2511 &movq (&QWP(8,$s1),"mm4");

2512

2513 &lea ($s1,&DWP(16,$s1)); # advance out

2514 &mov ($_out,$s1); # save out

2515 &lea ($acc,&DWP(16,$acc)); # advance inp

2516 &mov ($_inp,$acc); # save inp

2517 &mov ($_len,$s2); # save len

2518 &jnz (&label("slow_dec_loop_sse"));

2519 &emms ();

2520 &mov ("esp",$_esp);

2521 &popf ();

2522 &function_end_A();

2523 &pushf (); # kludge, never executed

2524

2525 &set_label("slow_dec_partial_sse",16);

2526 &movq (&QWP(0,$s0),"mm0"); # save output to temp

2527 &movq (&QWP(8,$s0),"mm4");

2528 &emms ();

2529

2530 &add ($s2 eq "ecx" ? "ecx":"",16);

2531 &mov ("edi",$s1); # out

2532 &mov ("esi",$s0); # temp

2533 &align (4);

2534 &data_word(0xA4F3F689); # rep movsb # copy partial output

2535

2536 &mov ("esp",$_esp);

2537 &popf ();

2538 &function_end_A();

2539 &pushf (); # kludge, never executed

2540 }

2541 &set_label("slow_dec_loop_x86",16);

2542 &mov ($s0,&DWP(0,$acc)); # read input

2543 &mov ($s1,&DWP(4,$acc));

2544 &mov ($s2,&DWP(8,$acc));

2545 &mov ($s3,&DWP(12,$acc));

2546

2547 &lea ($key,$ivec);

2548 &mov (&DWP(0,$key),$s0); # copy to temp

2549 &mov (&DWP(4,$key),$s1);

2550 &mov (&DWP(8,$key),$s2);

2551 &mov (&DWP(12,$key),$s3);

2552

2553 &mov ($key,$_key); # load key

2554 &call ("_x86_AES_decrypt_compact");

2555

2556 &mov ($key,$_ivp); # load ivp

2557 &mov ($acc,$_len); # load len

2558 &xor ($s0,&DWP(0,$key)); # xor iv

2559 &xor ($s1,&DWP(4,$key));

2560 &xor ($s2,&DWP(8,$key));

2561 &xor ($s3,&DWP(12,$key));

2562

2563 &sub ($acc,16);

2564 &jc (&label("slow_dec_partial_x86"));

2565

2566 &mov ($_len,$acc); # save len

2567 &mov ($acc,$_out); # load out

2568

2569 &mov (&DWP(0,$acc),$s0); # write output

2570 &mov (&DWP(4,$acc),$s1);

2571 &mov (&DWP(8,$acc),$s2);

2572 &mov (&DWP(12,$acc),$s3);

2573

2574 &lea ($acc,&DWP(16,$acc)); # advance out

2575 &mov ($_out,$acc); # save out

2576

2577 &lea ($acc,$ivec);

2578 &mov ($s0,&DWP(0,$acc)); # read temp

2579 &mov ($s1,&DWP(4,$acc));

2580 &mov ($s2,&DWP(8,$acc));

2581 &mov ($s3,&DWP(12,$acc));

2582

2583 &mov (&DWP(0,$key),$s0); # copy it to iv

2584 &mov (&DWP(4,$key),$s1);

2585 &mov (&DWP(8,$key),$s2);

2586 &mov (&DWP(12,$key),$s3);

2587

2588 &mov ($acc,$_inp); # load inp

2589 &lea ($acc,&DWP(16,$acc)); # advance inp

2590 &mov ($_inp,$acc); # save inp

2591 &jnz (&label("slow_dec_loop_x86"));

2592 &mov ("esp",$_esp);

2593 &popf ();

2594 &function_end_A();

2595 &pushf (); # kludge, never executed

2596

2597 &set_label("slow_dec_partial_x86",16);

2598 &lea ($acc,$ivec);

2599 &mov (&DWP(0,$acc),$s0); # save output to temp

2600 &mov (&DWP(4,$acc),$s1);

2601 &mov (&DWP(8,$acc),$s2);

2602 &mov (&DWP(12,$acc),$s3);

2603

2604 &mov ($acc,$_inp);

2605 &mov ($s0,&DWP(0,$acc)); # re-read input

2606 &mov ($s1,&DWP(4,$acc));

2607 &mov ($s2,&DWP(8,$acc));

2608 &mov ($s3,&DWP(12,$acc));

2609

2610 &mov (&DWP(0,$key),$s0); # copy it to iv

2611 &mov (&DWP(4,$key),$s1);

2612 &mov (&DWP(8,$key),$s2);

2613 &mov (&DWP(12,$key),$s3);

2614

2615 &mov ("ecx",$_len);

2616 &mov ("edi",$_out);

2617 &lea ("esi",$ivec);

2618 &align (4);

2619 &data_word(0xA4F3F689); # rep movsb # copy partial output

2620

2621 &mov ("esp",$_esp);

2622 &popf ();

2623 &function_end("AES_cbc_encrypt");

2624 }

2625

2626 #------------------------------------------------------------------#

2627

2628 sub enckey()

2629 {

2630 &movz ("esi",&LB("edx")); # rk[i]>>0

2631 &movz ("ebx",&BP(-128,$tbl,"esi",1));

2632 &movz ("esi",&HB("edx")); # rk[i]>>8

2633 &shl ("ebx",24);

2634 &xor ("eax","ebx");

2635

2636 &movz ("ebx",&BP(-128,$tbl,"esi",1));

2637 &shr ("edx",16);

2638 &movz ("esi",&LB("edx")); # rk[i]>>16

2639 &xor ("eax","ebx");

2640

2641 &movz ("ebx",&BP(-128,$tbl,"esi",1));

2642 &movz ("esi",&HB("edx")); # rk[i]>>24

2643 &shl ("ebx",8);

2644 &xor ("eax","ebx");

2645

2646 &movz ("ebx",&BP(-128,$tbl,"esi",1));

2647 &shl ("ebx",16);

2648 &xor ("eax","ebx");

2649

2650 &xor ("eax",&DWP(1024-128,$tbl,"ecx",4)); # rcon

2651 }

2652

2653 &function_begin("_x86_AES_set_encrypt_key");

2654 &mov ("esi",&wparam(1)); # user supplied key

2655 &mov ("edi",&wparam(3)); # private key schedule

2656

2657 &test ("esi",-1);

2658 &jz (&label("badpointer"));

2659 &test ("edi",-1);

2660 &jz (&label("badpointer"));

2661

2662 &call (&label("pic_point"));

2663 &set_label("pic_point");

2664 &blindpop($tbl);

2665 &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));

2666 &lea ($tbl,&DWP(2048+128,$tbl));

2667

2668 # prefetch Te4

2669 &mov ("eax",&DWP(0-128,$tbl));

2670 &mov ("ebx",&DWP(32-128,$tbl));

2671 &mov ("ecx",&DWP(64-128,$tbl));

2672 &mov ("edx",&DWP(96-128,$tbl));

2673 &mov ("eax",&DWP(128-128,$tbl));

2674 &mov ("ebx",&DWP(160-128,$tbl));

2675 &mov ("ecx",&DWP(192-128,$tbl));

2676 &mov ("edx",&DWP(224-128,$tbl));

2677

2678 &mov ("ecx",&wparam(2)); # number of bits in key

2679 &cmp ("ecx",128);

2680 &je (&label("10rounds"));

2681 &cmp ("ecx",192);

2682 &je (&label("12rounds"));

2683 &cmp ("ecx",256);

2684 &je (&label("14rounds"));

2685 &mov ("eax",-2); # invalid number of bits

2686 &jmp (&label("exit"));

2687

2688 &set_label("10rounds");

2689 &mov ("eax",&DWP(0,"esi")); # copy first 4 dwords

2690 &mov ("ebx",&DWP(4,"esi"));

2691 &mov ("ecx",&DWP(8,"esi"));

2692 &mov ("edx",&DWP(12,"esi"));

2693 &mov (&DWP(0,"edi"),"eax");

2694 &mov (&DWP(4,"edi"),"ebx");

2695 &mov (&DWP(8,"edi"),"ecx");

2696 &mov (&DWP(12,"edi"),"edx");

2697

2698 &xor ("ecx","ecx");

2699 &jmp (&label("10shortcut"));

2700

2701 &align (4);

2702 &set_label("10loop");

2703 &mov ("eax",&DWP(0,"edi")); # rk[0]

2704 &mov ("edx",&DWP(12,"edi")); # rk[3]

2705 &set_label("10shortcut");

2706 &enckey ();

2707

2708 &mov (&DWP(16,"edi"),"eax"); # rk[4]

2709 &xor ("eax",&DWP(4,"edi"));

2710 &mov (&DWP(20,"edi"),"eax"); # rk[5]

2711 &xor ("eax",&DWP(8,"edi"));

2712 &mov (&DWP(24,"edi"),"eax"); # rk[6]

2713 &xor ("eax",&DWP(12,"edi"));

2714 &mov (&DWP(28,"edi"),"eax"); # rk[7]

2715 &inc ("ecx");

2716 &add ("edi",16);

2717 &cmp ("ecx",10);

2718 &jl (&label("10loop"));

2719

2720 &mov (&DWP(80,"edi"),10); # setup number of rounds

2721 &xor ("eax","eax");

2722 &jmp (&label("exit"));

2723

2724 &set_label("12rounds");

2725 &mov ("eax",&DWP(0,"esi")); # copy first 6 dwords

2726 &mov ("ebx",&DWP(4,"esi"));

2727 &mov ("ecx",&DWP(8,"esi"));

2728 &mov ("edx",&DWP(12,"esi"));

2729 &mov (&DWP(0,"edi"),"eax");

2730 &mov (&DWP(4,"edi"),"ebx");

2731 &mov (&DWP(8,"edi"),"ecx");

2732 &mov (&DWP(12,"edi"),"edx");

2733 &mov ("ecx",&DWP(16,"esi"));

2734 &mov ("edx",&DWP(20,"esi"));

2735 &mov (&DWP(16,"edi"),"ecx");

2736 &mov (&DWP(20,"edi"),"edx");

2737

2738 &xor ("ecx","ecx");

2739 &jmp (&label("12shortcut"));

2740

2741 &align (4);

2742 &set_label("12loop");

2743 &mov ("eax",&DWP(0,"edi")); # rk[0]

2744 &mov ("edx",&DWP(20,"edi")); # rk[5]

2745 &set_label("12shortcut");

2746 &enckey ();

2747

2748 &mov (&DWP(24,"edi"),"eax"); # rk[6]

2749 &xor ("eax",&DWP(4,"edi"));

2750 &mov (&DWP(28,"edi"),"eax"); # rk[7]

2751 &xor ("eax",&DWP(8,"edi"));

2752 &mov (&DWP(32,"edi"),"eax"); # rk[8]

2753 &xor ("eax",&DWP(12,"edi"));

2754 &mov (&DWP(36,"edi"),"eax"); # rk[9]

2755

2756 &cmp ("ecx",7);

2757 &je (&label("12break"));

2758 &inc ("ecx");

2759

2760 &xor ("eax",&DWP(16,"edi"));

2761 &mov (&DWP(40,"edi"),"eax"); # rk[10]

2762 &xor ("eax",&DWP(20,"edi"));

2763 &mov (&DWP(44,"edi"),"eax"); # rk[11]

2764

2765 &add ("edi",24);

2766 &jmp (&label("12loop"));

2767

2768 &set_label("12break");

2769 &mov (&DWP(72,"edi"),12); # setup number of rounds

2770 &xor ("eax","eax");

2771 &jmp (&label("exit"));

2772

2773 &set_label("14rounds");

2774 &mov ("eax",&DWP(0,"esi")); # copy first 8 dwords

2775 &mov ("ebx",&DWP(4,"esi"));

2776 &mov ("ecx",&DWP(8,"esi"));

2777 &mov ("edx",&DWP(12,"esi"));

2778 &mov (&DWP(0,"edi"),"eax");

2779 &mov (&DWP(4,"edi"),"ebx");

2780 &mov (&DWP(8,"edi"),"ecx");

2781 &mov (&DWP(12,"edi"),"edx");

2782 &mov ("eax",&DWP(16,"esi"));

2783 &mov ("ebx",&DWP(20,"esi"));

2784 &mov ("ecx",&DWP(24,"esi"));

2785 &mov ("edx",&DWP(28,"esi"));

2786 &mov (&DWP(16,"edi"),"eax");

2787 &mov (&DWP(20,"edi"),"ebx");

2788 &mov (&DWP(24,"edi"),"ecx");

2789 &mov (&DWP(28,"edi"),"edx");

2790

2791 &xor ("ecx","ecx");

2792 &jmp (&label("14shortcut"));

2793

2794 &align (4);

2795 &set_label("14loop");

2796 &mov ("edx",&DWP(28,"edi")); # rk[7]

2797 &set_label("14shortcut");

2798 &mov ("eax",&DWP(0,"edi")); # rk[0]

2799

2800 &enckey ();

2801

2802 &mov (&DWP(32,"edi"),"eax"); # rk[8]

2803 &xor ("eax",&DWP(4,"edi"));

2804 &mov (&DWP(36,"edi"),"eax"); # rk[9]

2805 &xor ("eax",&DWP(8,"edi"));

2806 &mov (&DWP(40,"edi"),"eax"); # rk[10]

2807 &xor ("eax",&DWP(12,"edi"));

2808 &mov (&DWP(44,"edi"),"eax"); # rk[11]

2809

2810 &cmp ("ecx",6);

2811 &je (&label("14break"));

2812 &inc ("ecx");

2813

2814 &mov ("edx","eax");

2815 &mov ("eax",&DWP(16,"edi")); # rk[4]

2816 &movz ("esi",&LB("edx")); # rk[11]>>0

2817 &movz ("ebx",&BP(-128,$tbl,"esi",1));

2818 &movz ("esi",&HB("edx")); # rk[11]>>8

2819 &xor ("eax","ebx");

2820

2821 &movz ("ebx",&BP(-128,$tbl,"esi",1));

2822 &shr ("edx",16);

2823 &shl ("ebx",8);

2824 &movz ("esi",&LB("edx")); # rk[11]>>16

2825 &xor ("eax","ebx");

2826

2827 &movz ("ebx",&BP(-128,$tbl,"esi",1));

2828 &movz ("esi",&HB("edx")); # rk[11]>>24

2829 &shl ("ebx",16);

2830 &xor ("eax","ebx");

2831

2832 &movz ("ebx",&BP(-128,$tbl,"esi",1));

2833 &shl ("ebx",24);

2834 &xor ("eax","ebx");

2835

2836 &mov (&DWP(48,"edi"),"eax"); # rk[12]

2837 &xor ("eax",&DWP(20,"edi"));

2838 &mov (&DWP(52,"edi"),"eax"); # rk[13]

2839 &xor ("eax",&DWP(24,"edi"));

2840 &mov (&DWP(56,"edi"),"eax"); # rk[14]

2841 &xor ("eax",&DWP(28,"edi"));

2842 &mov (&DWP(60,"edi"),"eax"); # rk[15]

2843

2844 &add ("edi",32);

2845 &jmp (&label("14loop"));

2846

2847 &set_label("14break");

2848 &mov (&DWP(48,"edi"),14); # setup number of rounds

2849 &xor ("eax","eax");

2850 &jmp (&label("exit"));

2851

2852 &set_label("badpointer");

2853 &mov ("eax",-1);

2854 &set_label("exit");

2855 &function_end("_x86_AES_set_encrypt_key");

2856

2857 # int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,

2858 # AES_KEY *key)

2859 &function_begin_B("private_AES_set_encrypt_key");

2860 &call ("_x86_AES_set_encrypt_key");

2861 &ret ();

2862 &function_end_B("private_AES_set_encrypt_key");

2863

2864 sub deckey()

2865 { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;

2866 my $tmp = $tbl;

2867

2868 &mov ($acc,$tp1);

2869 &and ($acc,0x80808080);

2870 &mov ($tmp,$acc);

2871 &shr ($tmp,7);

2872 &lea ($tp2,&DWP(0,$tp1,$tp1));

2873 &sub ($acc,$tmp);

2874 &and ($tp2,0xfefefefe);

2875 &and ($acc,0x1b1b1b1b);

2876 &xor ($acc,$tp2);

2877 &mov ($tp2,$acc);

2878

2879 &and ($acc,0x80808080);

2880 &mov ($tmp,$acc);

2881 &shr ($tmp,7);

2882 &lea ($tp4,&DWP(0,$tp2,$tp2));

2883 &sub ($acc,$tmp);

2884 &and ($tp4,0xfefefefe);

2885 &and ($acc,0x1b1b1b1b);

2886 &xor ($tp2,$tp1); # tp2^tp1

2887 &xor ($acc,$tp4);

2888 &mov ($tp4,$acc);

2889

2890 &and ($acc,0x80808080);

2891 &mov ($tmp,$acc);

2892 &shr ($tmp,7);

2893 &lea ($tp8,&DWP(0,$tp4,$tp4));

2894 &xor ($tp4,$tp1); # tp4^tp1

2895 &sub ($acc,$tmp);

2896 &and ($tp8,0xfefefefe);

2897 &and ($acc,0x1b1b1b1b);

2898 &rotl ($tp1,8); # = ROTATE(tp1,8)

2899 &xor ($tp8,$acc);

2900

2901 &mov ($tmp,&DWP(4*($i+1),$key)); # modulo-scheduled load

2902

2903 &xor ($tp1,$tp2);

2904 &xor ($tp2,$tp8);

2905 &xor ($tp1,$tp4);

2906 &rotl ($tp2,24);

2907 &xor ($tp4,$tp8);

2908 &xor ($tp1,$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)

2909 &rotl ($tp4,16);

2910 &xor ($tp1,$tp2); # ^= ROTATE(tp8^tp2^tp1,24)

2911 &rotl ($tp8,8);

2912 &xor ($tp1,$tp4); # ^= ROTATE(tp8^tp4^tp1,16)

2913 &mov ($tp2,$tmp);

2914 &xor ($tp1,$tp8); # ^= ROTATE(tp8,8)

2915

2916 &mov (&DWP(4*$i,$key),$tp1);

2917 }

2918

2919 # int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,

2920 # AES_KEY *key)

2921 &function_begin_B("private_AES_set_decrypt_key");

2922 &call ("_x86_AES_set_encrypt_key");

2923 &cmp ("eax",0);

2924 &je (&label("proceed"));

2925 &ret ();

2926

2927 &set_label("proceed");

2928 &push ("ebp");

2929 &push ("ebx");

2930 &push ("esi");

2931 &push ("edi");

2932

2933 &mov ("esi",&wparam(2));

2934 &mov ("ecx",&DWP(240,"esi")); # pull number of rounds

2935 &lea ("ecx",&DWP(0,"","ecx",4));

2936 &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk

2937

2938 &set_label("invert",4); # invert order of chunks

2939 &mov ("eax",&DWP(0,"esi"));

2940 &mov ("ebx",&DWP(4,"esi"));

2941 &mov ("ecx",&DWP(0,"edi"));

2942 &mov ("edx",&DWP(4,"edi"));

2943 &mov (&DWP(0,"edi"),"eax");

2944 &mov (&DWP(4,"edi"),"ebx");

2945 &mov (&DWP(0,"esi"),"ecx");

2946 &mov (&DWP(4,"esi"),"edx");

2947 &mov ("eax",&DWP(8,"esi"));

2948 &mov ("ebx",&DWP(12,"esi"));

2949 &mov ("ecx",&DWP(8,"edi"));

2950 &mov ("edx",&DWP(12,"edi"));

2951 &mov (&DWP(8,"edi"),"eax");

2952 &mov (&DWP(12,"edi"),"ebx");

2953 &mov (&DWP(8,"esi"),"ecx");

2954 &mov (&DWP(12,"esi"),"edx");

2955 &add ("esi",16);

2956 &sub ("edi",16);

2957 &cmp ("esi","edi");

2958 &jne (&label("invert"));

2959

2960 &mov ($key,&wparam(2));

2961 &mov ($acc,&DWP(240,$key)); # pull number of rounds

2962 &lea ($acc,&DWP(-2,$acc,$acc));

2963 &lea ($acc,&DWP(0,$key,$acc,8));

2964 &mov (&wparam(2),$acc);

2965

2966 &mov ($s0,&DWP(16,$key)); # modulo-scheduled load

2967 &set_label("permute",4); # permute the key schedule

2968 &add ($key,16);

2969 &deckey (0,$key,$s0,$s1,$s2,$s3);

2970 &deckey (1,$key,$s1,$s2,$s3,$s0);

2971 &deckey (2,$key,$s2,$s3,$s0,$s1);

2972 &deckey (3,$key,$s3,$s0,$s1,$s2);

2973 &cmp ($key,&wparam(2));

2974 &jb (&label("permute"));

2975

2976 &xor ("eax","eax"); # return success

2977 &function_end("private_AES_set_decrypt_key");

2978 &asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");

2979

2980 &asm_finish();

OLD	NEW

« no previous file with comments | « openssl/crypto/aes/asm/aes-586.S ('k') | openssl/crypto/aes/asm/aes-586-mac.S » ('j') | no next file with comments »