openssl/crypto/rc4/asm/rc4-ia64.pl - Issue 2072073002: Delete bundled copy of OpenSSL and replace with README.

Side by Side Diff: openssl/crypto/rc4/asm/rc4-ia64.pl

Issue 2072073002: Delete bundled copy of OpenSSL and replace with README. (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/openssl@master

Patch Set: Delete bundled copy of OpenSSL and replace with README. Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 #!/usr/bin/env perl

2 #

3 # ====================================================================

4 # Written by David Mosberger <David.Mosberger@acm.org> based on the

5 # Itanium optimized Crypto code which was released by HP Labs at

6 # http://www.hpl.hp.com/research/linux/crypto/.

7 #

8 # Copyright (c) 2005 Hewlett-Packard Development Company, L.P.

9 #

10 # Permission is hereby granted, free of charge, to any person obtaining

11 # a copy of this software and associated documentation files (the

12 # "Software"), to deal in the Software without restriction, including

13 # without limitation the rights to use, copy, modify, merge, publish,

14 # distribute, sublicense, and/or sell copies of the Software, and to

15 # permit persons to whom the Software is furnished to do so, subject to

16 # the following conditions:

17 #

18 # The above copyright notice and this permission notice shall be

19 # included in all copies or substantial portions of the Software.

20

21 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,

22 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF

23 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND

24 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE

25 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION

26 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION

27 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */

28

29

30

31 # This is a little helper program which generates a software-pipelined

32 # for RC4 encryption. The basic algorithm looks like this:

33 #

34 # for (counter = 0; counter < len; ++counter)

35 # {

36 # in = inp[counter];

37 # SI = S[I];

38 # J = (SI + J) & 0xff;

39 # SJ = S[J];

40 # T = (SI + SJ) & 0xff;

41 # S[I] = SJ, S[J] = SI;

42 # ST = S[T];

43 # outp[counter] = in ^ ST;

44 # I = (I + 1) & 0xff;

45 # }

46 #

47 # Pipelining this loop isn't easy, because the stores to the S[] array

48 # need to be observed in the right order. The loop generated by the

49 # code below has the following pipeline diagram:

50 #

51 # cycle

52 # \| 0 \| 1 \| 2 \| 3 \| 4 \| 5 \| 6 \| 7 \| 8 \| 9 \|10 \|11 \|12 \|13 \|14 \|15 \|16 \|17 \|

53 # iter

54 # 1: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx

55 # 2: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx

56 # 3: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx

57 #

58 # where:

59 # LDI = load of S[I]

60 # LDJ = load of S[J]

61 # SWP = swap of S[I] and S[J]

62 # LDT = load of S[T]

63 #

64 # Note that in the above diagram, the major trouble-spot is that LDI

65 # of the 2nd iteration is performed BEFORE the SWP of the first

66 # iteration. Fortunately, this is easy to detect (I of the 1st

67 # iteration will be equal to J of the 2nd iteration) and when this

68 # happens, we simply forward the proper value from the 1st iteration

69 # to the 2nd one. The proper value in this case is simply the value

70 # of S[I] from the first iteration (thanks to the fact that SWP

71 # simply swaps the contents of S[I] and S[J]).

72 #

73 # Another potential trouble-spot is in cycle 7, where SWP of the 1st

74 # iteration issues at the same time as the LDI of the 3rd iteration.

75 # However, thanks to IA-64 execution semantics, this can be taken

76 # care of simply by placing LDI later in the instruction-group than

77 # SWP. IA-64 CPUs will automatically forward the value if they

78 # detect that the SWP and LDI are accessing the same memory-location.

79

80 # The core-loop that can be pipelined then looks like this (annotated

81 # with McKinley/Madison issue port & latency numbers, assuming L1

82 # cache hits for the most part):

83

84 # operation: instruction: issue-ports: latency

85 # ------------------ ----------------------------- ------------- -------

86

87 # Data = *inp++ ld1 data = [inp], 1 M0-M1 1 cyc c0

88 # shladd Iptr = I, KeyTable, 3 M0-M3, I0, I1 1 cyc

89 # I = (I + 1) & 0xff padd1 nextI = I, one M0-M3, I0, I1 3 cyc

90 # ;;

91 # SI = S[I] ld8 SI = [Iptr] M0-M1 1 cyc c1 * after SWAP!

92 # ;;

93 # cmp.eq.unc pBypass = I, J * after J is valid!

94 # J = SI + J add J = J, SI M0-M3, I0, I1 1 cyc c2

95 # (pBypass) br.cond.spnt Bypass

96 # ;;

97 # ------------------------------------------------------------------------------ ---------

98 # J = J & 0xff zxt1 J = J I0, I1, 1 cyc c3

99 # ;;

100 # shladd Jptr = J, KeyTable, 3 M0-M3, I0, I1 1 cyc c4

101 # ;;

102 # SJ = S[J] ld8 SJ = [Jptr] M0-M1 1 cyc c5

103 # ;;

104 # ------------------------------------------------------------------------------ ---------

105 # T = (SI + SJ) add T = SI, SJ M0-M3, I0, I1 1 cyc c6

106 # ;;

107 # T = T & 0xff zxt1 T = T I0, I1 1 cyc

108 # S[I] = SJ st8 [Iptr] = SJ M2-M3 c7

109 # S[J] = SI st8 [Jptr] = SI M2-M3

110 # ;;

111 # shladd Tptr = T, KeyTable, 3 M0-M3, I0, I1 1 cyc c8

112 # ;;

113 # ------------------------------------------------------------------------------ ---------

114 # T = S[T] ld8 T = [Tptr] M0-M1 1 cyc c9

115 # ;;

116 # data ^= T xor data = data, T M0-M3, I0, I1 1 cyc c1 0

117 # ;;

118 # *out++ = Data ^ T dep word = word, data, 8, POS I0, I1 1 cyc c1 1

119 # ;;

120 # ------------------------------------------------------------------------------ ---------

121

122 # There are several points worth making here:

123

124 # - Note that due to the bypass/forwarding-path, the first two

125 # phases of the loop are strangly mingled together. In

126 # particular, note that the first stage of the pipeline is

127 # using the value of "J", as calculated by the second stage.

128 # - Each bundle-pair will have exactly 6 instructions.

129 # - Pipelined, the loop can execute in 3 cycles/iteration and

130 # 4 stages. However, McKinley/Madison can issue "st1" to

131 # the same bank at a rate of at most one per 4 cycles. Thus,

132 # instead of storing each byte, we accumulate them in a word

133 # and then write them back at once with a single "st8" (this

134 # implies that the setup code needs to ensure that the output

135 # buffer is properly aligned, if need be, by encoding the

136 # first few bytes separately).

137 # - There is no space for a "br.ctop" instruction. For this

138 # reason we can't use module-loop support in IA-64 and have

139 # to do a traditional, purely software-pipelined loop.

140 # - We can't replace any of the remaining "add/zxt1" pairs with

141 # "padd1" because the latency for that instruction is too high

142 # and would push the loop to the point where more bypasses

143 # would be needed, which we don't have space for.

144 # - The above loop runs at around 3.26 cycles/byte, or roughly

145 # 440 MByte/sec on a 1.5GHz Madison. This is well below the

146 # system bus bandwidth and hence with judicious use of

147 # "lfetch" this loop can run at (almost) peak speed even when

148 # the input and output data reside in memory. The

149 # max. latency that can be tolerated is (PREFETCH_DISTANCE *

150 # L2_LINE_SIZE * 3 cyc), or about 384 cycles assuming (at

151 # least) 1-ahead prefetching of 128 byte cache-lines. Note

152 # that we do NOT prefetch into L1, since that would only

153 # interfere with the S[] table values stored there. This is

154 # acceptable because there is a 10 cycle latency between

155 # load and first use of the input data.

156 # - We use a branch to out-of-line bypass-code of cycle-pressure:

157 # we calculate the next J, check for the need to activate the

158 # bypass path, and activate the bypass path ALL IN THE SAME

159 # CYCLE. If we didn't have these constraints, we could do

160 # the bypass with a simple conditional move instruction.

161 # Fortunately, the bypass paths get activated relatively

162 # infrequently, so the extra branches don't cost all that much

163 # (about 0.04 cycles/byte, measured on a 16396 byte file with

164 # random input data).

165 #

166

167 $phases = 4; # number of stages/phases in the pipelined-loop

168 $unroll_count = 6; # number of times we unrolled it

169 $pComI = (1 << 0);

170 $pComJ = (1 << 1);

171 $pComT = (1 << 2);

172 $pOut = (1 << 3);

173

174 $NData = 4;

175 $NIP = 3;

176 $NJP = 2;

177 $NI = 2;

178 $NSI = 3;

179 $NSJ = 2;

180 $NT = 2;

181 $NOutWord = 2;

182

183 #

184 # $threshold is the minimum length before we attempt to use the

185 # big software-pipelined loop. It MUST be greater-or-equal

186 # to:

187 # PHASES * (UNROLL_COUNT + 1) + 7

188 #

189 # The "+ 7" comes from the fact we may have to encode up to

190 # 7 bytes separately before the output pointer is aligned.

191 #

192 $threshold = (3 * ($phases * ($unroll_count + 1)) + 7);

193

194 sub I {

195 local *code = shift;

196 local $format = shift;

197 $code .= sprintf ("\t\t".$format."\n", @_);

198 }

199

200 sub P {

201 local *code = shift;

202 local $format = shift;

203 $code .= sprintf ($format."\n", @_);

204 }

205

206 sub STOP {

207 local *code = shift;

208 $code .=<<___;

209 ;;

210 ___

211 }

212

213 sub emit_body {

214 local *c = shift;

215 local *bypass = shift;

216 local ($iteration, $p) = @_;

217

218 local $i0 = $iteration;

219 local $i1 = $iteration - 1;

220 local $i2 = $iteration - 2;

221 local $i3 = $iteration - 3;

222 local $iw0 = ($iteration - 3) / 8;

223 local $iw1 = ($iteration > 3) ? ($iteration - 4) / 8 : 1;

224 local $byte_num = ($iteration - 3) % 8;

225 local $label = $iteration + 1;

226 local $pAny = ($p & 0xf) == 0xf;

227 local $pByp = (($p & $pComI) && ($iteration > 0));

228

229 $c.=<<___;

230 //////////////////////////////////////////////////

231 ___

232

233 if (($p & 0xf) == 0) {

234 $c.="#ifdef HOST_IS_BIG_ENDIAN\n";

235 &I(\$c,"shr.u OutWord[%u] = OutWord[%u], 32;;",

236 $iw1 % $NOutWord, $iw1 % $NOutWord);

237 $c.="#endif\n";

238 &I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord);

239 return;

240 }

241

242 # Cycle 0

243 &I(\$c, "{ .mmi") if ($pAny);

244 &I(\$c, "ld1 Data[%u] = [InPtr], 1", $i0 % $NData) if ($p & $pComI);

245 &I(\$c, "padd1 I[%u] = One, I[%u]", $i0 % $NI, $i1 % $NI)if ($p & $pComI);

246 &I(\$c, "zxt1 J = J") if ($p & $pComJ);

247 &I(\$c, "}") if ($pAny);

248 &I(\$c, "{ .mmi") if ($pAny);

249 &I(\$c, "LKEY T[%u] = [T[%u]]", $i1 % $NT, $i1 % $NT) if ($p & $pOut);

250 &I(\$c, "add T[%u] = SI[%u], SJ[%u]",

251 $i0 % $NT, $i2 % $NSI, $i1 % $NSJ) if ($p & $pComT);

252 &I(\$c, "KEYADDR(IPr[%u], I[%u])", $i0 % $NIP, $i1 % $NI) if ($p & $pComI);

253 &I(\$c, "}") if ($pAny);

254 &STOP(\$c);

255

256 # Cycle 1

257 &I(\$c, "{ .mmi") if ($pAny);

258 &I(\$c, "SKEY [IPr[%u]] = SJ[%u]", $i2 % $NIP, $i1%$NSJ)if ($p & $pComT);

259 &I(\$c, "SKEY [JP[%u]] = SI[%u]", $i1 % $NJP, $i2%$NSI) if ($p & $pComT);

260 &I(\$c, "zxt1 T[%u] = T[%u]", $i0 % $NT, $i0 % $NT) if ($p & $pComT);

261 &I(\$c, "}") if ($pAny);

262 &I(\$c, "{ .mmi") if ($pAny);

263 &I(\$c, "LKEY SI[%u] = [IPr[%u]]", $i0 % $NSI, $i0%$NIP)if ($p & $pComI);

264 &I(\$c, "KEYADDR(JP[%u], J)", $i0 % $NJP) if ($p & $pComJ);

265 &I(\$c, "xor Data[%u] = Data[%u], T[%u]",

266 $i3 % $NData, $i3 % $NData, $i1 % $NT) if ($p & $pOut);

267 &I(\$c, "}") if ($pAny);

268 &STOP(\$c);

269

270 # Cycle 2

271 &I(\$c, "{ .mmi") if ($pAny);

272 &I(\$c, "LKEY SJ[%u] = [JP[%u]]", $i0 % $NSJ, $i0%$NJP) if ($p & $pComJ);

273 &I(\$c, "cmp.eq pBypass, p0 = I[%u], J", $i1 % $NI) if ($pByp);

274 &I(\$c, "dep OutWord[%u] = Data[%u], OutWord[%u], BYTE_POS(%u), 8",

275 $iw0%$NOutWord, $i3%$NData, $iw1%$NOutWord, $byte_num) if ($p & $pOut);

276 &I(\$c, "}") if ($pAny);

277 &I(\$c, "{ .mmb") if ($pAny);

278 &I(\$c, "add J = J, SI[%u]", $i0 % $NSI) if ($p & $pComI);

279 &I(\$c, "KEYADDR(T[%u], T[%u])", $i0 % $NT, $i0 % $NT) if ($p & $pComT);

280 &P(\$c, "(pBypass)\tbr.cond.spnt.many .rc4Bypass%u",$label)if ($pByp);

281 &I(\$c, "}") if ($pAny);

282 &STOP(\$c);

283

284 &P(\$c, ".rc4Resume%u:", $label) if ($pByp);

285 if ($byte_num == 0 && $iteration >= $phases) {

286 &I(\$c, "st8 [OutPtr] = OutWord[%u], 8",

287 $iw1 % $NOutWord) if ($p & $pOut);

288 if ($iteration == (1 + $unroll_count) * $phases - 1) {

289 if ($unroll_count == 6) {

290 &I(\$c, "mov OutWord[%u] = OutWord[%u]",

291 $iw1 % $NOutWord, $iw0 % $NOutWord);

292 }

293 &I(\$c, "lfetch.nt1 [InPrefetch], %u",

294 $unroll_count * $phases);

295 &I(\$c, "lfetch.excl.nt1 [OutPrefetch], %u",

296 $unroll_count * $phases);

297 &I(\$c, "br.cloop.sptk.few .rc4Loop");

298 }

299 }

300

301 if ($pByp) {

302 &P(\$bypass, ".rc4Bypass%u:", $label);

303 &I(\$bypass, "sub J = J, SI[%u]", $i0 % $NSI);

304 &I(\$bypass, "nop 0");

305 &I(\$bypass, "nop 0");

306 &I(\$bypass, ";;");

307 &I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI);

308 &I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI);

309 &I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label);

310 &I(\$bypass, ";;");

311 }

312 }

313

314 $code=<<___;

315 .ident \"rc4-ia64.s, version 3.0\"

316 .ident \"Copyright (c) 2005 Hewlett-Packard Development Company, L.P.\"

317

318 #define LCSave r8

319 #define PRSave r9

320

321 /* Inputs become invalid once rotation begins! */

322

323 #define StateTable in0

324 #define DataLen in1

325 #define InputBuffer in2

326 #define OutputBuffer in3

327

328 #define KTable r14

329 #define J r15

330 #define InPtr r16

331 #define OutPtr r17

332 #define InPrefetch r18

333 #define OutPrefetch r19

334 #define One r20

335 #define LoopCount r21

336 #define Remainder r22

337 #define IFinal r23

338 #define EndPtr r24

339

340 #define tmp0 r25

341 #define tmp1 r26

342

343 #define pBypass p6

344 #define pDone p7

345 #define pSmall p8

346 #define pAligned p9

347 #define pUnaligned p10

348

349 #define pComputeI pPhase[0]

350 #define pComputeJ pPhase[1]

351 #define pComputeT pPhase[2]

352 #define pOutput pPhase[3]

353

354 #define RetVal r8

355 #define L_OK p7

356 #define L_NOK p8

357

358 #define _NINPUTS 4

359 #define _NOUTPUT 0

360

361 #define _NROTATE 24

362 #define _NLOCALS (_NROTATE - _NINPUTS - _NOUTPUT)

363

364 #ifndef SZ

365 # define SZ 4 // this must be set to sizeof(RC4_INT)

366 #endif

367

368 #if SZ == 1

369 # define LKEY ld1

370 # define SKEY st1

371 # define KEYADDR(dst, i) add dst = i, KTable

372 #elif SZ == 2

373 # define LKEY ld2

374 # define SKEY st2

375 # define KEYADDR(dst, i) shladd dst = i, 1, KTable

376 #elif SZ == 4

377 # define LKEY ld4

378 # define SKEY st4

379 # define KEYADDR(dst, i) shladd dst = i, 2, KTable

380 #else

381 # define LKEY ld8

382 # define SKEY st8

383 # define KEYADDR(dst, i) shladd dst = i, 3, KTable

384 #endif

385

386 #if defined(_HPUX_SOURCE) && !defined(_LP64)

387 # define ADDP addp4

388 #else

389 # define ADDP add

390 #endif

391

392 /* Define a macro for the bit number of the n-th byte: */

393

394 #if defined(_HPUX_SOURCE) \|\| defined(B_ENDIAN)

395 # define HOST_IS_BIG_ENDIAN

396 # define BYTE_POS(n) (56 - (8 * (n)))

397 #else

398 # define BYTE_POS(n) (8 * (n))

399 #endif

400

401 /*

402 We must perform the first phase of the pipeline explicitly since

403 we will always load from the stable the first time. The br.cexit

404 will never be taken since regardless of the number of bytes because

405 the epilogue count is 4.

406 */

407 /* MODSCHED_RC4 macro was split to _PROLOGUE and _LOOP, because HP-UX

408 assembler failed on original macro with syntax error. <appro> */

409 #define MODSCHED_RC4_PROLOGUE \\

410 { \\

411 ld1 Data[0] = [InPtr], 1; \\

412 add IFinal = 1, I[1]; \\

413 KEYADDR(IPr[0], I[1]); \\

414 } ;; \\

415 { \\

416 LKEY SI[0] = [IPr[0]]; \\

417 mov pr.rot = 0x10000; \\

418 mov ar.ec = 4; \\

419 } ;; \\

420 { \\

421 add J = J, SI[0]; \\

422 zxt1 I[0] = IFinal; \\

423 br.cexit.spnt.few .+16; /* never taken */ \\

424 } ;;

425 #define MODSCHED_RC4_LOOP(label) \\

426 label: \\

427 { .mmi; \\

428 (pComputeI) ld1 Data[0] = [InPtr], 1; \\

429 (pComputeI) add IFinal = 1, I[1]; \\

430 (pComputeJ) zxt1 J = J; \\

431 }{ .mmi; \\

432 (pOutput) LKEY T[1] = [T[1]]; \\

433 (pComputeT) add T[0] = SI[2], SJ[1]; \\

434 (pComputeI) KEYADDR(IPr[0], I[1]); \\

435 } ;; \\

436 { .mmi; \\

437 (pComputeT) SKEY [IPr[2]] = SJ[1]; \\

438 (pComputeT) SKEY [JP[1]] = SI[2]; \\

439 (pComputeT) zxt1 T[0] = T[0]; \\

440 }{ .mmi; \\

441 (pComputeI) LKEY SI[0] = [IPr[0]]; \\

442 (pComputeJ) KEYADDR(JP[0], J); \\

443 (pComputeI) cmp.eq.unc pBypass, p0 = I[1], J; \\

444 } ;; \\

445 { .mmi; \\

446 (pComputeJ) LKEY SJ[0] = [JP[0]]; \\

447 (pOutput) xor Data[3] = Data[3], T[1]; \\

448 nop 0x0; \\

449 }{ .mmi; \\

450 (pComputeT) KEYADDR(T[0], T[0]); \\

451 (pBypass) mov SI[0] = SI[1]; \\

452 (pComputeI) zxt1 I[0] = IFinal; \\

453 } ;; \\

454 { .mmb; \\

455 (pOutput) st1 [OutPtr] = Data[3], 1; \\

456 (pComputeI) add J = J, SI[0]; \\

457 br.ctop.sptk.few label; \\

458 } ;;

459

460 .text

461

462 .align 32

463

464 .type RC4, \@function

465 .global RC4

466

467 .proc RC4

468 .prologue

469

470 RC4:

471 {

472 .mmi

473 alloc r2 = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE

474

475 .rotr Data[4], I[2], IPr[3], SI[3], JP[2], SJ[2], T[2], \\

476 OutWord[2]

477 .rotp pPhase[4]

478

479 ADDP InPrefetch = 0, InputBuffer

480 ADDP KTable = 0, StateTable

481 }

482 {

483 .mmi

484 ADDP InPtr = 0, InputBuffer

485 ADDP OutPtr = 0, OutputBuffer

486 mov RetVal = r0

487 }

488 ;;

489 {

490 .mmi

491 lfetch.nt1 [InPrefetch], 0x80

492 ADDP OutPrefetch = 0, OutputBuffer

493 }

494 { // Return 0 if the input length is nonsensical

495 .mib

496 ADDP StateTable = 0, StateTable

497 cmp.ge.unc L_NOK, L_OK = r0, DataLen

498 (L_NOK) br.ret.sptk.few rp

499 }

500 ;;

501 {

502 .mib

503 cmp.eq.or L_NOK, L_OK = r0, InPtr

504 cmp.eq.or L_NOK, L_OK = r0, OutPtr

505 nop 0x0

506 }

507 {

508 .mib

509 cmp.eq.or L_NOK, L_OK = r0, StateTable

510 nop 0x0

511 (L_NOK) br.ret.sptk.few rp

512 }

513 ;;

514 LKEY I[1] = [KTable], SZ

515 /* Prefetch the state-table. It contains 256 elements of size SZ */

516

517 #if SZ == 1

518 ADDP tmp0 = 1*128, StateTable

519 #elif SZ == 2

520 ADDP tmp0 = 3*128, StateTable

521 ADDP tmp1 = 2*128, StateTable

522 #elif SZ == 4

523 ADDP tmp0 = 7*128, StateTable

524 ADDP tmp1 = 6*128, StateTable

525 #elif SZ == 8

526 ADDP tmp0 = 15*128, StateTable

527 ADDP tmp1 = 14*128, StateTable

528 #endif

529 ;;

530 #if SZ >= 8

531 lfetch.fault.nt1 [tmp0], -256 // 15

532 lfetch.fault.nt1 [tmp1], -256;;

533 lfetch.fault.nt1 [tmp0], -256 // 13

534 lfetch.fault.nt1 [tmp1], -256;;

535 lfetch.fault.nt1 [tmp0], -256 // 11

536 lfetch.fault.nt1 [tmp1], -256;;

537 lfetch.fault.nt1 [tmp0], -256 // 9

538 lfetch.fault.nt1 [tmp1], -256;;

539 #endif

540 #if SZ >= 4

541 lfetch.fault.nt1 [tmp0], -256 // 7

542 lfetch.fault.nt1 [tmp1], -256;;

543 lfetch.fault.nt1 [tmp0], -256 // 5

544 lfetch.fault.nt1 [tmp1], -256;;

545 #endif

546 #if SZ >= 2

547 lfetch.fault.nt1 [tmp0], -256 // 3

548 lfetch.fault.nt1 [tmp1], -256;;

549 #endif

550 {

551 .mii

552 lfetch.fault.nt1 [tmp0] // 1

553 add I[1]=1,I[1];;

554 zxt1 I[1]=I[1]

555 }

556 {

557 .mmi

558 lfetch.nt1 [InPrefetch], 0x80

559 lfetch.excl.nt1 [OutPrefetch], 0x80

560 .save pr, PRSave

561 mov PRSave = pr

562 } ;;

563 {

564 .mmi

565 lfetch.excl.nt1 [OutPrefetch], 0x80

566 LKEY J = [KTable], SZ

567 ADDP EndPtr = DataLen, InPtr

568 } ;;

569 {

570 .mmi

571 ADDP EndPtr = -1, EndPtr // Make it point to

572 // last data byte.

573 mov One = 1

574 .save ar.lc, LCSave

575 mov LCSave = ar.lc

576 .body

577 } ;;

578 {

579 .mmb

580 sub Remainder = 0, OutPtr

581 cmp.gtu pSmall, p0 = $threshold, DataLen

582 (pSmall) br.cond.dpnt .rc4Remainder // Data too small for

583 // big loop.

584 } ;;

585 {

586 .mmi

587 and Remainder = 0x7, Remainder

588 ;;

589 cmp.eq pAligned, pUnaligned = Remainder, r0

590 nop 0x0

591 } ;;

592 {

593 .mmb

594 .pred.rel "mutex",pUnaligned,pAligned

595 (pUnaligned) add Remainder = -1, Remainder

596 (pAligned) sub Remainder = EndPtr, InPtr

597 (pAligned) br.cond.dptk.many .rc4Aligned

598 } ;;

599 {

600 .mmi

601 nop 0x0

602 nop 0x0

603 mov.i ar.lc = Remainder

604 }

605

606 /* Do the initial few bytes via the compact, modulo-scheduled loop

607 until the output pointer is 8-byte-aligned. */

608

609 MODSCHED_RC4_PROLOGUE

610 MODSCHED_RC4_LOOP(.RC4AlignLoop)

611

612 {

613 .mib

614 sub Remainder = EndPtr, InPtr

615 zxt1 IFinal = IFinal

616 clrrrb // Clear CFM.rrb.pr so

617 ;; // next "mov pr.rot = N"

618 // does the right thing.

619 }

620 {

621 .mmi

622 mov I[1] = IFinal

623 nop 0x0

624 nop 0x0

625 } ;;

626

627

628 .rc4Aligned:

629

630 /*

631 Unrolled loop count = (Remainder - ($unroll_count+1)$phases)/($unroll_count $phases)

632 */

633

634 {

635 .mlx

636 add LoopCount = 1 - ($unroll_count + 1)*$phases, Remainder

637 movl Remainder = 0xaaaaaaaaaaaaaaab

638 } ;;

639 {

640 .mmi

641 setf.sig f6 = LoopCount // M2, M3 6 cyc

642 setf.sig f7 = Remainder // M2, M3 6 cyc

643 nop 0x0

644 } ;;

645 {

646 .mfb

647 nop 0x0

648 xmpy.hu f6 = f6, f7

649 nop 0x0

650 } ;;

651 {

652 .mmi

653 getf.sig LoopCount = f6;; // M2 5 cyc

654 nop 0x0

655 shr.u LoopCount = LoopCount, 4

656 } ;;

657 {

658 .mmi

659 nop 0x0

660 nop 0x0

661 mov.i ar.lc = LoopCount

662 } ;;

663

664 /* Now comes the unrolled loop: */

665

666 .rc4Prologue:

667 ___

668

669 $iteration = 0;

670

671 # Generate the prologue:

672 $predicates = 1;

673 for ($i = 0; $i < $phases; ++$i) {

674 &emit_body (\$code, \$bypass, $iteration++, $predicates);

675 $predicates = ($predicates << 1) \| 1;

676 }

677

678 $code.=<<___;

679 .rc4Loop:

680 ___

681

682 # Generate the body:

683 for ($i = 0; $i < $unroll_count*$phases; ++$i) {

684 &emit_body (\$code, \$bypass, $iteration++, $predicates);

685 }

686

687 $code.=<<___;

688 .rc4Epilogue:

689 ___

690

691 # Generate the epilogue:

692 for ($i = 0; $i < $phases; ++$i) {

693 $predicates <<= 1;

694 &emit_body (\$code, \$bypass, $iteration++, $predicates);

695 }

696

697 $code.=<<___;

698 {

699 .mmi

700 lfetch.nt1 [EndPtr] // fetch line with last byte

701 mov IFinal = I[1]

702 nop 0x0

703 }

704

705 .rc4Remainder:

706 {

707 .mmi

708 sub Remainder = EndPtr, InPtr // Calculate

709 // # of bytes

710 // left - 1

711 nop 0x0

712 nop 0x0

713 } ;;

714 {

715 .mib

716 cmp.eq pDone, p0 = -1, Remainder // done already?

717 mov.i ar.lc = Remainder

718 (pDone) br.cond.dptk.few .rc4Complete

719 }

720

721 /* Do the remaining bytes via the compact, modulo-scheduled loop */

722

723 MODSCHED_RC4_PROLOGUE

724 MODSCHED_RC4_LOOP(.RC4RestLoop)

725

726 .rc4Complete:

727 {

728 .mmi

729 add KTable = -SZ, KTable

730 add IFinal = -1, IFinal

731 mov ar.lc = LCSave

732 } ;;

733 {

734 .mii

735 SKEY [KTable] = J,-SZ

736 zxt1 IFinal = IFinal

737 mov pr = PRSave, 0x1FFFF

738 } ;;

739 {

740 .mib

741 SKEY [KTable] = IFinal

742 add RetVal = 1, r0

743 br.ret.sptk.few rp

744 } ;;

745 ___

746

747 # Last but not least, emit the code for the bypass-code of the unrolled loop:

748

749 $code.=$bypass;

750

751 $code.=<<___;

752 .endp RC4

753 ___

754

755 print $code;

OLD	NEW

« no previous file with comments | « openssl/crypto/rc4/asm/rc4-586.pl ('k') | openssl/crypto/rc4/asm/rc4-md5-x86_64.S » ('j') | no next file with comments »