openssl/crypto/rc4/asm/rc4-ia64.pl - Issue 9254031: Upgrade chrome's OpenSSL to same version Android ships with.

Side by Side Diff: openssl/crypto/rc4/asm/rc4-ia64.pl

Issue 9254031: Upgrade chrome's OpenSSL to same version Android ships with. (Closed) Base URL: http://src.chromium.org/svn/trunk/deps/third_party/openssl/

Patch Set: '' Created 8 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 #!/usr/bin/env perl

	2 #

	3 # ====================================================================

	4 # Written by David Mosberger <David.Mosberger@acm.org> based on the

	5 # Itanium optimized Crypto code which was released by HP Labs at

	6 # http://www.hpl.hp.com/research/linux/crypto/.

	7 #

	8 # Copyright (c) 2005 Hewlett-Packard Development Company, L.P.

	9 #

	10 # Permission is hereby granted, free of charge, to any person obtaining

	11 # a copy of this software and associated documentation files (the

	12 # "Software"), to deal in the Software without restriction, including

	13 # without limitation the rights to use, copy, modify, merge, publish,

	14 # distribute, sublicense, and/or sell copies of the Software, and to

	15 # permit persons to whom the Software is furnished to do so, subject to

	16 # the following conditions:

	17 #

	18 # The above copyright notice and this permission notice shall be

	19 # included in all copies or substantial portions of the Software.

	20

	21 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,

	22 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF

	23 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND

	24 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE

	25 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION

	26 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION

	27 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */

	28

	29

	30

	31 # This is a little helper program which generates a software-pipelined

	32 # for RC4 encryption. The basic algorithm looks like this:

	33 #

	34 # for (counter = 0; counter < len; ++counter)

	35 # {

	36 # in = inp[counter];

	37 # SI = S[I];

	38 # J = (SI + J) & 0xff;

	39 # SJ = S[J];

	40 # T = (SI + SJ) & 0xff;

	41 # S[I] = SJ, S[J] = SI;

	42 # ST = S[T];

	43 # outp[counter] = in ^ ST;

	44 # I = (I + 1) & 0xff;

	45 # }

	46 #

	47 # Pipelining this loop isn't easy, because the stores to the S[] array

	48 # need to be observed in the right order. The loop generated by the

	49 # code below has the following pipeline diagram:

	50 #

	51 # cycle

	52 # \| 0 \| 1 \| 2 \| 3 \| 4 \| 5 \| 6 \| 7 \| 8 \| 9 \|10 \|11 \|12 \|13 \|14 \|15 \|16 \|17 \|

	53 # iter

	54 # 1: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx

	55 # 2: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx

	56 # 3: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx

	57 #

	58 # where:

	59 # LDI = load of S[I]

	60 # LDJ = load of S[J]

	61 # SWP = swap of S[I] and S[J]

	62 # LDT = load of S[T]

	63 #

	64 # Note that in the above diagram, the major trouble-spot is that LDI

	65 # of the 2nd iteration is performed BEFORE the SWP of the first

	66 # iteration. Fortunately, this is easy to detect (I of the 1st

	67 # iteration will be equal to J of the 2nd iteration) and when this

	68 # happens, we simply forward the proper value from the 1st iteration

	69 # to the 2nd one. The proper value in this case is simply the value

	70 # of S[I] from the first iteration (thanks to the fact that SWP

	71 # simply swaps the contents of S[I] and S[J]).

	72 #

	73 # Another potential trouble-spot is in cycle 7, where SWP of the 1st

	74 # iteration issues at the same time as the LDI of the 3rd iteration.

	75 # However, thanks to IA-64 execution semantics, this can be taken

	76 # care of simply by placing LDI later in the instruction-group than

	77 # SWP. IA-64 CPUs will automatically forward the value if they

	78 # detect that the SWP and LDI are accessing the same memory-location.

	79

	80 # The core-loop that can be pipelined then looks like this (annotated

	81 # with McKinley/Madison issue port & latency numbers, assuming L1

	82 # cache hits for the most part):

	83

	84 # operation: instruction: issue-ports: latency

	85 # ------------------ ----------------------------- ------------- -------

	86

	87 # Data = *inp++ ld1 data = [inp], 1 M0-M1 1 cyc c0

	88 # shladd Iptr = I, KeyTable, 3 M0-M3, I0, I1 1 cyc

	89 # I = (I + 1) & 0xff padd1 nextI = I, one M0-M3, I0, I1 3 cyc

	90 # ;;

	91 # SI = S[I] ld8 SI = [Iptr] M0-M1 1 cyc c1 * after SWAP!

	92 # ;;

	93 # cmp.eq.unc pBypass = I, J * after J is valid!

	94 # J = SI + J add J = J, SI M0-M3, I0, I1 1 cyc c2

	95 # (pBypass) br.cond.spnt Bypass

	96 # ;;

	97 # ------------------------------------------------------------------------------ ---------

	98 # J = J & 0xff zxt1 J = J I0, I1, 1 cyc c3

	99 # ;;

	100 # shladd Jptr = J, KeyTable, 3 M0-M3, I0, I1 1 cyc c4

	101 # ;;

	102 # SJ = S[J] ld8 SJ = [Jptr] M0-M1 1 cyc c5

	103 # ;;

	104 # ------------------------------------------------------------------------------ ---------

	105 # T = (SI + SJ) add T = SI, SJ M0-M3, I0, I1 1 cyc c6

	106 # ;;

	107 # T = T & 0xff zxt1 T = T I0, I1 1 cyc

	108 # S[I] = SJ st8 [Iptr] = SJ M2-M3 c7

	109 # S[J] = SI st8 [Jptr] = SI M2-M3

	110 # ;;

	111 # shladd Tptr = T, KeyTable, 3 M0-M3, I0, I1 1 cyc c8

	112 # ;;

	113 # ------------------------------------------------------------------------------ ---------

	114 # T = S[T] ld8 T = [Tptr] M0-M1 1 cyc c9

	115 # ;;

	116 # data ^= T xor data = data, T M0-M3, I0, I1 1 cyc c1 0

	117 # ;;

	118 # *out++ = Data ^ T dep word = word, data, 8, POS I0, I1 1 cyc c1 1

	119 # ;;

	120 # ------------------------------------------------------------------------------ ---------

	121

	122 # There are several points worth making here:

	123

	124 # - Note that due to the bypass/forwarding-path, the first two

	125 # phases of the loop are strangly mingled together. In

	126 # particular, note that the first stage of the pipeline is

	127 # using the value of "J", as calculated by the second stage.

	128 # - Each bundle-pair will have exactly 6 instructions.

	129 # - Pipelined, the loop can execute in 3 cycles/iteration and

	130 # 4 stages. However, McKinley/Madison can issue "st1" to

	131 # the same bank at a rate of at most one per 4 cycles. Thus,

	132 # instead of storing each byte, we accumulate them in a word

	133 # and then write them back at once with a single "st8" (this

	134 # implies that the setup code needs to ensure that the output

	135 # buffer is properly aligned, if need be, by encoding the

	136 # first few bytes separately).

	137 # - There is no space for a "br.ctop" instruction. For this

	138 # reason we can't use module-loop support in IA-64 and have

	139 # to do a traditional, purely software-pipelined loop.

	140 # - We can't replace any of the remaining "add/zxt1" pairs with

	141 # "padd1" because the latency for that instruction is too high

	142 # and would push the loop to the point where more bypasses

	143 # would be needed, which we don't have space for.

	144 # - The above loop runs at around 3.26 cycles/byte, or roughly

	145 # 440 MByte/sec on a 1.5GHz Madison. This is well below the

	146 # system bus bandwidth and hence with judicious use of

	147 # "lfetch" this loop can run at (almost) peak speed even when

	148 # the input and output data reside in memory. The

	149 # max. latency that can be tolerated is (PREFETCH_DISTANCE *

	150 # L2_LINE_SIZE * 3 cyc), or about 384 cycles assuming (at

	151 # least) 1-ahead prefetching of 128 byte cache-lines. Note

	152 # that we do NOT prefetch into L1, since that would only

	153 # interfere with the S[] table values stored there. This is

	154 # acceptable because there is a 10 cycle latency between

	155 # load and first use of the input data.

	156 # - We use a branch to out-of-line bypass-code of cycle-pressure:

	157 # we calculate the next J, check for the need to activate the

	158 # bypass path, and activate the bypass path ALL IN THE SAME

	159 # CYCLE. If we didn't have these constraints, we could do

	160 # the bypass with a simple conditional move instruction.

	161 # Fortunately, the bypass paths get activated relatively

	162 # infrequently, so the extra branches don't cost all that much

	163 # (about 0.04 cycles/byte, measured on a 16396 byte file with

	164 # random input data).

	165 #

	166

	167 $phases = 4; # number of stages/phases in the pipelined-loop

	168 $unroll_count = 6; # number of times we unrolled it

	169 $pComI = (1 << 0);

	170 $pComJ = (1 << 1);

	171 $pComT = (1 << 2);

	172 $pOut = (1 << 3);

	173

	174 $NData = 4;

	175 $NIP = 3;

	176 $NJP = 2;

	177 $NI = 2;

	178 $NSI = 3;

	179 $NSJ = 2;

	180 $NT = 2;

	181 $NOutWord = 2;

	182

	183 #

	184 # $threshold is the minimum length before we attempt to use the

	185 # big software-pipelined loop. It MUST be greater-or-equal

	186 # to:

	187 # PHASES * (UNROLL_COUNT + 1) + 7

	188 #

	189 # The "+ 7" comes from the fact we may have to encode up to

	190 # 7 bytes separately before the output pointer is aligned.

	191 #

	192 $threshold = (3 * ($phases * ($unroll_count + 1)) + 7);

	193

	194 sub I {

	195 local *code = shift;

	196 local $format = shift;

	197 $code .= sprintf ("\t\t".$format."\n", @_);

	198 }

	199

	200 sub P {

	201 local *code = shift;

	202 local $format = shift;

	203 $code .= sprintf ($format."\n", @_);

	204 }

	205

	206 sub STOP {

	207 local *code = shift;

	208 $code .=<<___;

	209 ;;

	210 ___

	211 }

	212

	213 sub emit_body {

	214 local *c = shift;

	215 local *bypass = shift;

	216 local ($iteration, $p) = @_;

	217

	218 local $i0 = $iteration;

	219 local $i1 = $iteration - 1;

	220 local $i2 = $iteration - 2;

	221 local $i3 = $iteration - 3;

	222 local $iw0 = ($iteration - 3) / 8;

	223 local $iw1 = ($iteration > 3) ? ($iteration - 4) / 8 : 1;

	224 local $byte_num = ($iteration - 3) % 8;

	225 local $label = $iteration + 1;

	226 local $pAny = ($p & 0xf) == 0xf;

	227 local $pByp = (($p & $pComI) && ($iteration > 0));

	228

	229 $c.=<<___;

	230 //////////////////////////////////////////////////

	231 ___

	232

	233 if (($p & 0xf) == 0) {

	234 $c.="#ifdef HOST_IS_BIG_ENDIAN\n";

	235 &I(\$c,"shr.u OutWord[%u] = OutWord[%u], 32;;",

	236 $iw1 % $NOutWord, $iw1 % $NOutWord);

	237 $c.="#endif\n";

	238 &I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord);

	239 return;

	240 }

	241

	242 # Cycle 0

	243 &I(\$c, "{ .mmi") if ($pAny);

	244 &I(\$c, "ld1 Data[%u] = [InPtr], 1", $i0 % $NData) if ($p & $pComI);

	245 &I(\$c, "padd1 I[%u] = One, I[%u]", $i0 % $NI, $i1 % $NI)if ($p & $pComI);

	246 &I(\$c, "zxt1 J = J") if ($p & $pComJ);

	247 &I(\$c, "}") if ($pAny);

	248 &I(\$c, "{ .mmi") if ($pAny);

	249 &I(\$c, "LKEY T[%u] = [T[%u]]", $i1 % $NT, $i1 % $NT) if ($p & $pOut);

	250 &I(\$c, "add T[%u] = SI[%u], SJ[%u]",

	251 $i0 % $NT, $i2 % $NSI, $i1 % $NSJ) if ($p & $pComT);

	252 &I(\$c, "KEYADDR(IPr[%u], I[%u])", $i0 % $NIP, $i1 % $NI) if ($p & $pComI);

	253 &I(\$c, "}") if ($pAny);

	254 &STOP(\$c);

	255

	256 # Cycle 1

	257 &I(\$c, "{ .mmi") if ($pAny);

	258 &I(\$c, "SKEY [IPr[%u]] = SJ[%u]", $i2 % $NIP, $i1%$NSJ)if ($p & $pComT);

	259 &I(\$c, "SKEY [JP[%u]] = SI[%u]", $i1 % $NJP, $i2%$NSI) if ($p & $pComT);

	260 &I(\$c, "zxt1 T[%u] = T[%u]", $i0 % $NT, $i0 % $NT) if ($p & $pComT);

	261 &I(\$c, "}") if ($pAny);

	262 &I(\$c, "{ .mmi") if ($pAny);

	263 &I(\$c, "LKEY SI[%u] = [IPr[%u]]", $i0 % $NSI, $i0%$NIP)if ($p & $pComI);

	264 &I(\$c, "KEYADDR(JP[%u], J)", $i0 % $NJP) if ($p & $pComJ);

	265 &I(\$c, "xor Data[%u] = Data[%u], T[%u]",

	266 $i3 % $NData, $i3 % $NData, $i1 % $NT) if ($p & $pOut);

	267 &I(\$c, "}") if ($pAny);

	268 &STOP(\$c);

	269

	270 # Cycle 2

	271 &I(\$c, "{ .mmi") if ($pAny);

	272 &I(\$c, "LKEY SJ[%u] = [JP[%u]]", $i0 % $NSJ, $i0%$NJP) if ($p & $pComJ);

	273 &I(\$c, "cmp.eq pBypass, p0 = I[%u], J", $i1 % $NI) if ($pByp);

	274 &I(\$c, "dep OutWord[%u] = Data[%u], OutWord[%u], BYTE_POS(%u), 8",

	275 $iw0%$NOutWord, $i3%$NData, $iw1%$NOutWord, $byte_num) if ($p & $pOut);

	276 &I(\$c, "}") if ($pAny);

	277 &I(\$c, "{ .mmb") if ($pAny);

	278 &I(\$c, "add J = J, SI[%u]", $i0 % $NSI) if ($p & $pComI);

	279 &I(\$c, "KEYADDR(T[%u], T[%u])", $i0 % $NT, $i0 % $NT) if ($p & $pComT);

	280 &P(\$c, "(pBypass)\tbr.cond.spnt.many .rc4Bypass%u",$label)if ($pByp);

	281 &I(\$c, "}") if ($pAny);

	282 &STOP(\$c);

	283

	284 &P(\$c, ".rc4Resume%u:", $label) if ($pByp);

	285 if ($byte_num == 0 && $iteration >= $phases) {

	286 &I(\$c, "st8 [OutPtr] = OutWord[%u], 8",

	287 $iw1 % $NOutWord) if ($p & $pOut);

	288 if ($iteration == (1 + $unroll_count) * $phases - 1) {

	289 if ($unroll_count == 6) {

	290 &I(\$c, "mov OutWord[%u] = OutWord[%u]",

	291 $iw1 % $NOutWord, $iw0 % $NOutWord);

	292 }

	293 &I(\$c, "lfetch.nt1 [InPrefetch], %u",

	294 $unroll_count * $phases);

	295 &I(\$c, "lfetch.excl.nt1 [OutPrefetch], %u",

	296 $unroll_count * $phases);

	297 &I(\$c, "br.cloop.sptk.few .rc4Loop");

	298 }

	299 }

	300

	301 if ($pByp) {

	302 &P(\$bypass, ".rc4Bypass%u:", $label);

	303 &I(\$bypass, "sub J = J, SI[%u]", $i0 % $NSI);

	304 &I(\$bypass, "nop 0");

	305 &I(\$bypass, "nop 0");

	306 &I(\$bypass, ";;");

	307 &I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI);

	308 &I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI);

	309 &I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label);

	310 &I(\$bypass, ";;");

	311 }

	312 }

	313

	314 $code=<<___;

	315 .ident \"rc4-ia64.s, version 3.0\"

	316 .ident \"Copyright (c) 2005 Hewlett-Packard Development Company, L.P.\"

	317

	318 #define LCSave r8

	319 #define PRSave r9

	320

	321 /* Inputs become invalid once rotation begins! */

	322

	323 #define StateTable in0

	324 #define DataLen in1

	325 #define InputBuffer in2

	326 #define OutputBuffer in3

	327

	328 #define KTable r14

	329 #define J r15

	330 #define InPtr r16

	331 #define OutPtr r17

	332 #define InPrefetch r18

	333 #define OutPrefetch r19

	334 #define One r20

	335 #define LoopCount r21

	336 #define Remainder r22

	337 #define IFinal r23

	338 #define EndPtr r24

	339

	340 #define tmp0 r25

	341 #define tmp1 r26

	342

	343 #define pBypass p6

	344 #define pDone p7

	345 #define pSmall p8

	346 #define pAligned p9

	347 #define pUnaligned p10

	348

	349 #define pComputeI pPhase[0]

	350 #define pComputeJ pPhase[1]

	351 #define pComputeT pPhase[2]

	352 #define pOutput pPhase[3]

	353

	354 #define RetVal r8

	355 #define L_OK p7

	356 #define L_NOK p8

	357

	358 #define _NINPUTS 4

	359 #define _NOUTPUT 0

	360

	361 #define _NROTATE 24

	362 #define _NLOCALS (_NROTATE - _NINPUTS - _NOUTPUT)

	363

	364 #ifndef SZ

	365 # define SZ 4 // this must be set to sizeof(RC4_INT)

	366 #endif

	367

	368 #if SZ == 1

	369 # define LKEY ld1

	370 # define SKEY st1

	371 # define KEYADDR(dst, i) add dst = i, KTable

	372 #elif SZ == 2

	373 # define LKEY ld2

	374 # define SKEY st2

	375 # define KEYADDR(dst, i) shladd dst = i, 1, KTable

	376 #elif SZ == 4

	377 # define LKEY ld4

	378 # define SKEY st4

	379 # define KEYADDR(dst, i) shladd dst = i, 2, KTable

	380 #else

	381 # define LKEY ld8

	382 # define SKEY st8

	383 # define KEYADDR(dst, i) shladd dst = i, 3, KTable

	384 #endif

	385

	386 #if defined(_HPUX_SOURCE) && !defined(_LP64)

	387 # define ADDP addp4

	388 #else

	389 # define ADDP add

	390 #endif

	391

	392 /* Define a macro for the bit number of the n-th byte: */

	393

	394 #if defined(_HPUX_SOURCE) \|\| defined(B_ENDIAN)

	395 # define HOST_IS_BIG_ENDIAN

	396 # define BYTE_POS(n) (56 - (8 * (n)))

	397 #else

	398 # define BYTE_POS(n) (8 * (n))

	399 #endif

	400

	401 /*

	402 We must perform the first phase of the pipeline explicitly since

	403 we will always load from the stable the first time. The br.cexit

	404 will never be taken since regardless of the number of bytes because

	405 the epilogue count is 4.

	406 */

	407 /* MODSCHED_RC4 macro was split to _PROLOGUE and _LOOP, because HP-UX

	408 assembler failed on original macro with syntax error. <appro> */

	409 #define MODSCHED_RC4_PROLOGUE \\

	410 { \\

	411 ld1 Data[0] = [InPtr], 1; \\

	412 add IFinal = 1, I[1]; \\

	413 KEYADDR(IPr[0], I[1]); \\

	414 } ;; \\

	415 { \\

	416 LKEY SI[0] = [IPr[0]]; \\

	417 mov pr.rot = 0x10000; \\

	418 mov ar.ec = 4; \\

	419 } ;; \\

	420 { \\

	421 add J = J, SI[0]; \\

	422 zxt1 I[0] = IFinal; \\

	423 br.cexit.spnt.few .+16; /* never taken */ \\

	424 } ;;

	425 #define MODSCHED_RC4_LOOP(label) \\

	426 label: \\

	427 { .mmi; \\

	428 (pComputeI) ld1 Data[0] = [InPtr], 1; \\

	429 (pComputeI) add IFinal = 1, I[1]; \\

	430 (pComputeJ) zxt1 J = J; \\

	431 }{ .mmi; \\

	432 (pOutput) LKEY T[1] = [T[1]]; \\

	433 (pComputeT) add T[0] = SI[2], SJ[1]; \\

	434 (pComputeI) KEYADDR(IPr[0], I[1]); \\

	435 } ;; \\

	436 { .mmi; \\

	437 (pComputeT) SKEY [IPr[2]] = SJ[1]; \\

	438 (pComputeT) SKEY [JP[1]] = SI[2]; \\

	439 (pComputeT) zxt1 T[0] = T[0]; \\

	440 }{ .mmi; \\

	441 (pComputeI) LKEY SI[0] = [IPr[0]]; \\

	442 (pComputeJ) KEYADDR(JP[0], J); \\

	443 (pComputeI) cmp.eq.unc pBypass, p0 = I[1], J; \\

	444 } ;; \\

	445 { .mmi; \\

	446 (pComputeJ) LKEY SJ[0] = [JP[0]]; \\

	447 (pOutput) xor Data[3] = Data[3], T[1]; \\

	448 nop 0x0; \\

	449 }{ .mmi; \\

	450 (pComputeT) KEYADDR(T[0], T[0]); \\

	451 (pBypass) mov SI[0] = SI[1]; \\

	452 (pComputeI) zxt1 I[0] = IFinal; \\

	453 } ;; \\

	454 { .mmb; \\

	455 (pOutput) st1 [OutPtr] = Data[3], 1; \\

	456 (pComputeI) add J = J, SI[0]; \\

	457 br.ctop.sptk.few label; \\

	458 } ;;

	459

	460 .text

	461

	462 .align 32

	463

	464 .type RC4, \@function

	465 .global RC4

	466

	467 .proc RC4

	468 .prologue

	469

	470 RC4:

	471 {

	472 .mmi

	473 alloc r2 = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE

	474

	475 .rotr Data[4], I[2], IPr[3], SI[3], JP[2], SJ[2], T[2], \\

	476 OutWord[2]

	477 .rotp pPhase[4]

	478

	479 ADDP InPrefetch = 0, InputBuffer

	480 ADDP KTable = 0, StateTable

	481 }

	482 {

	483 .mmi

	484 ADDP InPtr = 0, InputBuffer

	485 ADDP OutPtr = 0, OutputBuffer

	486 mov RetVal = r0

	487 }

	488 ;;

	489 {

	490 .mmi

	491 lfetch.nt1 [InPrefetch], 0x80

	492 ADDP OutPrefetch = 0, OutputBuffer

	493 }

	494 { // Return 0 if the input length is nonsensical

	495 .mib

	496 ADDP StateTable = 0, StateTable

	497 cmp.ge.unc L_NOK, L_OK = r0, DataLen

	498 (L_NOK) br.ret.sptk.few rp

	499 }

	500 ;;

	501 {

	502 .mib

	503 cmp.eq.or L_NOK, L_OK = r0, InPtr

	504 cmp.eq.or L_NOK, L_OK = r0, OutPtr

	505 nop 0x0

	506 }

	507 {

	508 .mib

	509 cmp.eq.or L_NOK, L_OK = r0, StateTable

	510 nop 0x0

	511 (L_NOK) br.ret.sptk.few rp

	512 }

	513 ;;

	514 LKEY I[1] = [KTable], SZ

	515 /* Prefetch the state-table. It contains 256 elements of size SZ */

	516

	517 #if SZ == 1

	518 ADDP tmp0 = 1*128, StateTable

	519 #elif SZ == 2

	520 ADDP tmp0 = 3*128, StateTable

	521 ADDP tmp1 = 2*128, StateTable

	522 #elif SZ == 4

	523 ADDP tmp0 = 7*128, StateTable

	524 ADDP tmp1 = 6*128, StateTable

	525 #elif SZ == 8

	526 ADDP tmp0 = 15*128, StateTable

	527 ADDP tmp1 = 14*128, StateTable

	528 #endif

	529 ;;

	530 #if SZ >= 8

	531 lfetch.fault.nt1 [tmp0], -256 // 15

	532 lfetch.fault.nt1 [tmp1], -256;;

	533 lfetch.fault.nt1 [tmp0], -256 // 13

	534 lfetch.fault.nt1 [tmp1], -256;;

	535 lfetch.fault.nt1 [tmp0], -256 // 11

	536 lfetch.fault.nt1 [tmp1], -256;;

	537 lfetch.fault.nt1 [tmp0], -256 // 9

	538 lfetch.fault.nt1 [tmp1], -256;;

	539 #endif

	540 #if SZ >= 4

	541 lfetch.fault.nt1 [tmp0], -256 // 7

	542 lfetch.fault.nt1 [tmp1], -256;;

	543 lfetch.fault.nt1 [tmp0], -256 // 5

	544 lfetch.fault.nt1 [tmp1], -256;;

	545 #endif

	546 #if SZ >= 2

	547 lfetch.fault.nt1 [tmp0], -256 // 3

	548 lfetch.fault.nt1 [tmp1], -256;;

	549 #endif

	550 {

	551 .mii

	552 lfetch.fault.nt1 [tmp0] // 1

	553 add I[1]=1,I[1];;

	554 zxt1 I[1]=I[1]

	555 }

	556 {

	557 .mmi

	558 lfetch.nt1 [InPrefetch], 0x80

	559 lfetch.excl.nt1 [OutPrefetch], 0x80

	560 .save pr, PRSave

	561 mov PRSave = pr

	562 } ;;

	563 {

	564 .mmi

	565 lfetch.excl.nt1 [OutPrefetch], 0x80

	566 LKEY J = [KTable], SZ

	567 ADDP EndPtr = DataLen, InPtr

	568 } ;;

	569 {

	570 .mmi

	571 ADDP EndPtr = -1, EndPtr // Make it point to

	572 // last data byte.

	573 mov One = 1

	574 .save ar.lc, LCSave

	575 mov LCSave = ar.lc

	576 .body

	577 } ;;

	578 {

	579 .mmb

	580 sub Remainder = 0, OutPtr

	581 cmp.gtu pSmall, p0 = $threshold, DataLen

	582 (pSmall) br.cond.dpnt .rc4Remainder // Data too small for

	583 // big loop.

	584 } ;;

	585 {

	586 .mmi

	587 and Remainder = 0x7, Remainder

	588 ;;

	589 cmp.eq pAligned, pUnaligned = Remainder, r0

	590 nop 0x0

	591 } ;;

	592 {

	593 .mmb

	594 .pred.rel "mutex",pUnaligned,pAligned

	595 (pUnaligned) add Remainder = -1, Remainder

	596 (pAligned) sub Remainder = EndPtr, InPtr

	597 (pAligned) br.cond.dptk.many .rc4Aligned

	598 } ;;

	599 {

	600 .mmi

	601 nop 0x0

	602 nop 0x0

	603 mov.i ar.lc = Remainder

	604 }

	605

	606 /* Do the initial few bytes via the compact, modulo-scheduled loop

	607 until the output pointer is 8-byte-aligned. */

	608

	609 MODSCHED_RC4_PROLOGUE

	610 MODSCHED_RC4_LOOP(.RC4AlignLoop)

	611

	612 {

	613 .mib

	614 sub Remainder = EndPtr, InPtr

	615 zxt1 IFinal = IFinal

	616 clrrrb // Clear CFM.rrb.pr so

	617 ;; // next "mov pr.rot = N"

	618 // does the right thing.

	619 }

	620 {

	621 .mmi

	622 mov I[1] = IFinal

	623 nop 0x0

	624 nop 0x0

	625 } ;;

	626

	627

	628 .rc4Aligned:

	629

	630 /*

	631 Unrolled loop count = (Remainder - ($unroll_count+1)$phases)/($unroll_count $phases)

	632 */

	633

	634 {

	635 .mlx

	636 add LoopCount = 1 - ($unroll_count + 1)*$phases, Remainder

	637 movl Remainder = 0xaaaaaaaaaaaaaaab

	638 } ;;

	639 {

	640 .mmi

	641 setf.sig f6 = LoopCount // M2, M3 6 cyc

	642 setf.sig f7 = Remainder // M2, M3 6 cyc

	643 nop 0x0

	644 } ;;

	645 {

	646 .mfb

	647 nop 0x0

	648 xmpy.hu f6 = f6, f7

	649 nop 0x0

	650 } ;;

	651 {

	652 .mmi

	653 getf.sig LoopCount = f6;; // M2 5 cyc

	654 nop 0x0

	655 shr.u LoopCount = LoopCount, 4

	656 } ;;

	657 {

	658 .mmi

	659 nop 0x0

	660 nop 0x0

	661 mov.i ar.lc = LoopCount

	662 } ;;

	663

	664 /* Now comes the unrolled loop: */

	665

	666 .rc4Prologue:

	667 ___

	668

	669 $iteration = 0;

	670

	671 # Generate the prologue:

	672 $predicates = 1;

	673 for ($i = 0; $i < $phases; ++$i) {

	674 &emit_body (\$code, \$bypass, $iteration++, $predicates);

	675 $predicates = ($predicates << 1) \| 1;

	676 }

	677

	678 $code.=<<___;

	679 .rc4Loop:

	680 ___

	681

	682 # Generate the body:

	683 for ($i = 0; $i < $unroll_count*$phases; ++$i) {

	684 &emit_body (\$code, \$bypass, $iteration++, $predicates);

	685 }

	686

	687 $code.=<<___;

	688 .rc4Epilogue:

	689 ___

	690

	691 # Generate the epilogue:

	692 for ($i = 0; $i < $phases; ++$i) {

	693 $predicates <<= 1;

	694 &emit_body (\$code, \$bypass, $iteration++, $predicates);

	695 }

	696

	697 $code.=<<___;

	698 {

	699 .mmi

	700 lfetch.nt1 [EndPtr] // fetch line with last byte

	701 mov IFinal = I[1]

	702 nop 0x0

	703 }

	704

	705 .rc4Remainder:

	706 {

	707 .mmi

	708 sub Remainder = EndPtr, InPtr // Calculate

	709 // # of bytes

	710 // left - 1

	711 nop 0x0

	712 nop 0x0

	713 } ;;

	714 {

	715 .mib

	716 cmp.eq pDone, p0 = -1, Remainder // done already?

	717 mov.i ar.lc = Remainder

	718 (pDone) br.cond.dptk.few .rc4Complete

	719 }

	720

	721 /* Do the remaining bytes via the compact, modulo-scheduled loop */

	722

	723 MODSCHED_RC4_PROLOGUE

	724 MODSCHED_RC4_LOOP(.RC4RestLoop)

	725

	726 .rc4Complete:

	727 {

	728 .mmi

	729 add KTable = -SZ, KTable

	730 add IFinal = -1, IFinal

	731 mov ar.lc = LCSave

	732 } ;;

	733 {

	734 .mii

	735 SKEY [KTable] = J,-SZ

	736 zxt1 IFinal = IFinal

	737 mov pr = PRSave, 0x1FFFF

	738 } ;;

	739 {

	740 .mib

	741 SKEY [KTable] = IFinal

	742 add RetVal = 1, r0

	743 br.ret.sptk.few rp

	744 } ;;

	745 ___

	746

	747 # Last but not least, emit the code for the bypass-code of the unrolled loop:

	748

	749 $code.=$bypass;

	750

	751 $code.=<<___;

	752 .endp RC4

	753 ___

	754

	755 print $code;

OLD	NEW

« no previous file with comments | « openssl/crypto/rc4/asm/rc4-ia64.S ('k') | openssl/crypto/rc4/asm/rc4-s390x.pl » ('j') | no next file with comments »