OLD | NEW |
| (Empty) |
1 /* Copyright (c) 2005 Hewlett-Packard Development Company, L.P. | |
2 | |
3 Permission is hereby granted, free of charge, to any person obtaining | |
4 a copy of this software and associated documentation files (the | |
5 "Software"), to deal in the Software without restriction, including | |
6 without limitation the rights to use, copy, modify, merge, publish, | |
7 distribute, sublicense, and/or sell copies of the Software, and to | |
8 permit persons to whom the Software is furnished to do so, subject to | |
9 the following conditions: | |
10 | |
11 The above copyright notice and this permission notice shall be | |
12 included in all copies or substantial portions of the Software. | |
13 | |
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
15 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
16 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
17 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE | |
18 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | |
19 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION | |
20 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ | |
21 | |
22 // Common registers are assigned as follows: | |
23 // | |
24 // COMMON | |
25 // | |
26 // t0 Const Tbl Ptr TPtr | |
27 // t1 Round Constant TRound | |
28 // t4 Block residual LenResid | |
29 // t5 Residual Data DTmp | |
30 // | |
31 // {in,out}0 Block 0 Cycle RotateM0 | |
32 // {in,out}1 Block Value 12 M12 | |
33 // {in,out}2 Block Value 8 M8 | |
34 // {in,out}3 Block Value 4 M4 | |
35 // {in,out}4 Block Value 0 M0 | |
36 // {in,out}5 Block 1 Cycle RotateM1 | |
37 // {in,out}6 Block Value 13 M13 | |
38 // {in,out}7 Block Value 9 M9 | |
39 // {in,out}8 Block Value 5 M5 | |
40 // {in,out}9 Block Value 1 M1 | |
41 // {in,out}10 Block 2 Cycle RotateM2 | |
42 // {in,out}11 Block Value 14 M14 | |
43 // {in,out}12 Block Value 10 M10 | |
44 // {in,out}13 Block Value 6 M6 | |
45 // {in,out}14 Block Value 2 M2 | |
46 // {in,out}15 Block 3 Cycle RotateM3 | |
47 // {in,out}16 Block Value 15 M15 | |
48 // {in,out}17 Block Value 11 M11 | |
49 // {in,out}18 Block Value 7 M7 | |
50 // {in,out}19 Block Value 3 M3 | |
51 // {in,out}20 Scratch Z | |
52 // {in,out}21 Scratch Y | |
53 // {in,out}22 Scratch X | |
54 // {in,out}23 Scratch W | |
55 // {in,out}24 Digest A A | |
56 // {in,out}25 Digest B B | |
57 // {in,out}26 Digest C C | |
58 // {in,out}27 Digest D D | |
59 // {in,out}28 Active Data Ptr DPtr | |
60 // in28 Dummy Value - | |
61 // out28 Dummy Value - | |
62 // bt0 Coroutine Link QUICK_RTN | |
63 // | |
64 /// These predicates are used for computing the padding block(s) and | |
65 /// are shared between the driver and digest co-routines | |
66 // | |
67 // pt0 Extra Pad Block pExtra | |
68 // pt1 Load next word pLoad | |
69 // pt2 Skip next word pSkip | |
70 // pt3 Search for Pad pNoPad | |
71 // pt4 Pad Word 0 pPad0 | |
72 // pt5 Pad Word 1 pPad1 | |
73 // pt6 Pad Word 2 pPad2 | |
74 // pt7 Pad Word 3 pPad3 | |
75 | |
76 #define DTmp r19 | |
77 #define LenResid r18 | |
78 #define QUICK_RTN b6 | |
79 #define TPtr r14 | |
80 #define TRound r15 | |
81 #define pExtra p6 | |
82 #define pLoad p7 | |
83 #define pNoPad p9 | |
84 #define pPad0 p10 | |
85 #define pPad1 p11 | |
86 #define pPad2 p12 | |
87 #define pPad3 p13 | |
88 #define pSkip p8 | |
89 | |
90 #define A_ out24 | |
91 #define B_ out25 | |
92 #define C_ out26 | |
93 #define D_ out27 | |
94 #define DPtr_ out28 | |
95 #define M0_ out4 | |
96 #define M1_ out9 | |
97 #define M10_ out12 | |
98 #define M11_ out17 | |
99 #define M12_ out1 | |
100 #define M13_ out6 | |
101 #define M14_ out11 | |
102 #define M15_ out16 | |
103 #define M2_ out14 | |
104 #define M3_ out19 | |
105 #define M4_ out3 | |
106 #define M5_ out8 | |
107 #define M6_ out13 | |
108 #define M7_ out18 | |
109 #define M8_ out2 | |
110 #define M9_ out7 | |
111 #define RotateM0_ out0 | |
112 #define RotateM1_ out5 | |
113 #define RotateM2_ out10 | |
114 #define RotateM3_ out15 | |
115 #define W_ out23 | |
116 #define X_ out22 | |
117 #define Y_ out21 | |
118 #define Z_ out20 | |
119 | |
120 #define A in24 | |
121 #define B in25 | |
122 #define C in26 | |
123 #define D in27 | |
124 #define DPtr in28 | |
125 #define M0 in4 | |
126 #define M1 in9 | |
127 #define M10 in12 | |
128 #define M11 in17 | |
129 #define M12 in1 | |
130 #define M13 in6 | |
131 #define M14 in11 | |
132 #define M15 in16 | |
133 #define M2 in14 | |
134 #define M3 in19 | |
135 #define M4 in3 | |
136 #define M5 in8 | |
137 #define M6 in13 | |
138 #define M7 in18 | |
139 #define M8 in2 | |
140 #define M9 in7 | |
141 #define RotateM0 in0 | |
142 #define RotateM1 in5 | |
143 #define RotateM2 in10 | |
144 #define RotateM3 in15 | |
145 #define W in23 | |
146 #define X in22 | |
147 #define Y in21 | |
148 #define Z in20 | |
149 | |
150 /* register stack configuration for md5_block_asm_data_order(): */ | |
151 #define MD5_NINP 3 | |
152 #define MD5_NLOC 0 | |
153 #define MD5_NOUT 29 | |
154 #define MD5_NROT 0 | |
155 | |
156 /* register stack configuration for helpers: */ | |
157 #define _NINPUTS MD5_NOUT | |
158 #define _NLOCALS 0 | |
159 #define _NOUTPUT 0 | |
160 #define _NROTATE 24 /* this must be <= _NINPUTS */ | |
161 | |
162 #if defined(_HPUX_SOURCE) && !defined(_LP64) | |
163 #define ADDP addp4 | |
164 #else | |
165 #define ADDP add | |
166 #endif | |
167 | |
168 #if defined(_HPUX_SOURCE) || defined(B_ENDIAN) | |
169 #define HOST_IS_BIG_ENDIAN | |
170 #endif | |
171 | |
172 // Macros for getting the left and right portions of little-endian words | |
173 | |
174 #define GETLW(dst, src, align) dep.z dst = src, 32 - 8 * align, 8 * align | |
175 #define GETRW(dst, src, align) extr.u dst = src, 8 * align, 32 - 8 * align | |
176 | |
177 // MD5 driver | |
178 // | |
179 // Reads an input block, then calls the digest block | |
180 // subroutine and adds the results to the accumulated | |
181 // digest. It allocates 32 outs which the subroutine | |
182 // uses as it's inputs and rotating | |
183 // registers. Initializes the round constant pointer and | |
184 // takes care of saving/restoring ar.lc | |
185 // | |
186 /// INPUT | |
187 // | |
188 // in0 Context Ptr CtxPtr0 | |
189 // in1 Input Data Ptr DPtrIn | |
190 // in2 Integral Blocks BlockCount | |
191 // rp Return Address - | |
192 // | |
193 /// CODE | |
194 // | |
195 // v2 Input Align InAlign | |
196 // t0 Shared w/digest - | |
197 // t1 Shared w/digest - | |
198 // t2 Shared w/digest - | |
199 // t3 Shared w/digest - | |
200 // t4 Shared w/digest - | |
201 // t5 Shared w/digest - | |
202 // t6 PFS Save PFSSave | |
203 // t7 ar.lc Save LCSave | |
204 // t8 Saved PR PRSave | |
205 // t9 2nd CtxPtr CtxPtr1 | |
206 // t10 Table Base CTable | |
207 // t11 Table[0] CTable0 | |
208 // t13 Accumulator A AccumA | |
209 // t14 Accumulator B AccumB | |
210 // t15 Accumulator C AccumC | |
211 // t16 Accumulator D AccumD | |
212 // pt0 Shared w/digest - | |
213 // pt1 Shared w/digest - | |
214 // pt2 Shared w/digest - | |
215 // pt3 Shared w/digest - | |
216 // pt4 Shared w/digest - | |
217 // pt5 Shared w/digest - | |
218 // pt6 Shared w/digest - | |
219 // pt7 Shared w/digest - | |
220 // pt8 Not Aligned pOff | |
221 // pt8 Blocks Left pAgain | |
222 | |
223 #define AccumA r27 | |
224 #define AccumB r28 | |
225 #define AccumC r29 | |
226 #define AccumD r30 | |
227 #define CTable r24 | |
228 #define CTable0 r25 | |
229 #define CtxPtr0 in0 | |
230 #define CtxPtr1 r23 | |
231 #define DPtrIn in1 | |
232 #define BlockCount in2 | |
233 #define InAlign r10 | |
234 #define LCSave r21 | |
235 #define PFSSave r20 | |
236 #define PRSave r22 | |
237 #define pAgain p63 | |
238 #define pOff p63 | |
239 | |
240 .text | |
241 | |
242 /* md5_block_asm_data_order(MD5_CTX *c, const void *data, size_t num) | |
243 | |
244 where: | |
245 c: a pointer to a structure of this type: | |
246 | |
247 typedef struct MD5state_st | |
248 { | |
249 MD5_LONG A,B,C,D; | |
250 MD5_LONG Nl,Nh; | |
251 MD5_LONG data[MD5_LBLOCK]; | |
252 unsigned int num; | |
253 } | |
254 MD5_CTX; | |
255 | |
256 data: a pointer to the input data (may be misaligned) | |
257 num: the number of 16-byte blocks to hash (i.e., the length | |
258 of DATA is 16*NUM. | |
259 | |
260 */ | |
261 | |
262 .type md5_block_asm_data_order, @function | |
263 .global md5_block_asm_data_order | |
264 .align 32 | |
265 .proc md5_block_asm_data_order | |
266 md5_block_asm_data_order: | |
267 .md5_block: | |
268 .prologue | |
269 { .mmi | |
270 .save ar.pfs, PFSSave | |
271 alloc PFSSave = ar.pfs, MD5_NINP, MD5_NLOC, MD5_NOUT, MD5_NROT | |
272 ADDP CtxPtr1 = 8, CtxPtr0 | |
273 mov CTable = ip | |
274 } | |
275 { .mmi | |
276 ADDP DPtrIn = 0, DPtrIn | |
277 ADDP CtxPtr0 = 0, CtxPtr0 | |
278 .save ar.lc, LCSave | |
279 mov LCSave = ar.lc | |
280 } | |
281 ;; | |
282 { .mmi | |
283 add CTable = .md5_tbl_data_order#-.md5_block#, CTable | |
284 and InAlign = 0x3, DPtrIn | |
285 } | |
286 | |
287 { .mmi | |
288 ld4 AccumA = [CtxPtr0], 4 | |
289 ld4 AccumC = [CtxPtr1], 4 | |
290 .save pr, PRSave | |
291 mov PRSave = pr | |
292 .body | |
293 } | |
294 ;; | |
295 { .mmi | |
296 ld4 AccumB = [CtxPtr0] | |
297 ld4 AccumD = [CtxPtr1] | |
298 dep DPtr_ = 0, DPtrIn, 0, 2 | |
299 } ;; | |
300 #ifdef HOST_IS_BIG_ENDIAN | |
301 rum psr.be;; // switch to little-endian | |
302 #endif | |
303 { .mmb | |
304 ld4 CTable0 = [CTable], 4 | |
305 cmp.ne pOff, p0 = 0, InAlign | |
306 (pOff) br.cond.spnt.many .md5_unaligned | |
307 } ;; | |
308 | |
309 // The FF load/compute loop rotates values three times, so that | |
310 // loading into M12 here produces the M0 value, M13 -> M1, etc. | |
311 | |
312 .md5_block_loop0: | |
313 { .mmi | |
314 ld4 M12_ = [DPtr_], 4 | |
315 mov TPtr = CTable | |
316 mov TRound = CTable0 | |
317 } ;; | |
318 { .mmi | |
319 ld4 M13_ = [DPtr_], 4 | |
320 mov A_ = AccumA | |
321 mov B_ = AccumB | |
322 } ;; | |
323 { .mmi | |
324 ld4 M14_ = [DPtr_], 4 | |
325 mov C_ = AccumC | |
326 mov D_ = AccumD | |
327 } ;; | |
328 { .mmb | |
329 ld4 M15_ = [DPtr_], 4 | |
330 add BlockCount = -1, BlockCount | |
331 br.call.sptk.many QUICK_RTN = md5_digest_block0 | |
332 } ;; | |
333 | |
334 // Now, we add the new digest values and do some clean-up | |
335 // before checking if there's another full block to process | |
336 | |
337 { .mmi | |
338 add AccumA = AccumA, A_ | |
339 add AccumB = AccumB, B_ | |
340 cmp.ne pAgain, p0 = 0, BlockCount | |
341 } | |
342 { .mib | |
343 add AccumC = AccumC, C_ | |
344 add AccumD = AccumD, D_ | |
345 (pAgain) br.cond.dptk.many .md5_block_loop0 | |
346 } ;; | |
347 | |
348 .md5_exit: | |
349 #ifdef HOST_IS_BIG_ENDIAN | |
350 sum psr.be;; // switch back to big-endian mode | |
351 #endif | |
352 { .mmi | |
353 st4 [CtxPtr0] = AccumB, -4 | |
354 st4 [CtxPtr1] = AccumD, -4 | |
355 mov pr = PRSave, 0x1ffff ;; | |
356 } | |
357 { .mmi | |
358 st4 [CtxPtr0] = AccumA | |
359 st4 [CtxPtr1] = AccumC | |
360 mov ar.lc = LCSave | |
361 } ;; | |
362 { .mib | |
363 mov ar.pfs = PFSSave | |
364 br.ret.sptk.few rp | |
365 } ;; | |
366 | |
367 #define MD5UNALIGNED(offset) \ | |
368 .md5_process##offset: \ | |
369 { .mib ; \ | |
370 nop 0x0 ; \ | |
371 GETRW(DTmp, DTmp, offset) ; \ | |
372 } ;; \ | |
373 .md5_block_loop##offset: \ | |
374 { .mmi ; \ | |
375 ld4 Y_ = [DPtr_], 4 ; \ | |
376 mov TPtr = CTable ; \ | |
377 mov TRound = CTable0 ; \ | |
378 } ;; \ | |
379 { .mmi ; \ | |
380 ld4 M13_ = [DPtr_], 4 ; \ | |
381 mov A_ = AccumA ; \ | |
382 mov B_ = AccumB ; \ | |
383 } ;; \ | |
384 { .mii ; \ | |
385 ld4 M14_ = [DPtr_], 4 ; \ | |
386 GETLW(W_, Y_, offset) ; \ | |
387 mov C_ = AccumC ; \ | |
388 } \ | |
389 { .mmi ; \ | |
390 mov D_ = AccumD ;; \ | |
391 or M12_ = W_, DTmp ; \ | |
392 GETRW(DTmp, Y_, offset) ; \ | |
393 } \ | |
394 { .mib ; \ | |
395 ld4 M15_ = [DPtr_], 4 ; \ | |
396 add BlockCount = -1, BlockCount ; \ | |
397 br.call.sptk.many QUICK_RTN = md5_digest_block##offset; \ | |
398 } ;; \ | |
399 { .mmi ; \ | |
400 add AccumA = AccumA, A_ ; \ | |
401 add AccumB = AccumB, B_ ; \ | |
402 cmp.ne pAgain, p0 = 0, BlockCount ; \ | |
403 } \ | |
404 { .mib ; \ | |
405 add AccumC = AccumC, C_ ; \ | |
406 add AccumD = AccumD, D_ ; \ | |
407 (pAgain) br.cond.dptk.many .md5_block_loop##offset ; \ | |
408 } ;; \ | |
409 { .mib ; \ | |
410 nop 0x0 ; \ | |
411 nop 0x0 ; \ | |
412 br.cond.sptk.many .md5_exit ; \ | |
413 } ;; | |
414 | |
415 .align 32 | |
416 .md5_unaligned: | |
417 // | |
418 // Because variable shifts are expensive, we special case each of | |
419 // the four alignements. In practice, this won't hurt too much | |
420 // since only one working set of code will be loaded. | |
421 // | |
422 { .mib | |
423 ld4 DTmp = [DPtr_], 4 | |
424 cmp.eq pOff, p0 = 1, InAlign | |
425 (pOff) br.cond.dpnt.many .md5_process1 | |
426 } ;; | |
427 { .mib | |
428 cmp.eq pOff, p0 = 2, InAlign | |
429 nop 0x0 | |
430 (pOff) br.cond.dpnt.many .md5_process2 | |
431 } ;; | |
432 MD5UNALIGNED(3) | |
433 MD5UNALIGNED(1) | |
434 MD5UNALIGNED(2) | |
435 | |
436 .endp md5_block_asm_data_order | |
437 | |
438 | |
439 // MD5 Perform the F function and load | |
440 // | |
441 // Passed the first 4 words (M0 - M3) and initial (A, B, C, D) values, | |
442 // computes the FF() round of functions, then branches to the common | |
443 // digest code to finish up with GG(), HH, and II(). | |
444 // | |
445 // INPUT | |
446 // | |
447 // rp Return Address - | |
448 // | |
449 // CODE | |
450 // | |
451 // v0 PFS bit bucket PFS | |
452 // v1 Loop Trip Count LTrip | |
453 // pt0 Load next word pMore | |
454 | |
455 /* For F round: */ | |
456 #define LTrip r9 | |
457 #define PFS r8 | |
458 #define pMore p6 | |
459 | |
460 /* For GHI rounds: */ | |
461 #define T r9 | |
462 #define U r10 | |
463 #define V r11 | |
464 | |
465 #define COMPUTE(a, b, s, M, R) \ | |
466 { \ | |
467 .mii ; \ | |
468 ld4 TRound = [TPtr], 4 ; \ | |
469 dep.z Y = Z, 32, 32 ;; \ | |
470 shrp Z = Z, Y, 64 - s ; \ | |
471 } ;; \ | |
472 { \ | |
473 .mmi ; \ | |
474 add a = Z, b ; \ | |
475 mov R = M ; \ | |
476 nop 0x0 ; \ | |
477 } ;; | |
478 | |
479 #define LOOP(a, b, s, M, R, label) \ | |
480 { .mii ; \ | |
481 ld4 TRound = [TPtr], 4 ; \ | |
482 dep.z Y = Z, 32, 32 ;; \ | |
483 shrp Z = Z, Y, 64 - s ; \ | |
484 } ;; \ | |
485 { .mib ; \ | |
486 add a = Z, b ; \ | |
487 mov R = M ; \ | |
488 br.ctop.sptk.many label ; \ | |
489 } ;; | |
490 | |
491 // G(B, C, D) = (B & D) | (C & ~D) | |
492 | |
493 #define G(a, b, c, d, M) \ | |
494 { .mmi ; \ | |
495 add Z = M, TRound ; \ | |
496 and Y = b, d ; \ | |
497 andcm X = c, d ; \ | |
498 } ;; \ | |
499 { .mii ; \ | |
500 add Z = Z, a ; \ | |
501 or Y = Y, X ;; \ | |
502 add Z = Z, Y ; \ | |
503 } ;; | |
504 | |
505 // H(B, C, D) = B ^ C ^ D | |
506 | |
507 #define H(a, b, c, d, M) \ | |
508 { .mmi ; \ | |
509 add Z = M, TRound ; \ | |
510 xor Y = b, c ; \ | |
511 nop 0x0 ; \ | |
512 } ;; \ | |
513 { .mii ; \ | |
514 add Z = Z, a ; \ | |
515 xor Y = Y, d ;; \ | |
516 add Z = Z, Y ; \ | |
517 } ;; | |
518 | |
519 // I(B, C, D) = C ^ (B | ~D) | |
520 // | |
521 // However, since we have an andcm operator, we use the fact that | |
522 // | |
523 // Y ^ Z == ~Y ^ ~Z | |
524 // | |
525 // to rewrite the expression as | |
526 // | |
527 // I(B, C, D) = ~C ^ (~B & D) | |
528 | |
529 #define I(a, b, c, d, M) \ | |
530 { .mmi ; \ | |
531 add Z = M, TRound ; \ | |
532 andcm Y = d, b ; \ | |
533 andcm X = -1, c ; \ | |
534 } ;; \ | |
535 { .mii ; \ | |
536 add Z = Z, a ; \ | |
537 xor Y = Y, X ;; \ | |
538 add Z = Z, Y ; \ | |
539 } ;; | |
540 | |
541 #define GG4(label) \ | |
542 G(A, B, C, D, M0) \ | |
543 COMPUTE(A, B, 5, M0, RotateM0) \ | |
544 G(D, A, B, C, M1) \ | |
545 COMPUTE(D, A, 9, M1, RotateM1) \ | |
546 G(C, D, A, B, M2) \ | |
547 COMPUTE(C, D, 14, M2, RotateM2) \ | |
548 G(B, C, D, A, M3) \ | |
549 LOOP(B, C, 20, M3, RotateM3, label) | |
550 | |
551 #define HH4(label) \ | |
552 H(A, B, C, D, M0) \ | |
553 COMPUTE(A, B, 4, M0, RotateM0) \ | |
554 H(D, A, B, C, M1) \ | |
555 COMPUTE(D, A, 11, M1, RotateM1) \ | |
556 H(C, D, A, B, M2) \ | |
557 COMPUTE(C, D, 16, M2, RotateM2) \ | |
558 H(B, C, D, A, M3) \ | |
559 LOOP(B, C, 23, M3, RotateM3, label) | |
560 | |
561 #define II4(label) \ | |
562 I(A, B, C, D, M0) \ | |
563 COMPUTE(A, B, 6, M0, RotateM0) \ | |
564 I(D, A, B, C, M1) \ | |
565 COMPUTE(D, A, 10, M1, RotateM1) \ | |
566 I(C, D, A, B, M2) \ | |
567 COMPUTE(C, D, 15, M2, RotateM2) \ | |
568 I(B, C, D, A, M3) \ | |
569 LOOP(B, C, 21, M3, RotateM3, label) | |
570 | |
571 #define FFLOAD(a, b, c, d, M, N, s) \ | |
572 { .mii ; \ | |
573 (pMore) ld4 N = [DPtr], 4 ; \ | |
574 add Z = M, TRound ; \ | |
575 and Y = c, b ; \ | |
576 } \ | |
577 { .mmi ; \ | |
578 andcm X = d, b ;; \ | |
579 add Z = Z, a ; \ | |
580 or Y = Y, X ; \ | |
581 } ;; \ | |
582 { .mii ; \ | |
583 ld4 TRound = [TPtr], 4 ; \ | |
584 add Z = Z, Y ;; \ | |
585 dep.z Y = Z, 32, 32 ; \ | |
586 } ;; \ | |
587 { .mii ; \ | |
588 nop 0x0 ; \ | |
589 shrp Z = Z, Y, 64 - s ;; \ | |
590 add a = Z, b ; \ | |
591 } ;; | |
592 | |
593 #define FFLOOP(a, b, c, d, M, N, s, dest) \ | |
594 { .mii ; \ | |
595 (pMore) ld4 N = [DPtr], 4 ; \ | |
596 add Z = M, TRound ; \ | |
597 and Y = c, b ; \ | |
598 } \ | |
599 { .mmi ; \ | |
600 andcm X = d, b ;; \ | |
601 add Z = Z, a ; \ | |
602 or Y = Y, X ; \ | |
603 } ;; \ | |
604 { .mii ; \ | |
605 ld4 TRound = [TPtr], 4 ; \ | |
606 add Z = Z, Y ;; \ | |
607 dep.z Y = Z, 32, 32 ; \ | |
608 } ;; \ | |
609 { .mii ; \ | |
610 nop 0x0 ; \ | |
611 shrp Z = Z, Y, 64 - s ;; \ | |
612 add a = Z, b ; \ | |
613 } \ | |
614 { .mib ; \ | |
615 cmp.ne pMore, p0 = 0, LTrip ; \ | |
616 add LTrip = -1, LTrip ; \ | |
617 br.ctop.dptk.many dest ; \ | |
618 } ;; | |
619 | |
620 .type md5_digest_block0, @function | |
621 .align 32 | |
622 | |
623 .proc md5_digest_block0 | |
624 .prologue | |
625 md5_digest_block0: | |
626 .altrp QUICK_RTN | |
627 .body | |
628 { .mmi | |
629 alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE | |
630 mov LTrip = 2 | |
631 mov ar.lc = 3 | |
632 } ;; | |
633 { .mii | |
634 cmp.eq pMore, p0 = r0, r0 | |
635 mov ar.ec = 0 | |
636 nop 0x0 | |
637 } ;; | |
638 | |
639 .md5_FF_round0: | |
640 FFLOAD(A, B, C, D, M12, RotateM0, 7) | |
641 FFLOAD(D, A, B, C, M13, RotateM1, 12) | |
642 FFLOAD(C, D, A, B, M14, RotateM2, 17) | |
643 FFLOOP(B, C, D, A, M15, RotateM3, 22, .md5_FF_round0) | |
644 // | |
645 // !!! Fall through to md5_digest_GHI | |
646 // | |
647 .endp md5_digest_block0 | |
648 | |
649 .type md5_digest_GHI, @function | |
650 .align 32 | |
651 | |
652 .proc md5_digest_GHI | |
653 .prologue | |
654 .regstk _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE | |
655 md5_digest_GHI: | |
656 .altrp QUICK_RTN | |
657 .body | |
658 // | |
659 // The following sequence shuffles the block counstants round for the | |
660 // next round: | |
661 // | |
662 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
663 // 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12 | |
664 // | |
665 { .mmi | |
666 mov Z = M0 | |
667 mov Y = M15 | |
668 mov ar.lc = 3 | |
669 } | |
670 { .mmi | |
671 mov X = M2 | |
672 mov W = M9 | |
673 mov V = M4 | |
674 } ;; | |
675 | |
676 { .mmi | |
677 mov M0 = M1 | |
678 mov M15 = M12 | |
679 mov ar.ec = 1 | |
680 } | |
681 { .mmi | |
682 mov M2 = M11 | |
683 mov M9 = M14 | |
684 mov M4 = M5 | |
685 } ;; | |
686 | |
687 { .mmi | |
688 mov M1 = M6 | |
689 mov M12 = M13 | |
690 mov U = M3 | |
691 } | |
692 { .mmi | |
693 mov M11 = M8 | |
694 mov M14 = M7 | |
695 mov M5 = M10 | |
696 } ;; | |
697 | |
698 { .mmi | |
699 mov M6 = Y | |
700 mov M13 = X | |
701 mov M3 = Z | |
702 } | |
703 { .mmi | |
704 mov M8 = W | |
705 mov M7 = V | |
706 mov M10 = U | |
707 } ;; | |
708 | |
709 .md5_GG_round: | |
710 GG4(.md5_GG_round) | |
711 | |
712 // The following sequence shuffles the block constants round for the | |
713 // next round: | |
714 // | |
715 // 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12 | |
716 // 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2 | |
717 | |
718 { .mmi | |
719 mov Z = M0 | |
720 mov Y = M1 | |
721 mov ar.lc = 3 | |
722 } | |
723 { .mmi | |
724 mov X = M3 | |
725 mov W = M5 | |
726 mov V = M6 | |
727 } ;; | |
728 | |
729 { .mmi | |
730 mov M0 = M4 | |
731 mov M1 = M11 | |
732 mov ar.ec = 1 | |
733 } | |
734 { .mmi | |
735 mov M3 = M9 | |
736 mov U = M8 | |
737 mov T = M13 | |
738 } ;; | |
739 | |
740 { .mmi | |
741 mov M4 = Z | |
742 mov M11 = Y | |
743 mov M5 = M7 | |
744 } | |
745 { .mmi | |
746 mov M6 = M14 | |
747 mov M8 = M12 | |
748 mov M13 = M15 | |
749 } ;; | |
750 | |
751 { .mmi | |
752 mov M7 = W | |
753 mov M14 = V | |
754 nop 0x0 | |
755 } | |
756 { .mmi | |
757 mov M9 = X | |
758 mov M12 = U | |
759 mov M15 = T | |
760 } ;; | |
761 | |
762 .md5_HH_round: | |
763 HH4(.md5_HH_round) | |
764 | |
765 // The following sequence shuffles the block constants round for the | |
766 // next round: | |
767 // | |
768 // 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2 | |
769 // 0 7 14 5 12 3 10 1 8 15 6 13 4 11 2 9 | |
770 | |
771 { .mmi | |
772 mov Z = M0 | |
773 mov Y = M15 | |
774 mov ar.lc = 3 | |
775 } | |
776 { .mmi | |
777 mov X = M10 | |
778 mov W = M1 | |
779 mov V = M4 | |
780 } ;; | |
781 | |
782 { .mmi | |
783 mov M0 = M9 | |
784 mov M15 = M12 | |
785 mov ar.ec = 1 | |
786 } | |
787 { .mmi | |
788 mov M10 = M11 | |
789 mov M1 = M6 | |
790 mov M4 = M13 | |
791 } ;; | |
792 | |
793 { .mmi | |
794 mov M9 = M14 | |
795 mov M12 = M5 | |
796 mov U = M3 | |
797 } | |
798 { .mmi | |
799 mov M11 = M8 | |
800 mov M6 = M7 | |
801 mov M13 = M2 | |
802 } ;; | |
803 | |
804 { .mmi | |
805 mov M14 = Y | |
806 mov M5 = X | |
807 mov M3 = Z | |
808 } | |
809 { .mmi | |
810 mov M8 = W | |
811 mov M7 = V | |
812 mov M2 = U | |
813 } ;; | |
814 | |
815 .md5_II_round: | |
816 II4(.md5_II_round) | |
817 | |
818 { .mib | |
819 nop 0x0 | |
820 nop 0x0 | |
821 br.ret.sptk.many QUICK_RTN | |
822 } ;; | |
823 | |
824 .endp md5_digest_GHI | |
825 | |
826 #define FFLOADU(a, b, c, d, M, P, N, s, offset) \ | |
827 { .mii ; \ | |
828 (pMore) ld4 N = [DPtr], 4 ; \ | |
829 add Z = M, TRound ; \ | |
830 and Y = c, b ; \ | |
831 } \ | |
832 { .mmi ; \ | |
833 andcm X = d, b ;; \ | |
834 add Z = Z, a ; \ | |
835 or Y = Y, X ; \ | |
836 } ;; \ | |
837 { .mii ; \ | |
838 ld4 TRound = [TPtr], 4 ; \ | |
839 GETLW(W, P, offset) ; \ | |
840 add Z = Z, Y ; \ | |
841 } ;; \ | |
842 { .mii ; \ | |
843 or W = W, DTmp ; \ | |
844 dep.z Y = Z, 32, 32 ;; \ | |
845 shrp Z = Z, Y, 64 - s ; \ | |
846 } ;; \ | |
847 { .mii ; \ | |
848 add a = Z, b ; \ | |
849 GETRW(DTmp, P, offset) ; \ | |
850 mov P = W ; \ | |
851 } ;; | |
852 | |
853 #define FFLOOPU(a, b, c, d, M, P, N, s, offset) \ | |
854 { .mii ; \ | |
855 (pMore) ld4 N = [DPtr], 4 ; \ | |
856 add Z = M, TRound ; \ | |
857 and Y = c, b ; \ | |
858 } \ | |
859 { .mmi ; \ | |
860 andcm X = d, b ;; \ | |
861 add Z = Z, a ; \ | |
862 or Y = Y, X ; \ | |
863 } ;; \ | |
864 { .mii ; \ | |
865 ld4 TRound = [TPtr], 4 ; \ | |
866 (pMore) GETLW(W, P, offset) ; \ | |
867 add Z = Z, Y ; \ | |
868 } ;; \ | |
869 { .mii ; \ | |
870 (pMore) or W = W, DTmp ; \ | |
871 dep.z Y = Z, 32, 32 ;; \ | |
872 shrp Z = Z, Y, 64 - s ; \ | |
873 } ;; \ | |
874 { .mii ; \ | |
875 add a = Z, b ; \ | |
876 (pMore) GETRW(DTmp, P, offset) ; \ | |
877 (pMore) mov P = W ; \ | |
878 } \ | |
879 { .mib ; \ | |
880 cmp.ne pMore, p0 = 0, LTrip ; \ | |
881 add LTrip = -1, LTrip ; \ | |
882 br.ctop.sptk.many .md5_FF_round##offset ; \ | |
883 } ;; | |
884 | |
885 #define MD5FBLOCK(offset) \ | |
886 .type md5_digest_block##offset, @function ; \ | |
887 \ | |
888 .align 32 ; \ | |
889 .proc md5_digest_block##offset ; \ | |
890 .prologue ; \ | |
891 .altrp QUICK_RTN ; \ | |
892 .body ; \ | |
893 md5_digest_block##offset: \ | |
894 { .mmi ; \ | |
895 alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE ; \ | |
896 mov LTrip = 2 ; \ | |
897 mov ar.lc = 3 ; \ | |
898 } ;; \ | |
899 { .mii ; \ | |
900 cmp.eq pMore, p0 = r0, r0 ; \ | |
901 mov ar.ec = 0 ; \ | |
902 nop 0x0 ; \ | |
903 } ;; \ | |
904 \ | |
905 .pred.rel "mutex", pLoad, pSkip ; \ | |
906 .md5_FF_round##offset: \ | |
907 FFLOADU(A, B, C, D, M12, M13, RotateM0, 7, offset) \ | |
908 FFLOADU(D, A, B, C, M13, M14, RotateM1, 12, offset) \ | |
909 FFLOADU(C, D, A, B, M14, M15, RotateM2, 17, offset) \ | |
910 FFLOOPU(B, C, D, A, M15, RotateM0, RotateM3, 22, offset) \ | |
911 \ | |
912 { .mib ; \ | |
913 nop 0x0 ; \ | |
914 nop 0x0 ; \ | |
915 br.cond.sptk.many md5_digest_GHI ; \ | |
916 } ;; \ | |
917 .endp md5_digest_block##offset | |
918 | |
919 MD5FBLOCK(1) | |
920 MD5FBLOCK(2) | |
921 MD5FBLOCK(3) | |
922 | |
923 .align 64 | |
924 .type md5_constants, @object | |
925 md5_constants: | |
926 .md5_tbl_data_order: // To ensure little-endian data | |
927 // order, code as bytes. | |
928 data1 0x78, 0xa4, 0x6a, 0xd7 // 0 | |
929 data1 0x56, 0xb7, 0xc7, 0xe8 // 1 | |
930 data1 0xdb, 0x70, 0x20, 0x24 // 2 | |
931 data1 0xee, 0xce, 0xbd, 0xc1 // 3 | |
932 data1 0xaf, 0x0f, 0x7c, 0xf5 // 4 | |
933 data1 0x2a, 0xc6, 0x87, 0x47 // 5 | |
934 data1 0x13, 0x46, 0x30, 0xa8 // 6 | |
935 data1 0x01, 0x95, 0x46, 0xfd // 7 | |
936 data1 0xd8, 0x98, 0x80, 0x69 // 8 | |
937 data1 0xaf, 0xf7, 0x44, 0x8b // 9 | |
938 data1 0xb1, 0x5b, 0xff, 0xff // 10 | |
939 data1 0xbe, 0xd7, 0x5c, 0x89 // 11 | |
940 data1 0x22, 0x11, 0x90, 0x6b // 12 | |
941 data1 0x93, 0x71, 0x98, 0xfd // 13 | |
942 data1 0x8e, 0x43, 0x79, 0xa6 // 14 | |
943 data1 0x21, 0x08, 0xb4, 0x49 // 15 | |
944 data1 0x62, 0x25, 0x1e, 0xf6 // 16 | |
945 data1 0x40, 0xb3, 0x40, 0xc0 // 17 | |
946 data1 0x51, 0x5a, 0x5e, 0x26 // 18 | |
947 data1 0xaa, 0xc7, 0xb6, 0xe9 // 19 | |
948 data1 0x5d, 0x10, 0x2f, 0xd6 // 20 | |
949 data1 0x53, 0x14, 0x44, 0x02 // 21 | |
950 data1 0x81, 0xe6, 0xa1, 0xd8 // 22 | |
951 data1 0xc8, 0xfb, 0xd3, 0xe7 // 23 | |
952 data1 0xe6, 0xcd, 0xe1, 0x21 // 24 | |
953 data1 0xd6, 0x07, 0x37, 0xc3 // 25 | |
954 data1 0x87, 0x0d, 0xd5, 0xf4 // 26 | |
955 data1 0xed, 0x14, 0x5a, 0x45 // 27 | |
956 data1 0x05, 0xe9, 0xe3, 0xa9 // 28 | |
957 data1 0xf8, 0xa3, 0xef, 0xfc // 29 | |
958 data1 0xd9, 0x02, 0x6f, 0x67 // 30 | |
959 data1 0x8a, 0x4c, 0x2a, 0x8d // 31 | |
960 data1 0x42, 0x39, 0xfa, 0xff // 32 | |
961 data1 0x81, 0xf6, 0x71, 0x87 // 33 | |
962 data1 0x22, 0x61, 0x9d, 0x6d // 34 | |
963 data1 0x0c, 0x38, 0xe5, 0xfd // 35 | |
964 data1 0x44, 0xea, 0xbe, 0xa4 // 36 | |
965 data1 0xa9, 0xcf, 0xde, 0x4b // 37 | |
966 data1 0x60, 0x4b, 0xbb, 0xf6 // 38 | |
967 data1 0x70, 0xbc, 0xbf, 0xbe // 39 | |
968 data1 0xc6, 0x7e, 0x9b, 0x28 // 40 | |
969 data1 0xfa, 0x27, 0xa1, 0xea // 41 | |
970 data1 0x85, 0x30, 0xef, 0xd4 // 42 | |
971 data1 0x05, 0x1d, 0x88, 0x04 // 43 | |
972 data1 0x39, 0xd0, 0xd4, 0xd9 // 44 | |
973 data1 0xe5, 0x99, 0xdb, 0xe6 // 45 | |
974 data1 0xf8, 0x7c, 0xa2, 0x1f // 46 | |
975 data1 0x65, 0x56, 0xac, 0xc4 // 47 | |
976 data1 0x44, 0x22, 0x29, 0xf4 // 48 | |
977 data1 0x97, 0xff, 0x2a, 0x43 // 49 | |
978 data1 0xa7, 0x23, 0x94, 0xab // 50 | |
979 data1 0x39, 0xa0, 0x93, 0xfc // 51 | |
980 data1 0xc3, 0x59, 0x5b, 0x65 // 52 | |
981 data1 0x92, 0xcc, 0x0c, 0x8f // 53 | |
982 data1 0x7d, 0xf4, 0xef, 0xff // 54 | |
983 data1 0xd1, 0x5d, 0x84, 0x85 // 55 | |
984 data1 0x4f, 0x7e, 0xa8, 0x6f // 56 | |
985 data1 0xe0, 0xe6, 0x2c, 0xfe // 57 | |
986 data1 0x14, 0x43, 0x01, 0xa3 // 58 | |
987 data1 0xa1, 0x11, 0x08, 0x4e // 59 | |
988 data1 0x82, 0x7e, 0x53, 0xf7 // 60 | |
989 data1 0x35, 0xf2, 0x3a, 0xbd // 61 | |
990 data1 0xbb, 0xd2, 0xd7, 0x2a // 62 | |
991 data1 0x91, 0xd3, 0x86, 0xeb // 63 | |
992 .size md5_constants#,64*4 | |
OLD | NEW |