OLD | NEW |
| (Empty) |
1 dnl Alpha ev67 mpn_gcd_1 -- Nx1 greatest common divisor. | |
2 | |
3 dnl Copyright 2003, 2004 Free Software Foundation, Inc. | |
4 | |
5 dnl This file is part of the GNU MP Library. | |
6 dnl | |
7 dnl The GNU MP Library is free software; you can redistribute it and/or | |
8 dnl modify it under the terms of the GNU Lesser General Public License as | |
9 dnl published by the Free Software Foundation; either version 3 of the | |
10 dnl License, or (at your option) any later version. | |
11 dnl | |
12 dnl The GNU MP Library is distributed in the hope that it will be useful, | |
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 dnl Lesser General Public License for more details. | |
16 dnl | |
17 dnl You should have received a copy of the GNU Lesser General Public License | |
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. | |
19 | |
20 include(`../config.m4') | |
21 | |
22 | |
23 C ev67: 3.4 cycles/bitpair for 1x1 part | |
24 | |
25 | |
26 C mp_limb_t mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y); | |
27 C | |
28 C In the 1x1 part, the algorithm is to change x,y to abs(x-y),min(x,y) and | |
29 C strip trailing zeros from abs(x-y) to maintain x and y both odd. | |
30 C | |
31 C The trailing zeros are calculated from just x-y, since in twos-complement | |
32 C there's the same number of trailing zeros on d or -d. This means the cttz | |
33 C runs in parallel with abs(x-y). | |
34 C | |
35 C The loop takes 5 cycles, and at 0.68 iterations per bit for two N-bit | |
36 C operands with this algorithm gives the measured 3.4 c/l. | |
37 C | |
38 C The slottings shown are for SVR4 style systems, Unicos differs in the | |
39 C initial gp setup and the LEA. | |
40 C | |
41 C Enhancement: | |
42 C | |
43 C On the jsr, !lituse_jsr! (when available) would allow the linker to relax | |
44 C it to a bsr, but probably only in a static binary. Plain "jsr foo" gives | |
45 C the right object code for relaxation, and ought to be available | |
46 C everywhere, but we prefer to schedule the GOT ldq (LEA) back earlier, for | |
47 C the usual case of running in a shared library. | |
48 C | |
49 C bsr could perhaps be used explicitly anyway. We should be able to assume | |
50 C modexact is in the same module as us (ie. shared library or mainline). | |
51 C Would there be any worries about the size of the displacement? Could | |
52 C always put modexact and gcd_1 in the same .o to be certain. | |
53 | |
54 ASM_START() | |
55 PROLOGUE(mpn_gcd_1, gp) | |
56 | |
57 C r16 xp | |
58 C r17 size | |
59 C r18 y | |
60 | |
61 C ldah C l | |
62 C lda C u | |
63 | |
64 ldq r0, 0(r16) C L x = xp[0] | |
65 lda r30, -32(r30) C u alloc stack | |
66 | |
67 LEA( r27, mpn_modexact_1c_odd) C L modexact addr, ldq (gp) | |
68 stq r10, 16(r30) C L save r10 | |
69 cttz r18, r10 C U0 y twos | |
70 cmpeq r17, 1, r5 C u test size==1 | |
71 | |
72 stq r9, 8(r30) C L save r9 | |
73 clr r19 C u zero c for modexact | |
74 unop | |
75 unop | |
76 | |
77 cttz r0, r6 C U0 x twos | |
78 stq r26, 0(r30) C L save ra | |
79 | |
80 srl r18, r10, r18 C U y odd | |
81 | |
82 mov r18, r9 C l hold y across call | |
83 | |
84 cmpult r6, r10, r2 C u test x_twos < y_twos | |
85 | |
86 cmovne r2, r6, r10 C l common_twos = min(x_twos,y_twos) | |
87 bne r5, L(one) C U no modexact if size==1 | |
88 jsr r26, (r27), mpn_modexact_1c_odd C L0 | |
89 | |
90 LDGP( r29, 0(r26)) C u,l ldah,lda | |
91 cttz r0, r6 C U0 new x twos | |
92 ldq r26, 0(r30) C L restore ra | |
93 | |
94 L(one): | |
95 mov r9, r1 C u y | |
96 ldq r9, 8(r30) C L restore r9 | |
97 mov r10, r2 C u common twos | |
98 ldq r10, 16(r30) C L restore r10 | |
99 | |
100 lda r30, 32(r30) C l free stack | |
101 beq r0, L(done) C U return y if x%y==0 | |
102 | |
103 srl r0, r6, r0 C U x odd | |
104 unop | |
105 | |
106 ALIGN(16) | |
107 L(top): | |
108 C r0 x | |
109 C r1 y | |
110 C r2 common twos, for use at end | |
111 | |
112 subq r0, r1, r7 C l0 d = x - y | |
113 cmpult r0, r1, r16 C u0 test x >= y | |
114 | |
115 subq r1, r0, r4 C l0 new_x = y - x | |
116 cttz r7, r8 C U0 d twos | |
117 | |
118 cmoveq r16, r7, r4 C l0 new_x = d if x>=y | |
119 cmovne r16, r0, r1 C u0 y = x if x<y | |
120 unop C l \ force cmoveq into l0 | |
121 unop C u / | |
122 | |
123 C C cmoveq2 L0, cmovne2 U0 | |
124 | |
125 srl r4, r8, r0 C U0 x = new_x >> twos | |
126 bne r7, L(top) C U1 stop when d==0 | |
127 | |
128 | |
129 L(done): | |
130 sll r1, r2, r0 C U0 return y << common_twos | |
131 ret r31, (r26), 1 C L0 | |
132 | |
133 EPILOGUE() | |
134 ASM_END() | |
OLD | NEW |