OLD | NEW |
| (Empty) |
1 dnl x86-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number. | |
2 | |
3 dnl Copyright 2007, 2008 Free Software Foundation, Inc. | |
4 | |
5 dnl This file is part of the GNU MP Library. | |
6 | |
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify | |
8 dnl it under the terms of the GNU Lesser General Public License as published | |
9 dnl by the Free Software Foundation; either version 3 of the License, or (at | |
10 dnl your option) any later version. | |
11 | |
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but | |
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public | |
15 dnl License for more details. | |
16 | |
17 dnl You should have received a copy of the GNU Lesser General Public License | |
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. | |
19 | |
20 include(`../config.m4') | |
21 | |
22 | |
23 C norm frac | |
24 C K8 20 20 | |
25 C P4 73 73 | |
26 C P6-15 37 37 | |
27 | |
28 C TODO | |
29 C * Perhaps compute the inverse without relying on divq? Could either use | |
30 C Newton's method and mulq, or perhaps the faster fdiv. | |
31 C * The loop has not been carefully tuned, nor analysed for critical path | |
32 C length. It seems that 20 c/l is a bit long, compared to the 13 c/l for | |
33 C mpn_divrem_1. | |
34 C * Clean up. This code is really crude. | |
35 | |
36 | |
37 C INPUT PARAMETERS | |
38 define(`qp', `%rdi') | |
39 define(`fn', `%rsi') | |
40 define(`up_param', `%rdx') | |
41 define(`un_param', `%rcx') | |
42 define(`dp', `%r8') | |
43 | |
44 define(`dinv', `%r9') | |
45 | |
46 | |
47 C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 | |
48 C cnt qp d dinv | |
49 | |
50 ASM_START() | |
51 TEXT | |
52 ALIGN(16) | |
53 PROLOGUE(mpn_divrem_2) | |
54 | |
55 push %r15 | |
56 lea (%rdx,%rcx,8), %rax | |
57 push %r14 | |
58 push %r13 | |
59 mov %rsi, %r13 | |
60 push %r12 | |
61 lea -24(%rax), %r12 | |
62 push %rbp | |
63 mov %rdi, %rbp | |
64 push %rbx | |
65 mov 8(%r8), %r11 | |
66 mov -8(%rax), %r9 | |
67 mov (%r8), %r8 | |
68 mov -16(%rax), %r10 | |
69 xor R32(%r15), R32(%r15) | |
70 cmp %r9, %r11 | |
71 ja L(2) | |
72 setb %dl | |
73 cmp %r10, %r8 | |
74 setbe %al | |
75 or %al, %dl | |
76 jne L(23) | |
77 L(2): | |
78 lea -3(%rcx,%r13), %rbx C un + fn - 3 | |
79 test %rbx, %rbx | |
80 js L(6) | |
81 mov %r11, %rdx | |
82 mov $-1, %rax | |
83 not %rdx | |
84 div %r11 | |
85 mov %r11, %rdx | |
86 mov %rax, %rdi | |
87 imul %rax, %rdx | |
88 mov %rdx, %r14 | |
89 mul %r8 | |
90 mov %rdx, %rcx | |
91 mov $-1, %rdx | |
92 add %r8, %r14 | |
93 adc $0, %rdx | |
94 add %rcx, %r14 | |
95 adc $0, %rdx | |
96 js L(8) | |
97 L(18): | |
98 dec %rdi | |
99 sub %r11, %r14 | |
100 sbb $0, %rdx | |
101 jns L(18) | |
102 L(8): | |
103 | |
104 C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 | |
105 C n2 un n1 dinv qp d0 d1 up fn msl | |
106 C n2 un -d1 n1 dinv XX XX | |
107 | |
108 ifdef(`NEW',` | |
109 lea (%rbp,%rbx,8), %rbp | |
110 mov %rbx, %rcx C un | |
111 mov %r9, %rbx | |
112 mov %rdi, %r9 C di | |
113 mov %r10, %r14 | |
114 mov %r11, %rsi | |
115 neg %rsi C -d1 | |
116 ALIGN(16) | |
117 L(loop): | |
118 mov %r9, %rax C di ncp | |
119 mul %rbx C 0, 18 | |
120 add %r14, %rax C 4 | |
121 mov %rax, %r10 C q0 5 | |
122 adc %rbx, %rdx C 5 | |
123 mov %rdx, %rdi C q 6 | |
124 imul %rsi, %rdx C 6 | |
125 mov %r8, %rax C ncp | |
126 lea (%rdx, %r14), %rbx C n1 -= ... 7 | |
127 mul %rdi C 7 | |
128 xor R32(%r14), R32(%r14) C | |
129 cmp %rcx, %r13 C | |
130 jg L(19) C | |
131 mov (%r12), %r14 C | |
132 sub $8, %r12 C | |
133 L(19): sub %r8, %r14 C ncp | |
134 sbb %r11, %rbx C 9 | |
135 sub %rax, %r14 C 11 | |
136 sbb %rdx, %rbx C 12 | |
137 inc %rdi C 7 | |
138 xor R32(%rdx), R32(%rdx) C | |
139 cmp %r10, %rbx C 13 | |
140 mov %r8, %rax C d1 ncp | |
141 adc $-1, %rdx C mask 14 | |
142 add %rdx, %rdi C q-- 15 | |
143 and %rdx, %rax C d0 or 0 15 | |
144 and %r11, %rdx C d1 or 0 15 | |
145 add %rax, %r14 C 16 | |
146 adc %rdx, %rbx C 16 | |
147 cmp %r11, %rbx C 17 | |
148 jae L(fix) C | |
149 L(bck): mov %rdi, (%rbp) C | |
150 sub $8, %rbp C | |
151 dec %rcx | |
152 jns L(loop) | |
153 | |
154 mov %r14, %r10 | |
155 mov %rbx, %r9 | |
156 ',` | |
157 lea (%rbp,%rbx,8), %rbp | |
158 mov %rbx, %rcx | |
159 mov %r9, %rax | |
160 mov %r10, %rsi | |
161 ALIGN(16) | |
162 L(loop): | |
163 mov %rax, %r14 C 0, 19 | |
164 mul %rdi C 0 | |
165 mov %r11, %r9 C 1 | |
166 add %rsi, %rax C 4 | |
167 mov %rax, %rbx C q0 5 | |
168 adc %r14, %rdx C q 5 | |
169 lea 1(%rdx), %r10 C 6 | |
170 mov %rdx, %rax C 6 | |
171 imul %rdx, %r9 C 6 | |
172 sub %r9, %rsi C 10 | |
173 xor R32(%r9), R32(%r9) C | |
174 mul %r8 C 7 | |
175 cmp %rcx, %r13 C | |
176 jg L(13) C | |
177 mov (%r12), %r9 C | |
178 sub $8, %r12 C | |
179 L(13): sub %r8, %r9 C ncp | |
180 sbb %r11, %rsi C 11 | |
181 sub %rax, %r9 C 11 | |
182 sbb %rdx, %rsi C 12 | |
183 cmp %rbx, %rsi C 13 | |
184 sbb %rax, %rax C 14 | |
185 not %rax C 15 | |
186 add %rax, %r10 C 16 | |
187 mov %r8, %rbx C ncp | |
188 and %rax, %rbx C 16 | |
189 and %r11, %rax C 16 | |
190 add %rbx, %r9 C 17 | |
191 adc %rsi, %rax C 18 | |
192 cmp %rax, %r11 C 19 | |
193 jbe L(fix) C | |
194 L(bck): mov %r10, (%rbp) C | |
195 sub $8, %rbp C | |
196 mov %r9, %rsi C 18 | |
197 dec %rcx | |
198 jns L(loop) | |
199 | |
200 mov %rsi, %r10 | |
201 mov %rax, %r9 | |
202 ') | |
203 L(6): | |
204 mov %r10, 8(%r12) | |
205 mov %r9, 16(%r12) | |
206 pop %rbx | |
207 pop %rbp | |
208 pop %r12 | |
209 pop %r13 | |
210 pop %r14 | |
211 mov %r15, %rax | |
212 pop %r15 | |
213 ret | |
214 | |
215 L(23): inc R32(%r15) | |
216 sub %r8, %r10 | |
217 sbb %r11, %r9 | |
218 jmp L(2) | |
219 | |
220 ifdef(`NEW',` | |
221 L(fix): seta %dl | |
222 cmp %r8, %r14 | |
223 setae %al | |
224 orb %dl, %al | |
225 je L(bck) | |
226 inc %rdi | |
227 sub %r8, %r14 | |
228 sbb %r11, %rbx | |
229 jmp L(bck) | |
230 ',` | |
231 L(fix): jb L(88) | |
232 cmp %r8, %r9 | |
233 jb L(bck) | |
234 L(88): inc %r10 | |
235 sub %r8, %r9 | |
236 sbb %r11, %rax | |
237 jmp L(bck) | |
238 ') | |
239 EPILOGUE() | |
OLD | NEW |