OLD | NEW |
| (Empty) |
1 dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1. | |
2 | |
3 dnl Copyright 2000, 2001, 2002, 2004, 2005, 2007 Free Software Foundation, | |
4 dnl Inc. | |
5 dnl | |
6 dnl This file is part of the GNU MP Library. | |
7 dnl | |
8 dnl The GNU MP Library is free software; you can redistribute it and/or | |
9 dnl modify it under the terms of the GNU Lesser General Public License as | |
10 dnl published by the Free Software Foundation; either version 3 of the | |
11 dnl License, or (at your option) any later version. | |
12 dnl | |
13 dnl The GNU MP Library is distributed in the hope that it will be useful, | |
14 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 dnl Lesser General Public License for more details. | |
17 dnl | |
18 dnl You should have received a copy of the GNU Lesser General Public License | |
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. | |
20 | |
21 include(`../config.m4') | |
22 | |
23 | |
24 C cycles/limb | |
25 C K8,K9: 1.0 | |
26 C K10: 1.12 | |
27 C P4: 3.25 | |
28 C P6-15 (Core2): 1.5 | |
29 C P6-28 (Atom): 2.5 | |
30 | |
31 | |
32 C INPUT PARAMETERS | |
33 C up rdi | |
34 C n rsi | |
35 | |
36 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) | |
37 | |
38 C TODO | |
39 C * Apply the movzwl tricks to the x86/k7 code | |
40 C * Review feed-in and wind-down code. In particular, try to avoid adcq and | |
41 C sbbq to placate Pentium4. | |
42 C * More unrolling and/or index addressing could bring time to under 1 c/l | |
43 C for Athlon64, approaching 0.67 c/l seems possible. | |
44 C * There are recurrencies on the carry registers (r8, r9, r10) that might | |
45 C be the limiting factor for the Pentium4 speed. Splitting these into 6 | |
46 C registers would help. | |
47 C * For ultimate Athlon64 performance, a sequence like this might be best. | |
48 C It should reach 0.5 c/l (limited by L1 cache bandwidth). | |
49 C | |
50 C addq (%rdi), %rax | |
51 C adcq 8(%rdi), %rcx | |
52 C adcq 16(%rdi), %rdx | |
53 C adcq $0, %r8 | |
54 C addq 24(%rdi), %rax | |
55 C adcq 32(%rdi), %rcx | |
56 C adcq 40(%rdi), %rdx | |
57 C adcq $0, %r8 | |
58 C ... | |
59 | |
60 | |
61 ASM_START() | |
62 TEXT | |
63 ALIGN(32) | |
64 PROLOGUE(mpn_mod_34lsub1) | |
65 | |
66 mov $0x0000FFFFFFFFFFFF, %r11 | |
67 | |
68 sub $2, %rsi | |
69 ja L(gt2) | |
70 | |
71 mov (%rdi), %rax | |
72 nop | |
73 jb L(1) | |
74 | |
75 mov 8(%rdi), %rsi | |
76 mov %rax, %rdx | |
77 shr $48, %rax C src[0] low | |
78 | |
79 and %r11, %rdx C src[0] high | |
80 add %rdx, %rax | |
81 mov %esi, %edx | |
82 | |
83 shr $32, %rsi C src[1] high | |
84 add %rsi, %rax | |
85 | |
86 shl $16, %rdx C src[1] low | |
87 add %rdx, %rax | |
88 | |
89 L(1): ret | |
90 | |
91 | |
92 ALIGN(16) | |
93 L(gt2): xor %eax, %eax | |
94 xor %ecx, %ecx | |
95 xor %edx, %edx | |
96 xor %r8, %r8 | |
97 xor %r9, %r9 | |
98 xor %r10, %r10 | |
99 | |
100 L(top): add (%rdi), %rax | |
101 adc $0, %r10 | |
102 add 8(%rdi), %rcx | |
103 adc $0, %r8 | |
104 add 16(%rdi), %rdx | |
105 adc $0, %r9 | |
106 | |
107 sub $3,%rsi | |
108 jng L(end) | |
109 | |
110 add 24(%rdi), %rax | |
111 adc $0, %r10 | |
112 add 32(%rdi), %rcx | |
113 adc $0, %r8 | |
114 add 40(%rdi), %rdx | |
115 lea 48(%rdi), %rdi | |
116 adc $0, %r9 | |
117 | |
118 sub $3,%rsi | |
119 jg L(top) | |
120 | |
121 | |
122 add $-24, %rdi | |
123 L(end): add %r9, %rax | |
124 adc %r10, %rcx | |
125 adc %r8, %rdx | |
126 | |
127 inc %rsi | |
128 mov $0x1, %r10d | |
129 js L(combine) | |
130 | |
131 mov $0x10000, %r10d | |
132 adc 24(%rdi), %rax | |
133 dec %rsi | |
134 js L(combine) | |
135 | |
136 adc 32(%rdi), %rcx | |
137 mov $0x100000000, %r10 | |
138 | |
139 L(combine): | |
140 sbb %rsi, %rsi C carry | |
141 mov %rax, %rdi C 0mod3 | |
142 shr $48, %rax C 0mod3 high | |
143 | |
144 and %r10, %rsi C carry masked | |
145 and %r11, %rdi C 0mod3 low | |
146 mov %ecx, %r10d C 1mod3 | |
147 | |
148 add %rsi, %rax C apply carry | |
149 shr $32, %rcx C 1mod3 high | |
150 | |
151 add %rdi, %rax C apply 0mod3 low | |
152 movzwl %dx, %edi C 2mod3 | |
153 shl $16, %r10 C 1mod3 low | |
154 | |
155 add %rcx, %rax C apply 1mod3 high | |
156 shr $16, %rdx C 2mod3 high | |
157 | |
158 add %r10, %rax C apply 1mod3 low | |
159 shl $32, %rdi C 2mod3 low | |
160 | |
161 add %rdx, %rax C apply 2mod3 high | |
162 add %rdi, %rax C apply 2mod3 low | |
163 | |
164 ret | |
165 EPILOGUE() | |
OLD | NEW |