OLD | NEW |
| (Empty) |
1 dnl AMD K7 mpn_mod_34lsub1 -- remainder modulo 2^24-1. | |
2 | |
3 dnl Copyright 2000, 2001, 2002, 2004, 2005, 2008 Free Software Foundation, | |
4 dnl Inc. | |
5 dnl | |
6 dnl This file is part of the GNU MP Library. | |
7 dnl | |
8 dnl The GNU MP Library is free software; you can redistribute it and/or | |
9 dnl modify it under the terms of the GNU Lesser General Public License as | |
10 dnl published by the Free Software Foundation; either version 3 of the | |
11 dnl License, or (at your option) any later version. | |
12 dnl | |
13 dnl The GNU MP Library is distributed in the hope that it will be useful, | |
14 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 dnl Lesser General Public License for more details. | |
17 dnl | |
18 dnl You should have received a copy of the GNU Lesser General Public License | |
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. | |
20 | |
21 include(`../config.m4') | |
22 | |
23 | |
24 C cycles/limb | |
25 C Athlon: 1 | |
26 C Hammer: 1 | |
27 | |
28 | |
29 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) | |
30 C | |
31 C The loop form below and the 64 byte code alignment seem necessary for the | |
32 C claimed speed. This is a bit strange, since normally k7 isn't very | |
33 C sensitive to such things. Perhaps there has to be 6 instructions in the | |
34 C first 16 bytes for the BTB entry or something. | |
35 | |
36 defframe(PARAM_SIZE, 8) | |
37 defframe(PARAM_SRC, 4) | |
38 | |
39 dnl re-use parameter space | |
40 define(SAVE_EDI, `PARAM_SIZE') | |
41 | |
42 TEXT | |
43 ALIGN(64) | |
44 PROLOGUE(mpn_mod_34lsub1) | |
45 deflit(`FRAME',0) | |
46 | |
47 movl PARAM_SIZE, %ecx | |
48 movl PARAM_SRC, %edx | |
49 | |
50 subl $2, %ecx | |
51 ja L(three_or_more) | |
52 | |
53 movl (%edx), %eax | |
54 jb L(one) | |
55 | |
56 movl 4(%edx), %ecx | |
57 movl %eax, %edx | |
58 shrl $24, %eax C src[0] low | |
59 | |
60 andl $0xFFFFFF, %edx C src[0] high | |
61 addl %edx, %eax | |
62 movl %ecx, %edx | |
63 | |
64 andl $0xFFFF, %ecx | |
65 shrl $16, %edx C src[1] high | |
66 addl %edx, %eax | |
67 | |
68 shll $8, %ecx C src[1] low | |
69 addl %ecx, %eax | |
70 | |
71 L(one): | |
72 ret | |
73 | |
74 | |
75 L(three_or_more): | |
76 C eax | |
77 C ebx | |
78 C ecx size-2 | |
79 C edx src | |
80 C esi | |
81 C edi | |
82 | |
83 pushl %ebx FRAME_pushl() | |
84 xorl %eax, %eax | |
85 xorl %ebx, %ebx | |
86 | |
87 movl %edi, SAVE_EDI | |
88 pushl %esi FRAME_pushl() | |
89 xorl %esi, %esi C and clear carry flag | |
90 | |
91 | |
92 C code offset 0x40 at this point | |
93 L(top): | |
94 C eax acc 0mod3 | |
95 C ebx acc 1mod3 | |
96 C ecx counter, limbs | |
97 C edx src | |
98 C esi acc 2mod3 | |
99 C edi | |
100 | |
101 leal 24(%edx), %edx | |
102 leal -2(%ecx), %ecx | |
103 adcl -24(%edx), %eax | |
104 adcl -20(%edx), %ebx | |
105 adcl -16(%edx), %esi | |
106 | |
107 decl %ecx | |
108 jng L(done_loop) | |
109 | |
110 leal -2(%ecx), %ecx | |
111 adcl -12(%edx), %eax | |
112 adcl -8(%edx), %ebx | |
113 adcl -4(%edx), %esi | |
114 | |
115 decl %ecx | |
116 jg L(top) | |
117 | |
118 | |
119 leal 12(%edx), %edx | |
120 | |
121 | |
122 L(done_loop): | |
123 C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively | |
124 | |
125 incl %ecx | |
126 movl $0xFFFFFFFF, %edi | |
127 js L(combine) | |
128 | |
129 adcl -12(%edx), %eax | |
130 decl %ecx | |
131 movl $0xFFFFFF00, %edi | |
132 js L(combine) | |
133 | |
134 adcl -8(%edx), %ebx | |
135 movl $0xFFFF0000, %edi | |
136 | |
137 | |
138 L(combine): | |
139 C eax acc 0mod3 | |
140 C ebx acc 1mod3 | |
141 C ecx | |
142 C edx | |
143 C esi acc 2mod3 | |
144 C edi mask | |
145 | |
146 sbbl %ecx, %ecx C carry | |
147 movl %eax, %edx C 0mod3 | |
148 shrl $24, %eax C 0mod3 high | |
149 | |
150 andl %edi, %ecx C carry masked | |
151 andl $0x00FFFFFF, %edx C 0mod3 low | |
152 movl %ebx, %edi C 1mod3 | |
153 | |
154 subl %ecx, %eax C apply carry | |
155 shrl $16, %ebx C 1mod3 high | |
156 andl $0xFFFF, %edi | |
157 | |
158 addl %edx, %eax C apply 0mod3 low | |
159 movl %esi, %edx C 2mod3 | |
160 shll $8, %edi C 1mod3 low | |
161 | |
162 addl %ebx, %eax C apply 1mod3 high | |
163 shrl $8, %esi C 2mod3 high | |
164 movzbl %dl, %edx C 2mod3 low | |
165 | |
166 addl %edi, %eax C apply 1mod3 low | |
167 shll $16, %edx C 2mod3 low | |
168 | |
169 addl %esi, %eax C apply 2mod3 high | |
170 popl %esi FRAME_popl() | |
171 | |
172 movl SAVE_EDI, %edi | |
173 addl %edx, %eax C apply 2mod3 low | |
174 popl %ebx FRAME_popl() | |
175 | |
176 ret | |
177 | |
178 EPILOGUE() | |
OLD | NEW |