OLD | NEW |
| (Empty) |
1 dnl x86 __gmpn_addmul_1 (for 386 and 486) -- Multiply a limb vector with a | |
2 dnl limb and add the result to a second limb vector. | |
3 | |
4 dnl Copyright 1992, 1994, 1997, 1999, 2000, 2001, 2002, 2005 Free Software | |
5 dnl Foundation, Inc. | |
6 dnl | |
7 dnl This file is part of the GNU MP Library. | |
8 dnl | |
9 dnl The GNU MP Library is free software; you can redistribute it and/or | |
10 dnl modify it under the terms of the GNU Lesser General Public License as | |
11 dnl published by the Free Software Foundation; either version 3 of the | |
12 dnl License, or (at your option) any later version. | |
13 dnl | |
14 dnl The GNU MP Library is distributed in the hope that it will be useful, | |
15 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 dnl Lesser General Public License for more details. | |
18 dnl | |
19 dnl You should have received a copy of the GNU Lesser General Public License | |
20 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. | |
21 | |
22 include(`../config.m4') | |
23 | |
24 | |
25 C cycles/limb | |
26 C P5: 14.75 | |
27 C P6 model 0-8,10-12) 7.5 | |
28 C P6 model 9 (Banias) | |
29 C P6 model 13 (Dothan) 6.75 | |
30 C P4 model 0 (Willamette) 24.0 | |
31 C P4 model 1 (?) 24.0 | |
32 C P4 model 2 (Northwood) 24.0 | |
33 C P4 model 3 (Prescott) | |
34 C P4 model 4 (Nocona) | |
35 C K6: 12.5 | |
36 C K7: 5.25 | |
37 C K8: | |
38 | |
39 | |
40 ifdef(`OPERATION_addmul_1',` | |
41 define(M4_inst, addl) | |
42 define(M4_function_1, mpn_addmul_1) | |
43 | |
44 ',`ifdef(`OPERATION_submul_1',` | |
45 define(M4_inst, subl) | |
46 define(M4_function_1, mpn_submul_1) | |
47 | |
48 ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 | |
49 ')')') | |
50 | |
51 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) | |
52 | |
53 | |
54 C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, | |
55 C mp_limb_t mult); | |
56 | |
57 define(PARAM_MULTIPLIER, `FRAME+16(%esp)') | |
58 define(PARAM_SIZE, `FRAME+12(%esp)') | |
59 define(PARAM_SRC, `FRAME+8(%esp)') | |
60 define(PARAM_DST, `FRAME+4(%esp)') | |
61 | |
62 TEXT | |
63 ALIGN(8) | |
64 | |
65 PROLOGUE(M4_function_1) | |
66 deflit(`FRAME',0) | |
67 | |
68 pushl %edi | |
69 pushl %esi | |
70 pushl %ebx | |
71 pushl %ebp | |
72 deflit(`FRAME',16) | |
73 | |
74 movl PARAM_DST,%edi | |
75 movl PARAM_SRC,%esi | |
76 movl PARAM_SIZE,%ecx | |
77 | |
78 xorl %ebx,%ebx | |
79 andl $3,%ecx | |
80 jz L(end0) | |
81 | |
82 L(oop0): | |
83 movl (%esi),%eax | |
84 mull PARAM_MULTIPLIER | |
85 leal 4(%esi),%esi | |
86 addl %ebx,%eax | |
87 movl $0,%ebx | |
88 adcl %ebx,%edx | |
89 M4_inst %eax,(%edi) | |
90 adcl %edx,%ebx C propagate carry into cylimb | |
91 | |
92 leal 4(%edi),%edi | |
93 decl %ecx | |
94 jnz L(oop0) | |
95 | |
96 L(end0): | |
97 movl PARAM_SIZE,%ecx | |
98 shrl $2,%ecx | |
99 jz L(end) | |
100 | |
101 ALIGN(8) | |
102 L(oop): movl (%esi),%eax | |
103 mull PARAM_MULTIPLIER | |
104 addl %eax,%ebx | |
105 movl $0,%ebp | |
106 adcl %edx,%ebp | |
107 | |
108 movl 4(%esi),%eax | |
109 mull PARAM_MULTIPLIER | |
110 M4_inst %ebx,(%edi) | |
111 adcl %eax,%ebp C new lo + cylimb | |
112 movl $0,%ebx | |
113 adcl %edx,%ebx | |
114 | |
115 movl 8(%esi),%eax | |
116 mull PARAM_MULTIPLIER | |
117 M4_inst %ebp,4(%edi) | |
118 adcl %eax,%ebx C new lo + cylimb | |
119 movl $0,%ebp | |
120 adcl %edx,%ebp | |
121 | |
122 movl 12(%esi),%eax | |
123 mull PARAM_MULTIPLIER | |
124 M4_inst %ebx,8(%edi) | |
125 adcl %eax,%ebp C new lo + cylimb | |
126 movl $0,%ebx | |
127 adcl %edx,%ebx | |
128 | |
129 M4_inst %ebp,12(%edi) | |
130 adcl $0,%ebx C propagate carry into cylimb | |
131 | |
132 leal 16(%esi),%esi | |
133 leal 16(%edi),%edi | |
134 decl %ecx | |
135 jnz L(oop) | |
136 | |
137 L(end): movl %ebx,%eax | |
138 | |
139 popl %ebp | |
140 popl %ebx | |
141 popl %esi | |
142 popl %edi | |
143 ret | |
144 | |
145 EPILOGUE() | |
OLD | NEW |