Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(29)

Side by Side Diff: gcc/gmp/mpn/x86_64/mod_34lsub1.asm

Issue 3050029: [gcc] GCC 4.5.0=>4.5.1 (Closed) Base URL: ssh://git@gitrw.chromium.org:9222/nacl-toolchain.git
Patch Set: Created 10 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « gcc/gmp/mpn/x86_64/lshsub_n.asm ('k') | gcc/gmp/mpn/x86_64/mul_1.asm » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
2
3 dnl Copyright 2000, 2001, 2002, 2004, 2005, 2007 Free Software Foundation,
4 dnl Inc.
5 dnl
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or
9 dnl modify it under the terms of the GNU Lesser General Public License as
10 dnl published by the Free Software Foundation; either version 3 of the
11 dnl License, or (at your option) any later version.
12 dnl
13 dnl The GNU MP Library is distributed in the hope that it will be useful,
14 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
15 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 dnl Lesser General Public License for more details.
17 dnl
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20
21 include(`../config.m4')
22
23
24 C cycles/limb
25 C K8,K9: 1.0
26 C K10: 1.12
27 C P4: 3.25
28 C P6-15 (Core2): 1.5
29 C P6-28 (Atom): 2.5
30
31
32 C INPUT PARAMETERS
33 C up rdi
34 C n rsi
35
36 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
37
38 C TODO
39 C * Apply the movzwl tricks to the x86/k7 code
40 C * Review feed-in and wind-down code. In particular, try to avoid adcq and
41 C sbbq to placate Pentium4.
42 C * More unrolling and/or index addressing could bring time to under 1 c/l
43 C for Athlon64, approaching 0.67 c/l seems possible.
44 C * There are recurrencies on the carry registers (r8, r9, r10) that might
45 C be the limiting factor for the Pentium4 speed. Splitting these into 6
46 C registers would help.
47 C * For ultimate Athlon64 performance, a sequence like this might be best.
48 C It should reach 0.5 c/l (limited by L1 cache bandwidth).
49 C
50 C addq (%rdi), %rax
51 C adcq 8(%rdi), %rcx
52 C adcq 16(%rdi), %rdx
53 C adcq $0, %r8
54 C addq 24(%rdi), %rax
55 C adcq 32(%rdi), %rcx
56 C adcq 40(%rdi), %rdx
57 C adcq $0, %r8
58 C ...
59
60
61 ASM_START()
62 TEXT
63 ALIGN(32)
64 PROLOGUE(mpn_mod_34lsub1)
65
66 mov $0x0000FFFFFFFFFFFF, %r11
67
68 sub $2, %rsi
69 ja L(gt2)
70
71 mov (%rdi), %rax
72 nop
73 jb L(1)
74
75 mov 8(%rdi), %rsi
76 mov %rax, %rdx
77 shr $48, %rax C src[0] low
78
79 and %r11, %rdx C src[0] high
80 add %rdx, %rax
81 mov %esi, %edx
82
83 shr $32, %rsi C src[1] high
84 add %rsi, %rax
85
86 shl $16, %rdx C src[1] low
87 add %rdx, %rax
88
89 L(1): ret
90
91
92 ALIGN(16)
93 L(gt2): xor %eax, %eax
94 xor %ecx, %ecx
95 xor %edx, %edx
96 xor %r8, %r8
97 xor %r9, %r9
98 xor %r10, %r10
99
100 L(top): add (%rdi), %rax
101 adc $0, %r10
102 add 8(%rdi), %rcx
103 adc $0, %r8
104 add 16(%rdi), %rdx
105 adc $0, %r9
106
107 sub $3,%rsi
108 jng L(end)
109
110 add 24(%rdi), %rax
111 adc $0, %r10
112 add 32(%rdi), %rcx
113 adc $0, %r8
114 add 40(%rdi), %rdx
115 lea 48(%rdi), %rdi
116 adc $0, %r9
117
118 sub $3,%rsi
119 jg L(top)
120
121
122 add $-24, %rdi
123 L(end): add %r9, %rax
124 adc %r10, %rcx
125 adc %r8, %rdx
126
127 inc %rsi
128 mov $0x1, %r10d
129 js L(combine)
130
131 mov $0x10000, %r10d
132 adc 24(%rdi), %rax
133 dec %rsi
134 js L(combine)
135
136 adc 32(%rdi), %rcx
137 mov $0x100000000, %r10
138
139 L(combine):
140 sbb %rsi, %rsi C carry
141 mov %rax, %rdi C 0mod3
142 shr $48, %rax C 0mod3 high
143
144 and %r10, %rsi C carry masked
145 and %r11, %rdi C 0mod3 low
146 mov %ecx, %r10d C 1mod3
147
148 add %rsi, %rax C apply carry
149 shr $32, %rcx C 1mod3 high
150
151 add %rdi, %rax C apply 0mod3 low
152 movzwl %dx, %edi C 2mod3
153 shl $16, %r10 C 1mod3 low
154
155 add %rcx, %rax C apply 1mod3 high
156 shr $16, %rdx C 2mod3 high
157
158 add %r10, %rax C apply 1mod3 low
159 shl $32, %rdi C 2mod3 low
160
161 add %rdx, %rax C apply 2mod3 high
162 add %rdi, %rax C apply 2mod3 low
163
164 ret
165 EPILOGUE()
OLDNEW
« no previous file with comments | « gcc/gmp/mpn/x86_64/lshsub_n.asm ('k') | gcc/gmp/mpn/x86_64/mul_1.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698