Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(110)

Side by Side Diff: gcc/gmp/mpn/x86/pentium4/mmx/popham.asm

Issue 3050029: [gcc] GCC 4.5.0=>4.5.1 (Closed) Base URL: ssh://git@gitrw.chromium.org:9222/nacl-toolchain.git
Patch Set: Created 10 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « gcc/gmp/mpn/x86/pentium4/copyd.asm ('k') | gcc/gmp/mpn/x86/pentium4/sse2/add_n.asm » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 dnl Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and
2 dnl hamming distance.
3
4 dnl Copyright 2000, 2001, 2002, 2007 Free Software Foundation, Inc.
5 dnl
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or
9 dnl modify it under the terms of the GNU Lesser General Public License as
10 dnl published by the Free Software Foundation; either version 3 of the
11 dnl License, or (at your option) any later version.
12 dnl
13 dnl The GNU MP Library is distributed in the hope that it will be useful,
14 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
15 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 dnl Lesser General Public License for more details.
17 dnl
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20
21 include(`../config.m4')
22
23
24 C popcount hamdist
25 C P3 model 9 (Banias) ? ?
26 C P3 model 13 (Dothan) 6 6
27 C P4 model 0 (Willamette)
28 C P4 model 1 (?)
29 C P4 model 2 (Northwood) 8 9
30 C P4 model 3 (Prescott) 8 9
31 C P4 model 4 (Nocona)
32
33 C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
34 C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
35 C
36 C Loading with unaligned movq's costs an extra 1 c/l and hence is avoided.
37 C Two movd's and a punpckldq seems to be the same speed as an aligned movq,
38 C and using them saves fiddling about with alignment testing on entry.
39 C
40 C For popcount there's 13 mmx instructions in the loop, so perhaps 6.5 c/l
41 C might be possible, but 8 c/l relying on out-of-order execution is already
42 C quite reasonable.
43
44 ifdef(`OPERATION_popcount',,
45 `ifdef(`OPERATION_hamdist',,
46 `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
47 ')')')
48
49 define(HAM,
50 m4_assert_numargs(1)
51 `ifdef(`OPERATION_hamdist',`$1')')
52
53 define(POP,
54 m4_assert_numargs(1)
55 `ifdef(`OPERATION_popcount',`$1')')
56
57 HAM(`
58 defframe(PARAM_SIZE, 12)
59 defframe(PARAM_SRC2, 8)
60 defframe(PARAM_SRC, 4)
61 define(M4_function,mpn_hamdist)
62 ')
63 POP(`
64 defframe(PARAM_SIZE, 8)
65 defframe(PARAM_SRC, 4)
66 define(M4_function,mpn_popcount)
67 ')
68
69 MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
70
71
72 ifdef(`PIC',,`
73 dnl non-PIC
74 RODATA
75 ALIGN(8)
76 L(rodata_AAAAAAAAAAAAAAAA):
77 .long 0xAAAAAAAA
78 .long 0xAAAAAAAA
79 L(rodata_3333333333333333):
80 .long 0x33333333
81 .long 0x33333333
82 L(rodata_0F0F0F0F0F0F0F0F):
83 .long 0x0F0F0F0F
84 .long 0x0F0F0F0F
85 ')
86
87 TEXT
88 ALIGN(16)
89
90 PROLOGUE(M4_function)
91 deflit(`FRAME',0)
92
93 movl PARAM_SIZE, %ecx
94 movl PARAM_SRC, %eax
95
96 ifdef(`PIC',`
97 movl $0xAAAAAAAA, %edx
98 movd %edx, %mm7
99 punpckldq %mm7, %mm7
100
101 movl $0x33333333, %edx
102 movd %edx, %mm6
103 punpckldq %mm6, %mm6
104
105 movl $0x0F0F0F0F, %edx
106 movd %edx, %mm5
107 punpckldq %mm5, %mm5
108
109 HAM(` movl PARAM_SRC2, %edx')
110
111 ',`
112 dnl non-PIC
113 HAM(` movl PARAM_SRC2, %edx')
114 movq L(rodata_AAAAAAAAAAAAAAAA), %mm7
115 movq L(rodata_3333333333333333), %mm6
116 movq L(rodata_0F0F0F0F0F0F0F0F), %mm5
117 ')
118
119 pxor %mm4, %mm4 C zero
120 pxor %mm0, %mm0 C total
121
122 subl $1, %ecx
123 ja L(top)
124
125 L(last):
126 movd (%eax,%ecx,4), %mm1 C src high limb
127 HAM(` movd (%edx,%ecx,4), %mm2
128 pxor %mm2, %mm1
129 ')
130 jmp L(loaded)
131
132
133 L(top):
134 C eax src
135 C ebx
136 C ecx counter, size-1 to 2 or 1, inclusive
137 C edx [hamdist] src2
138 C
139 C mm0 total (low dword)
140 C mm1 (scratch)
141 C mm2 (scratch)
142 C mm3
143 C mm4 0x0000000000000000
144 C mm5 0x0F0F0F0F0F0F0F0F
145 C mm6 0x3333333333333333
146 C mm7 0xAAAAAAAAAAAAAAAA
147
148 movd (%eax), %mm1
149 movd 4(%eax), %mm2
150 punpckldq %mm2, %mm1
151 addl $8, %eax
152
153 HAM(` movd (%edx), %mm2
154 movd 4(%edx), %mm3
155 punpckldq %mm3, %mm2
156 pxor %mm2, %mm1
157 addl $8, %edx
158 ')
159
160 L(loaded):
161 movq %mm7, %mm2
162 pand %mm1, %mm2
163 psrlq $1, %mm2
164 psubd %mm2, %mm1 C bit pairs
165
166 movq %mm6, %mm2
167 pand %mm1, %mm2
168 psrlq $2, %mm1
169 pand %mm6, %mm1
170 paddd %mm2, %mm1 C nibbles
171
172 movq %mm5, %mm2
173 pand %mm1, %mm2
174 psrlq $4, %mm1
175 pand %mm5, %mm1
176 paddd %mm2, %mm1 C bytes
177
178 psadbw( %mm4, %mm1)
179 paddd %mm1, %mm0 C to total
180
181 subl $2, %ecx
182 jg L(top)
183
184 C ecx is 0 or -1 representing respectively 1 or 0 further limbs
185 jz L(last)
186
187
188 movd %mm0, %eax
189 emms
190 ret
191
192 EPILOGUE()
OLDNEW
« no previous file with comments | « gcc/gmp/mpn/x86/pentium4/copyd.asm ('k') | gcc/gmp/mpn/x86/pentium4/sse2/add_n.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698