Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(244)

Side by Side Diff: gcc/gmp/mpn/x86/pentium/mmx/rshift.asm

Issue 3050029: [gcc] GCC 4.5.0=>4.5.1 (Closed) Base URL: ssh://git@gitrw.chromium.org:9222/nacl-toolchain.git
Patch Set: Created 10 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « gcc/gmp/mpn/x86/pentium/mmx/mul_1.asm ('k') | gcc/gmp/mpn/x86/pentium/mode1o.asm » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 dnl Intel P5 mpn_rshift -- mpn right shift.
2
3 dnl Copyright 2000, 2002 Free Software Foundation, Inc.
4 dnl
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
11 dnl
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
16 dnl
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
19
20 include(`../config.m4')
21
22
23 C P5: 1.75 cycles/limb.
24
25
26 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
27 C unsigned shift);
28 C
29 C Shift src,size right by shift many bits and store the result in dst,size.
30 C Zeros are shifted in at the left. Return the bits shifted out at the
31 C right.
32 C
33 C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
34 C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
35 C
36 C Full speed depends on source and destination being aligned. Unaligned mmx
37 C loads and stores on P5 don't pair and have a 2 cycle penalty. Some hairy
38 C setups and finish-ups are done to ensure alignment for the loop.
39 C
40 C MMX shifts work out a bit faster even for the simple loop.
41
42 defframe(PARAM_SHIFT,16)
43 defframe(PARAM_SIZE, 12)
44 defframe(PARAM_SRC, 8)
45 defframe(PARAM_DST, 4)
46 deflit(`FRAME',0)
47
48 dnl Minimum 5, because the unrolled loop can't handle less.
49 deflit(UNROLL_THRESHOLD, 5)
50
51 TEXT
52 ALIGN(8)
53
54 PROLOGUE(mpn_rshift)
55
56 pushl %ebx
57 pushl %edi
58 deflit(`FRAME',8)
59
60 movl PARAM_SIZE, %eax
61 movl PARAM_DST, %edx
62
63 movl PARAM_SRC, %ebx
64 movl PARAM_SHIFT, %ecx
65
66 cmp $UNROLL_THRESHOLD, %eax
67 jae L(unroll)
68
69 decl %eax
70 movl (%ebx), %edi C src low limb
71
72 jnz L(simple)
73
74 shrdl( %cl, %edi, %eax) C eax was decremented to zero
75
76 shrl %cl, %edi
77
78 movl %edi, (%edx) C dst low limb
79 popl %edi C risk of data cache bank clash
80
81 popl %ebx
82
83 ret
84
85
86 C -----------------------------------------------------------------------------
87 ALIGN(8)
88 L(simple):
89 C eax size-1
90 C ebx src
91 C ecx shift
92 C edx dst
93 C esi
94 C edi
95 C ebp
96 deflit(`FRAME',8)
97
98 movd (%ebx), %mm5 C src[0]
99 leal (%ebx,%eax,4), %ebx C &src[size-1]
100
101 movd %ecx, %mm6 C rshift
102 leal -4(%edx,%eax,4), %edx C &dst[size-2]
103
104 psllq $32, %mm5
105 negl %eax
106
107
108 C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
109 C cycle waiting for the mm0 result to be ready. For comparison a shrdl is 4
110 C cycles and would be 8 in a simple loop. Using mmx helps the return value
111 C and last limb calculations too.
112
113 L(simple_top):
114 C eax counter, limbs, negative
115 C ebx &src[size-1]
116 C ecx return value
117 C edx &dst[size-2]
118 C
119 C mm0 scratch
120 C mm5 return value
121 C mm6 shift
122
123 movq (%ebx,%eax,4), %mm0
124 incl %eax
125
126 psrlq %mm6, %mm0
127
128 movd %mm0, (%edx,%eax,4)
129 jnz L(simple_top)
130
131
132 movd (%ebx), %mm0
133 psrlq %mm6, %mm5 C return value
134
135 psrlq %mm6, %mm0
136 popl %edi
137
138 movd %mm5, %eax
139 popl %ebx
140
141 movd %mm0, 4(%edx)
142
143 emms
144
145 ret
146
147
148 C -----------------------------------------------------------------------------
149 ALIGN(8)
150 L(unroll):
151 C eax size
152 C ebx src
153 C ecx shift
154 C edx dst
155 C esi
156 C edi
157 C ebp
158 deflit(`FRAME',8)
159
160 movd (%ebx), %mm5 C src[0]
161 movl $4, %edi
162
163 movd %ecx, %mm6 C rshift
164 testl %edi, %ebx
165
166 psllq $32, %mm5
167 jz L(start_src_aligned)
168
169
170 C src isn't aligned, process low limb separately (marked xxx) and
171 C step src and dst by one limb, making src aligned.
172 C
173 C source ebx
174 C --+-------+-------+-------+
175 C | xxx |
176 C --+-------+-------+-------+
177 C 4mod8 0mod8 4mod8
178 C
179 C dest edx
180 C --+-------+-------+
181 C | | xxx |
182 C --+-------+-------+
183
184 movq (%ebx), %mm0 C unaligned load
185
186 psrlq %mm6, %mm0
187 addl $4, %ebx
188
189 decl %eax
190
191 movd %mm0, (%edx)
192 addl $4, %edx
193 L(start_src_aligned):
194
195
196 movq (%ebx), %mm1
197 testl %edi, %edx
198
199 psrlq %mm6, %mm5 C retval
200 jz L(start_dst_aligned)
201
202 C dst isn't aligned, add 4 to make it so, and pretend the shift is
203 C 32 bits extra. Low limb of dst (marked xxx) handled here
204 C separately.
205 C
206 C source ebx
207 C --+-------+-------+
208 C | mm1 |
209 C --+-------+-------+
210 C 4mod8 0mod8
211 C
212 C dest edx
213 C --+-------+-------+-------+
214 C | xxx |
215 C --+-------+-------+-------+
216 C 4mod8 0mod8 4mod8
217
218 movq %mm1, %mm0
219 addl $32, %ecx C new shift
220
221 psrlq %mm6, %mm0
222
223 movd %ecx, %mm6
224
225 movd %mm0, (%edx)
226 addl $4, %edx
227 L(start_dst_aligned):
228
229
230 movq 8(%ebx), %mm3
231 negl %ecx
232
233 movq %mm3, %mm2 C mm2 src qword
234 addl $64, %ecx
235
236 movd %ecx, %mm7
237 psrlq %mm6, %mm1
238
239 leal -12(%ebx,%eax,4), %ebx
240 leal -20(%edx,%eax,4), %edx
241
242 psllq %mm7, %mm3
243 subl $7, %eax C size-7
244
245 por %mm1, %mm3 C mm3 ready to store
246 negl %eax C -(size-7)
247
248 jns L(finish)
249
250
251 C This loop is the important bit, the rest is just support. Careful
252 C instruction scheduling achieves the claimed 1.75 c/l. The
253 C relevant parts of the pairing rules are:
254 C
255 C - mmx loads and stores execute only in the U pipe
256 C - only one mmx shift in a pair
257 C - wait one cycle before storing an mmx register result
258 C - the usual address generation interlock
259 C
260 C Two qword calculations are slightly interleaved. The instructions
261 C marked "C" belong to the second qword, and the "C prev" one is for
262 C the second qword from the previous iteration.
263
264 ALIGN(8)
265 L(unroll_loop):
266 C eax counter, limbs, negative
267 C ebx &src[size-12]
268 C ecx
269 C edx &dst[size-12]
270 C esi
271 C edi
272 C
273 C mm0
274 C mm1
275 C mm2 src qword from -8(%ebx,%eax,4)
276 C mm3 dst qword ready to store to -8(%edx,%eax,4)
277 C
278 C mm5 return value
279 C mm6 rshift
280 C mm7 lshift
281
282 movq (%ebx,%eax,4), %mm0
283 psrlq %mm6, %mm2
284
285 movq %mm0, %mm1
286 psllq %mm7, %mm0
287
288 movq %mm3, -8(%edx,%eax,4) C prev
289 por %mm2, %mm0
290
291 movq 8(%ebx,%eax,4), %mm3 C
292 psrlq %mm6, %mm1 C
293
294 movq %mm0, (%edx,%eax,4)
295 movq %mm3, %mm2 C
296
297 psllq %mm7, %mm3 C
298 addl $4, %eax
299
300 por %mm1, %mm3 C
301 js L(unroll_loop)
302
303
304 L(finish):
305 C eax 0 to 3 representing respectively 3 to 0 limbs remaining
306
307 testb $2, %al
308
309 jnz L(finish_no_two)
310
311 movq (%ebx,%eax,4), %mm0
312 psrlq %mm6, %mm2
313
314 movq %mm0, %mm1
315 psllq %mm7, %mm0
316
317 movq %mm3, -8(%edx,%eax,4) C prev
318 por %mm2, %mm0
319
320 movq %mm1, %mm2
321 movq %mm0, %mm3
322
323 addl $2, %eax
324 L(finish_no_two):
325
326
327 C eax 2 or 3 representing respectively 1 or 0 limbs remaining
328 C
329 C mm2 src prev qword, from -8(%ebx,%eax,4)
330 C mm3 dst qword, for -8(%edx,%eax,4)
331
332 testb $1, %al
333 popl %edi
334
335 movd %mm5, %eax C retval
336 jnz L(finish_zero)
337
338
339 C One extra limb, destination was aligned.
340 C
341 C source ebx
342 C +-------+---------------+--
343 C | | mm2 |
344 C +-------+---------------+--
345 C
346 C dest edx
347 C +-------+---------------+---------------+--
348 C | | | mm3 |
349 C +-------+---------------+---------------+--
350 C
351 C mm6 = shift
352 C mm7 = ecx = 64-shift
353
354
355 C One extra limb, destination was unaligned.
356 C
357 C source ebx
358 C +-------+---------------+--
359 C | | mm2 |
360 C +-------+---------------+--
361 C
362 C dest edx
363 C +---------------+---------------+--
364 C | | mm3 |
365 C +---------------+---------------+--
366 C
367 C mm6 = shift+32
368 C mm7 = ecx = 64-(shift+32)
369
370
371 C In both cases there's one extra limb of src to fetch and combine
372 C with mm2 to make a qword at 8(%edx), and in the aligned case
373 C there's a further extra limb of dst to be formed.
374
375
376 movd 8(%ebx), %mm0
377 psrlq %mm6, %mm2
378
379 movq %mm0, %mm1
380 psllq %mm7, %mm0
381
382 movq %mm3, (%edx)
383 por %mm2, %mm0
384
385 psrlq %mm6, %mm1
386 andl $32, %ecx
387
388 popl %ebx
389 jz L(finish_one_unaligned)
390
391 C dst was aligned, must store one extra limb
392 movd %mm1, 16(%edx)
393 L(finish_one_unaligned):
394
395 movq %mm0, 8(%edx)
396
397 emms
398
399 ret
400
401
402 L(finish_zero):
403
404 C No extra limbs, destination was aligned.
405 C
406 C source ebx
407 C +---------------+--
408 C | mm2 |
409 C +---------------+--
410 C
411 C dest edx+4
412 C +---------------+---------------+--
413 C | | mm3 |
414 C +---------------+---------------+--
415 C
416 C mm6 = shift
417 C mm7 = ecx = 64-shift
418
419
420 C No extra limbs, destination was unaligned.
421 C
422 C source ebx
423 C +---------------+--
424 C | mm2 |
425 C +---------------+--
426 C
427 C dest edx+4
428 C +-------+---------------+--
429 C | | mm3 |
430 C +-------+---------------+--
431 C
432 C mm6 = shift+32
433 C mm7 = 64-(shift+32)
434
435
436 C The movd for the unaligned case is clearly the same data as the
437 C movq for the aligned case, it's just a choice between whether one
438 C or two limbs should be written.
439
440
441 movq %mm3, 4(%edx)
442 psrlq %mm6, %mm2
443
444 movd %mm2, 12(%edx)
445 andl $32, %ecx
446
447 popl %ebx
448 jz L(finish_zero_unaligned)
449
450 movq %mm2, 12(%edx)
451 L(finish_zero_unaligned):
452
453 emms
454
455 ret
456
457 EPILOGUE()
OLDNEW
« no previous file with comments | « gcc/gmp/mpn/x86/pentium/mmx/mul_1.asm ('k') | gcc/gmp/mpn/x86/pentium/mode1o.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698