OLD | NEW |
| (Empty) |
1 dnl Intel P5 mpn_rshift -- mpn right shift. | |
2 | |
3 dnl Copyright 2000, 2002 Free Software Foundation, Inc. | |
4 dnl | |
5 dnl This file is part of the GNU MP Library. | |
6 dnl | |
7 dnl The GNU MP Library is free software; you can redistribute it and/or | |
8 dnl modify it under the terms of the GNU Lesser General Public License as | |
9 dnl published by the Free Software Foundation; either version 3 of the | |
10 dnl License, or (at your option) any later version. | |
11 dnl | |
12 dnl The GNU MP Library is distributed in the hope that it will be useful, | |
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 dnl Lesser General Public License for more details. | |
16 dnl | |
17 dnl You should have received a copy of the GNU Lesser General Public License | |
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. | |
19 | |
20 include(`../config.m4') | |
21 | |
22 | |
23 C P5: 1.75 cycles/limb. | |
24 | |
25 | |
26 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, | |
27 C unsigned shift); | |
28 C | |
29 C Shift src,size right by shift many bits and store the result in dst,size. | |
30 C Zeros are shifted in at the left. Return the bits shifted out at the | |
31 C right. | |
32 C | |
33 C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb, | |
34 C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l. | |
35 C | |
36 C Full speed depends on source and destination being aligned. Unaligned mmx | |
37 C loads and stores on P5 don't pair and have a 2 cycle penalty. Some hairy | |
38 C setups and finish-ups are done to ensure alignment for the loop. | |
39 C | |
40 C MMX shifts work out a bit faster even for the simple loop. | |
41 | |
42 defframe(PARAM_SHIFT,16) | |
43 defframe(PARAM_SIZE, 12) | |
44 defframe(PARAM_SRC, 8) | |
45 defframe(PARAM_DST, 4) | |
46 deflit(`FRAME',0) | |
47 | |
48 dnl Minimum 5, because the unrolled loop can't handle less. | |
49 deflit(UNROLL_THRESHOLD, 5) | |
50 | |
51 TEXT | |
52 ALIGN(8) | |
53 | |
54 PROLOGUE(mpn_rshift) | |
55 | |
56 pushl %ebx | |
57 pushl %edi | |
58 deflit(`FRAME',8) | |
59 | |
60 movl PARAM_SIZE, %eax | |
61 movl PARAM_DST, %edx | |
62 | |
63 movl PARAM_SRC, %ebx | |
64 movl PARAM_SHIFT, %ecx | |
65 | |
66 cmp $UNROLL_THRESHOLD, %eax | |
67 jae L(unroll) | |
68 | |
69 decl %eax | |
70 movl (%ebx), %edi C src low limb | |
71 | |
72 jnz L(simple) | |
73 | |
74 shrdl( %cl, %edi, %eax) C eax was decremented to zero | |
75 | |
76 shrl %cl, %edi | |
77 | |
78 movl %edi, (%edx) C dst low limb | |
79 popl %edi C risk of data cache bank clash | |
80 | |
81 popl %ebx | |
82 | |
83 ret | |
84 | |
85 | |
86 C ----------------------------------------------------------------------------- | |
87 ALIGN(8) | |
88 L(simple): | |
89 C eax size-1 | |
90 C ebx src | |
91 C ecx shift | |
92 C edx dst | |
93 C esi | |
94 C edi | |
95 C ebp | |
96 deflit(`FRAME',8) | |
97 | |
98 movd (%ebx), %mm5 C src[0] | |
99 leal (%ebx,%eax,4), %ebx C &src[size-1] | |
100 | |
101 movd %ecx, %mm6 C rshift | |
102 leal -4(%edx,%eax,4), %edx C &dst[size-2] | |
103 | |
104 psllq $32, %mm5 | |
105 negl %eax | |
106 | |
107 | |
108 C This loop is 5 or 8 cycles, with every second load unaligned and a wasted | |
109 C cycle waiting for the mm0 result to be ready. For comparison a shrdl is 4 | |
110 C cycles and would be 8 in a simple loop. Using mmx helps the return value | |
111 C and last limb calculations too. | |
112 | |
113 L(simple_top): | |
114 C eax counter, limbs, negative | |
115 C ebx &src[size-1] | |
116 C ecx return value | |
117 C edx &dst[size-2] | |
118 C | |
119 C mm0 scratch | |
120 C mm5 return value | |
121 C mm6 shift | |
122 | |
123 movq (%ebx,%eax,4), %mm0 | |
124 incl %eax | |
125 | |
126 psrlq %mm6, %mm0 | |
127 | |
128 movd %mm0, (%edx,%eax,4) | |
129 jnz L(simple_top) | |
130 | |
131 | |
132 movd (%ebx), %mm0 | |
133 psrlq %mm6, %mm5 C return value | |
134 | |
135 psrlq %mm6, %mm0 | |
136 popl %edi | |
137 | |
138 movd %mm5, %eax | |
139 popl %ebx | |
140 | |
141 movd %mm0, 4(%edx) | |
142 | |
143 emms | |
144 | |
145 ret | |
146 | |
147 | |
148 C ----------------------------------------------------------------------------- | |
149 ALIGN(8) | |
150 L(unroll): | |
151 C eax size | |
152 C ebx src | |
153 C ecx shift | |
154 C edx dst | |
155 C esi | |
156 C edi | |
157 C ebp | |
158 deflit(`FRAME',8) | |
159 | |
160 movd (%ebx), %mm5 C src[0] | |
161 movl $4, %edi | |
162 | |
163 movd %ecx, %mm6 C rshift | |
164 testl %edi, %ebx | |
165 | |
166 psllq $32, %mm5 | |
167 jz L(start_src_aligned) | |
168 | |
169 | |
170 C src isn't aligned, process low limb separately (marked xxx) and | |
171 C step src and dst by one limb, making src aligned. | |
172 C | |
173 C source ebx | |
174 C --+-------+-------+-------+ | |
175 C | xxx | | |
176 C --+-------+-------+-------+ | |
177 C 4mod8 0mod8 4mod8 | |
178 C | |
179 C dest edx | |
180 C --+-------+-------+ | |
181 C | | xxx | | |
182 C --+-------+-------+ | |
183 | |
184 movq (%ebx), %mm0 C unaligned load | |
185 | |
186 psrlq %mm6, %mm0 | |
187 addl $4, %ebx | |
188 | |
189 decl %eax | |
190 | |
191 movd %mm0, (%edx) | |
192 addl $4, %edx | |
193 L(start_src_aligned): | |
194 | |
195 | |
196 movq (%ebx), %mm1 | |
197 testl %edi, %edx | |
198 | |
199 psrlq %mm6, %mm5 C retval | |
200 jz L(start_dst_aligned) | |
201 | |
202 C dst isn't aligned, add 4 to make it so, and pretend the shift is | |
203 C 32 bits extra. Low limb of dst (marked xxx) handled here | |
204 C separately. | |
205 C | |
206 C source ebx | |
207 C --+-------+-------+ | |
208 C | mm1 | | |
209 C --+-------+-------+ | |
210 C 4mod8 0mod8 | |
211 C | |
212 C dest edx | |
213 C --+-------+-------+-------+ | |
214 C | xxx | | |
215 C --+-------+-------+-------+ | |
216 C 4mod8 0mod8 4mod8 | |
217 | |
218 movq %mm1, %mm0 | |
219 addl $32, %ecx C new shift | |
220 | |
221 psrlq %mm6, %mm0 | |
222 | |
223 movd %ecx, %mm6 | |
224 | |
225 movd %mm0, (%edx) | |
226 addl $4, %edx | |
227 L(start_dst_aligned): | |
228 | |
229 | |
230 movq 8(%ebx), %mm3 | |
231 negl %ecx | |
232 | |
233 movq %mm3, %mm2 C mm2 src qword | |
234 addl $64, %ecx | |
235 | |
236 movd %ecx, %mm7 | |
237 psrlq %mm6, %mm1 | |
238 | |
239 leal -12(%ebx,%eax,4), %ebx | |
240 leal -20(%edx,%eax,4), %edx | |
241 | |
242 psllq %mm7, %mm3 | |
243 subl $7, %eax C size-7 | |
244 | |
245 por %mm1, %mm3 C mm3 ready to store | |
246 negl %eax C -(size-7) | |
247 | |
248 jns L(finish) | |
249 | |
250 | |
251 C This loop is the important bit, the rest is just support. Careful | |
252 C instruction scheduling achieves the claimed 1.75 c/l. The | |
253 C relevant parts of the pairing rules are: | |
254 C | |
255 C - mmx loads and stores execute only in the U pipe | |
256 C - only one mmx shift in a pair | |
257 C - wait one cycle before storing an mmx register result | |
258 C - the usual address generation interlock | |
259 C | |
260 C Two qword calculations are slightly interleaved. The instructions | |
261 C marked "C" belong to the second qword, and the "C prev" one is for | |
262 C the second qword from the previous iteration. | |
263 | |
264 ALIGN(8) | |
265 L(unroll_loop): | |
266 C eax counter, limbs, negative | |
267 C ebx &src[size-12] | |
268 C ecx | |
269 C edx &dst[size-12] | |
270 C esi | |
271 C edi | |
272 C | |
273 C mm0 | |
274 C mm1 | |
275 C mm2 src qword from -8(%ebx,%eax,4) | |
276 C mm3 dst qword ready to store to -8(%edx,%eax,4) | |
277 C | |
278 C mm5 return value | |
279 C mm6 rshift | |
280 C mm7 lshift | |
281 | |
282 movq (%ebx,%eax,4), %mm0 | |
283 psrlq %mm6, %mm2 | |
284 | |
285 movq %mm0, %mm1 | |
286 psllq %mm7, %mm0 | |
287 | |
288 movq %mm3, -8(%edx,%eax,4) C prev | |
289 por %mm2, %mm0 | |
290 | |
291 movq 8(%ebx,%eax,4), %mm3 C | |
292 psrlq %mm6, %mm1 C | |
293 | |
294 movq %mm0, (%edx,%eax,4) | |
295 movq %mm3, %mm2 C | |
296 | |
297 psllq %mm7, %mm3 C | |
298 addl $4, %eax | |
299 | |
300 por %mm1, %mm3 C | |
301 js L(unroll_loop) | |
302 | |
303 | |
304 L(finish): | |
305 C eax 0 to 3 representing respectively 3 to 0 limbs remaining | |
306 | |
307 testb $2, %al | |
308 | |
309 jnz L(finish_no_two) | |
310 | |
311 movq (%ebx,%eax,4), %mm0 | |
312 psrlq %mm6, %mm2 | |
313 | |
314 movq %mm0, %mm1 | |
315 psllq %mm7, %mm0 | |
316 | |
317 movq %mm3, -8(%edx,%eax,4) C prev | |
318 por %mm2, %mm0 | |
319 | |
320 movq %mm1, %mm2 | |
321 movq %mm0, %mm3 | |
322 | |
323 addl $2, %eax | |
324 L(finish_no_two): | |
325 | |
326 | |
327 C eax 2 or 3 representing respectively 1 or 0 limbs remaining | |
328 C | |
329 C mm2 src prev qword, from -8(%ebx,%eax,4) | |
330 C mm3 dst qword, for -8(%edx,%eax,4) | |
331 | |
332 testb $1, %al | |
333 popl %edi | |
334 | |
335 movd %mm5, %eax C retval | |
336 jnz L(finish_zero) | |
337 | |
338 | |
339 C One extra limb, destination was aligned. | |
340 C | |
341 C source ebx | |
342 C +-------+---------------+-- | |
343 C | | mm2 | | |
344 C +-------+---------------+-- | |
345 C | |
346 C dest edx | |
347 C +-------+---------------+---------------+-- | |
348 C | | | mm3 | | |
349 C +-------+---------------+---------------+-- | |
350 C | |
351 C mm6 = shift | |
352 C mm7 = ecx = 64-shift | |
353 | |
354 | |
355 C One extra limb, destination was unaligned. | |
356 C | |
357 C source ebx | |
358 C +-------+---------------+-- | |
359 C | | mm2 | | |
360 C +-------+---------------+-- | |
361 C | |
362 C dest edx | |
363 C +---------------+---------------+-- | |
364 C | | mm3 | | |
365 C +---------------+---------------+-- | |
366 C | |
367 C mm6 = shift+32 | |
368 C mm7 = ecx = 64-(shift+32) | |
369 | |
370 | |
371 C In both cases there's one extra limb of src to fetch and combine | |
372 C with mm2 to make a qword at 8(%edx), and in the aligned case | |
373 C there's a further extra limb of dst to be formed. | |
374 | |
375 | |
376 movd 8(%ebx), %mm0 | |
377 psrlq %mm6, %mm2 | |
378 | |
379 movq %mm0, %mm1 | |
380 psllq %mm7, %mm0 | |
381 | |
382 movq %mm3, (%edx) | |
383 por %mm2, %mm0 | |
384 | |
385 psrlq %mm6, %mm1 | |
386 andl $32, %ecx | |
387 | |
388 popl %ebx | |
389 jz L(finish_one_unaligned) | |
390 | |
391 C dst was aligned, must store one extra limb | |
392 movd %mm1, 16(%edx) | |
393 L(finish_one_unaligned): | |
394 | |
395 movq %mm0, 8(%edx) | |
396 | |
397 emms | |
398 | |
399 ret | |
400 | |
401 | |
402 L(finish_zero): | |
403 | |
404 C No extra limbs, destination was aligned. | |
405 C | |
406 C source ebx | |
407 C +---------------+-- | |
408 C | mm2 | | |
409 C +---------------+-- | |
410 C | |
411 C dest edx+4 | |
412 C +---------------+---------------+-- | |
413 C | | mm3 | | |
414 C +---------------+---------------+-- | |
415 C | |
416 C mm6 = shift | |
417 C mm7 = ecx = 64-shift | |
418 | |
419 | |
420 C No extra limbs, destination was unaligned. | |
421 C | |
422 C source ebx | |
423 C +---------------+-- | |
424 C | mm2 | | |
425 C +---------------+-- | |
426 C | |
427 C dest edx+4 | |
428 C +-------+---------------+-- | |
429 C | | mm3 | | |
430 C +-------+---------------+-- | |
431 C | |
432 C mm6 = shift+32 | |
433 C mm7 = 64-(shift+32) | |
434 | |
435 | |
436 C The movd for the unaligned case is clearly the same data as the | |
437 C movq for the aligned case, it's just a choice between whether one | |
438 C or two limbs should be written. | |
439 | |
440 | |
441 movq %mm3, 4(%edx) | |
442 psrlq %mm6, %mm2 | |
443 | |
444 movd %mm2, 12(%edx) | |
445 andl $32, %ecx | |
446 | |
447 popl %ebx | |
448 jz L(finish_zero_unaligned) | |
449 | |
450 movq %mm2, 12(%edx) | |
451 L(finish_zero_unaligned): | |
452 | |
453 emms | |
454 | |
455 ret | |
456 | |
457 EPILOGUE() | |
OLD | NEW |