Index: gcc/gmp/mpn/x86/divrem_1.asm |
diff --git a/gcc/gmp/mpn/x86/divrem_1.asm b/gcc/gmp/mpn/x86/divrem_1.asm |
deleted file mode 100644 |
index a5fb88071d6b414095d546ec44f9630128c96ff5..0000000000000000000000000000000000000000 |
--- a/gcc/gmp/mpn/x86/divrem_1.asm |
+++ /dev/null |
@@ -1,223 +0,0 @@ |
-dnl x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient. |
- |
-dnl Copyright 1999, 2000, 2001, 2002, 2003, 2007 Free Software Foundation, |
-dnl Inc. |
-dnl |
-dnl This file is part of the GNU MP Library. |
-dnl |
-dnl The GNU MP Library is free software; you can redistribute it and/or |
-dnl modify it under the terms of the GNU Lesser General Public License as |
-dnl published by the Free Software Foundation; either version 3 of the |
-dnl License, or (at your option) any later version. |
-dnl |
-dnl The GNU MP Library is distributed in the hope that it will be useful, |
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of |
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
-dnl Lesser General Public License for more details. |
-dnl |
-dnl You should have received a copy of the GNU Lesser General Public License |
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
- |
-include(`../config.m4') |
- |
- |
-C cycles/limb |
-C 486 approx 43 maybe |
-C P5 44 |
-C P6 39 |
-C P6MMX 39 |
-C K6 22 |
-C K7 42 |
-C P4 58 |
- |
- |
-C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, |
-C mp_srcptr src, mp_size_t size, mp_limb_t divisor); |
-C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, |
-C mp_srcptr src, mp_size_t size, mp_limb_t divisor, |
-C mp_limb_t carry); |
-C |
-C Divide src,size by divisor and store the quotient in dst+xsize,size. |
-C Extend the division to fractional quotient limbs in dst,xsize. Return the |
-C remainder. Either or both xsize and size can be 0. |
-C |
-C mpn_divrem_1c takes a carry parameter which is an initial high limb, |
-C effectively one extra limb at the top of src,size. Must have |
-C carry<divisor. |
-C |
-C |
-C Essentially the code is the same as the division based part of |
-C mpn/generic/divrem_1.c, but has the advantage that we get the desired divl |
-C instruction even when gcc is not being used (when longlong.h only has the |
-C rather slow generic C udiv_qrnnd(). |
-C |
-C A test is done to see if the high limb is less than the divisor, and if so |
-C one less div is done. A div is between 20 and 40 cycles on the various |
-C x86s, so assuming high<divisor about half the time, then this test saves |
-C half that amount. The branch misprediction penalty on each chip is less |
-C than half a div. |
-C |
-C |
-C Notes for P5: |
-C |
-C It might be thought that moving the load down to pair with the store would |
-C save 1 cycle, but that doesn't seem to happen in practice, and in any case |
-C would be a mere 2.2% saving, so it's hardly worth bothering about. |
-C |
-C A mul-by-inverse might be a possibility for P5, as done in |
-C mpn/x86/pentium/mod_1.asm. The number of auxiliary instructions required |
-C is a hinderance, but there could be a 10-15% speedup available. |
-C |
-C |
-C Notes for K6: |
-C |
-C K6 has its own version of this code, using loop and paying attention to |
-C cache line boundary crossings. The target 20 c/l can be had with the |
-C decl+jnz of the present code by pairing up the load and store in the |
-C loops. But it's considered easier not to introduce complexity just for |
-C that, but instead let k6 have its own code. |
-C |
- |
-defframe(PARAM_CARRY, 24) |
-defframe(PARAM_DIVISOR,20) |
-defframe(PARAM_SIZE, 16) |
-defframe(PARAM_SRC, 12) |
-defframe(PARAM_XSIZE, 8) |
-defframe(PARAM_DST, 4) |
- |
- TEXT |
- ALIGN(16) |
- |
-PROLOGUE(mpn_divrem_1c) |
-deflit(`FRAME',0) |
- |
- movl PARAM_SIZE, %ecx |
- pushl %edi FRAME_pushl() |
- |
- movl PARAM_SRC, %edi |
- pushl %esi FRAME_pushl() |
- |
- movl PARAM_DIVISOR, %esi |
- pushl %ebx FRAME_pushl() |
- |
- movl PARAM_DST, %ebx |
- pushl %ebp FRAME_pushl() |
- |
- movl PARAM_XSIZE, %ebp |
- orl %ecx, %ecx |
- |
- movl PARAM_CARRY, %edx |
- jz L(fraction) |
- |
- leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part |
- jmp L(integer_top) |
- |
-EPILOGUE() |
- |
- |
-PROLOGUE(mpn_divrem_1) |
-deflit(`FRAME',0) |
- |
- movl PARAM_SIZE, %ecx |
- pushl %edi FRAME_pushl() |
- |
- movl PARAM_SRC, %edi |
- pushl %esi FRAME_pushl() |
- |
- movl PARAM_DIVISOR, %esi |
- orl %ecx,%ecx |
- |
- jz L(size_zero) |
- pushl %ebx FRAME_pushl() |
- |
- movl -4(%edi,%ecx,4), %eax C src high limb |
- xorl %edx, %edx |
- |
- movl PARAM_DST, %ebx |
- pushl %ebp FRAME_pushl() |
- |
- movl PARAM_XSIZE, %ebp |
- cmpl %esi, %eax |
- |
- leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part |
- jae L(integer_entry) |
- |
- |
- C high<divisor, so high of dst is zero, and avoid one div |
- |
- movl %edx, (%ebx,%ecx,4) |
- decl %ecx |
- |
- movl %eax, %edx |
- jz L(fraction) |
- |
- |
-L(integer_top): |
- C eax scratch (quotient) |
- C ebx dst+4*xsize-4 |
- C ecx counter |
- C edx scratch (remainder) |
- C esi divisor |
- C edi src |
- C ebp xsize |
- |
- movl -4(%edi,%ecx,4), %eax |
-L(integer_entry): |
- |
- divl %esi |
- |
- movl %eax, (%ebx,%ecx,4) |
- decl %ecx |
- jnz L(integer_top) |
- |
- |
-L(fraction): |
- orl %ebp, %ecx |
- jz L(done) |
- |
- movl PARAM_DST, %ebx |
- |
- |
-L(fraction_top): |
- C eax scratch (quotient) |
- C ebx dst |
- C ecx counter |
- C edx scratch (remainder) |
- C esi divisor |
- C edi |
- C ebp |
- |
- xorl %eax, %eax |
- |
- divl %esi |
- |
- movl %eax, -4(%ebx,%ecx,4) |
- decl %ecx |
- jnz L(fraction_top) |
- |
- |
-L(done): |
- popl %ebp |
- movl %edx, %eax |
- popl %ebx |
- popl %esi |
- popl %edi |
- ret |
- |
- |
-L(size_zero): |
-deflit(`FRAME',8) |
- movl PARAM_XSIZE, %ecx |
- xorl %eax, %eax |
- |
- movl PARAM_DST, %edi |
- |
- cld C better safe than sorry, see mpn/x86/README |
- |
- rep |
- stosl |
- |
- popl %esi |
- popl %edi |
- ret |
-EPILOGUE() |