| Index: gcc/gmp/mpn/x86/mod_1.asm
|
| diff --git a/gcc/gmp/mpn/x86/mod_1.asm b/gcc/gmp/mpn/x86/mod_1.asm
|
| deleted file mode 100644
|
| index 0fa3ce0def466a0b888ad67a674baeb34583c180..0000000000000000000000000000000000000000
|
| --- a/gcc/gmp/mpn/x86/mod_1.asm
|
| +++ /dev/null
|
| @@ -1,163 +0,0 @@
|
| -dnl x86 mpn_mod_1 -- mpn by limb remainder.
|
| -
|
| -dnl Copyright 1999, 2000, 2001, 2002, 2003, 2007 Free Software Foundation,
|
| -dnl Inc.
|
| -dnl
|
| -dnl This file is part of the GNU MP Library.
|
| -dnl
|
| -dnl The GNU MP Library is free software; you can redistribute it and/or
|
| -dnl modify it under the terms of the GNU Lesser General Public License as
|
| -dnl published by the Free Software Foundation; either version 3 of the
|
| -dnl License, or (at your option) any later version.
|
| -dnl
|
| -dnl The GNU MP Library is distributed in the hope that it will be useful,
|
| -dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| -dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| -dnl Lesser General Public License for more details.
|
| -dnl
|
| -dnl You should have received a copy of the GNU Lesser General Public License
|
| -dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
|
| -
|
| -include(`../config.m4')
|
| -
|
| -
|
| -C cycles/limb
|
| -C 486 42 approx, maybe
|
| -C P5 44
|
| -C P6 39
|
| -C K6 20
|
| -C K7 41
|
| -C P4 58
|
| -
|
| -
|
| -C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
|
| -C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
|
| -C mp_limb_t carry);
|
| -C
|
| -C Essentially this code is the same as the division based part of
|
| -C mpn/generic/mod_1.c, but has the advantage that we get the desired divl
|
| -C instruction even when gcc is not being used (where longlong.h only has the
|
| -C rather slow generic C udiv_qrnnd().
|
| -C
|
| -C A test is done to see if the high limb is less than the divisor, and if so
|
| -C one less div is done. A div is between 20 and 40 cycles on the various
|
| -C x86s, so assuming high<divisor about half the time, then this test saves
|
| -C half that amount. The branch misprediction penalty on each chip is less
|
| -C than half a div.
|
| -C
|
| -C
|
| -C Notes for K6:
|
| -C
|
| -C Back-to-back div instructions take 20 cycles, the same as the loop here,
|
| -C so it seems there's nothing to gain by rearranging. Pairing the mov and
|
| -C loop instructions was found to gain nothing. Normally we use a loop
|
| -C instruction rather than decl/jnz, but it gains nothing here.
|
| -C
|
| -C A multiply-by-inverse is used in mpn/x86/k6/pre_mod_1.asm, but it saves
|
| -C only 2 c/l so currently we haven't bothered with the same for mpn_mod_1.
|
| -C If an inverse takes about 40 cycles for normalized or perhaps 60 for
|
| -C unnormalized (due to bsfl being slow on k6) then the threshold would be at
|
| -C least 20 or 30 limbs.
|
| -C
|
| -
|
| -defframe(PARAM_CARRY, 16)
|
| -defframe(PARAM_DIVISOR,12)
|
| -defframe(PARAM_SIZE, 8)
|
| -defframe(PARAM_SRC, 4)
|
| -
|
| - TEXT
|
| -
|
| - ALIGN(16)
|
| -PROLOGUE(mpn_mod_1)
|
| -deflit(`FRAME',0)
|
| -
|
| - movl PARAM_SIZE, %ecx
|
| - pushl %ebx FRAME_pushl()
|
| -
|
| - movl PARAM_SRC, %ebx
|
| - pushl %esi FRAME_pushl()
|
| -
|
| - orl %ecx, %ecx
|
| - jz L(done_zero)
|
| -
|
| - movl PARAM_DIVISOR, %esi
|
| - movl -4(%ebx,%ecx,4), %eax C src high limb
|
| -
|
| - cmpl %esi, %eax
|
| -
|
| - sbbl %edx, %edx C -1 if high<divisor
|
| -
|
| - addl %edx, %ecx C skip one division if high<divisor
|
| - jz L(done_eax)
|
| -
|
| - andl %eax, %edx C carry if high<divisor
|
| -
|
| -
|
| -L(top):
|
| - C eax scratch (quotient)
|
| - C ebx src
|
| - C ecx counter
|
| - C edx carry (remainder)
|
| - C esi divisor
|
| - C edi
|
| - C ebp
|
| -
|
| - movl -4(%ebx,%ecx,4), %eax
|
| -
|
| - divl %esi
|
| -
|
| - decl %ecx
|
| - jnz L(top)
|
| -
|
| -
|
| - movl %edx, %eax
|
| -L(done_eax):
|
| - popl %esi
|
| -
|
| - popl %ebx
|
| -
|
| - ret
|
| -
|
| -EPILOGUE()
|
| -
|
| -
|
| - C This code located after mpn_mod_1, so the jump to L(top) here is
|
| - C back and hence will be predicted as taken. (size==0 is considered
|
| - C unlikely.)
|
| -
|
| - ALIGN(16)
|
| -PROLOGUE(mpn_mod_1c)
|
| -deflit(`FRAME',0)
|
| -
|
| - movl PARAM_SIZE, %ecx
|
| - pushl %ebx FRAME_pushl()
|
| -
|
| - movl PARAM_SRC, %ebx
|
| - pushl %esi FRAME_pushl()
|
| -
|
| - movl PARAM_DIVISOR, %esi
|
| - orl %ecx, %ecx
|
| -
|
| - movl PARAM_CARRY, %edx
|
| - jnz L(top)
|
| -
|
| - popl %esi
|
| - movl %edx, %eax
|
| -
|
| - popl %ebx
|
| -
|
| - ret
|
| -
|
| -
|
| - C This code is for mpn_mod_1, but is positioned here to save some
|
| - C space in the alignment padding.
|
| - C
|
| -L(done_zero):
|
| - popl %esi
|
| - xorl %eax, %eax
|
| -
|
| - popl %ebx
|
| -
|
| - ret
|
| -
|
| -EPILOGUE()
|
|
|