| Index: gcc/gmp/mpn/x86/k6/gcd_1.asm
|
| diff --git a/gcc/gmp/mpn/x86/k6/gcd_1.asm b/gcc/gmp/mpn/x86/k6/gcd_1.asm
|
| deleted file mode 100644
|
| index 58aff08221028369ade94b920c471d9b351ff703..0000000000000000000000000000000000000000
|
| --- a/gcc/gmp/mpn/x86/k6/gcd_1.asm
|
| +++ /dev/null
|
| @@ -1,351 +0,0 @@
|
| -dnl AMD K6 mpn_gcd_1 -- mpn by 1 gcd.
|
| -
|
| -dnl Copyright 2000, 2001, 2002, 2004 Free Software Foundation, Inc.
|
| -dnl
|
| -dnl This file is part of the GNU MP Library.
|
| -dnl
|
| -dnl The GNU MP Library is free software; you can redistribute it and/or
|
| -dnl modify it under the terms of the GNU Lesser General Public License as
|
| -dnl published by the Free Software Foundation; either version 3 of the
|
| -dnl License, or (at your option) any later version.
|
| -dnl
|
| -dnl The GNU MP Library is distributed in the hope that it will be useful,
|
| -dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| -dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| -dnl Lesser General Public License for more details.
|
| -dnl
|
| -dnl You should have received a copy of the GNU Lesser General Public License
|
| -dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
|
| -
|
| -include(`../config.m4')
|
| -
|
| -
|
| -C K6: 9.5 cycles/bit (approx) 1x1 gcd
|
| -C 11.0 cycles/limb Nx1 reduction (modexact_1_odd)
|
| -
|
| -
|
| -C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t y);
|
| -C
|
| -C This code is nothing very special, but offers a speedup over what gcc 2.95
|
| -C can do with mpn/generic/gcd_1.c.
|
| -C
|
| -C Future:
|
| -C
|
| -C Using a lookup table to count trailing zeros seems a touch quicker, but
|
| -C after a slightly longer startup. Might be worthwhile if an mpn_gcd_2 used
|
| -C it too.
|
| -
|
| -
|
| -dnl If size==1 and x (the larger operand) is more than DIV_THRESHOLD bits
|
| -dnl bigger than y, then a division x%y is done to reduce it.
|
| -dnl
|
| -dnl A divl is 20 cycles and the loop runs at about 9.5 cycles/bitpair so
|
| -dnl there should be an advantage in the divl at about 4 or 5 bits, which is
|
| -dnl what's found.
|
| -
|
| -deflit(DIV_THRESHOLD, 5)
|
| -
|
| -
|
| -defframe(PARAM_LIMB, 12)
|
| -defframe(PARAM_SIZE, 8)
|
| -defframe(PARAM_SRC, 4)
|
| -
|
| - TEXT
|
| - ALIGN(16)
|
| -
|
| -PROLOGUE(mpn_gcd_1)
|
| -deflit(`FRAME',0)
|
| -
|
| - ASSERT(ne, `cmpl $0, PARAM_LIMB')
|
| - ASSERT(ae, `cmpl $1, PARAM_SIZE')
|
| -
|
| -
|
| - movl PARAM_SRC, %eax
|
| - pushl %ebx FRAME_pushl()
|
| -
|
| - movl PARAM_LIMB, %edx
|
| - movl $-1, %ecx
|
| -
|
| - movl (%eax), %ebx C src low limb
|
| -
|
| - movl %ebx, %eax C src low limb
|
| - orl %edx, %ebx
|
| -
|
| -L(common_twos):
|
| - shrl %ebx
|
| - incl %ecx
|
| -
|
| - jnc L(common_twos) C 1/4 chance on random data
|
| - shrl %cl, %edx C y
|
| -
|
| - cmpl $1, PARAM_SIZE
|
| - ja L(size_two_or_more)
|
| -
|
| -
|
| - ASSERT(nz, `orl %eax, %eax') C should have src limb != 0
|
| -
|
| - shrl %cl, %eax C x
|
| -
|
| -
|
| - C Swap if necessary to make x>=y. Measures a touch quicker as a
|
| - C jump than a branch free calculation.
|
| - C
|
| - C eax x
|
| - C ebx
|
| - C ecx common twos
|
| - C edx y
|
| -
|
| - movl %eax, %ebx
|
| - cmpl %eax, %edx
|
| -
|
| - jb L(noswap)
|
| - movl %edx, %eax
|
| -
|
| - movl %ebx, %edx
|
| - movl %eax, %ebx
|
| -L(noswap):
|
| -
|
| -
|
| - C See if it's worth reducing x with a divl.
|
| - C
|
| - C eax x
|
| - C ebx x
|
| - C ecx common twos
|
| - C edx y
|
| -
|
| - shrl $DIV_THRESHOLD, %ebx
|
| -
|
| - cmpl %ebx, %edx
|
| - ja L(nodiv)
|
| -
|
| -
|
| - C Reduce x to x%y.
|
| - C
|
| - C eax x
|
| - C ebx
|
| - C ecx common twos
|
| - C edx y
|
| -
|
| - movl %edx, %ebx
|
| - xorl %edx, %edx
|
| -
|
| - divl %ebx
|
| -
|
| - orl %edx, %edx C y
|
| - nop C code alignment
|
| -
|
| - movl %ebx, %eax C x
|
| - jz L(done_shll)
|
| -L(nodiv):
|
| -
|
| -
|
| - C eax x
|
| - C ebx
|
| - C ecx common twos
|
| - C edx y
|
| - C esi
|
| - C edi
|
| - C ebp
|
| -
|
| -L(strip_y):
|
| - shrl %edx
|
| - jnc L(strip_y)
|
| -
|
| - leal 1(%edx,%edx), %edx
|
| - movl %ecx, %ebx C common twos
|
| -
|
| - leal 1(%eax), %ecx
|
| - jmp L(strip_x_and)
|
| -
|
| -
|
| -C Calculating a %cl shift based on the low bit 0 or 1 avoids doing a branch
|
| -C on a 50/50 chance of 0 or 1. The chance of the next bit also being 0 is
|
| -C only 1/4.
|
| -C
|
| -C A second computed %cl shift was tried, but that measured a touch slower
|
| -C than branching back.
|
| -C
|
| -C A branch-free abs(x-y) and min(x,y) calculation was tried, but that
|
| -C measured about 1 cycle/bit slower.
|
| -
|
| - C eax x
|
| - C ebx common twos
|
| - C ecx scratch
|
| - C edx y
|
| -
|
| - ALIGN(4)
|
| -L(swap):
|
| - addl %eax, %edx C x-y+y = x
|
| - negl %eax C -(x-y) = y-x
|
| -
|
| -L(strip_x):
|
| - shrl %eax C odd-odd = even, so always one to strip
|
| - ASSERT(nz)
|
| -
|
| -L(strip_x_leal):
|
| - leal 1(%eax), %ecx
|
| -
|
| -L(strip_x_and):
|
| - andl $1, %ecx C (x^1)&1
|
| -
|
| - shrl %cl, %eax C shift if x even
|
| -
|
| - testb $1, %al
|
| - jz L(strip_x)
|
| -
|
| - ASSERT(nz,`testl $1, %eax') C x, y odd
|
| - ASSERT(nz,`testl $1, %edx')
|
| -
|
| - subl %edx, %eax
|
| - jb L(swap)
|
| - ja L(strip_x)
|
| -
|
| -
|
| - movl %edx, %eax
|
| - movl %ebx, %ecx
|
| -
|
| -L(done_shll):
|
| - shll %cl, %eax
|
| - popl %ebx
|
| -
|
| - ret
|
| -
|
| -
|
| -C -----------------------------------------------------------------------------
|
| -C Two or more limbs.
|
| -C
|
| -C x={src,size} is reduced modulo y using either a plain mod_1 style
|
| -C remainder, or a modexact_1 style exact division.
|
| -
|
| -deflit(MODEXACT_THRESHOLD, ifdef(`PIC', 4, 4))
|
| -
|
| - ALIGN(8)
|
| -L(size_two_or_more):
|
| - C eax
|
| - C ebx
|
| - C ecx common twos
|
| - C edx y, without common twos
|
| - C esi
|
| - C edi
|
| - C ebp
|
| -
|
| -deflit(FRAME_TWO_OR_MORE, FRAME)
|
| -
|
| - pushl %edi defframe_pushl(SAVE_EDI)
|
| - movl PARAM_SRC, %ebx
|
| -
|
| -L(y_twos):
|
| - shrl %edx
|
| - jnc L(y_twos)
|
| -
|
| - movl %ecx, %edi C common twos
|
| - movl PARAM_SIZE, %ecx
|
| -
|
| - pushl %esi defframe_pushl(SAVE_ESI)
|
| - leal 1(%edx,%edx), %esi C y (odd)
|
| -
|
| - movl -4(%ebx,%ecx,4), %eax C src high limb
|
| -
|
| - cmpl %edx, %eax C carry if high<divisor
|
| -
|
| - sbbl %edx, %edx C -1 if high<divisor
|
| -
|
| - addl %edx, %ecx C skip one limb if high<divisor
|
| - andl %eax, %edx
|
| -
|
| - cmpl $MODEXACT_THRESHOLD, %ecx
|
| - jae L(modexact)
|
| -
|
| -
|
| -L(divide_top):
|
| - C eax scratch (quotient)
|
| - C ebx src
|
| - C ecx counter, size-1 to 1
|
| - C edx carry (remainder)
|
| - C esi divisor (odd)
|
| - C edi
|
| - C ebp
|
| -
|
| - movl -4(%ebx,%ecx,4), %eax
|
| - divl %esi
|
| - loop L(divide_top)
|
| -
|
| -
|
| - movl %edx, %eax C x
|
| - movl %esi, %edx C y (odd)
|
| -
|
| - movl %edi, %ebx C common twos
|
| - popl %esi
|
| -
|
| - popl %edi
|
| - leal 1(%eax), %ecx
|
| -
|
| - orl %eax, %eax
|
| - jnz L(strip_x_and)
|
| -
|
| -
|
| - movl %ebx, %ecx
|
| - movl %edx, %eax
|
| -
|
| - shll %cl, %eax
|
| - popl %ebx
|
| -
|
| - ret
|
| -
|
| -
|
| - ALIGN(8)
|
| -L(modexact):
|
| - C eax
|
| - C ebx src ptr
|
| - C ecx size or size-1
|
| - C edx
|
| - C esi y odd
|
| - C edi common twos
|
| - C ebp
|
| -
|
| - movl PARAM_SIZE, %eax
|
| - pushl %esi FRAME_pushl()
|
| -
|
| - pushl %eax FRAME_pushl()
|
| -
|
| - pushl %ebx FRAME_pushl()
|
| -
|
| -ifdef(`PIC',`
|
| - nop C code alignment
|
| - call L(movl_eip_ebx)
|
| -L(here):
|
| - addl $_GLOBAL_OFFSET_TABLE_, %ebx
|
| - call GSYM_PREFIX`'mpn_modexact_1_odd@PLT
|
| -',`
|
| - call GSYM_PREFIX`'mpn_modexact_1_odd
|
| -')
|
| -
|
| - movl %esi, %edx C y odd
|
| - movl SAVE_ESI, %esi
|
| -
|
| - movl %edi, %ebx C common twos
|
| - movl SAVE_EDI, %edi
|
| -
|
| - addl $eval(FRAME - FRAME_TWO_OR_MORE), %esp
|
| - orl %eax, %eax
|
| -
|
| - leal 1(%eax), %ecx
|
| - jnz L(strip_x_and)
|
| -
|
| -
|
| - movl %ebx, %ecx
|
| - movl %edx, %eax
|
| -
|
| - shll %cl, %eax
|
| - popl %ebx
|
| -
|
| - ret
|
| -
|
| -
|
| -ifdef(`PIC',`
|
| -L(movl_eip_ebx):
|
| - movl (%esp), %ebx
|
| - ret_internal
|
| -')
|
| -
|
| -EPILOGUE()
|
|
|