Index: gcc/gmp/mpn/x86/k6/gcd_1.asm |
diff --git a/gcc/gmp/mpn/x86/k6/gcd_1.asm b/gcc/gmp/mpn/x86/k6/gcd_1.asm |
deleted file mode 100644 |
index 58aff08221028369ade94b920c471d9b351ff703..0000000000000000000000000000000000000000 |
--- a/gcc/gmp/mpn/x86/k6/gcd_1.asm |
+++ /dev/null |
@@ -1,351 +0,0 @@ |
-dnl AMD K6 mpn_gcd_1 -- mpn by 1 gcd. |
- |
-dnl Copyright 2000, 2001, 2002, 2004 Free Software Foundation, Inc. |
-dnl |
-dnl This file is part of the GNU MP Library. |
-dnl |
-dnl The GNU MP Library is free software; you can redistribute it and/or |
-dnl modify it under the terms of the GNU Lesser General Public License as |
-dnl published by the Free Software Foundation; either version 3 of the |
-dnl License, or (at your option) any later version. |
-dnl |
-dnl The GNU MP Library is distributed in the hope that it will be useful, |
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of |
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
-dnl Lesser General Public License for more details. |
-dnl |
-dnl You should have received a copy of the GNU Lesser General Public License |
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
- |
-include(`../config.m4') |
- |
- |
-C K6: 9.5 cycles/bit (approx) 1x1 gcd |
-C 11.0 cycles/limb Nx1 reduction (modexact_1_odd) |
- |
- |
-C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t y); |
-C |
-C This code is nothing very special, but offers a speedup over what gcc 2.95 |
-C can do with mpn/generic/gcd_1.c. |
-C |
-C Future: |
-C |
-C Using a lookup table to count trailing zeros seems a touch quicker, but |
-C after a slightly longer startup. Might be worthwhile if an mpn_gcd_2 used |
-C it too. |
- |
- |
-dnl If size==1 and x (the larger operand) is more than DIV_THRESHOLD bits |
-dnl bigger than y, then a division x%y is done to reduce it. |
-dnl |
-dnl A divl is 20 cycles and the loop runs at about 9.5 cycles/bitpair so |
-dnl there should be an advantage in the divl at about 4 or 5 bits, which is |
-dnl what's found. |
- |
-deflit(DIV_THRESHOLD, 5) |
- |
- |
-defframe(PARAM_LIMB, 12) |
-defframe(PARAM_SIZE, 8) |
-defframe(PARAM_SRC, 4) |
- |
- TEXT |
- ALIGN(16) |
- |
-PROLOGUE(mpn_gcd_1) |
-deflit(`FRAME',0) |
- |
- ASSERT(ne, `cmpl $0, PARAM_LIMB') |
- ASSERT(ae, `cmpl $1, PARAM_SIZE') |
- |
- |
- movl PARAM_SRC, %eax |
- pushl %ebx FRAME_pushl() |
- |
- movl PARAM_LIMB, %edx |
- movl $-1, %ecx |
- |
- movl (%eax), %ebx C src low limb |
- |
- movl %ebx, %eax C src low limb |
- orl %edx, %ebx |
- |
-L(common_twos): |
- shrl %ebx |
- incl %ecx |
- |
- jnc L(common_twos) C 1/4 chance on random data |
- shrl %cl, %edx C y |
- |
- cmpl $1, PARAM_SIZE |
- ja L(size_two_or_more) |
- |
- |
- ASSERT(nz, `orl %eax, %eax') C should have src limb != 0 |
- |
- shrl %cl, %eax C x |
- |
- |
- C Swap if necessary to make x>=y. Measures a touch quicker as a |
- C jump than a branch free calculation. |
- C |
- C eax x |
- C ebx |
- C ecx common twos |
- C edx y |
- |
- movl %eax, %ebx |
- cmpl %eax, %edx |
- |
- jb L(noswap) |
- movl %edx, %eax |
- |
- movl %ebx, %edx |
- movl %eax, %ebx |
-L(noswap): |
- |
- |
- C See if it's worth reducing x with a divl. |
- C |
- C eax x |
- C ebx x |
- C ecx common twos |
- C edx y |
- |
- shrl $DIV_THRESHOLD, %ebx |
- |
- cmpl %ebx, %edx |
- ja L(nodiv) |
- |
- |
- C Reduce x to x%y. |
- C |
- C eax x |
- C ebx |
- C ecx common twos |
- C edx y |
- |
- movl %edx, %ebx |
- xorl %edx, %edx |
- |
- divl %ebx |
- |
- orl %edx, %edx C y |
- nop C code alignment |
- |
- movl %ebx, %eax C x |
- jz L(done_shll) |
-L(nodiv): |
- |
- |
- C eax x |
- C ebx |
- C ecx common twos |
- C edx y |
- C esi |
- C edi |
- C ebp |
- |
-L(strip_y): |
- shrl %edx |
- jnc L(strip_y) |
- |
- leal 1(%edx,%edx), %edx |
- movl %ecx, %ebx C common twos |
- |
- leal 1(%eax), %ecx |
- jmp L(strip_x_and) |
- |
- |
-C Calculating a %cl shift based on the low bit 0 or 1 avoids doing a branch |
-C on a 50/50 chance of 0 or 1. The chance of the next bit also being 0 is |
-C only 1/4. |
-C |
-C A second computed %cl shift was tried, but that measured a touch slower |
-C than branching back. |
-C |
-C A branch-free abs(x-y) and min(x,y) calculation was tried, but that |
-C measured about 1 cycle/bit slower. |
- |
- C eax x |
- C ebx common twos |
- C ecx scratch |
- C edx y |
- |
- ALIGN(4) |
-L(swap): |
- addl %eax, %edx C x-y+y = x |
- negl %eax C -(x-y) = y-x |
- |
-L(strip_x): |
- shrl %eax C odd-odd = even, so always one to strip |
- ASSERT(nz) |
- |
-L(strip_x_leal): |
- leal 1(%eax), %ecx |
- |
-L(strip_x_and): |
- andl $1, %ecx C (x^1)&1 |
- |
- shrl %cl, %eax C shift if x even |
- |
- testb $1, %al |
- jz L(strip_x) |
- |
- ASSERT(nz,`testl $1, %eax') C x, y odd |
- ASSERT(nz,`testl $1, %edx') |
- |
- subl %edx, %eax |
- jb L(swap) |
- ja L(strip_x) |
- |
- |
- movl %edx, %eax |
- movl %ebx, %ecx |
- |
-L(done_shll): |
- shll %cl, %eax |
- popl %ebx |
- |
- ret |
- |
- |
-C ----------------------------------------------------------------------------- |
-C Two or more limbs. |
-C |
-C x={src,size} is reduced modulo y using either a plain mod_1 style |
-C remainder, or a modexact_1 style exact division. |
- |
-deflit(MODEXACT_THRESHOLD, ifdef(`PIC', 4, 4)) |
- |
- ALIGN(8) |
-L(size_two_or_more): |
- C eax |
- C ebx |
- C ecx common twos |
- C edx y, without common twos |
- C esi |
- C edi |
- C ebp |
- |
-deflit(FRAME_TWO_OR_MORE, FRAME) |
- |
- pushl %edi defframe_pushl(SAVE_EDI) |
- movl PARAM_SRC, %ebx |
- |
-L(y_twos): |
- shrl %edx |
- jnc L(y_twos) |
- |
- movl %ecx, %edi C common twos |
- movl PARAM_SIZE, %ecx |
- |
- pushl %esi defframe_pushl(SAVE_ESI) |
- leal 1(%edx,%edx), %esi C y (odd) |
- |
- movl -4(%ebx,%ecx,4), %eax C src high limb |
- |
- cmpl %edx, %eax C carry if high<divisor |
- |
- sbbl %edx, %edx C -1 if high<divisor |
- |
- addl %edx, %ecx C skip one limb if high<divisor |
- andl %eax, %edx |
- |
- cmpl $MODEXACT_THRESHOLD, %ecx |
- jae L(modexact) |
- |
- |
-L(divide_top): |
- C eax scratch (quotient) |
- C ebx src |
- C ecx counter, size-1 to 1 |
- C edx carry (remainder) |
- C esi divisor (odd) |
- C edi |
- C ebp |
- |
- movl -4(%ebx,%ecx,4), %eax |
- divl %esi |
- loop L(divide_top) |
- |
- |
- movl %edx, %eax C x |
- movl %esi, %edx C y (odd) |
- |
- movl %edi, %ebx C common twos |
- popl %esi |
- |
- popl %edi |
- leal 1(%eax), %ecx |
- |
- orl %eax, %eax |
- jnz L(strip_x_and) |
- |
- |
- movl %ebx, %ecx |
- movl %edx, %eax |
- |
- shll %cl, %eax |
- popl %ebx |
- |
- ret |
- |
- |
- ALIGN(8) |
-L(modexact): |
- C eax |
- C ebx src ptr |
- C ecx size or size-1 |
- C edx |
- C esi y odd |
- C edi common twos |
- C ebp |
- |
- movl PARAM_SIZE, %eax |
- pushl %esi FRAME_pushl() |
- |
- pushl %eax FRAME_pushl() |
- |
- pushl %ebx FRAME_pushl() |
- |
-ifdef(`PIC',` |
- nop C code alignment |
- call L(movl_eip_ebx) |
-L(here): |
- addl $_GLOBAL_OFFSET_TABLE_, %ebx |
- call GSYM_PREFIX`'mpn_modexact_1_odd@PLT |
-',` |
- call GSYM_PREFIX`'mpn_modexact_1_odd |
-') |
- |
- movl %esi, %edx C y odd |
- movl SAVE_ESI, %esi |
- |
- movl %edi, %ebx C common twos |
- movl SAVE_EDI, %edi |
- |
- addl $eval(FRAME - FRAME_TWO_OR_MORE), %esp |
- orl %eax, %eax |
- |
- leal 1(%eax), %ecx |
- jnz L(strip_x_and) |
- |
- |
- movl %ebx, %ecx |
- movl %edx, %eax |
- |
- shll %cl, %eax |
- popl %ebx |
- |
- ret |
- |
- |
-ifdef(`PIC',` |
-L(movl_eip_ebx): |
- movl (%esp), %ebx |
- ret_internal |
-') |
- |
-EPILOGUE() |