Index: gcc/gmp/mpn/x86/k7/gcd_1.asm |
diff --git a/gcc/gmp/mpn/x86/k7/gcd_1.asm b/gcc/gmp/mpn/x86/k7/gcd_1.asm |
deleted file mode 100644 |
index f912f43730411d68d3d1443fc26973b6285f0c23..0000000000000000000000000000000000000000 |
--- a/gcc/gmp/mpn/x86/k7/gcd_1.asm |
+++ /dev/null |
@@ -1,369 +0,0 @@ |
-dnl AMD K7 mpn_gcd_1 -- mpn by 1 gcd. |
- |
-dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc. |
-dnl |
-dnl This file is part of the GNU MP Library. |
-dnl |
-dnl The GNU MP Library is free software; you can redistribute it and/or |
-dnl modify it under the terms of the GNU Lesser General Public License as |
-dnl published by the Free Software Foundation; either version 3 of the |
-dnl License, or (at your option) any later version. |
-dnl |
-dnl The GNU MP Library is distributed in the hope that it will be useful, |
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of |
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
-dnl Lesser General Public License for more details. |
-dnl |
-dnl You should have received a copy of the GNU Lesser General Public License |
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
- |
-include(`../config.m4') |
- |
- |
-C K7: 6.75 cycles/bit (approx) 1x1 gcd |
-C 11.0 cycles/limb Nx1 reduction (modexact_1_odd) |
- |
- |
-dnl Reduce using x%y if x is more than DIV_THRESHOLD bits bigger than y, |
-dnl where x is the larger of the two. See tune/README for more. |
-dnl |
-dnl divl at 40 cycles compared to the gcd at about 7 cycles/bitpair |
-dnl suggests 40/7*2=11.4 but 7 seems to be about right. |
- |
-deflit(DIV_THRESHOLD, 7) |
- |
- |
-C table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. |
-C |
-C This is mixed in with the code, but as per the k7 optimization manual it's |
-C a full cache line and suitably aligned so it won't get swapped between |
-C code and data. Having it in TEXT rather than RODATA saves needing a GOT |
-C entry when PIC. |
-C |
-C Actually, there doesn't seem to be a measurable difference between this in |
-C it's own cache line or plonked in the middle of the code. Presumably |
-C since TEXT is read-only there's no worries about coherency. |
- |
-deflit(MASK, 63) |
-deflit(MAXSHIFT, 6) |
- |
- TEXT |
- ALIGN(64) |
-L(table): |
- .byte MAXSHIFT |
-forloop(i,1,MASK, |
-` .byte m4_count_trailing_zeros(i) |
-') |
- |
- |
-C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t limb); |
-C |
- |
-defframe(PARAM_LIMB, 12) |
-defframe(PARAM_SIZE, 8) |
-defframe(PARAM_SRC, 4) |
- |
-defframe(SAVE_EBX, -4) |
-defframe(SAVE_ESI, -8) |
-defframe(SAVE_EDI, -12) |
-defframe(SAVE_EBP, -16) |
-defframe(CALL_DIVISOR,-20) |
-defframe(CALL_SIZE, -24) |
-defframe(CALL_SRC, -28) |
- |
-deflit(STACK_SPACE, 28) |
- |
- TEXT |
- ALIGN(16) |
- |
-PROLOGUE(mpn_gcd_1) |
-deflit(`FRAME',0) |
- |
- ASSERT(ne, `cmpl $0, PARAM_LIMB') C y!=0 |
- ASSERT(ae, `cmpl $1, PARAM_SIZE') C size>=1 |
- |
- movl PARAM_SRC, %eax |
- movl PARAM_LIMB, %edx |
- subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) |
- |
- movl %esi, SAVE_ESI |
- movl %ebx, SAVE_EBX |
- |
- movl (%eax), %esi C src low limb |
- |
-ifdef(`PIC',` |
- movl %edi, SAVE_EDI |
- call L(movl_eip_to_edi) |
-L(here): |
- addl $L(table)-L(here), %edi |
-') |
- |
- movl %esi, %ebx |
- orl %edx, %esi C x|y |
- movl $-1, %ecx |
- |
-L(twos): |
- incl %ecx |
- shrl %esi |
- jnc L(twos) C 3/4 chance of x or y odd already |
- |
- shrl %cl, %ebx |
- shrl %cl, %edx |
- movl %ecx, %esi C common twos |
- |
- movl PARAM_SIZE, %ecx |
- cmpl $1, %ecx |
- ja L(divide) |
- |
- |
- C eax |
- C ebx x |
- C ecx |
- C edx y |
- C esi common twos |
- C edi [PIC] L(table) |
- C ebp |
- |
- movl %edx, %eax |
- cmpl %ebx, %edx |
- |
- cmovb( %ebx, %eax) C swap to make x bigger than y |
- cmovb( %edx, %ebx) |
- |
- |
-L(strip_y): |
- C eax x |
- C ebx y |
- C ecx |
- C edx |
- C esi common twos |
- C edi [PIC] L(table) |
- C ebp |
- |
- ASSERT(nz,`orl %ebx,%ebx') |
- shrl %ebx |
- jnc L(strip_y) |
- rcll %ebx |
- |
- |
- C eax x |
- C ebx y (odd) |
- C ecx |
- C edx |
- C esi common twos |
- C edi [PIC] L(table) |
- C ebp |
- |
- movl %eax, %ecx |
- movl %ebx, %edx |
- shrl $DIV_THRESHOLD, %eax |
- |
- cmpl %eax, %ebx |
- movl %ecx, %eax |
- ja L(strip_x_entry) C do x%y if x much bigger than y |
- |
- |
- xorl %edx, %edx |
- |
- divl %ebx |
- |
- orl %edx, %edx |
- movl %edx, %eax C remainder -> x |
- movl %ebx, %edx C y |
- |
- jz L(done_ebx) |
- jmp L(strip_x) |
- |
- |
- C Offset 0x9D here for non-PIC. About 0.4 cycles/bit is saved by |
- C ensuring the end of the jnz at the end of this loop doesn't cross |
- C into the next cache line at 0xC0. |
- C |
- C PIC on the other hand is offset 0xAC here and extends to 0xC9, so |
- C it crosses but doesn't suffer any measurable slowdown. |
- |
-L(top): |
- C eax x |
- C ebx y-x |
- C ecx x-y |
- C edx y |
- C esi twos, for use at end |
- C edi [PIC] L(table) |
- |
- cmovc( %ebx, %ecx) C if x-y gave carry, use x and y-x |
- cmovc( %eax, %edx) |
- |
-L(strip_x): |
- movl %ecx, %eax |
-L(strip_x_entry): |
- andl $MASK, %ecx |
- |
- ASSERT(nz, `orl %eax, %eax') |
- |
-ifdef(`PIC',` |
- movb (%ecx,%edi), %cl |
-',` |
- movb L(table) (%ecx), %cl |
-') |
- |
- shrl %cl, %eax |
- cmpb $MAXSHIFT, %cl |
- |
- movl %eax, %ecx |
- movl %edx, %ebx |
- je L(strip_x) |
- |
- ASSERT(nz, `testl $1, %eax') C both odd |
- ASSERT(nz, `testl $1, %edx') |
- |
- subl %eax, %ebx |
- subl %edx, %ecx |
- jnz L(top) |
- |
- |
-L(done): |
- movl %esi, %ecx |
- movl SAVE_ESI, %esi |
-ifdef(`PIC',` |
- movl SAVE_EDI, %edi |
-') |
- |
- shll %cl, %eax |
- movl SAVE_EBX, %ebx |
- addl $FRAME, %esp |
- |
- ret |
- |
- |
- |
-C ----------------------------------------------------------------------------- |
-C two or more limbs |
- |
-dnl MODEXACT_THRESHOLD is the size at which it's better to call |
-dnl mpn_modexact_1_odd than do an inline loop. |
- |
-deflit(MODEXACT_THRESHOLD, ifdef(`PIC',6,5)) |
- |
-L(divide): |
- C eax src |
- C ebx |
- C ecx size |
- C edx y |
- C esi common twos |
- C edi [PIC] L(table) |
- C ebp |
- |
-L(divide_strip_y): |
- ASSERT(nz,`orl %edx,%edx') |
- shrl %edx |
- jnc L(divide_strip_y) |
- leal 1(%edx,%edx), %ebx C y now odd |
- |
- movl %ebp, SAVE_EBP |
- movl %eax, %ebp |
- movl -4(%eax,%ecx,4), %eax C src high limb |
- |
- cmp $MODEXACT_THRESHOLD, %ecx |
- jae L(modexact) |
- |
- cmpl %ebx, %eax C high cmp divisor |
- movl $0, %edx |
- |
- cmovc( %eax, %edx) C skip a div if high<divisor |
- sbbl $0, %ecx |
- |
- |
-L(divide_top): |
- C eax scratch (quotient) |
- C ebx y |
- C ecx counter (size to 1, inclusive) |
- C edx carry (remainder) |
- C esi common twos |
- C edi [PIC] L(table) |
- C ebp src |
- |
- movl -4(%ebp,%ecx,4), %eax |
- |
- divl %ebx |
- |
- decl %ecx |
- jnz L(divide_top) |
- |
- |
- C eax |
- C ebx y (odd) |
- C ecx |
- C edx x |
- C esi common twos |
- C edi [PIC] L(table) |
- C ebp |
- |
- orl %edx, %edx |
- movl SAVE_EBP, %ebp |
- movl %edx, %eax |
- |
- movl %edx, %ecx |
- movl %ebx, %edx |
- jnz L(strip_x_entry) |
- |
- |
-L(done_ebx): |
- movl %ebx, %eax |
- jmp L(done) |
- |
- |
- |
-L(modexact): |
- C eax |
- C ebx y |
- C ecx size |
- C edx |
- C esi common twos |
- C edi [PIC] L(table) |
- C ebp src |
- |
-ifdef(`PIC',` |
- movl %ebp, CALL_SRC |
- movl %ebx, %ebp C y |
- movl %edi, %ebx C L(table) |
- |
- addl $_GLOBAL_OFFSET_TABLE_+[.-L(table)], %ebx |
- movl %ebp, CALL_DIVISOR |
- movl %ecx, CALL_SIZE |
- |
- call GSYM_PREFIX`'mpn_modexact_1_odd@PLT |
-',` |
-dnl non-PIC |
- movl %ebx, CALL_DIVISOR |
- movl %ebp, CALL_SRC |
- movl %ecx, CALL_SIZE |
- |
- call GSYM_PREFIX`'mpn_modexact_1_odd |
-') |
- |
- C eax x |
- C ebx [non-PIC] y |
- C ecx |
- C edx |
- C esi common twos |
- C edi [PIC] L(table) |
- C ebp [PIC] y |
- |
- orl %eax, %eax |
- movl ifdef(`PIC',`%ebp',`%ebx'), %edx |
- movl SAVE_EBP, %ebp |
- |
- movl %eax, %ecx |
- jnz L(strip_x_entry) |
- |
- movl %edx, %eax |
- jmp L(done) |
- |
- |
-ifdef(`PIC', ` |
-L(movl_eip_to_edi): |
- movl (%esp), %edi |
- ret_internal |
-') |
- |
-EPILOGUE() |