Index: gcc/gmp/mpn/x86/k7/dive_1.asm |
diff --git a/gcc/gmp/mpn/x86/k7/dive_1.asm b/gcc/gmp/mpn/x86/k7/dive_1.asm |
deleted file mode 100644 |
index c994e0fb06fd74872fdbde1005e9e3748120ed96..0000000000000000000000000000000000000000 |
--- a/gcc/gmp/mpn/x86/k7/dive_1.asm |
+++ /dev/null |
@@ -1,196 +0,0 @@ |
-dnl AMD K7 mpn_divexact_1 -- mpn by limb exact division. |
- |
-dnl Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc. |
-dnl |
-dnl This file is part of the GNU MP Library. |
-dnl |
-dnl The GNU MP Library is free software; you can redistribute it and/or |
-dnl modify it under the terms of the GNU Lesser General Public License as |
-dnl published by the Free Software Foundation; either version 3 of the |
-dnl License, or (at your option) any later version. |
-dnl |
-dnl The GNU MP Library is distributed in the hope that it will be useful, |
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of |
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
-dnl Lesser General Public License for more details. |
-dnl |
-dnl You should have received a copy of the GNU Lesser General Public License |
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
- |
-include(`../config.m4') |
- |
- |
-C cycles/limb |
-C Athlon: 11.0 |
-C Hammer: 9.0 |
- |
- |
-C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, |
-C mp_limb_t divisor); |
-C |
-C The dependent chain is mul+imul+sub for 11 cycles and that speed is |
-C achieved with no special effort. The load and shrld latencies are hidden |
-C by out of order execution. |
-C |
-C It's a touch faster on size==1 to use the mul-by-inverse than divl. |
- |
-defframe(PARAM_DIVISOR,16) |
-defframe(PARAM_SIZE, 12) |
-defframe(PARAM_SRC, 8) |
-defframe(PARAM_DST, 4) |
- |
-defframe(SAVE_EBX, -4) |
-defframe(SAVE_ESI, -8) |
-defframe(SAVE_EDI, -12) |
-defframe(SAVE_EBP, -16) |
-defframe(VAR_INVERSE, -20) |
-defframe(VAR_DST_END, -24) |
- |
-deflit(STACK_SPACE, 24) |
- |
- TEXT |
- |
- ALIGN(16) |
-PROLOGUE(mpn_divexact_1) |
-deflit(`FRAME',0) |
- |
- movl PARAM_DIVISOR, %eax |
- subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) |
- movl $-1, %ecx C shift count |
- |
- movl %ebp, SAVE_EBP |
- movl PARAM_SIZE, %ebp |
- |
- movl %esi, SAVE_ESI |
- movl %edi, SAVE_EDI |
- |
- C If there's usually only one or two trailing zero bits then this |
- C should be faster than bsfl. |
-L(strip_twos): |
- incl %ecx |
- shrl %eax |
- jnc L(strip_twos) |
- |
- movl %ebx, SAVE_EBX |
- leal 1(%eax,%eax), %ebx C d without twos |
- andl $127, %eax C d/2, 7 bits |
- |
-ifdef(`PIC',` |
- LEA( binvert_limb_table, %edx) |
- movzbl (%eax,%edx), %eax C inv 8 bits |
-',` |
- movzbl binvert_limb_table(%eax), %eax C inv 8 bits |
-') |
- |
- leal (%eax,%eax), %edx C 2*inv |
- movl %ebx, PARAM_DIVISOR C d without twos |
- |
- imull %eax, %eax C inv*inv |
- |
- movl PARAM_SRC, %esi |
- movl PARAM_DST, %edi |
- |
- imull %ebx, %eax C inv*inv*d |
- |
- subl %eax, %edx C inv = 2*inv - inv*inv*d |
- leal (%edx,%edx), %eax C 2*inv |
- |
- imull %edx, %edx C inv*inv |
- |
- leal (%esi,%ebp,4), %esi C src end |
- leal (%edi,%ebp,4), %edi C dst end |
- negl %ebp C -size |
- |
- imull %ebx, %edx C inv*inv*d |
- |
- subl %edx, %eax C inv = 2*inv - inv*inv*d |
- |
- ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB |
- pushl %eax FRAME_pushl() |
- imull PARAM_DIVISOR, %eax |
- cmpl $1, %eax |
- popl %eax FRAME_popl()') |
- |
- movl %eax, VAR_INVERSE |
- movl (%esi,%ebp,4), %eax C src[0] |
- |
- incl %ebp |
- jz L(one) |
- |
- movl (%esi,%ebp,4), %edx C src[1] |
- |
- shrdl( %cl, %edx, %eax) |
- |
- movl %edi, VAR_DST_END |
- xorl %ebx, %ebx |
- jmp L(entry) |
- |
- ALIGN(8) |
-L(top): |
- C eax q |
- C ebx carry bit, 0 or 1 |
- C ecx shift |
- C edx |
- C esi src end |
- C edi dst end |
- C ebp counter, limbs, negative |
- |
- mull PARAM_DIVISOR C carry limb in edx |
- |
- movl -4(%esi,%ebp,4), %eax |
- movl (%esi,%ebp,4), %edi |
- |
- shrdl( %cl, %edi, %eax) |
- |
- subl %ebx, %eax C apply carry bit |
- setc %bl |
- movl VAR_DST_END, %edi |
- |
- subl %edx, %eax C apply carry limb |
- adcl $0, %ebx |
- |
-L(entry): |
- imull VAR_INVERSE, %eax |
- |
- movl %eax, -4(%edi,%ebp,4) |
- incl %ebp |
- jnz L(top) |
- |
- |
- mull PARAM_DIVISOR C carry limb in edx |
- |
- movl -4(%esi), %eax C src high limb |
- shrl %cl, %eax |
- movl SAVE_ESI, %esi |
- |
- subl %ebx, %eax C apply carry bit |
- movl SAVE_EBX, %ebx |
- movl SAVE_EBP, %ebp |
- |
- subl %edx, %eax C apply carry limb |
- |
- imull VAR_INVERSE, %eax |
- |
- movl %eax, -4(%edi) |
- movl SAVE_EDI, %edi |
- addl $STACK_SPACE, %esp |
- |
- ret |
- |
- |
-L(one): |
- shrl %cl, %eax |
- movl SAVE_ESI, %esi |
- movl SAVE_EBX, %ebx |
- |
- imull VAR_INVERSE, %eax |
- |
- movl SAVE_EBP, %ebp |
- movl %eax, -4(%edi) |
- |
- movl SAVE_EDI, %edi |
- addl $STACK_SPACE, %esp |
- |
- ret |
- |
-EPILOGUE() |