Index: gcc/gmp/mpn/x86/p6/copyd.asm |
diff --git a/gcc/gmp/mpn/x86/p6/copyd.asm b/gcc/gmp/mpn/x86/p6/copyd.asm |
deleted file mode 100644 |
index 2946f51e7a36601599a2904660fefc7c4e8ea679..0000000000000000000000000000000000000000 |
--- a/gcc/gmp/mpn/x86/p6/copyd.asm |
+++ /dev/null |
@@ -1,167 +0,0 @@ |
-dnl Intel P6 mpn_copyd -- copy limb vector backwards. |
- |
-dnl Copyright 2001, 2002 Free Software Foundation, Inc. |
-dnl |
-dnl This file is part of the GNU MP Library. |
-dnl |
-dnl The GNU MP Library is free software; you can redistribute it and/or |
-dnl modify it under the terms of the GNU Lesser General Public License as |
-dnl published by the Free Software Foundation; either version 3 of the |
-dnl License, or (at your option) any later version. |
-dnl |
-dnl The GNU MP Library is distributed in the hope that it will be useful, |
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of |
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
-dnl Lesser General Public License for more details. |
-dnl |
-dnl You should have received a copy of the GNU Lesser General Public License |
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
- |
-include(`../config.m4') |
- |
- |
-C P6: 1.75 cycles/limb, or 0.75 if no overlap |
- |
- |
-C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); |
-C |
-C An explicit loop is used because a decrementing rep movsl is a bit slow at |
-C 2.4 c/l. That rep movsl also has about a 40 cycle startup time, and the |
-C code here stands a chance of being faster if the branches predict well. |
-C |
-C The slightly strange loop form seems necessary for the claimed speed. |
-C Maybe load/store ordering affects it. |
-C |
-C The source and destination are checked to see if they're actually |
-C overlapping, since it might be possible to use an incrementing rep movsl |
-C at 0.75 c/l. (It doesn't suffer the bad startup time of the decrementing |
-C version.) |
-C |
-C Enhancements: |
-C |
-C Top speed for an all-integer copy is probably 1.0 c/l, being one load and |
-C one store each cycle. Unrolling the loop below would approach 1.0, but |
-C it'd be good to know why something like store/load/subl + store/load/jnz |
-C doesn't already run at 1.0 c/l. It looks like it should decode in 2 |
-C cycles, but doesn't run that way. |
- |
-defframe(PARAM_SIZE,12) |
-defframe(PARAM_SRC, 8) |
-defframe(PARAM_DST, 4) |
- |
-dnl re-using parameter space |
-define(SAVE_ESI,`PARAM_SIZE') |
-define(SAVE_EDI,`PARAM_SRC') |
- |
- TEXT |
- ALIGN(16) |
- |
-PROLOGUE(mpn_copyd) |
-deflit(`FRAME',0) |
- |
- movl PARAM_SIZE, %ecx |
- |
- movl %esi, SAVE_ESI |
- movl PARAM_SRC, %esi |
- |
- movl %edi, SAVE_EDI |
- movl PARAM_DST, %edi |
- |
- subl $1, %ecx |
- jb L(zero) |
- |
- movl (%esi,%ecx,4), %eax C src[size-1] |
- jz L(one) |
- |
- movl -4(%esi,%ecx,4), %edx C src[size-2] |
- subl $2, %ecx |
- jbe L(done_loop) C 2 or 3 limbs only |
- |
- |
- C The usual overlap is |
- C |
- C high low |
- C +------------------+ |
- C | dst| |
- C +------------------+ |
- C +------------------+ |
- C | src| |
- C +------------------+ |
- C |
- C We can use an incrementing copy in the following circumstances. |
- C |
- C src+4*size<=dst, since then the regions are disjoint |
- C |
- C src==dst, clearly (though this shouldn't occur normally) |
- C |
- C src>dst, since in that case it's a requirement of the |
- C parameters that src>=dst+size*4, and hence the |
- C regions are disjoint |
- C |
- |
- leal (%edi,%ecx,4), %edx |
- cmpl %edi, %esi |
- jae L(use_movsl) C src >= dst |
- |
- cmpl %edi, %edx |
- movl 4(%esi,%ecx,4), %edx C src[size-2] again |
- jbe L(use_movsl) C src+4*size <= dst |
- |
- |
-L(top): |
- C eax prev high limb |
- C ebx |
- C ecx counter, size-3 down to 0 or -1, inclusive, by 2s |
- C edx prev low limb |
- C esi src |
- C edi dst |
- C ebp |
- |
- movl %eax, 8(%edi,%ecx,4) |
- movl (%esi,%ecx,4), %eax |
- |
- movl %edx, 4(%edi,%ecx,4) |
- movl -4(%esi,%ecx,4), %edx |
- |
- subl $2, %ecx |
- jnbe L(top) |
- |
- |
-L(done_loop): |
- movl %eax, 8(%edi,%ecx,4) |
- movl %edx, 4(%edi,%ecx,4) |
- |
- C copy low limb (needed if size was odd, but will already have been |
- C done in the loop if size was even) |
- movl (%esi), %eax |
-L(one): |
- movl %eax, (%edi) |
- movl SAVE_EDI, %edi |
- movl SAVE_ESI, %esi |
- |
- ret |
- |
- |
-L(use_movsl): |
- C eax |
- C ebx |
- C ecx size-3 |
- C edx |
- C esi src |
- C edi dst |
- C ebp |
- |
- addl $3, %ecx |
- |
- cld C better safe than sorry, see mpn/x86/README |
- |
- rep |
- movsl |
- |
-L(zero): |
- movl SAVE_ESI, %esi |
- movl SAVE_EDI, %edi |
- |
- ret |
- |
-EPILOGUE() |