Index: gcc/gmp/mpn/x86/k6/k62mmx/copyd.asm |
diff --git a/gcc/gmp/mpn/x86/k6/k62mmx/copyd.asm b/gcc/gmp/mpn/x86/k6/k62mmx/copyd.asm |
deleted file mode 100644 |
index 227ed7878367dbf6caed4071703ced01f0a4e5cf..0000000000000000000000000000000000000000 |
--- a/gcc/gmp/mpn/x86/k6/k62mmx/copyd.asm |
+++ /dev/null |
@@ -1,107 +0,0 @@ |
-dnl AMD K6-2 mpn_copyd -- copy limb vector, decrementing. |
- |
-dnl Copyright 2001, 2002 Free Software Foundation, Inc. |
-dnl |
-dnl This file is part of the GNU MP Library. |
-dnl |
-dnl The GNU MP Library is free software; you can redistribute it and/or |
-dnl modify it under the terms of the GNU Lesser General Public License as |
-dnl published by the Free Software Foundation; either version 3 of the |
-dnl License, or (at your option) any later version. |
-dnl |
-dnl The GNU MP Library is distributed in the hope that it will be useful, |
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of |
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
-dnl Lesser General Public License for more details. |
-dnl |
-dnl You should have received a copy of the GNU Lesser General Public License |
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
- |
-include(`../config.m4') |
- |
- |
-C K6-2: 1.0 cycles/limb |
- |
- |
-C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); |
-C |
-C The loop here is no faster than a rep movsl at 1.0 c/l, but it avoids a 30 |
-C cycle startup time, which amounts for instance to a 2x speedup at 15 |
-C limbs. |
-C |
-C If dst is 4mod8 the loop would be 1.17 c/l, but that's avoided by |
-C processing one limb separately to make it aligned. This and a final odd |
-C limb are handled in a branch-free fashion, ending up re-copying if the |
-C special case isn't needed. |
-C |
-C Alternatives: |
-C |
-C There used to be a big unrolled version of this, running at 0.56 c/l if |
-C the destination was aligned, but that seemed rather excessive for the |
-C relative importance of copyd. |
-C |
-C If the destination alignment is ignored and just left to run at 1.17 c/l |
-C some code size and a fixed few cycles can be saved. Considering how few |
-C uses copyd finds perhaps that should be favoured. The current code has |
-C the attraction of being no slower than a basic rep movsl though. |
- |
-defframe(PARAM_SIZE,12) |
-defframe(PARAM_SRC, 8) |
-defframe(PARAM_DST, 4) |
- |
-dnl re-using parameter space |
-define(SAVE_EBX,`PARAM_SIZE') |
- |
- TEXT |
- ALIGN(16) |
- |
-PROLOGUE(mpn_copyd) |
-deflit(`FRAME',0) |
- |
- movl PARAM_SIZE, %ecx |
- movl %ebx, SAVE_EBX |
- |
- movl PARAM_SRC, %eax |
- movl PARAM_DST, %edx |
- |
- subl $1, %ecx C better code alignment than decl |
- jb L(zero) |
- |
- jz L(one_more) |
- leal 4(%edx,%ecx,4), %ebx |
- |
-Zdisp( movd, 0,(%eax,%ecx,4), %mm0) C high limb |
-Zdisp( movd, %mm0, 0,(%edx,%ecx,4)) C Zdisp for good code alignment |
- |
- cmpl $1, %ecx |
- je L(one_more) |
- |
- shrl $2, %ebx |
- andl $1, %ebx C 1 if dst[size-2] unaligned |
- |
- subl %ebx, %ecx |
- nop C code alignment |
- |
-L(top): |
- C eax src |
- C ebx |
- C ecx counter |
- C edx dst |
- |
- movq -4(%eax,%ecx,4), %mm0 |
- subl $2, %ecx |
- |
- movq %mm0, 4(%edx,%ecx,4) |
- ja L(top) |
- |
- |
-L(one_more): |
- movd (%eax), %mm0 |
- movd %mm0, (%edx) |
- |
- movl SAVE_EBX, %ebx |
- emms_or_femms |
-L(zero): |
- ret |
- |
-EPILOGUE() |