Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(6)

Unified Diff: gcc/gmp/mpn/x86/p6/copyd.asm

Issue 3050029: [gcc] GCC 4.5.0=>4.5.1 (Closed) Base URL: ssh://git@gitrw.chromium.org:9222/nacl-toolchain.git
Patch Set: Created 10 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « gcc/gmp/mpn/x86/p6/aorsmul_1.asm ('k') | gcc/gmp/mpn/x86/p6/lshsub_n.asm » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: gcc/gmp/mpn/x86/p6/copyd.asm
diff --git a/gcc/gmp/mpn/x86/p6/copyd.asm b/gcc/gmp/mpn/x86/p6/copyd.asm
deleted file mode 100644
index 2946f51e7a36601599a2904660fefc7c4e8ea679..0000000000000000000000000000000000000000
--- a/gcc/gmp/mpn/x86/p6/copyd.asm
+++ /dev/null
@@ -1,167 +0,0 @@
-dnl Intel P6 mpn_copyd -- copy limb vector backwards.
-
-dnl Copyright 2001, 2002 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 3 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public License
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C P6: 1.75 cycles/limb, or 0.75 if no overlap
-
-
-C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
-C
-C An explicit loop is used because a decrementing rep movsl is a bit slow at
-C 2.4 c/l. That rep movsl also has about a 40 cycle startup time, and the
-C code here stands a chance of being faster if the branches predict well.
-C
-C The slightly strange loop form seems necessary for the claimed speed.
-C Maybe load/store ordering affects it.
-C
-C The source and destination are checked to see if they're actually
-C overlapping, since it might be possible to use an incrementing rep movsl
-C at 0.75 c/l. (It doesn't suffer the bad startup time of the decrementing
-C version.)
-C
-C Enhancements:
-C
-C Top speed for an all-integer copy is probably 1.0 c/l, being one load and
-C one store each cycle. Unrolling the loop below would approach 1.0, but
-C it'd be good to know why something like store/load/subl + store/load/jnz
-C doesn't already run at 1.0 c/l. It looks like it should decode in 2
-C cycles, but doesn't run that way.
-
-defframe(PARAM_SIZE,12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-dnl re-using parameter space
-define(SAVE_ESI,`PARAM_SIZE')
-define(SAVE_EDI,`PARAM_SRC')
-
- TEXT
- ALIGN(16)
-
-PROLOGUE(mpn_copyd)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %ecx
-
- movl %esi, SAVE_ESI
- movl PARAM_SRC, %esi
-
- movl %edi, SAVE_EDI
- movl PARAM_DST, %edi
-
- subl $1, %ecx
- jb L(zero)
-
- movl (%esi,%ecx,4), %eax C src[size-1]
- jz L(one)
-
- movl -4(%esi,%ecx,4), %edx C src[size-2]
- subl $2, %ecx
- jbe L(done_loop) C 2 or 3 limbs only
-
-
- C The usual overlap is
- C
- C high low
- C +------------------+
- C | dst|
- C +------------------+
- C +------------------+
- C | src|
- C +------------------+
- C
- C We can use an incrementing copy in the following circumstances.
- C
- C src+4*size<=dst, since then the regions are disjoint
- C
- C src==dst, clearly (though this shouldn't occur normally)
- C
- C src>dst, since in that case it's a requirement of the
- C parameters that src>=dst+size*4, and hence the
- C regions are disjoint
- C
-
- leal (%edi,%ecx,4), %edx
- cmpl %edi, %esi
- jae L(use_movsl) C src >= dst
-
- cmpl %edi, %edx
- movl 4(%esi,%ecx,4), %edx C src[size-2] again
- jbe L(use_movsl) C src+4*size <= dst
-
-
-L(top):
- C eax prev high limb
- C ebx
- C ecx counter, size-3 down to 0 or -1, inclusive, by 2s
- C edx prev low limb
- C esi src
- C edi dst
- C ebp
-
- movl %eax, 8(%edi,%ecx,4)
- movl (%esi,%ecx,4), %eax
-
- movl %edx, 4(%edi,%ecx,4)
- movl -4(%esi,%ecx,4), %edx
-
- subl $2, %ecx
- jnbe L(top)
-
-
-L(done_loop):
- movl %eax, 8(%edi,%ecx,4)
- movl %edx, 4(%edi,%ecx,4)
-
- C copy low limb (needed if size was odd, but will already have been
- C done in the loop if size was even)
- movl (%esi), %eax
-L(one):
- movl %eax, (%edi)
- movl SAVE_EDI, %edi
- movl SAVE_ESI, %esi
-
- ret
-
-
-L(use_movsl):
- C eax
- C ebx
- C ecx size-3
- C edx
- C esi src
- C edi dst
- C ebp
-
- addl $3, %ecx
-
- cld C better safe than sorry, see mpn/x86/README
-
- rep
- movsl
-
-L(zero):
- movl SAVE_ESI, %esi
- movl SAVE_EDI, %edi
-
- ret
-
-EPILOGUE()
« no previous file with comments | « gcc/gmp/mpn/x86/p6/aorsmul_1.asm ('k') | gcc/gmp/mpn/x86/p6/lshsub_n.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698