Index: gcc/gmp/mpn/x86_64/lshsub_n.asm |
diff --git a/gcc/gmp/mpn/x86_64/lshsub_n.asm b/gcc/gmp/mpn/x86_64/lshsub_n.asm |
deleted file mode 100644 |
index dc8576b2209287f798a27923c6e11fc176183ae5..0000000000000000000000000000000000000000 |
--- a/gcc/gmp/mpn/x86_64/lshsub_n.asm |
+++ /dev/null |
@@ -1,152 +0,0 @@ |
-dnl AMD64 mpn_lshsub_n. R = 2^k(U - V). |
- |
-dnl Copyright 2006 Free Software Foundation, Inc. |
- |
-dnl This file is part of the GNU MP Library. |
- |
-dnl The GNU MP Library is free software; you can redistribute it and/or modify |
-dnl it under the terms of the GNU Lesser General Public License as published |
-dnl by the Free Software Foundation; either version 3 of the License, or (at |
-dnl your option) any later version. |
- |
-dnl The GNU MP Library is distributed in the hope that it will be useful, but |
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
-dnl License for more details. |
- |
-dnl You should have received a copy of the GNU Lesser General Public License |
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
- |
-include(`../config.m4') |
- |
- |
-C cycles/limb |
-C K8,K9: 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l) |
-C K10: 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l) |
-C P4: 16.5 |
-C P6-15: 4.35 |
- |
-C This was written quickly and not optimized at all, but it runs very well on |
-C K8. But perhaps one could get under 3 c/l. Ideas: |
-C 1) Use indexing to save the 3 LEA |
-C 2) Write reasonable feed-in code |
-C 3) Be more clever about register usage |
-C 4) Unroll more, handling CL negation, carry save/restore cost much now |
-C 5) Reschedule |
- |
-C INPUT PARAMETERS |
-define(`rp', `%rdi') |
-define(`up', `%rsi') |
-define(`vp', `%rdx') |
-define(`n', `%rcx') |
-define(`cnt' `%r8') |
- |
-ASM_START() |
- TEXT |
- ALIGN(16) |
-PROLOGUE(mpn_lshsub_n) |
- |
- push %r12 |
- push %r13 |
- push %r14 |
- push %r15 |
- push %rbx |
- |
- mov n, %rax |
- xor %ebx, %ebx C clear carry save register |
- mov %r8d, %ecx C shift count |
- xor %r15d, %r15d C limb carry |
- |
- mov %eax, %r11d |
- and $3, %r11d |
- je L(4) |
- sub $1, %r11d |
- |
-L(oopette): |
- add %ebx, %ebx C restore carry flag |
- mov 0(up), %r8 |
- lea 8(up), up |
- sbb 0(vp), %r8 |
- mov %r8, %r12 |
- sbb %ebx, %ebx C save carry flag |
- shl %cl, %r8 |
- or %r15, %r8 |
- mov %r12, %r15 |
- lea 8(vp), vp |
- neg %cl |
- shr %cl, %r15 |
- neg %cl |
- mov %r8, 0(rp) |
- lea 8(rp), rp |
- sub $1, %r11d |
- jnc L(oopette) |
- |
-L(4): |
- sub $4, %rax |
- jc L(end) |
- |
- ALIGN(16) |
-L(oop): |
- add %ebx, %ebx C restore carry flag |
- |
- mov 0(up), %r8 |
- mov 8(up), %r9 |
- mov 16(up), %r10 |
- mov 24(up), %r11 |
- |
- lea 32(up), up |
- |
- sbb 0(vp), %r8 |
- mov %r8, %r12 |
- sbb 8(vp), %r9 |
- mov %r9, %r13 |
- sbb 16(vp), %r10 |
- mov %r10, %r14 |
- sbb 24(vp), %r11 |
- |
- sbb %ebx, %ebx C save carry flag |
- |
- shl %cl, %r8 |
- shl %cl, %r9 |
- shl %cl, %r10 |
- or %r15, %r8 |
- mov %r11, %r15 |
- shl %cl, %r11 |
- |
- lea 32(vp), vp |
- |
- neg %cl |
- |
- shr %cl, %r12 |
- shr %cl, %r13 |
- shr %cl, %r14 |
- shr %cl, %r15 C used next loop |
- |
- or %r12, %r9 |
- or %r13, %r10 |
- or %r14, %r11 |
- |
- neg %cl |
- |
- mov %r8, 0(rp) |
- mov %r9, 8(rp) |
- mov %r10, 16(rp) |
- mov %r11, 24(rp) |
- |
- lea 32(rp), rp |
- |
- sub $4, %rax |
- jnc L(oop) |
-L(end): |
- neg %ebx |
- shl %cl, %rbx |
- adc %r15, %rbx |
- mov %rbx, %rax |
- pop %rbx |
- pop %r15 |
- pop %r14 |
- pop %r13 |
- pop %r12 |
- |
- ret |
-EPILOGUE() |