Index: gcc/gmp/mpn/x86_64/mod_34lsub1.asm |
diff --git a/gcc/gmp/mpn/x86_64/mod_34lsub1.asm b/gcc/gmp/mpn/x86_64/mod_34lsub1.asm |
deleted file mode 100644 |
index 34df5bb5b792274545b1f0897619b33712241c6d..0000000000000000000000000000000000000000 |
--- a/gcc/gmp/mpn/x86_64/mod_34lsub1.asm |
+++ /dev/null |
@@ -1,165 +0,0 @@ |
-dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1. |
- |
-dnl Copyright 2000, 2001, 2002, 2004, 2005, 2007 Free Software Foundation, |
-dnl Inc. |
-dnl |
-dnl This file is part of the GNU MP Library. |
-dnl |
-dnl The GNU MP Library is free software; you can redistribute it and/or |
-dnl modify it under the terms of the GNU Lesser General Public License as |
-dnl published by the Free Software Foundation; either version 3 of the |
-dnl License, or (at your option) any later version. |
-dnl |
-dnl The GNU MP Library is distributed in the hope that it will be useful, |
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of |
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
-dnl Lesser General Public License for more details. |
-dnl |
-dnl You should have received a copy of the GNU Lesser General Public License |
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
- |
-include(`../config.m4') |
- |
- |
-C cycles/limb |
-C K8,K9: 1.0 |
-C K10: 1.12 |
-C P4: 3.25 |
-C P6-15 (Core2): 1.5 |
-C P6-28 (Atom): 2.5 |
- |
- |
-C INPUT PARAMETERS |
-C up rdi |
-C n rsi |
- |
-C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) |
- |
-C TODO |
-C * Apply the movzwl tricks to the x86/k7 code |
-C * Review feed-in and wind-down code. In particular, try to avoid adcq and |
-C sbbq to placate Pentium4. |
-C * More unrolling and/or index addressing could bring time to under 1 c/l |
-C for Athlon64, approaching 0.67 c/l seems possible. |
-C * There are recurrencies on the carry registers (r8, r9, r10) that might |
-C be the limiting factor for the Pentium4 speed. Splitting these into 6 |
-C registers would help. |
-C * For ultimate Athlon64 performance, a sequence like this might be best. |
-C It should reach 0.5 c/l (limited by L1 cache bandwidth). |
-C |
-C addq (%rdi), %rax |
-C adcq 8(%rdi), %rcx |
-C adcq 16(%rdi), %rdx |
-C adcq $0, %r8 |
-C addq 24(%rdi), %rax |
-C adcq 32(%rdi), %rcx |
-C adcq 40(%rdi), %rdx |
-C adcq $0, %r8 |
-C ... |
- |
- |
-ASM_START() |
- TEXT |
- ALIGN(32) |
-PROLOGUE(mpn_mod_34lsub1) |
- |
- mov $0x0000FFFFFFFFFFFF, %r11 |
- |
- sub $2, %rsi |
- ja L(gt2) |
- |
- mov (%rdi), %rax |
- nop |
- jb L(1) |
- |
- mov 8(%rdi), %rsi |
- mov %rax, %rdx |
- shr $48, %rax C src[0] low |
- |
- and %r11, %rdx C src[0] high |
- add %rdx, %rax |
- mov %esi, %edx |
- |
- shr $32, %rsi C src[1] high |
- add %rsi, %rax |
- |
- shl $16, %rdx C src[1] low |
- add %rdx, %rax |
- |
-L(1): ret |
- |
- |
- ALIGN(16) |
-L(gt2): xor %eax, %eax |
- xor %ecx, %ecx |
- xor %edx, %edx |
- xor %r8, %r8 |
- xor %r9, %r9 |
- xor %r10, %r10 |
- |
-L(top): add (%rdi), %rax |
- adc $0, %r10 |
- add 8(%rdi), %rcx |
- adc $0, %r8 |
- add 16(%rdi), %rdx |
- adc $0, %r9 |
- |
- sub $3,%rsi |
- jng L(end) |
- |
- add 24(%rdi), %rax |
- adc $0, %r10 |
- add 32(%rdi), %rcx |
- adc $0, %r8 |
- add 40(%rdi), %rdx |
- lea 48(%rdi), %rdi |
- adc $0, %r9 |
- |
- sub $3,%rsi |
- jg L(top) |
- |
- |
- add $-24, %rdi |
-L(end): add %r9, %rax |
- adc %r10, %rcx |
- adc %r8, %rdx |
- |
- inc %rsi |
- mov $0x1, %r10d |
- js L(combine) |
- |
- mov $0x10000, %r10d |
- adc 24(%rdi), %rax |
- dec %rsi |
- js L(combine) |
- |
- adc 32(%rdi), %rcx |
- mov $0x100000000, %r10 |
- |
-L(combine): |
- sbb %rsi, %rsi C carry |
- mov %rax, %rdi C 0mod3 |
- shr $48, %rax C 0mod3 high |
- |
- and %r10, %rsi C carry masked |
- and %r11, %rdi C 0mod3 low |
- mov %ecx, %r10d C 1mod3 |
- |
- add %rsi, %rax C apply carry |
- shr $32, %rcx C 1mod3 high |
- |
- add %rdi, %rax C apply 0mod3 low |
- movzwl %dx, %edi C 2mod3 |
- shl $16, %r10 C 1mod3 low |
- |
- add %rcx, %rax C apply 1mod3 high |
- shr $16, %rdx C 2mod3 high |
- |
- add %r10, %rax C apply 1mod3 low |
- shl $32, %rdi C 2mod3 low |
- |
- add %rdx, %rax C apply 2mod3 high |
- add %rdi, %rax C apply 2mod3 low |
- |
- ret |
-EPILOGUE() |