| Index: gcc/gmp/mpn/x86_64/mod_34lsub1.asm
|
| diff --git a/gcc/gmp/mpn/x86_64/mod_34lsub1.asm b/gcc/gmp/mpn/x86_64/mod_34lsub1.asm
|
| deleted file mode 100644
|
| index 34df5bb5b792274545b1f0897619b33712241c6d..0000000000000000000000000000000000000000
|
| --- a/gcc/gmp/mpn/x86_64/mod_34lsub1.asm
|
| +++ /dev/null
|
| @@ -1,165 +0,0 @@
|
| -dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
|
| -
|
| -dnl Copyright 2000, 2001, 2002, 2004, 2005, 2007 Free Software Foundation,
|
| -dnl Inc.
|
| -dnl
|
| -dnl This file is part of the GNU MP Library.
|
| -dnl
|
| -dnl The GNU MP Library is free software; you can redistribute it and/or
|
| -dnl modify it under the terms of the GNU Lesser General Public License as
|
| -dnl published by the Free Software Foundation; either version 3 of the
|
| -dnl License, or (at your option) any later version.
|
| -dnl
|
| -dnl The GNU MP Library is distributed in the hope that it will be useful,
|
| -dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| -dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| -dnl Lesser General Public License for more details.
|
| -dnl
|
| -dnl You should have received a copy of the GNU Lesser General Public License
|
| -dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
|
| -
|
| -include(`../config.m4')
|
| -
|
| -
|
| -C cycles/limb
|
| -C K8,K9: 1.0
|
| -C K10: 1.12
|
| -C P4: 3.25
|
| -C P6-15 (Core2): 1.5
|
| -C P6-28 (Atom): 2.5
|
| -
|
| -
|
| -C INPUT PARAMETERS
|
| -C up rdi
|
| -C n rsi
|
| -
|
| -C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
|
| -
|
| -C TODO
|
| -C * Apply the movzwl tricks to the x86/k7 code
|
| -C * Review feed-in and wind-down code. In particular, try to avoid adcq and
|
| -C sbbq to placate Pentium4.
|
| -C * More unrolling and/or index addressing could bring time to under 1 c/l
|
| -C for Athlon64, approaching 0.67 c/l seems possible.
|
| -C * There are recurrencies on the carry registers (r8, r9, r10) that might
|
| -C be the limiting factor for the Pentium4 speed. Splitting these into 6
|
| -C registers would help.
|
| -C * For ultimate Athlon64 performance, a sequence like this might be best.
|
| -C It should reach 0.5 c/l (limited by L1 cache bandwidth).
|
| -C
|
| -C addq (%rdi), %rax
|
| -C adcq 8(%rdi), %rcx
|
| -C adcq 16(%rdi), %rdx
|
| -C adcq $0, %r8
|
| -C addq 24(%rdi), %rax
|
| -C adcq 32(%rdi), %rcx
|
| -C adcq 40(%rdi), %rdx
|
| -C adcq $0, %r8
|
| -C ...
|
| -
|
| -
|
| -ASM_START()
|
| - TEXT
|
| - ALIGN(32)
|
| -PROLOGUE(mpn_mod_34lsub1)
|
| -
|
| - mov $0x0000FFFFFFFFFFFF, %r11
|
| -
|
| - sub $2, %rsi
|
| - ja L(gt2)
|
| -
|
| - mov (%rdi), %rax
|
| - nop
|
| - jb L(1)
|
| -
|
| - mov 8(%rdi), %rsi
|
| - mov %rax, %rdx
|
| - shr $48, %rax C src[0] low
|
| -
|
| - and %r11, %rdx C src[0] high
|
| - add %rdx, %rax
|
| - mov %esi, %edx
|
| -
|
| - shr $32, %rsi C src[1] high
|
| - add %rsi, %rax
|
| -
|
| - shl $16, %rdx C src[1] low
|
| - add %rdx, %rax
|
| -
|
| -L(1): ret
|
| -
|
| -
|
| - ALIGN(16)
|
| -L(gt2): xor %eax, %eax
|
| - xor %ecx, %ecx
|
| - xor %edx, %edx
|
| - xor %r8, %r8
|
| - xor %r9, %r9
|
| - xor %r10, %r10
|
| -
|
| -L(top): add (%rdi), %rax
|
| - adc $0, %r10
|
| - add 8(%rdi), %rcx
|
| - adc $0, %r8
|
| - add 16(%rdi), %rdx
|
| - adc $0, %r9
|
| -
|
| - sub $3,%rsi
|
| - jng L(end)
|
| -
|
| - add 24(%rdi), %rax
|
| - adc $0, %r10
|
| - add 32(%rdi), %rcx
|
| - adc $0, %r8
|
| - add 40(%rdi), %rdx
|
| - lea 48(%rdi), %rdi
|
| - adc $0, %r9
|
| -
|
| - sub $3,%rsi
|
| - jg L(top)
|
| -
|
| -
|
| - add $-24, %rdi
|
| -L(end): add %r9, %rax
|
| - adc %r10, %rcx
|
| - adc %r8, %rdx
|
| -
|
| - inc %rsi
|
| - mov $0x1, %r10d
|
| - js L(combine)
|
| -
|
| - mov $0x10000, %r10d
|
| - adc 24(%rdi), %rax
|
| - dec %rsi
|
| - js L(combine)
|
| -
|
| - adc 32(%rdi), %rcx
|
| - mov $0x100000000, %r10
|
| -
|
| -L(combine):
|
| - sbb %rsi, %rsi C carry
|
| - mov %rax, %rdi C 0mod3
|
| - shr $48, %rax C 0mod3 high
|
| -
|
| - and %r10, %rsi C carry masked
|
| - and %r11, %rdi C 0mod3 low
|
| - mov %ecx, %r10d C 1mod3
|
| -
|
| - add %rsi, %rax C apply carry
|
| - shr $32, %rcx C 1mod3 high
|
| -
|
| - add %rdi, %rax C apply 0mod3 low
|
| - movzwl %dx, %edi C 2mod3
|
| - shl $16, %r10 C 1mod3 low
|
| -
|
| - add %rcx, %rax C apply 1mod3 high
|
| - shr $16, %rdx C 2mod3 high
|
| -
|
| - add %r10, %rax C apply 1mod3 low
|
| - shl $32, %rdi C 2mod3 low
|
| -
|
| - add %rdx, %rax C apply 2mod3 high
|
| - add %rdi, %rax C apply 2mod3 low
|
| -
|
| - ret
|
| -EPILOGUE()
|
|
|