Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(66)

Unified Diff: gcc/gmp/mpn/x86_64/mod_34lsub1.asm

Issue 3050029: [gcc] GCC 4.5.0=>4.5.1 (Closed) Base URL: ssh://git@gitrw.chromium.org:9222/nacl-toolchain.git
Patch Set: Created 10 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « gcc/gmp/mpn/x86_64/lshsub_n.asm ('k') | gcc/gmp/mpn/x86_64/mul_1.asm » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: gcc/gmp/mpn/x86_64/mod_34lsub1.asm
diff --git a/gcc/gmp/mpn/x86_64/mod_34lsub1.asm b/gcc/gmp/mpn/x86_64/mod_34lsub1.asm
deleted file mode 100644
index 34df5bb5b792274545b1f0897619b33712241c6d..0000000000000000000000000000000000000000
--- a/gcc/gmp/mpn/x86_64/mod_34lsub1.asm
+++ /dev/null
@@ -1,165 +0,0 @@
-dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
-
-dnl Copyright 2000, 2001, 2002, 2004, 2005, 2007 Free Software Foundation,
-dnl Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 3 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public License
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb
-C K8,K9: 1.0
-C K10: 1.12
-C P4: 3.25
-C P6-15 (Core2): 1.5
-C P6-28 (Atom): 2.5
-
-
-C INPUT PARAMETERS
-C up rdi
-C n rsi
-
-C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
-
-C TODO
-C * Apply the movzwl tricks to the x86/k7 code
-C * Review feed-in and wind-down code. In particular, try to avoid adcq and
-C sbbq to placate Pentium4.
-C * More unrolling and/or index addressing could bring time to under 1 c/l
-C for Athlon64, approaching 0.67 c/l seems possible.
-C * There are recurrencies on the carry registers (r8, r9, r10) that might
-C be the limiting factor for the Pentium4 speed. Splitting these into 6
-C registers would help.
-C * For ultimate Athlon64 performance, a sequence like this might be best.
-C It should reach 0.5 c/l (limited by L1 cache bandwidth).
-C
-C addq (%rdi), %rax
-C adcq 8(%rdi), %rcx
-C adcq 16(%rdi), %rdx
-C adcq $0, %r8
-C addq 24(%rdi), %rax
-C adcq 32(%rdi), %rcx
-C adcq 40(%rdi), %rdx
-C adcq $0, %r8
-C ...
-
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_mod_34lsub1)
-
- mov $0x0000FFFFFFFFFFFF, %r11
-
- sub $2, %rsi
- ja L(gt2)
-
- mov (%rdi), %rax
- nop
- jb L(1)
-
- mov 8(%rdi), %rsi
- mov %rax, %rdx
- shr $48, %rax C src[0] low
-
- and %r11, %rdx C src[0] high
- add %rdx, %rax
- mov %esi, %edx
-
- shr $32, %rsi C src[1] high
- add %rsi, %rax
-
- shl $16, %rdx C src[1] low
- add %rdx, %rax
-
-L(1): ret
-
-
- ALIGN(16)
-L(gt2): xor %eax, %eax
- xor %ecx, %ecx
- xor %edx, %edx
- xor %r8, %r8
- xor %r9, %r9
- xor %r10, %r10
-
-L(top): add (%rdi), %rax
- adc $0, %r10
- add 8(%rdi), %rcx
- adc $0, %r8
- add 16(%rdi), %rdx
- adc $0, %r9
-
- sub $3,%rsi
- jng L(end)
-
- add 24(%rdi), %rax
- adc $0, %r10
- add 32(%rdi), %rcx
- adc $0, %r8
- add 40(%rdi), %rdx
- lea 48(%rdi), %rdi
- adc $0, %r9
-
- sub $3,%rsi
- jg L(top)
-
-
- add $-24, %rdi
-L(end): add %r9, %rax
- adc %r10, %rcx
- adc %r8, %rdx
-
- inc %rsi
- mov $0x1, %r10d
- js L(combine)
-
- mov $0x10000, %r10d
- adc 24(%rdi), %rax
- dec %rsi
- js L(combine)
-
- adc 32(%rdi), %rcx
- mov $0x100000000, %r10
-
-L(combine):
- sbb %rsi, %rsi C carry
- mov %rax, %rdi C 0mod3
- shr $48, %rax C 0mod3 high
-
- and %r10, %rsi C carry masked
- and %r11, %rdi C 0mod3 low
- mov %ecx, %r10d C 1mod3
-
- add %rsi, %rax C apply carry
- shr $32, %rcx C 1mod3 high
-
- add %rdi, %rax C apply 0mod3 low
- movzwl %dx, %edi C 2mod3
- shl $16, %r10 C 1mod3 low
-
- add %rcx, %rax C apply 1mod3 high
- shr $16, %rdx C 2mod3 high
-
- add %r10, %rax C apply 1mod3 low
- shl $32, %rdi C 2mod3 low
-
- add %rdx, %rax C apply 2mod3 high
- add %rdi, %rax C apply 2mod3 low
-
- ret
-EPILOGUE()
« no previous file with comments | « gcc/gmp/mpn/x86_64/lshsub_n.asm ('k') | gcc/gmp/mpn/x86_64/mul_1.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698