gcc/gmp/mpn/x86_64/mod_34lsub1.asm - Issue 3050029: [gcc] GCC 4.5.0=>4.5.1

Unified Diff: gcc/gmp/mpn/x86_64/mod_34lsub1.asm

Issue 3050029: [gcc] GCC 4.5.0=>4.5.1 (Closed) Base URL: ssh://git@gitrw.chromium.org:9222/nacl-toolchain.git

Patch Set: Created 10 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: gcc/gmp/mpn/x86_64/mod_34lsub1.asm

diff --git a/gcc/gmp/mpn/x86_64/mod_34lsub1.asm b/gcc/gmp/mpn/x86_64/mod_34lsub1.asm

deleted file mode 100644

index 34df5bb5b792274545b1f0897619b33712241c6d..0000000000000000000000000000000000000000

--- a/gcc/gmp/mpn/x86_64/mod_34lsub1.asm

+++ /dev/null

@@ -1,165 +0,0 @@

-dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.

-dnl Inc.

-dnl

-dnl This file is part of the GNU MP Library.

-dnl

-dnl The GNU MP Library is free software; you can redistribute it and/or

-dnl modify it under the terms of the GNU Lesser General Public License as

-dnl published by the Free Software Foundation; either version 3 of the

-dnl License, or (at your option) any later version.

-dnl

-dnl The GNU MP Library is distributed in the hope that it will be useful,

-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of

-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

-dnl Lesser General Public License for more details.

-dnl

-dnl You should have received a copy of the GNU Lesser General Public License

-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.

-include(`../config.m4')

-C cycles/limb

-C K8,K9: 1.0

-C K10: 1.12

-C P4: 3.25

-C P6-15 (Core2): 1.5

-C P6-28 (Atom): 2.5

-C INPUT PARAMETERS

-C up rdi

-C n rsi

-C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)

-C TODO

-C * Apply the movzwl tricks to the x86/k7 code

-C * Review feed-in and wind-down code. In particular, try to avoid adcq and

-C sbbq to placate Pentium4.

-C * More unrolling and/or index addressing could bring time to under 1 c/l

-C for Athlon64, approaching 0.67 c/l seems possible.

-C * There are recurrencies on the carry registers (r8, r9, r10) that might

-C be the limiting factor for the Pentium4 speed. Splitting these into 6

-C registers would help.

-C * For ultimate Athlon64 performance, a sequence like this might be best.

-C It should reach 0.5 c/l (limited by L1 cache bandwidth).

-C

-C addq (%rdi), %rax

-C adcq 8(%rdi), %rcx

-C adcq 16(%rdi), %rdx

-C adcq $0, %r8

-C addq 24(%rdi), %rax

-C adcq 32(%rdi), %rcx

-C adcq 40(%rdi), %rdx

-C adcq $0, %r8

-C ...

-ASM_START()

- TEXT

- ALIGN(32)

-PROLOGUE(mpn_mod_34lsub1)

- mov $0x0000FFFFFFFFFFFF, %r11

- sub $2, %rsi

- ja L(gt2)

- mov (%rdi), %rax

- nop

- jb L(1)

- mov 8(%rdi), %rsi

- mov %rax, %rdx

- shr $48, %rax C src[0] low

- and %r11, %rdx C src[0] high

- add %rdx, %rax

- mov %esi, %edx

- shr $32, %rsi C src[1] high

- add %rsi, %rax

- shl $16, %rdx C src[1] low

- add %rdx, %rax

-L(1): ret

- ALIGN(16)

-L(gt2): xor %eax, %eax

- xor %ecx, %ecx

- xor %edx, %edx

- xor %r8, %r8

- xor %r9, %r9

- xor %r10, %r10

-L(top): add (%rdi), %rax

- adc $0, %r10

- add 8(%rdi), %rcx

- adc $0, %r8

- add 16(%rdi), %rdx

- adc $0, %r9

- sub $3,%rsi

- jng L(end)

- add 24(%rdi), %rax

- adc $0, %r10

- add 32(%rdi), %rcx

- adc $0, %r8

- add 40(%rdi), %rdx

- lea 48(%rdi), %rdi

- adc $0, %r9

- sub $3,%rsi

- jg L(top)

- add $-24, %rdi

-L(end): add %r9, %rax

- adc %r10, %rcx

- adc %r8, %rdx

- inc %rsi

- mov $0x1, %r10d

- js L(combine)

- mov $0x10000, %r10d

- adc 24(%rdi), %rax

- dec %rsi

- js L(combine)

- adc 32(%rdi), %rcx

- mov $0x100000000, %r10

-L(combine):

- sbb %rsi, %rsi C carry

- mov %rax, %rdi C 0mod3

- shr $48, %rax C 0mod3 high

- and %r10, %rsi C carry masked

- and %r11, %rdi C 0mod3 low

- mov %ecx, %r10d C 1mod3

- add %rsi, %rax C apply carry

- shr $32, %rcx C 1mod3 high

- add %rdi, %rax C apply 0mod3 low

- movzwl %dx, %edi C 2mod3

- shl $16, %r10 C 1mod3 low

- add %rcx, %rax C apply 1mod3 high

- shr $16, %rdx C 2mod3 high

- add %r10, %rax C apply 1mod3 low

- shl $32, %rdi C 2mod3 low

- add %rdx, %rax C apply 2mod3 high

- add %rdi, %rax C apply 2mod3 low

- ret

-EPILOGUE()

« no previous file with comments | « gcc/gmp/mpn/x86_64/lshsub_n.asm ('k') | gcc/gmp/mpn/x86_64/mul_1.asm » ('j') | no next file with comments »