gcc/gmp/mpn/x86/k7/dive_1.asm - Issue 3050029: [gcc] GCC 4.5.0=>4.5.1

Unified Diff: gcc/gmp/mpn/x86/k7/dive_1.asm

Issue 3050029: [gcc] GCC 4.5.0=>4.5.1 (Closed) Base URL: ssh://git@gitrw.chromium.org:9222/nacl-toolchain.git

Patch Set: Created 10 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: gcc/gmp/mpn/x86/k7/dive_1.asm

diff --git a/gcc/gmp/mpn/x86/k7/dive_1.asm b/gcc/gmp/mpn/x86/k7/dive_1.asm

deleted file mode 100644

index c994e0fb06fd74872fdbde1005e9e3748120ed96..0000000000000000000000000000000000000000

--- a/gcc/gmp/mpn/x86/k7/dive_1.asm

+++ /dev/null

@@ -1,196 +0,0 @@

-dnl AMD K7 mpn_divexact_1 -- mpn by limb exact division.

-dnl

-dnl This file is part of the GNU MP Library.

-dnl

-dnl The GNU MP Library is free software; you can redistribute it and/or

-dnl modify it under the terms of the GNU Lesser General Public License as

-dnl published by the Free Software Foundation; either version 3 of the

-dnl License, or (at your option) any later version.

-dnl

-dnl The GNU MP Library is distributed in the hope that it will be useful,

-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of

-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

-dnl Lesser General Public License for more details.

-dnl

-dnl You should have received a copy of the GNU Lesser General Public License

-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.

-include(`../config.m4')

-C cycles/limb

-C Athlon: 11.0

-C Hammer: 9.0

-C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,

-C mp_limb_t divisor);

-C

-C The dependent chain is mul+imul+sub for 11 cycles and that speed is

-C achieved with no special effort. The load and shrld latencies are hidden

-C by out of order execution.

-C

-C It's a touch faster on size==1 to use the mul-by-inverse than divl.

-defframe(PARAM_DIVISOR,16)

-defframe(PARAM_SIZE, 12)

-defframe(PARAM_SRC, 8)

-defframe(PARAM_DST, 4)

-defframe(SAVE_EBX, -4)

-defframe(SAVE_ESI, -8)

-defframe(SAVE_EDI, -12)

-defframe(SAVE_EBP, -16)

-defframe(VAR_INVERSE, -20)

-defframe(VAR_DST_END, -24)

-deflit(STACK_SPACE, 24)

- TEXT

- ALIGN(16)

-PROLOGUE(mpn_divexact_1)

-deflit(`FRAME',0)

- movl PARAM_DIVISOR, %eax

- subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)

- movl $-1, %ecx C shift count

- movl %ebp, SAVE_EBP

- movl PARAM_SIZE, %ebp

- movl %esi, SAVE_ESI

- movl %edi, SAVE_EDI

- C If there's usually only one or two trailing zero bits then this

- C should be faster than bsfl.

-L(strip_twos):

- incl %ecx

- shrl %eax

- jnc L(strip_twos)

- movl %ebx, SAVE_EBX

- leal 1(%eax,%eax), %ebx C d without twos

- andl $127, %eax C d/2, 7 bits

-ifdef(`PIC',`

- LEA( binvert_limb_table, %edx)

- movzbl (%eax,%edx), %eax C inv 8 bits

-',`

- movzbl binvert_limb_table(%eax), %eax C inv 8 bits

-')

- leal (%eax,%eax), %edx C 2*inv

- movl %ebx, PARAM_DIVISOR C d without twos

- imull %eax, %eax C inv*inv

- movl PARAM_SRC, %esi

- movl PARAM_DST, %edi

- imull %ebx, %eax C inv*inv*d

- subl %eax, %edx C inv = 2*inv - inv*inv*d

- leal (%edx,%edx), %eax C 2*inv

- imull %edx, %edx C inv*inv

- leal (%esi,%ebp,4), %esi C src end

- leal (%edi,%ebp,4), %edi C dst end

- negl %ebp C -size

- imull %ebx, %edx C inv*inv*d

- subl %edx, %eax C inv = 2*inv - inv*inv*d

- ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB

- pushl %eax FRAME_pushl()

- imull PARAM_DIVISOR, %eax

- cmpl $1, %eax

- popl %eax FRAME_popl()')

- movl %eax, VAR_INVERSE

- movl (%esi,%ebp,4), %eax C src[0]

- incl %ebp

- jz L(one)

- movl (%esi,%ebp,4), %edx C src[1]

- shrdl( %cl, %edx, %eax)

- movl %edi, VAR_DST_END

- xorl %ebx, %ebx

- jmp L(entry)

- ALIGN(8)

-L(top):

- C eax q

- C ebx carry bit, 0 or 1

- C ecx shift

- C edx

- C esi src end

- C edi dst end

- C ebp counter, limbs, negative

- mull PARAM_DIVISOR C carry limb in edx

- movl -4(%esi,%ebp,4), %eax

- movl (%esi,%ebp,4), %edi

- shrdl( %cl, %edi, %eax)

- subl %ebx, %eax C apply carry bit

- setc %bl

- movl VAR_DST_END, %edi

- subl %edx, %eax C apply carry limb

- adcl $0, %ebx

-L(entry):

- imull VAR_INVERSE, %eax

- movl %eax, -4(%edi,%ebp,4)

- incl %ebp

- jnz L(top)

- mull PARAM_DIVISOR C carry limb in edx

- movl -4(%esi), %eax C src high limb

- shrl %cl, %eax

- movl SAVE_ESI, %esi

- subl %ebx, %eax C apply carry bit

- movl SAVE_EBX, %ebx

- movl SAVE_EBP, %ebp

- subl %edx, %eax C apply carry limb

- imull VAR_INVERSE, %eax

- movl %eax, -4(%edi)

- movl SAVE_EDI, %edi

- addl $STACK_SPACE, %esp

- ret

-L(one):

- shrl %cl, %eax

- movl SAVE_ESI, %esi

- movl SAVE_EBX, %ebx

- imull VAR_INVERSE, %eax

- movl SAVE_EBP, %ebp

- movl %eax, -4(%edi)

- movl SAVE_EDI, %edi

- addl $STACK_SPACE, %esp

- ret

-EPILOGUE()

« no previous file with comments | « gcc/gmp/mpn/x86/k6/sqr_basecase.asm ('k') | gcc/gmp/mpn/x86/k7/gcd_1.asm » ('j') | no next file with comments »