gcc/gmp/mpn/x86/k6/mmx/dive_1.asm - Issue 3050029: [gcc] GCC 4.5.0=>4.5.1

Unified Diff: gcc/gmp/mpn/x86/k6/mmx/dive_1.asm

Issue 3050029: [gcc] GCC 4.5.0=>4.5.1 (Closed) Base URL: ssh://git@gitrw.chromium.org:9222/nacl-toolchain.git

Patch Set: Created 10 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: gcc/gmp/mpn/x86/k6/mmx/dive_1.asm

diff --git a/gcc/gmp/mpn/x86/k6/mmx/dive_1.asm b/gcc/gmp/mpn/x86/k6/mmx/dive_1.asm

deleted file mode 100644

index 9cc90d88a5b46683277de540d0b83613605aed85..0000000000000000000000000000000000000000

--- a/gcc/gmp/mpn/x86/k6/mmx/dive_1.asm

+++ /dev/null

@@ -1,270 +0,0 @@

-dnl AMD K6 mpn_divexact_1 -- mpn by limb exact division.

-dnl

-dnl This file is part of the GNU MP Library.

-dnl

-dnl The GNU MP Library is free software; you can redistribute it and/or

-dnl modify it under the terms of the GNU Lesser General Public License as

-dnl published by the Free Software Foundation; either version 3 of the

-dnl License, or (at your option) any later version.

-dnl

-dnl The GNU MP Library is distributed in the hope that it will be useful,

-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of

-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

-dnl Lesser General Public License for more details.

-dnl

-dnl You should have received a copy of the GNU Lesser General Public License

-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.

-include(`../config.m4')

-C divisor

-C odd even

-C K6: 10.0 12.0 cycles/limb

-C K6-2: 10.0 11.5

-C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,

-C mp_limb_t divisor);

-C

-C A simple divl is used for size==1. This is about 10 cycles faster for an

-C odd divisor or 20 cycles for an even divisor.

-C

-C The loops are quite sensitive to code alignment, speeds should be

-C rechecked (odd and even divisor, pic and non-pic) if contemplating

-C changing anything.

-defframe(PARAM_DIVISOR,16)

-defframe(PARAM_SIZE, 12)

-defframe(PARAM_SRC, 8)

-defframe(PARAM_DST, 4)

-dnl re-use parameter space

-define(VAR_INVERSE,`PARAM_DST')

- TEXT

- ALIGN(32)

-PROLOGUE(mpn_divexact_1)

-deflit(`FRAME',0)

- movl PARAM_SIZE, %ecx

- movl PARAM_SRC, %eax

- xorl %edx, %edx

- cmpl $1, %ecx

- jnz L(two_or_more)

- movl (%eax), %eax

- divl PARAM_DIVISOR

- movl PARAM_DST, %ecx

- movl %eax, (%ecx)

- ret

-L(two_or_more):

- movl PARAM_DIVISOR, %eax

- pushl %ebx FRAME_pushl()

- movl PARAM_SRC, %ebx

- pushl %ebp FRAME_pushl()

-L(strip_twos):

- shrl %eax

- incl %edx C will get shift+1

- jnc L(strip_twos)

- pushl %esi FRAME_pushl()

- leal 1(%eax,%eax), %esi C d without twos

- andl $127, %eax C d/2, 7 bits

-ifdef(`PIC',`

- LEA( binvert_limb_table, %ebp)

-Zdisp( movzbl, 0,(%eax,%ebp), %eax)

-',`

- movzbl binvert_limb_table(%eax), %eax C inv 8 bits

-')

- pushl %edi FRAME_pushl()

- leal (%eax,%eax), %ebp C 2*inv

- imull %eax, %eax C inv*inv

- movl PARAM_DST, %edi

- imull %esi, %eax C inv*inv*d

- subl %eax, %ebp C inv = 2*inv - inv*inv*d

- leal (%ebp,%ebp), %eax C 2*inv

- imull %ebp, %ebp C inv*inv

- movl %esi, PARAM_DIVISOR C d without twos

- leal (%ebx,%ecx,4), %ebx C src end

- imull %esi, %ebp C inv*inv*d

- leal (%edi,%ecx,4), %edi C dst end

- negl %ecx C -size

- subl %ebp, %eax C inv = 2*inv - inv*inv*d

- subl $1, %edx C shift amount, and clear carry

- ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB

- pushl %eax FRAME_pushl()

- imull PARAM_DIVISOR, %eax

- cmpl $1, %eax

- popl %eax FRAME_popl()')

- movl %eax, VAR_INVERSE

- jnz L(even)

- movl (%ebx,%ecx,4), %esi C src low limb

- jmp L(odd_entry)

- ALIGN(16)

- nop C code alignment

-L(odd_top):

- C eax scratch

- C ebx src end

- C ecx counter, limbs, negative

- C edx inverse

- C esi next limb, adjusted for carry

- C edi dst end

- C ebp carry bit, 0 or -1

- imull %edx, %esi

- movl PARAM_DIVISOR, %eax

- movl %esi, -4(%edi,%ecx,4)

- mull %esi C carry limb in edx

- subl %ebp, %edx C apply carry bit

- movl (%ebx,%ecx,4), %esi

-L(odd_entry):

- subl %edx, %esi C apply carry limb

- movl VAR_INVERSE, %edx

- sbbl %ebp, %ebp C 0 or -1

- incl %ecx

- jnz L(odd_top)

- imull %edx, %esi

- movl %esi, -4(%edi,%ecx,4)

- popl %edi

- popl %esi

- popl %ebp

- popl %ebx

- ret

-L(even):

- C eax

- C ebx src end

- C ecx -size

- C edx twos

- C esi

- C edi dst end

- C ebp

- xorl %ebp, %ebp

-Zdisp( movq, 0,(%ebx,%ecx,4), %mm0) C src[0,1]

- movd %edx, %mm7

- movl VAR_INVERSE, %edx

- addl $2, %ecx

- psrlq %mm7, %mm0

- movd %mm0, %esi

- jz L(even_two) C if only two limbs

-C Out-of-order execution is good enough to hide the load/rshift/movd

-C latency. Having imul at the top of the loop gives 11.5 c/l instead of 12,

-C on K6-2. In fact there's only 11 of decode, but nothing running at 11 has

-C been found. Maybe the fact every second movq is unaligned costs the extra

-C 0.5.

-L(even_top):

- C eax scratch

- C ebx src end

- C ecx counter, limbs, negative

- C edx inverse

- C esi next limb, adjusted for carry

- C edi dst end

- C ebp carry bit, 0 or -1

- C

- C mm0 scratch, source limbs

- C mm7 twos

- imull %edx, %esi

- movl %esi, -8(%edi,%ecx,4)

- movl PARAM_DIVISOR, %eax

- mull %esi C carry limb in edx

- movq -4(%ebx,%ecx,4), %mm0

- psrlq %mm7, %mm0

- movd %mm0, %esi

- subl %ebp, %edx C apply carry bit

- subl %edx, %esi C apply carry limb

- movl VAR_INVERSE, %edx

- sbbl %ebp, %ebp C 0 or -1

- incl %ecx

- jnz L(even_top)

-L(even_two):

- movd -4(%ebx), %mm0 C src high limb

- psrlq %mm7, %mm0

- imull %edx, %esi

- movl %esi, -8(%edi)

- movl PARAM_DIVISOR, %eax

- mull %esi C carry limb in edx

- movd %mm0, %esi

- subl %ebp, %edx C apply carry bit

- movl VAR_INVERSE, %eax

- subl %edx, %esi C apply carry limb

- imull %eax, %esi

- movl %esi, -4(%edi)

- popl %edi

- popl %esi

- popl %ebp

- popl %ebx

- emms_or_femms

- ret

-EPILOGUE()

« no previous file with comments | « gcc/gmp/mpn/x86/k6/mmx/com_n.asm ('k') | gcc/gmp/mpn/x86/k6/mmx/logops_n.asm » ('j') | no next file with comments »