gcc/gmp/mpn/x86/pentium/mmx/mul_1.asm - Issue 3050029: [gcc] GCC 4.5.0=>4.5.1

Unified Diff: gcc/gmp/mpn/x86/pentium/mmx/mul_1.asm

Issue 3050029: [gcc] GCC 4.5.0=>4.5.1 (Closed) Base URL: ssh://git@gitrw.chromium.org:9222/nacl-toolchain.git

Patch Set: Created 10 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: gcc/gmp/mpn/x86/pentium/mmx/mul_1.asm

diff --git a/gcc/gmp/mpn/x86/pentium/mmx/mul_1.asm b/gcc/gmp/mpn/x86/pentium/mmx/mul_1.asm

deleted file mode 100644

index b9fe77ed0743f6af2989aedba9720e56c74da3cc..0000000000000000000000000000000000000000

--- a/gcc/gmp/mpn/x86/pentium/mmx/mul_1.asm

+++ /dev/null

@@ -1,360 +0,0 @@

-dnl Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication.

-dnl

-dnl This file is part of the GNU MP Library.

-dnl

-dnl The GNU MP Library is free software; you can redistribute it and/or

-dnl modify it under the terms of the GNU Lesser General Public License as

-dnl published by the Free Software Foundation; either version 3 of the

-dnl License, or (at your option) any later version.

-dnl

-dnl The GNU MP Library is distributed in the hope that it will be useful,

-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of

-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

-dnl Lesser General Public License for more details.

-dnl

-dnl You should have received a copy of the GNU Lesser General Public License

-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.

-include(`../config.m4')

-C cycles/limb

-C P5: 12.0 for 32-bit multiplier

-C 7.0 for 16-bit multiplier

-C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,

-C mp_limb_t multiplier);

-C

-C When the multiplier is 16 bits some special case MMX code is used. Small

-C multipliers might arise reasonably often from mpz_mul_ui etc. If the size

-C is odd there's roughly a 5 cycle penalty, so times for say size==7 and

-C size==8 end up being quite close. If src isn't aligned to an 8 byte

-C boundary then one limb is processed separately with roughly a 5 cycle

-C penalty, so in that case it's say size==8 and size==9 which are close.

-C

-C Alternatives:

-C

-C MMX is not believed to be of any use for 32-bit multipliers, since for

-C instance the current method would just have to be more or less duplicated

-C for the high and low halves of the multiplier, and would probably

-C therefore run at about 14 cycles, which is slower than the plain integer

-C at 12.

-C

-C Adding the high and low MMX products using integer code seems best. An

-C attempt at using paddd and carry bit propagation with pcmpgtd didn't give

-C any joy. Perhaps something could be done keeping the values signed and

-C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or

-C perhaps not.

-C

-C Future:

-C

-C An mpn_mul_1c entrypoint would need a double carry out of the low result

-C limb in the 16-bit code, unless it could be assumed the carry fits in 16

-C bits, possibly as carry<multiplier, this being true of a big calculation

-C done piece by piece. But let's worry about that if/when mul_1c is

-C actually used.

-defframe(PARAM_MULTIPLIER,16)

-defframe(PARAM_SIZE, 12)

-defframe(PARAM_SRC, 8)

-defframe(PARAM_DST, 4)

- TEXT

- ALIGN(8)

-PROLOGUE(mpn_mul_1)

-deflit(`FRAME',0)

- movl PARAM_SIZE, %ecx

- movl PARAM_SRC, %edx

- cmpl $1, %ecx

- jne L(two_or_more)

- C one limb only

- movl PARAM_MULTIPLIER, %eax

- movl PARAM_DST, %ecx

- mull (%edx)

- movl %eax, (%ecx)

- movl %edx, %eax

- ret

-L(two_or_more):

- C eax size

- C ebx

- C ecx carry

- C edx

- C esi src

- C edi

- C ebp

- pushl %esi FRAME_pushl()

- pushl %edi FRAME_pushl()

- movl %edx, %esi C src

- movl PARAM_DST, %edi

- movl PARAM_MULTIPLIER, %eax

- pushl %ebx FRAME_pushl()

- leal (%esi,%ecx,4), %esi C src end

- leal (%edi,%ecx,4), %edi C dst end

- negl %ecx C -size

- pushl %ebp FRAME_pushl()

- cmpl $65536, %eax

- jb L(small)

-L(big):

- xorl %ebx, %ebx C carry limb

- sarl %ecx C -size/2

- jnc L(top) C with carry flag clear

- C size was odd, process one limb separately

- mull 4(%esi,%ecx,8) C m * src[0]

- movl %eax, 4(%edi,%ecx,8)

- incl %ecx

- orl %edx, %ebx C carry limb, and clear carry flag

-L(top):

- C eax

- C ebx carry

- C ecx counter, negative

- C edx

- C esi src end

- C edi dst end

- C ebp (scratch carry)

- adcl $0, %ebx

- movl (%esi,%ecx,8), %eax

- mull PARAM_MULTIPLIER

- movl %edx, %ebp

- addl %eax, %ebx

- adcl $0, %ebp

- movl 4(%esi,%ecx,8), %eax

- mull PARAM_MULTIPLIER

- movl %ebx, (%edi,%ecx,8)

- addl %ebp, %eax

- movl %eax, 4(%edi,%ecx,8)

- incl %ecx

- movl %edx, %ebx

- jnz L(top)

- adcl $0, %ebx

- popl %ebp

- movl %ebx, %eax

- popl %ebx

- popl %edi

- popl %esi

- ret

-L(small):

- C Special case for 16-bit multiplier.

- C

- C eax multiplier

- C ebx

- C ecx -size

- C edx src

- C esi src end

- C edi dst end

- C ebp multiplier

- C size<3 not supported here. At size==3 we're already a couple of

- C cycles faster, so there's no threshold as such, just use the MMX

- C as soon as possible.

- cmpl $-3, %ecx

- ja L(big)

- movd %eax, %mm7 C m

- pxor %mm6, %mm6 C initial carry word

- punpcklwd %mm7, %mm7 C m replicated 2 times

- addl $2, %ecx C -size+2

- punpckldq %mm7, %mm7 C m replicated 4 times

- andl $4, %edx C test alignment, clear carry flag

- movq %mm7, %mm0 C m

- jz L(small_entry)

- C Source is unaligned, process one limb separately.

- C

- C Plain integer code is used here, since it's smaller and is about

- C the same 13 cycles as an mmx block would be.

- C

- C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence

- C the use of separate incl and orl.

- mull -8(%esi,%ecx,4) C m * src[0]

- movl %eax, -8(%edi,%ecx,4) C dst[0]

- incl %ecx C one limb processed

- movd %edx, %mm6 C initial carry

- orl %eax, %eax C clear carry flag

- jmp L(small_entry)

-C The scheduling here is quite tricky, since so many instructions have

-C pairing restrictions. In particular the js won't pair with a movd, and

-C can't be paired with an adc since it wants flags from the inc, so

-C instructions are rotated to the top of the loop to find somewhere useful

-C for it.

-C

-C Trouble has been taken to avoid overlapping successive loop iterations,

-C since that would greatly increase the size of the startup and finishup

-C code. Actually there's probably not much advantage to be had from

-C overlapping anyway, since the difficulties are mostly with pairing, not

-C with latencies as such.

-C

-C In the comments x represents the src data and m the multiplier (16

-C bits, but replicated 4 times).

-C

-C The m signs calculated in %mm3 are a loop invariant and could be held in

-C say %mm5, but that would save only one instruction and hence be no faster.

-L(small_top):

- C eax l.low, then l.high

- C ebx (h.low)

- C ecx counter, -size+2 to 0 or 1

- C edx (h.high)

- C esi &src[size]

- C edi &dst[size]

- C ebp

- C

- C %mm0 (high products)

- C %mm1 (low products)

- C %mm2 (adjust for m using x signs)

- C %mm3 (adjust for x using m signs)

- C %mm4

- C %mm5

- C %mm6 h.low, then carry

- C %mm7 m replicated 4 times

- movd %mm6, %ebx C h.low

- psrlq $32, %mm1 C l.high

- movd %mm0, %edx C h.high

- movq %mm0, %mm6 C new c

- adcl %eax, %ebx

- incl %ecx

- movd %mm1, %eax C l.high

- movq %mm7, %mm0

- adcl %eax, %edx

- movl %ebx, -16(%edi,%ecx,4)

- movl %edx, -12(%edi,%ecx,4)

- psrlq $32, %mm6 C c

-L(small_entry):

- pmulhw -8(%esi,%ecx,4), %mm0 C h = (x*m).high

- movq %mm7, %mm1

- pmullw -8(%esi,%ecx,4), %mm1 C l = (x*m).low

- movq %mm7, %mm3

- movq -8(%esi,%ecx,4), %mm2 C x

- psraw $15, %mm3 C m signs

- pand -8(%esi,%ecx,4), %mm3 C x selected by m signs

- psraw $15, %mm2 C x signs

- paddw %mm3, %mm0 C add x to h if m neg

- pand %mm7, %mm2 C m selected by x signs

- paddw %mm2, %mm0 C add m to h if x neg

- incl %ecx

- movd %mm1, %eax C l.low

- punpcklwd %mm0, %mm6 C c + h.low << 16

- psrlq $16, %mm0 C h.high

- js L(small_top)

- movd %mm6, %ebx C h.low

- psrlq $32, %mm1 C l.high

- adcl %eax, %ebx

- popl %ebp FRAME_popl()

- movd %mm0, %edx C h.high

- psrlq $32, %mm0 C l.high

- movd %mm1, %eax C l.high

- adcl %eax, %edx

- movl %ebx, -12(%edi,%ecx,4)

- movd %mm0, %eax C c

- adcl $0, %eax

- movl %edx, -8(%edi,%ecx,4)

- orl %ecx, %ecx

- jnz L(small_done) C final %ecx==1 means even, ==0 odd

- C Size odd, one extra limb to process.

- C Plain integer code is used here, since it's smaller and is about

- C the same speed as another mmx block would be.

- movl %eax, %ecx

- movl PARAM_MULTIPLIER, %eax

- mull -4(%esi)

- addl %ecx, %eax

- adcl $0, %edx

- movl %eax, -4(%edi)

- movl %edx, %eax

-L(small_done):

- popl %ebx

- popl %edi

- popl %esi

- emms

- ret

-EPILOGUE()

« no previous file with comments | « gcc/gmp/mpn/x86/pentium/mmx/lshift.asm ('k') | gcc/gmp/mpn/x86/pentium/mmx/rshift.asm » ('j') | no next file with comments »