| Index: gcc/gmp/mpn/x86/pentium/sqr_basecase.asm
|
| diff --git a/gcc/gmp/mpn/x86/pentium/sqr_basecase.asm b/gcc/gmp/mpn/x86/pentium/sqr_basecase.asm
|
| deleted file mode 100644
|
| index e4fca7c54629b45dd356f8a90d9286afe7ec107b..0000000000000000000000000000000000000000
|
| --- a/gcc/gmp/mpn/x86/pentium/sqr_basecase.asm
|
| +++ /dev/null
|
| @@ -1,517 +0,0 @@
|
| -dnl Intel P5 mpn_sqr_basecase -- square an mpn number.
|
| -
|
| -dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
|
| -dnl
|
| -dnl This file is part of the GNU MP Library.
|
| -dnl
|
| -dnl The GNU MP Library is free software; you can redistribute it and/or
|
| -dnl modify it under the terms of the GNU Lesser General Public License as
|
| -dnl published by the Free Software Foundation; either version 3 of the
|
| -dnl License, or (at your option) any later version.
|
| -dnl
|
| -dnl The GNU MP Library is distributed in the hope that it will be useful,
|
| -dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| -dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| -dnl Lesser General Public License for more details.
|
| -dnl
|
| -dnl You should have received a copy of the GNU Lesser General Public License
|
| -dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
|
| -
|
| -include(`../config.m4')
|
| -
|
| -
|
| -C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
|
| -C product at around 20x20 limbs.
|
| -
|
| -
|
| -C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
|
| -C
|
| -C Calculate src,size squared, storing the result in dst,2*size.
|
| -C
|
| -C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
|
| -C lot of function call overheads are avoided, especially when the size is
|
| -C small.
|
| -
|
| -defframe(PARAM_SIZE,12)
|
| -defframe(PARAM_SRC, 8)
|
| -defframe(PARAM_DST, 4)
|
| -
|
| - TEXT
|
| - ALIGN(8)
|
| -PROLOGUE(mpn_sqr_basecase)
|
| -deflit(`FRAME',0)
|
| -
|
| - movl PARAM_SIZE, %edx
|
| - movl PARAM_SRC, %eax
|
| -
|
| - cmpl $2, %edx
|
| - movl PARAM_DST, %ecx
|
| -
|
| - je L(two_limbs)
|
| -
|
| - movl (%eax), %eax
|
| - ja L(three_or_more)
|
| -
|
| -C -----------------------------------------------------------------------------
|
| -C one limb only
|
| - C eax src
|
| - C ebx
|
| - C ecx dst
|
| - C edx
|
| -
|
| - mull %eax
|
| -
|
| - movl %eax, (%ecx)
|
| - movl %edx, 4(%ecx)
|
| -
|
| - ret
|
| -
|
| -C -----------------------------------------------------------------------------
|
| - ALIGN(8)
|
| -L(two_limbs):
|
| - C eax src
|
| - C ebx
|
| - C ecx dst
|
| - C edx size
|
| -
|
| - pushl %ebp
|
| - pushl %edi
|
| -
|
| - pushl %esi
|
| - pushl %ebx
|
| -
|
| - movl %eax, %ebx
|
| - movl (%eax), %eax
|
| -
|
| - mull %eax C src[0]^2
|
| -
|
| - movl %eax, (%ecx) C dst[0]
|
| - movl %edx, %esi C dst[1]
|
| -
|
| - movl 4(%ebx), %eax
|
| -
|
| - mull %eax C src[1]^2
|
| -
|
| - movl %eax, %edi C dst[2]
|
| - movl %edx, %ebp C dst[3]
|
| -
|
| - movl (%ebx), %eax
|
| -
|
| - mull 4(%ebx) C src[0]*src[1]
|
| -
|
| - addl %eax, %esi
|
| - popl %ebx
|
| -
|
| - adcl %edx, %edi
|
| -
|
| - adcl $0, %ebp
|
| - addl %esi, %eax
|
| -
|
| - adcl %edi, %edx
|
| - movl %eax, 4(%ecx)
|
| -
|
| - adcl $0, %ebp
|
| - popl %esi
|
| -
|
| - movl %edx, 8(%ecx)
|
| - movl %ebp, 12(%ecx)
|
| -
|
| - popl %edi
|
| - popl %ebp
|
| -
|
| - ret
|
| -
|
| -
|
| -C -----------------------------------------------------------------------------
|
| - ALIGN(8)
|
| -L(three_or_more):
|
| - C eax src low limb
|
| - C ebx
|
| - C ecx dst
|
| - C edx size
|
| -
|
| - cmpl $4, %edx
|
| - pushl %ebx
|
| -deflit(`FRAME',4)
|
| -
|
| - movl PARAM_SRC, %ebx
|
| - jae L(four_or_more)
|
| -
|
| -
|
| -C -----------------------------------------------------------------------------
|
| -C three limbs
|
| - C eax src low limb
|
| - C ebx src
|
| - C ecx dst
|
| - C edx size
|
| -
|
| - pushl %ebp
|
| - pushl %edi
|
| -
|
| - mull %eax C src[0] ^ 2
|
| -
|
| - movl %eax, (%ecx)
|
| - movl %edx, 4(%ecx)
|
| -
|
| - movl 4(%ebx), %eax
|
| - xorl %ebp, %ebp
|
| -
|
| - mull %eax C src[1] ^ 2
|
| -
|
| - movl %eax, 8(%ecx)
|
| - movl %edx, 12(%ecx)
|
| -
|
| - movl 8(%ebx), %eax
|
| - pushl %esi C risk of cache bank clash
|
| -
|
| - mull %eax C src[2] ^ 2
|
| -
|
| - movl %eax, 16(%ecx)
|
| - movl %edx, 20(%ecx)
|
| -
|
| - movl (%ebx), %eax
|
| -
|
| - mull 4(%ebx) C src[0] * src[1]
|
| -
|
| - movl %eax, %esi
|
| - movl %edx, %edi
|
| -
|
| - movl (%ebx), %eax
|
| -
|
| - mull 8(%ebx) C src[0] * src[2]
|
| -
|
| - addl %eax, %edi
|
| - movl %edx, %ebp
|
| -
|
| - adcl $0, %ebp
|
| - movl 4(%ebx), %eax
|
| -
|
| - mull 8(%ebx) C src[1] * src[2]
|
| -
|
| - xorl %ebx, %ebx
|
| - addl %eax, %ebp
|
| -
|
| - C eax
|
| - C ebx zero, will be dst[5]
|
| - C ecx dst
|
| - C edx dst[4]
|
| - C esi dst[1]
|
| - C edi dst[2]
|
| - C ebp dst[3]
|
| -
|
| - adcl $0, %edx
|
| - addl %esi, %esi
|
| -
|
| - adcl %edi, %edi
|
| -
|
| - adcl %ebp, %ebp
|
| -
|
| - adcl %edx, %edx
|
| - movl 4(%ecx), %eax
|
| -
|
| - adcl $0, %ebx
|
| - addl %esi, %eax
|
| -
|
| - movl %eax, 4(%ecx)
|
| - movl 8(%ecx), %eax
|
| -
|
| - adcl %edi, %eax
|
| - movl 12(%ecx), %esi
|
| -
|
| - adcl %ebp, %esi
|
| - movl 16(%ecx), %edi
|
| -
|
| - movl %eax, 8(%ecx)
|
| - movl %esi, 12(%ecx)
|
| -
|
| - adcl %edx, %edi
|
| - popl %esi
|
| -
|
| - movl 20(%ecx), %eax
|
| - movl %edi, 16(%ecx)
|
| -
|
| - popl %edi
|
| - popl %ebp
|
| -
|
| - adcl %ebx, %eax C no carry out of this
|
| - popl %ebx
|
| -
|
| - movl %eax, 20(%ecx)
|
| -
|
| - ret
|
| -
|
| -
|
| -C -----------------------------------------------------------------------------
|
| - ALIGN(8)
|
| -L(four_or_more):
|
| - C eax src low limb
|
| - C ebx src
|
| - C ecx dst
|
| - C edx size
|
| - C esi
|
| - C edi
|
| - C ebp
|
| - C
|
| - C First multiply src[0]*src[1..size-1] and store at dst[1..size].
|
| -
|
| -deflit(`FRAME',4)
|
| -
|
| - pushl %edi
|
| -FRAME_pushl()
|
| - pushl %esi
|
| -FRAME_pushl()
|
| -
|
| - pushl %ebp
|
| -FRAME_pushl()
|
| - leal (%ecx,%edx,4), %edi C dst end of this mul1
|
| -
|
| - leal (%ebx,%edx,4), %esi C src end
|
| - movl %ebx, %ebp C src
|
| -
|
| - negl %edx C -size
|
| - xorl %ebx, %ebx C clear carry limb and carry flag
|
| -
|
| - leal 1(%edx), %ecx C -(size-1)
|
| -
|
| -L(mul1):
|
| - C eax scratch
|
| - C ebx carry
|
| - C ecx counter, negative
|
| - C edx scratch
|
| - C esi &src[size]
|
| - C edi &dst[size]
|
| - C ebp src
|
| -
|
| - adcl $0, %ebx
|
| - movl (%esi,%ecx,4), %eax
|
| -
|
| - mull (%ebp)
|
| -
|
| - addl %eax, %ebx
|
| -
|
| - movl %ebx, (%edi,%ecx,4)
|
| - incl %ecx
|
| -
|
| - movl %edx, %ebx
|
| - jnz L(mul1)
|
| -
|
| -
|
| - C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
|
| - C n=1..size-2.
|
| - C
|
| - C The last two products, which are the end corner of the product
|
| - C triangle, are handled separately to save looping overhead. These
|
| - C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
|
| - C If size is 4 then it's only these that need to be done.
|
| - C
|
| - C In the outer loop %esi is a constant, and %edi just advances by 1
|
| - C limb each time. The size of the operation decreases by 1 limb
|
| - C each time.
|
| -
|
| - C eax
|
| - C ebx carry (needing carry flag added)
|
| - C ecx
|
| - C edx
|
| - C esi &src[size]
|
| - C edi &dst[size]
|
| - C ebp
|
| -
|
| - adcl $0, %ebx
|
| - movl PARAM_SIZE, %edx
|
| -
|
| - movl %ebx, (%edi)
|
| - subl $4, %edx
|
| -
|
| - negl %edx
|
| - jz L(corner)
|
| -
|
| -
|
| -L(outer):
|
| - C ebx previous carry limb to store
|
| - C edx outer loop counter (negative)
|
| - C esi &src[size]
|
| - C edi dst, pointing at stored carry limb of previous loop
|
| -
|
| - pushl %edx C new outer loop counter
|
| - leal -2(%edx), %ecx
|
| -
|
| - movl %ebx, (%edi)
|
| - addl $4, %edi
|
| -
|
| - addl $4, %ebp
|
| - xorl %ebx, %ebx C initial carry limb, clear carry flag
|
| -
|
| -L(inner):
|
| - C eax scratch
|
| - C ebx carry (needing carry flag added)
|
| - C ecx counter, negative
|
| - C edx scratch
|
| - C esi &src[size]
|
| - C edi dst end of this addmul
|
| - C ebp &src[j]
|
| -
|
| - adcl $0, %ebx
|
| - movl (%esi,%ecx,4), %eax
|
| -
|
| - mull (%ebp)
|
| -
|
| - addl %ebx, %eax
|
| - movl (%edi,%ecx,4), %ebx
|
| -
|
| - adcl $0, %edx
|
| - addl %eax, %ebx
|
| -
|
| - movl %ebx, (%edi,%ecx,4)
|
| - incl %ecx
|
| -
|
| - movl %edx, %ebx
|
| - jnz L(inner)
|
| -
|
| -
|
| - adcl $0, %ebx
|
| - popl %edx C outer loop counter
|
| -
|
| - incl %edx
|
| - jnz L(outer)
|
| -
|
| -
|
| - movl %ebx, (%edi)
|
| -
|
| -L(corner):
|
| - C esi &src[size]
|
| - C edi &dst[2*size-4]
|
| -
|
| - movl -8(%esi), %eax
|
| - movl -4(%edi), %ebx C risk of data cache bank clash here
|
| -
|
| - mull -12(%esi) C src[size-2]*src[size-3]
|
| -
|
| - addl %eax, %ebx
|
| - movl %edx, %ecx
|
| -
|
| - adcl $0, %ecx
|
| - movl -4(%esi), %eax
|
| -
|
| - mull -12(%esi) C src[size-1]*src[size-3]
|
| -
|
| - addl %ecx, %eax
|
| - movl (%edi), %ecx
|
| -
|
| - adcl $0, %edx
|
| - movl %ebx, -4(%edi)
|
| -
|
| - addl %eax, %ecx
|
| - movl %edx, %ebx
|
| -
|
| - adcl $0, %ebx
|
| - movl -4(%esi), %eax
|
| -
|
| - mull -8(%esi) C src[size-1]*src[size-2]
|
| -
|
| - movl %ecx, (%edi)
|
| - addl %eax, %ebx
|
| -
|
| - adcl $0, %edx
|
| - movl PARAM_SIZE, %eax
|
| -
|
| - negl %eax
|
| - movl %ebx, 4(%edi)
|
| -
|
| - addl $1, %eax C -(size-1) and clear carry
|
| - movl %edx, 8(%edi)
|
| -
|
| -
|
| -C -----------------------------------------------------------------------------
|
| -C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
|
| -
|
| -L(lshift):
|
| - C eax counter, negative
|
| - C ebx next limb
|
| - C ecx
|
| - C edx
|
| - C esi
|
| - C edi &dst[2*size-4]
|
| - C ebp
|
| -
|
| - movl 12(%edi,%eax,8), %ebx
|
| -
|
| - rcll %ebx
|
| - movl 16(%edi,%eax,8), %ecx
|
| -
|
| - rcll %ecx
|
| - movl %ebx, 12(%edi,%eax,8)
|
| -
|
| - movl %ecx, 16(%edi,%eax,8)
|
| - incl %eax
|
| -
|
| - jnz L(lshift)
|
| -
|
| -
|
| - adcl %eax, %eax C high bit out
|
| - movl PARAM_SRC, %esi
|
| -
|
| - movl PARAM_SIZE, %ecx C risk of cache bank clash
|
| - movl %eax, 12(%edi) C dst most significant limb
|
| -
|
| -
|
| -C -----------------------------------------------------------------------------
|
| -C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
|
| -C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
|
| -C low limb of src[0]^2.
|
| -
|
| - movl (%esi), %eax C src[0]
|
| - leal (%esi,%ecx,4), %esi C src end
|
| -
|
| - negl %ecx
|
| -
|
| - mull %eax
|
| -
|
| - movl %eax, 16(%edi,%ecx,8) C dst[0]
|
| - movl %edx, %ebx
|
| -
|
| - addl $1, %ecx C size-1 and clear carry
|
| -
|
| -L(diag):
|
| - C eax scratch (low product)
|
| - C ebx carry limb
|
| - C ecx counter, negative
|
| - C edx scratch (high product)
|
| - C esi &src[size]
|
| - C edi &dst[2*size-4]
|
| - C ebp scratch (fetched dst limbs)
|
| -
|
| - movl (%esi,%ecx,4), %eax
|
| - adcl $0, %ebx
|
| -
|
| - mull %eax
|
| -
|
| - movl 16-4(%edi,%ecx,8), %ebp
|
| -
|
| - addl %ebp, %ebx
|
| - movl 16(%edi,%ecx,8), %ebp
|
| -
|
| - adcl %eax, %ebp
|
| - movl %ebx, 16-4(%edi,%ecx,8)
|
| -
|
| - movl %ebp, 16(%edi,%ecx,8)
|
| - incl %ecx
|
| -
|
| - movl %edx, %ebx
|
| - jnz L(diag)
|
| -
|
| -
|
| - adcl $0, %edx
|
| - movl 16-4(%edi), %eax C dst most significant limb
|
| -
|
| - addl %eax, %edx
|
| - popl %ebp
|
| -
|
| - movl %edx, 16-4(%edi)
|
| - popl %esi C risk of cache bank clash
|
| -
|
| - popl %edi
|
| - popl %ebx
|
| -
|
| - ret
|
| -
|
| -EPILOGUE()
|
|
|