Index: gcc/gmp/mpn/x86/pentium/sqr_basecase.asm |
diff --git a/gcc/gmp/mpn/x86/pentium/sqr_basecase.asm b/gcc/gmp/mpn/x86/pentium/sqr_basecase.asm |
deleted file mode 100644 |
index e4fca7c54629b45dd356f8a90d9286afe7ec107b..0000000000000000000000000000000000000000 |
--- a/gcc/gmp/mpn/x86/pentium/sqr_basecase.asm |
+++ /dev/null |
@@ -1,517 +0,0 @@ |
-dnl Intel P5 mpn_sqr_basecase -- square an mpn number. |
- |
-dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. |
-dnl |
-dnl This file is part of the GNU MP Library. |
-dnl |
-dnl The GNU MP Library is free software; you can redistribute it and/or |
-dnl modify it under the terms of the GNU Lesser General Public License as |
-dnl published by the Free Software Foundation; either version 3 of the |
-dnl License, or (at your option) any later version. |
-dnl |
-dnl The GNU MP Library is distributed in the hope that it will be useful, |
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of |
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
-dnl Lesser General Public License for more details. |
-dnl |
-dnl You should have received a copy of the GNU Lesser General Public License |
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
- |
-include(`../config.m4') |
- |
- |
-C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular |
-C product at around 20x20 limbs. |
- |
- |
-C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); |
-C |
-C Calculate src,size squared, storing the result in dst,2*size. |
-C |
-C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a |
-C lot of function call overheads are avoided, especially when the size is |
-C small. |
- |
-defframe(PARAM_SIZE,12) |
-defframe(PARAM_SRC, 8) |
-defframe(PARAM_DST, 4) |
- |
- TEXT |
- ALIGN(8) |
-PROLOGUE(mpn_sqr_basecase) |
-deflit(`FRAME',0) |
- |
- movl PARAM_SIZE, %edx |
- movl PARAM_SRC, %eax |
- |
- cmpl $2, %edx |
- movl PARAM_DST, %ecx |
- |
- je L(two_limbs) |
- |
- movl (%eax), %eax |
- ja L(three_or_more) |
- |
-C ----------------------------------------------------------------------------- |
-C one limb only |
- C eax src |
- C ebx |
- C ecx dst |
- C edx |
- |
- mull %eax |
- |
- movl %eax, (%ecx) |
- movl %edx, 4(%ecx) |
- |
- ret |
- |
-C ----------------------------------------------------------------------------- |
- ALIGN(8) |
-L(two_limbs): |
- C eax src |
- C ebx |
- C ecx dst |
- C edx size |
- |
- pushl %ebp |
- pushl %edi |
- |
- pushl %esi |
- pushl %ebx |
- |
- movl %eax, %ebx |
- movl (%eax), %eax |
- |
- mull %eax C src[0]^2 |
- |
- movl %eax, (%ecx) C dst[0] |
- movl %edx, %esi C dst[1] |
- |
- movl 4(%ebx), %eax |
- |
- mull %eax C src[1]^2 |
- |
- movl %eax, %edi C dst[2] |
- movl %edx, %ebp C dst[3] |
- |
- movl (%ebx), %eax |
- |
- mull 4(%ebx) C src[0]*src[1] |
- |
- addl %eax, %esi |
- popl %ebx |
- |
- adcl %edx, %edi |
- |
- adcl $0, %ebp |
- addl %esi, %eax |
- |
- adcl %edi, %edx |
- movl %eax, 4(%ecx) |
- |
- adcl $0, %ebp |
- popl %esi |
- |
- movl %edx, 8(%ecx) |
- movl %ebp, 12(%ecx) |
- |
- popl %edi |
- popl %ebp |
- |
- ret |
- |
- |
-C ----------------------------------------------------------------------------- |
- ALIGN(8) |
-L(three_or_more): |
- C eax src low limb |
- C ebx |
- C ecx dst |
- C edx size |
- |
- cmpl $4, %edx |
- pushl %ebx |
-deflit(`FRAME',4) |
- |
- movl PARAM_SRC, %ebx |
- jae L(four_or_more) |
- |
- |
-C ----------------------------------------------------------------------------- |
-C three limbs |
- C eax src low limb |
- C ebx src |
- C ecx dst |
- C edx size |
- |
- pushl %ebp |
- pushl %edi |
- |
- mull %eax C src[0] ^ 2 |
- |
- movl %eax, (%ecx) |
- movl %edx, 4(%ecx) |
- |
- movl 4(%ebx), %eax |
- xorl %ebp, %ebp |
- |
- mull %eax C src[1] ^ 2 |
- |
- movl %eax, 8(%ecx) |
- movl %edx, 12(%ecx) |
- |
- movl 8(%ebx), %eax |
- pushl %esi C risk of cache bank clash |
- |
- mull %eax C src[2] ^ 2 |
- |
- movl %eax, 16(%ecx) |
- movl %edx, 20(%ecx) |
- |
- movl (%ebx), %eax |
- |
- mull 4(%ebx) C src[0] * src[1] |
- |
- movl %eax, %esi |
- movl %edx, %edi |
- |
- movl (%ebx), %eax |
- |
- mull 8(%ebx) C src[0] * src[2] |
- |
- addl %eax, %edi |
- movl %edx, %ebp |
- |
- adcl $0, %ebp |
- movl 4(%ebx), %eax |
- |
- mull 8(%ebx) C src[1] * src[2] |
- |
- xorl %ebx, %ebx |
- addl %eax, %ebp |
- |
- C eax |
- C ebx zero, will be dst[5] |
- C ecx dst |
- C edx dst[4] |
- C esi dst[1] |
- C edi dst[2] |
- C ebp dst[3] |
- |
- adcl $0, %edx |
- addl %esi, %esi |
- |
- adcl %edi, %edi |
- |
- adcl %ebp, %ebp |
- |
- adcl %edx, %edx |
- movl 4(%ecx), %eax |
- |
- adcl $0, %ebx |
- addl %esi, %eax |
- |
- movl %eax, 4(%ecx) |
- movl 8(%ecx), %eax |
- |
- adcl %edi, %eax |
- movl 12(%ecx), %esi |
- |
- adcl %ebp, %esi |
- movl 16(%ecx), %edi |
- |
- movl %eax, 8(%ecx) |
- movl %esi, 12(%ecx) |
- |
- adcl %edx, %edi |
- popl %esi |
- |
- movl 20(%ecx), %eax |
- movl %edi, 16(%ecx) |
- |
- popl %edi |
- popl %ebp |
- |
- adcl %ebx, %eax C no carry out of this |
- popl %ebx |
- |
- movl %eax, 20(%ecx) |
- |
- ret |
- |
- |
-C ----------------------------------------------------------------------------- |
- ALIGN(8) |
-L(four_or_more): |
- C eax src low limb |
- C ebx src |
- C ecx dst |
- C edx size |
- C esi |
- C edi |
- C ebp |
- C |
- C First multiply src[0]*src[1..size-1] and store at dst[1..size]. |
- |
-deflit(`FRAME',4) |
- |
- pushl %edi |
-FRAME_pushl() |
- pushl %esi |
-FRAME_pushl() |
- |
- pushl %ebp |
-FRAME_pushl() |
- leal (%ecx,%edx,4), %edi C dst end of this mul1 |
- |
- leal (%ebx,%edx,4), %esi C src end |
- movl %ebx, %ebp C src |
- |
- negl %edx C -size |
- xorl %ebx, %ebx C clear carry limb and carry flag |
- |
- leal 1(%edx), %ecx C -(size-1) |
- |
-L(mul1): |
- C eax scratch |
- C ebx carry |
- C ecx counter, negative |
- C edx scratch |
- C esi &src[size] |
- C edi &dst[size] |
- C ebp src |
- |
- adcl $0, %ebx |
- movl (%esi,%ecx,4), %eax |
- |
- mull (%ebp) |
- |
- addl %eax, %ebx |
- |
- movl %ebx, (%edi,%ecx,4) |
- incl %ecx |
- |
- movl %edx, %ebx |
- jnz L(mul1) |
- |
- |
- C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for |
- C n=1..size-2. |
- C |
- C The last two products, which are the end corner of the product |
- C triangle, are handled separately to save looping overhead. These |
- C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1]. |
- C If size is 4 then it's only these that need to be done. |
- C |
- C In the outer loop %esi is a constant, and %edi just advances by 1 |
- C limb each time. The size of the operation decreases by 1 limb |
- C each time. |
- |
- C eax |
- C ebx carry (needing carry flag added) |
- C ecx |
- C edx |
- C esi &src[size] |
- C edi &dst[size] |
- C ebp |
- |
- adcl $0, %ebx |
- movl PARAM_SIZE, %edx |
- |
- movl %ebx, (%edi) |
- subl $4, %edx |
- |
- negl %edx |
- jz L(corner) |
- |
- |
-L(outer): |
- C ebx previous carry limb to store |
- C edx outer loop counter (negative) |
- C esi &src[size] |
- C edi dst, pointing at stored carry limb of previous loop |
- |
- pushl %edx C new outer loop counter |
- leal -2(%edx), %ecx |
- |
- movl %ebx, (%edi) |
- addl $4, %edi |
- |
- addl $4, %ebp |
- xorl %ebx, %ebx C initial carry limb, clear carry flag |
- |
-L(inner): |
- C eax scratch |
- C ebx carry (needing carry flag added) |
- C ecx counter, negative |
- C edx scratch |
- C esi &src[size] |
- C edi dst end of this addmul |
- C ebp &src[j] |
- |
- adcl $0, %ebx |
- movl (%esi,%ecx,4), %eax |
- |
- mull (%ebp) |
- |
- addl %ebx, %eax |
- movl (%edi,%ecx,4), %ebx |
- |
- adcl $0, %edx |
- addl %eax, %ebx |
- |
- movl %ebx, (%edi,%ecx,4) |
- incl %ecx |
- |
- movl %edx, %ebx |
- jnz L(inner) |
- |
- |
- adcl $0, %ebx |
- popl %edx C outer loop counter |
- |
- incl %edx |
- jnz L(outer) |
- |
- |
- movl %ebx, (%edi) |
- |
-L(corner): |
- C esi &src[size] |
- C edi &dst[2*size-4] |
- |
- movl -8(%esi), %eax |
- movl -4(%edi), %ebx C risk of data cache bank clash here |
- |
- mull -12(%esi) C src[size-2]*src[size-3] |
- |
- addl %eax, %ebx |
- movl %edx, %ecx |
- |
- adcl $0, %ecx |
- movl -4(%esi), %eax |
- |
- mull -12(%esi) C src[size-1]*src[size-3] |
- |
- addl %ecx, %eax |
- movl (%edi), %ecx |
- |
- adcl $0, %edx |
- movl %ebx, -4(%edi) |
- |
- addl %eax, %ecx |
- movl %edx, %ebx |
- |
- adcl $0, %ebx |
- movl -4(%esi), %eax |
- |
- mull -8(%esi) C src[size-1]*src[size-2] |
- |
- movl %ecx, (%edi) |
- addl %eax, %ebx |
- |
- adcl $0, %edx |
- movl PARAM_SIZE, %eax |
- |
- negl %eax |
- movl %ebx, 4(%edi) |
- |
- addl $1, %eax C -(size-1) and clear carry |
- movl %edx, 8(%edi) |
- |
- |
-C ----------------------------------------------------------------------------- |
-C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1]. |
- |
-L(lshift): |
- C eax counter, negative |
- C ebx next limb |
- C ecx |
- C edx |
- C esi |
- C edi &dst[2*size-4] |
- C ebp |
- |
- movl 12(%edi,%eax,8), %ebx |
- |
- rcll %ebx |
- movl 16(%edi,%eax,8), %ecx |
- |
- rcll %ecx |
- movl %ebx, 12(%edi,%eax,8) |
- |
- movl %ecx, 16(%edi,%eax,8) |
- incl %eax |
- |
- jnz L(lshift) |
- |
- |
- adcl %eax, %eax C high bit out |
- movl PARAM_SRC, %esi |
- |
- movl PARAM_SIZE, %ecx C risk of cache bank clash |
- movl %eax, 12(%edi) C dst most significant limb |
- |
- |
-C ----------------------------------------------------------------------------- |
-C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ..., |
-C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the |
-C low limb of src[0]^2. |
- |
- movl (%esi), %eax C src[0] |
- leal (%esi,%ecx,4), %esi C src end |
- |
- negl %ecx |
- |
- mull %eax |
- |
- movl %eax, 16(%edi,%ecx,8) C dst[0] |
- movl %edx, %ebx |
- |
- addl $1, %ecx C size-1 and clear carry |
- |
-L(diag): |
- C eax scratch (low product) |
- C ebx carry limb |
- C ecx counter, negative |
- C edx scratch (high product) |
- C esi &src[size] |
- C edi &dst[2*size-4] |
- C ebp scratch (fetched dst limbs) |
- |
- movl (%esi,%ecx,4), %eax |
- adcl $0, %ebx |
- |
- mull %eax |
- |
- movl 16-4(%edi,%ecx,8), %ebp |
- |
- addl %ebp, %ebx |
- movl 16(%edi,%ecx,8), %ebp |
- |
- adcl %eax, %ebp |
- movl %ebx, 16-4(%edi,%ecx,8) |
- |
- movl %ebp, 16(%edi,%ecx,8) |
- incl %ecx |
- |
- movl %edx, %ebx |
- jnz L(diag) |
- |
- |
- adcl $0, %edx |
- movl 16-4(%edi), %eax C dst most significant limb |
- |
- addl %eax, %edx |
- popl %ebp |
- |
- movl %edx, 16-4(%edi) |
- popl %esi C risk of cache bank clash |
- |
- popl %edi |
- popl %ebx |
- |
- ret |
- |
-EPILOGUE() |