Index: gcc/gmp/mpn/x86/sqr_basecase.asm |
diff --git a/gcc/gmp/mpn/x86/sqr_basecase.asm b/gcc/gmp/mpn/x86/sqr_basecase.asm |
deleted file mode 100644 |
index 9a7e13327b8c41c91ec01d28bc0a89cad846b069..0000000000000000000000000000000000000000 |
--- a/gcc/gmp/mpn/x86/sqr_basecase.asm |
+++ /dev/null |
@@ -1,348 +0,0 @@ |
-dnl x86 generic mpn_sqr_basecase -- square an mpn number. |
- |
-dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc. |
-dnl |
-dnl This file is part of the GNU MP Library. |
-dnl |
-dnl The GNU MP Library is free software; you can redistribute it and/or modify |
-dnl it under the terms of the GNU Lesser General Public License as published |
-dnl by the Free Software Foundation; either version 3 of the License, or (at |
-dnl your option) any later version. |
-dnl |
-dnl The GNU MP Library is distributed in the hope that it will be useful, but |
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
-dnl License for more details. |
-dnl |
-dnl You should have received a copy of the GNU Lesser General Public License |
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
- |
- |
-include(`../config.m4') |
- |
- |
-C cycles/crossproduct cycles/triangleproduct |
-C P5: |
-C P6: |
-C K6: |
-C K7: |
-C P4: |
- |
- |
-C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); |
-C |
-C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a |
-C lot of function call overheads are avoided, especially when the size is |
-C small. |
-C |
-C The mul1 loop is not unrolled like mul_1.asm, it doesn't seem worth the |
-C code size to do so here. |
-C |
-C Enhancements: |
-C |
-C The addmul loop here is also not unrolled like aorsmul_1.asm and |
-C mul_basecase.asm are. Perhaps it should be done. It'd add to the |
-C complexity, but if it's worth doing in the other places then it should be |
-C worthwhile here. |
-C |
-C A fully-unrolled style like other sqr_basecase.asm versions (k6, k7, p6) |
-C might be worth considering. That'd add quite a bit to the code size, but |
-C only as much as is used would be dragged into L1 cache. |
- |
-defframe(PARAM_SIZE,12) |
-defframe(PARAM_SRC, 8) |
-defframe(PARAM_DST, 4) |
- |
- TEXT |
- ALIGN(8) |
-PROLOGUE(mpn_sqr_basecase) |
-deflit(`FRAME',0) |
- |
- movl PARAM_SIZE, %edx |
- |
- movl PARAM_SRC, %eax |
- |
- cmpl $2, %edx |
- movl PARAM_DST, %ecx |
- |
- je L(two_limbs) |
- ja L(three_or_more) |
- |
- |
-C ----------------------------------------------------------------------------- |
-C one limb only |
- C eax src |
- C ebx |
- C ecx dst |
- C edx |
- |
- movl (%eax), %eax |
- mull %eax |
- movl %eax, (%ecx) |
- movl %edx, 4(%ecx) |
- ret |
- |
- |
-C ----------------------------------------------------------------------------- |
- ALIGN(8) |
-L(two_limbs): |
- C eax src |
- C ebx |
- C ecx dst |
- C edx |
- |
- pushl %ebx |
- pushl %ebp |
- |
- movl %eax, %ebx |
- movl (%eax), %eax |
- |
- mull %eax C src[0]^2 |
- |
- pushl %esi |
- pushl %edi |
- |
- movl %edx, %esi C dst[1] |
- movl %eax, (%ecx) C dst[0] |
- |
- movl 4(%ebx), %eax |
- mull %eax C src[1]^2 |
- |
- movl %eax, %edi C dst[2] |
- movl %edx, %ebp C dst[3] |
- |
- movl (%ebx), %eax |
- mull 4(%ebx) C src[0]*src[1] |
- |
- addl %eax, %esi |
- |
- adcl %edx, %edi |
- |
- adcl $0, %ebp |
- addl %esi, %eax |
- |
- adcl %edi, %edx |
- movl %eax, 4(%ecx) |
- |
- adcl $0, %ebp |
- |
- movl %edx, 8(%ecx) |
- movl %ebp, 12(%ecx) |
- |
- popl %edi |
- popl %esi |
- |
- popl %ebp |
- popl %ebx |
- |
- ret |
- |
- |
-C ----------------------------------------------------------------------------- |
- ALIGN(8) |
-L(three_or_more): |
-deflit(`FRAME',0) |
- C eax src |
- C ebx |
- C ecx dst |
- C edx size |
- |
- pushl %ebx FRAME_pushl() |
- pushl %edi FRAME_pushl() |
- |
- pushl %esi FRAME_pushl() |
- pushl %ebp FRAME_pushl() |
- |
- leal (%ecx,%edx,4), %edi C &dst[size], end of this mul1 |
- leal (%eax,%edx,4), %esi C &src[size] |
- |
-C First multiply src[0]*src[1..size-1] and store at dst[1..size]. |
- |
- movl (%eax), %ebp C src[0], multiplier |
- movl %edx, %ecx |
- |
- negl %ecx C -size |
- xorl %ebx, %ebx C clear carry limb |
- |
- incl %ecx C -(size-1) |
- |
-L(mul1): |
- C eax scratch |
- C ebx carry |
- C ecx counter, limbs, negative |
- C edx scratch |
- C esi &src[size] |
- C edi &dst[size] |
- C ebp multiplier |
- |
- movl (%esi,%ecx,4), %eax |
- mull %ebp |
- addl %eax, %ebx |
- adcl $0, %edx |
- movl %ebx, (%edi,%ecx,4) |
- movl %edx, %ebx |
- incl %ecx |
- jnz L(mul1) |
- |
- movl %ebx, (%edi) |
- |
- |
- C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for |
- C n=1..size-2. |
- C |
- C The last products src[size-2]*src[size-1], which is the end corner |
- C of the product triangle, is handled separately at the end to save |
- C looping overhead. If size is 3 then it's only this that needs to |
- C be done. |
- C |
- C In the outer loop %esi is a constant, and %edi just advances by 1 |
- C limb each time. The size of the operation decreases by 1 limb |
- C each time. |
- |
- C eax |
- C ebx carry (needing carry flag added) |
- C ecx |
- C edx |
- C esi &src[size] |
- C edi &dst[size] |
- C ebp |
- |
- movl PARAM_SIZE, %ecx |
- subl $3, %ecx |
- jz L(corner) |
- |
- negl %ecx |
- |
-dnl re-use parameter space |
-define(VAR_OUTER,`PARAM_DST') |
- |
-L(outer): |
- C eax |
- C ebx |
- C ecx |
- C edx outer loop counter, -(size-3) to -1 |
- C esi &src[size] |
- C edi dst, pointing at stored carry limb of previous loop |
- C ebp |
- |
- movl %ecx, VAR_OUTER |
- addl $4, %edi C advance dst end |
- |
- movl -8(%esi,%ecx,4), %ebp C next multiplier |
- subl $1, %ecx |
- |
- xorl %ebx, %ebx C initial carry limb |
- |
-L(inner): |
- C eax scratch |
- C ebx carry (needing carry flag added) |
- C ecx counter, -n-1 to -1 |
- C edx scratch |
- C esi &src[size] |
- C edi dst end of this addmul |
- C ebp multiplier |
- |
- movl (%esi,%ecx,4), %eax |
- mull %ebp |
- addl %ebx, %eax |
- adcl $0, %edx |
- addl %eax, (%edi,%ecx,4) |
- adcl $0, %edx |
- movl %edx, %ebx |
- addl $1, %ecx |
- jl L(inner) |
- |
- |
- movl %ebx, (%edi) |
- movl VAR_OUTER, %ecx |
- incl %ecx |
- jnz L(outer) |
- |
- |
-L(corner): |
- C esi &src[size] |
- C edi &dst[2*size-3] |
- |
- movl -4(%esi), %eax |
- mull -8(%esi) C src[size-1]*src[size-2] |
- addl %eax, 0(%edi) |
- adcl $0, %edx |
- movl %edx, 4(%edi) C dst high limb |
- |
- |
-C ----------------------------------------------------------------------------- |
-C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1]. |
- |
- movl PARAM_SIZE, %eax |
- negl %eax |
- addl $1, %eax C -(size-1) and clear carry |
- |
-L(lshift): |
- C eax counter, negative |
- C ebx next limb |
- C ecx |
- C edx |
- C esi |
- C edi &dst[2*size-4] |
- C ebp |
- |
- rcll 8(%edi,%eax,8) |
- rcll 12(%edi,%eax,8) |
- incl %eax |
- jnz L(lshift) |
- |
- |
- adcl %eax, %eax C high bit out |
- movl %eax, 8(%edi) C dst most significant limb |
- |
- |
-C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ..., |
-C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the |
-C low limb of src[0]^2. |
- |
- movl PARAM_SRC, %esi |
- movl (%esi), %eax C src[0] |
- mull %eax C src[0]^2 |
- |
- movl PARAM_SIZE, %ecx |
- leal (%esi,%ecx,4), %esi C src end |
- |
- negl %ecx C -size |
- movl %edx, %ebx C initial carry |
- |
- movl %eax, 12(%edi,%ecx,8) C dst[0] |
- incl %ecx C -(size-1) |
- |
-L(diag): |
- C eax scratch (low product) |
- C ebx carry limb |
- C ecx counter, -(size-1) to -1 |
- C edx scratch (high product) |
- C esi &src[size] |
- C edi &dst[2*size-3] |
- C ebp scratch (fetched dst limbs) |
- |
- movl (%esi,%ecx,4), %eax |
- mull %eax |
- |
- addl %ebx, 8(%edi,%ecx,8) |
- movl %edx, %ebx |
- |
- adcl %eax, 12(%edi,%ecx,8) |
- adcl $0, %ebx |
- |
- incl %ecx |
- jnz L(diag) |
- |
- |
- addl %ebx, 8(%edi) C dst most significant limb |
- |
- popl %ebp |
- popl %esi |
- |
- popl %edi |
- popl %ebx |
- |
- ret |
- |
-EPILOGUE() |