Index: gcc/gmp/mpn/x86/mul_basecase.asm |
diff --git a/gcc/gmp/mpn/x86/mul_basecase.asm b/gcc/gmp/mpn/x86/mul_basecase.asm |
deleted file mode 100644 |
index 7918ea07f3430a585d3dbc5424e7b5a0d082689a..0000000000000000000000000000000000000000 |
--- a/gcc/gmp/mpn/x86/mul_basecase.asm |
+++ /dev/null |
@@ -1,213 +0,0 @@ |
-dnl x86 mpn_mul_basecase -- Multiply two limb vectors and store the result |
-dnl in a third limb vector. |
- |
-dnl Copyright 1996, 1997, 1998, 1999, 2000, 2001, 2002 Free Software |
-dnl Foundation, Inc. |
-dnl |
-dnl This file is part of the GNU MP Library. |
-dnl |
-dnl The GNU MP Library is free software; you can redistribute it and/or |
-dnl modify it under the terms of the GNU Lesser General Public License as |
-dnl published by the Free Software Foundation; either version 3 of the |
-dnl License, or (at your option) any later version. |
-dnl |
-dnl The GNU MP Library is distributed in the hope that it will be useful, |
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of |
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
-dnl Lesser General Public License for more details. |
-dnl |
-dnl You should have received a copy of the GNU Lesser General Public License |
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
- |
-include(`../config.m4') |
- |
- |
-C cycles/crossproduct |
-C P5: 15 |
-C P6: 7.5 |
-C K6: 12.5 |
-C K7: 5.5 |
-C P4: 24 |
- |
- |
-C void mpn_mul_basecase (mp_ptr wp, |
-C mp_srcptr xp, mp_size_t xsize, |
-C mp_srcptr yp, mp_size_t ysize); |
-C |
-C This was written in a haste since the Pentium optimized code that was used |
-C for all x86 machines was slow for the Pentium II. This code would benefit |
-C from some cleanup. |
-C |
-C To shave off some percentage of the run-time, one should make 4 variants |
-C of the Louter loop, for the four different outcomes of un mod 4. That |
-C would avoid Loop0 altogether. Code expansion would be > 4-fold for that |
-C part of the function, but since it is not very large, that would be |
-C acceptable. |
-C |
-C The mul loop (at L(oopM)) might need some tweaking. It's current speed is |
-C unknown. |
- |
-defframe(PARAM_YSIZE,20) |
-defframe(PARAM_YP, 16) |
-defframe(PARAM_XSIZE,12) |
-defframe(PARAM_XP, 8) |
-defframe(PARAM_WP, 4) |
- |
-defframe(VAR_MULTIPLIER, -4) |
-defframe(VAR_COUNTER, -8) |
-deflit(VAR_STACK_SPACE, 8) |
- |
- TEXT |
- ALIGN(8) |
- |
-PROLOGUE(mpn_mul_basecase) |
-deflit(`FRAME',0) |
- |
- subl $VAR_STACK_SPACE,%esp |
- pushl %esi |
- pushl %ebp |
- pushl %edi |
-deflit(`FRAME',eval(VAR_STACK_SPACE+12)) |
- |
- movl PARAM_XP,%esi |
- movl PARAM_WP,%edi |
- movl PARAM_YP,%ebp |
- |
- movl (%esi),%eax C load xp[0] |
- mull (%ebp) C multiply by yp[0] |
- movl %eax,(%edi) C store to wp[0] |
- movl PARAM_XSIZE,%ecx C xsize |
- decl %ecx C If xsize = 1, ysize = 1 too |
- jz L(done) |
- |
- pushl %ebx |
-FRAME_pushl() |
- movl %edx,%ebx |
- |
- leal 4(%esi),%esi |
- leal 4(%edi),%edi |
- |
-L(oopM): |
- movl (%esi),%eax C load next limb at xp[j] |
- leal 4(%esi),%esi |
- mull (%ebp) |
- addl %ebx,%eax |
- movl %edx,%ebx |
- adcl $0,%ebx |
- movl %eax,(%edi) |
- leal 4(%edi),%edi |
- decl %ecx |
- jnz L(oopM) |
- |
- movl %ebx,(%edi) C most significant limb of product |
- addl $4,%edi C increment wp |
- movl PARAM_XSIZE,%eax |
- shll $2,%eax |
- subl %eax,%edi |
- subl %eax,%esi |
- |
- movl PARAM_YSIZE,%eax C ysize |
- decl %eax |
- jz L(skip) |
- movl %eax,VAR_COUNTER C set index i to ysize |
- |
-L(outer): |
- movl PARAM_YP,%ebp C yp |
- addl $4,%ebp C make ebp point to next v limb |
- movl %ebp,PARAM_YP |
- movl (%ebp),%eax C copy y limb ... |
- movl %eax,VAR_MULTIPLIER C ... to stack slot |
- movl PARAM_XSIZE,%ecx |
- |
- xorl %ebx,%ebx |
- andl $3,%ecx |
- jz L(end0) |
- |
-L(oop0): |
- movl (%esi),%eax |
- mull VAR_MULTIPLIER |
- leal 4(%esi),%esi |
- addl %ebx,%eax |
- movl $0,%ebx |
- adcl %ebx,%edx |
- addl %eax,(%edi) |
- adcl %edx,%ebx C propagate carry into cylimb |
- |
- leal 4(%edi),%edi |
- decl %ecx |
- jnz L(oop0) |
- |
-L(end0): |
- movl PARAM_XSIZE,%ecx |
- shrl $2,%ecx |
- jz L(endX) |
- |
- ALIGN(8) |
-L(oopX): |
- movl (%esi),%eax |
- mull VAR_MULTIPLIER |
- addl %eax,%ebx |
- movl $0,%ebp |
- adcl %edx,%ebp |
- |
- movl 4(%esi),%eax |
- mull VAR_MULTIPLIER |
- addl %ebx,(%edi) |
- adcl %eax,%ebp C new lo + cylimb |
- movl $0,%ebx |
- adcl %edx,%ebx |
- |
- movl 8(%esi),%eax |
- mull VAR_MULTIPLIER |
- addl %ebp,4(%edi) |
- adcl %eax,%ebx C new lo + cylimb |
- movl $0,%ebp |
- adcl %edx,%ebp |
- |
- movl 12(%esi),%eax |
- mull VAR_MULTIPLIER |
- addl %ebx,8(%edi) |
- adcl %eax,%ebp C new lo + cylimb |
- movl $0,%ebx |
- adcl %edx,%ebx |
- |
- addl %ebp,12(%edi) |
- adcl $0,%ebx C propagate carry into cylimb |
- |
- leal 16(%esi),%esi |
- leal 16(%edi),%edi |
- decl %ecx |
- jnz L(oopX) |
- |
-L(endX): |
- movl %ebx,(%edi) |
- addl $4,%edi |
- |
- C we incremented wp and xp in the loop above; compensate |
- movl PARAM_XSIZE,%eax |
- shll $2,%eax |
- subl %eax,%edi |
- subl %eax,%esi |
- |
- movl VAR_COUNTER,%eax |
- decl %eax |
- movl %eax,VAR_COUNTER |
- jnz L(outer) |
- |
-L(skip): |
- popl %ebx |
- popl %edi |
- popl %ebp |
- popl %esi |
- addl $8,%esp |
- ret |
- |
-L(done): |
- movl %edx,4(%edi) C store to wp[1] |
- popl %edi |
- popl %ebp |
- popl %esi |
- addl $8,%esp |
- ret |
- |
-EPILOGUE() |