Index: gcc/gmp/mpn/x86/p6/mul_basecase.asm |
diff --git a/gcc/gmp/mpn/x86/p6/mul_basecase.asm b/gcc/gmp/mpn/x86/p6/mul_basecase.asm |
deleted file mode 100644 |
index fc1afbdf0eb660f15f752f70435bab49b2370519..0000000000000000000000000000000000000000 |
--- a/gcc/gmp/mpn/x86/p6/mul_basecase.asm |
+++ /dev/null |
@@ -1,596 +0,0 @@ |
-dnl Intel P6 mpn_mul_basecase -- multiply two mpn numbers. |
- |
-dnl Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc. |
-dnl |
-dnl This file is part of the GNU MP Library. |
-dnl |
-dnl The GNU MP Library is free software; you can redistribute it and/or |
-dnl modify it under the terms of the GNU Lesser General Public License as |
-dnl published by the Free Software Foundation; either version 3 of the |
-dnl License, or (at your option) any later version. |
-dnl |
-dnl The GNU MP Library is distributed in the hope that it will be useful, |
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of |
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
-dnl Lesser General Public License for more details. |
-dnl |
-dnl You should have received a copy of the GNU Lesser General Public License |
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
- |
-include(`../config.m4') |
- |
- |
-C P6: approx 6.5 cycles per cross product (16 limbs/loop unrolling). |
- |
- |
-dnl P6 UNROLL_COUNT cycles/product (approx) |
-dnl 8 7 |
-dnl 16 6.5 |
-dnl 32 6.4 |
-dnl Maximum possible with the current code is 32. |
- |
-deflit(UNROLL_COUNT, 16) |
- |
- |
-C void mpn_mul_basecase (mp_ptr wp, |
-C mp_srcptr xp, mp_size_t xsize, |
-C mp_srcptr yp, mp_size_t ysize); |
-C |
-C This routine is essentially the same as mpn/generic/mul_basecase.c, but |
-C it's faster because it does most of the mpn_addmul_1() startup |
-C calculations only once. |
- |
-ifdef(`PIC',` |
-deflit(UNROLL_THRESHOLD, 5) |
-',` |
-deflit(UNROLL_THRESHOLD, 5) |
-') |
- |
-defframe(PARAM_YSIZE,20) |
-defframe(PARAM_YP, 16) |
-defframe(PARAM_XSIZE,12) |
-defframe(PARAM_XP, 8) |
-defframe(PARAM_WP, 4) |
- |
- TEXT |
- ALIGN(16) |
- |
-PROLOGUE(mpn_mul_basecase) |
-deflit(`FRAME',0) |
- |
- movl PARAM_XSIZE, %ecx |
- |
- movl PARAM_YP, %eax |
- |
- movl PARAM_XP, %edx |
- |
- movl (%eax), %eax C yp[0] |
- cmpl $2, %ecx |
- ja L(xsize_more_than_two) |
- je L(two_by_something) |
- |
- |
- C one limb by one limb |
- |
- mull (%edx) |
- |
- movl PARAM_WP, %ecx |
- movl %eax, (%ecx) |
- movl %edx, 4(%ecx) |
- ret |
- |
- |
-C ----------------------------------------------------------------------------- |
-L(two_by_something): |
-deflit(`FRAME',0) |
- |
-dnl re-use parameter space |
-define(SAVE_EBX, `PARAM_XSIZE') |
-define(SAVE_ESI, `PARAM_YSIZE') |
- |
- movl %ebx, SAVE_EBX |
- cmpl $1, PARAM_YSIZE |
- movl %eax, %ecx C yp[0] |
- |
- movl %esi, SAVE_ESI C save esi |
- movl PARAM_WP, %ebx |
- movl %edx, %esi C xp |
- |
- movl (%edx), %eax C xp[0] |
- jne L(two_by_two) |
- |
- |
- C two limbs by one limb |
- C |
- C eax xp[0] |
- C ebx wp |
- C ecx yp[0] |
- C edx |
- C esi xp |
- |
- mull %ecx |
- |
- movl %eax, (%ebx) |
- movl 4(%esi), %eax |
- movl %edx, %esi C carry |
- |
- mull %ecx |
- |
- addl %eax, %esi |
- |
- movl %esi, 4(%ebx) |
- movl SAVE_ESI, %esi |
- |
- adcl $0, %edx |
- |
- movl %edx, 8(%ebx) |
- movl SAVE_EBX, %ebx |
- |
- ret |
- |
- |
- |
-C ----------------------------------------------------------------------------- |
- |
- ALIGN(16) |
-L(two_by_two): |
- C eax xp[0] |
- C ebx wp |
- C ecx yp[0] |
- C edx |
- C esi xp |
- C edi |
- C ebp |
- |
-dnl more parameter space re-use |
-define(SAVE_EDI, `PARAM_WP') |
- |
- mull %ecx C xp[0] * yp[0] |
- |
- movl %edi, SAVE_EDI |
- movl %edx, %edi C carry, for wp[1] |
- |
- movl %eax, (%ebx) |
- movl 4(%esi), %eax |
- |
- mull %ecx C xp[1] * yp[0] |
- |
- addl %eax, %edi |
- movl PARAM_YP, %ecx |
- |
- adcl $0, %edx |
- movl 4(%ecx), %ecx C yp[1] |
- |
- movl %edi, 4(%ebx) |
- movl 4(%esi), %eax C xp[1] |
- movl %edx, %edi C carry, for wp[2] |
- |
- mull %ecx C xp[1] * yp[1] |
- |
- addl %eax, %edi |
- movl (%esi), %eax C xp[0] |
- |
- adcl $0, %edx |
- movl %edx, %esi C carry, for wp[3] |
- |
- mull %ecx C xp[0] * yp[1] |
- |
- addl %eax, 4(%ebx) |
- movl %esi, %eax |
- |
- adcl %edx, %edi |
- movl SAVE_ESI, %esi |
- |
- movl %edi, 8(%ebx) |
- |
- adcl $0, %eax |
- movl SAVE_EDI, %edi |
- |
- movl %eax, 12(%ebx) |
- movl SAVE_EBX, %ebx |
- |
- ret |
- |
- |
-C ----------------------------------------------------------------------------- |
- ALIGN(16) |
-L(xsize_more_than_two): |
- |
-C The first limb of yp is processed with a simple mpn_mul_1 loop running at |
-C about 6.2 c/l. Unrolling this doesn't seem worthwhile since it's only run |
-C once (whereas the addmul_1 below is run ysize-1 many times). A call to |
-C mpn_mul_1 would be slowed down by the parameter pushing and popping etc, |
-C and doesn't seem likely to be worthwhile on the typical sizes reaching |
-C here from the Karatsuba code. |
- |
- C eax yp[0] |
- C ebx |
- C ecx xsize |
- C edx xp |
- C esi |
- C edi |
- C ebp |
- |
-defframe(`SAVE_EBX', -4) |
-defframe(`SAVE_ESI', -8) |
-defframe(`SAVE_EDI', -12) |
-defframe(`SAVE_EBP', -16) |
-defframe(VAR_COUNTER, -20) dnl for use in the unroll case |
-defframe(VAR_ADJUST, -24) |
-defframe(VAR_JMP, -28) |
-defframe(VAR_SWAP, -32) |
-defframe(VAR_XP_LOW, -36) |
-deflit(STACK_SPACE, 36) |
- |
- subl $STACK_SPACE, %esp |
-deflit(`FRAME',STACK_SPACE) |
- |
- movl %edi, SAVE_EDI |
- movl PARAM_WP, %edi |
- |
- movl %ebx, SAVE_EBX |
- |
- movl %ebp, SAVE_EBP |
- movl %eax, %ebp |
- |
- movl %esi, SAVE_ESI |
- xorl %ebx, %ebx |
- leal (%edx,%ecx,4), %esi C xp end |
- |
- leal (%edi,%ecx,4), %edi C wp end of mul1 |
- negl %ecx |
- |
- |
-L(mul1): |
- C eax scratch |
- C ebx carry |
- C ecx counter, negative |
- C edx scratch |
- C esi xp end |
- C edi wp end of mul1 |
- C ebp multiplier |
- |
- movl (%esi,%ecx,4), %eax |
- |
- mull %ebp |
- |
- addl %ebx, %eax |
- movl %eax, (%edi,%ecx,4) |
- movl $0, %ebx |
- |
- adcl %edx, %ebx |
- incl %ecx |
- jnz L(mul1) |
- |
- |
- movl PARAM_YSIZE, %edx |
- |
- movl %ebx, (%edi) C final carry |
- movl PARAM_XSIZE, %ecx |
- decl %edx |
- |
- jz L(done) C if ysize==1 |
- |
- cmpl $UNROLL_THRESHOLD, %ecx |
- movl PARAM_YP, %eax |
- jae L(unroll) |
- |
- |
-C ----------------------------------------------------------------------------- |
- C simple addmul looping |
- C |
- C eax yp |
- C ebx |
- C ecx xsize |
- C edx ysize-1 |
- C esi xp end |
- C edi wp end of mul1 |
- C ebp |
- |
- leal 4(%eax,%edx,4), %ebp C yp end |
- negl %ecx |
- negl %edx |
- |
- movl %edx, PARAM_YSIZE C -(ysize-1) |
- movl (%esi,%ecx,4), %eax C xp low limb |
- incl %ecx |
- |
- movl %ecx, PARAM_XSIZE C -(xsize-1) |
- xorl %ebx, %ebx C initial carry |
- |
- movl %ebp, PARAM_YP |
- movl (%ebp,%edx,4), %ebp C yp second lowest limb - multiplier |
- jmp L(simple_outer_entry) |
- |
- |
-L(simple_outer_top): |
- C ebp ysize counter, negative |
- |
- movl PARAM_YP, %edx |
- |
- movl PARAM_XSIZE, %ecx C -(xsize-1) |
- xorl %ebx, %ebx C carry |
- |
- movl %ebp, PARAM_YSIZE |
- addl $4, %edi C next position in wp |
- |
- movl (%edx,%ebp,4), %ebp C yp limb - multiplier |
- |
- movl -4(%esi,%ecx,4), %eax C xp low limb |
- |
- |
-L(simple_outer_entry): |
- |
-L(simple_inner_top): |
- C eax xp limb |
- C ebx carry limb |
- C ecx loop counter (negative) |
- C edx scratch |
- C esi xp end |
- C edi wp end |
- C ebp multiplier |
- |
- mull %ebp |
- |
- addl %eax, %ebx |
- adcl $0, %edx |
- |
- addl %ebx, (%edi,%ecx,4) |
- movl (%esi,%ecx,4), %eax |
- adcl $0, %edx |
- |
- incl %ecx |
- movl %edx, %ebx |
- jnz L(simple_inner_top) |
- |
- |
- C separate code for last limb so outer loop counter handling can be |
- C interleaved |
- |
- mull %ebp |
- |
- movl PARAM_YSIZE, %ebp |
- addl %eax, %ebx |
- |
- adcl $0, %edx |
- |
- addl %ebx, (%edi) |
- |
- adcl $0, %edx |
- incl %ebp |
- |
- movl %edx, 4(%edi) |
- jnz L(simple_outer_top) |
- |
- |
-L(done): |
- movl SAVE_EBX, %ebx |
- |
- movl SAVE_ESI, %esi |
- |
- movl SAVE_EDI, %edi |
- |
- movl SAVE_EBP, %ebp |
- addl $FRAME, %esp |
- |
- ret |
- |
- |
- |
-C ----------------------------------------------------------------------------- |
-C |
-C The unrolled loop is the same as in mpn_addmul_1, see that code for some |
-C comments. |
-C |
-C VAR_ADJUST is the negative of how many limbs the leals in the inner loop |
-C increment xp and wp. This is used to adjust xp and wp, and is rshifted to |
-C given an initial VAR_COUNTER at the top of the outer loop. |
-C |
-C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT |
-C up to -1, inclusive. |
-C |
-C VAR_JMP is the computed jump into the unrolled loop. |
-C |
-C VAR_SWAP is 0 if xsize odd or 0xFFFFFFFF if xsize even, used to swap the |
-C initial ebx and ecx on entry to the unrolling. |
-C |
-C VAR_XP_LOW is the least significant limb of xp, which is needed at the |
-C start of the unrolled loop. |
-C |
-C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1, |
-C inclusive. |
-C |
-C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be |
-C added to give the location of the next limb of yp, which is the multiplier |
-C in the unrolled loop. |
-C |
-C The trick with the VAR_ADJUST value means it's only necessary to do one |
-C fetch in the outer loop to take care of xp, wp and the inner loop counter. |
- |
- |
-L(unroll): |
- C eax yp |
- C ebx |
- C ecx xsize |
- C edx ysize-1 |
- C esi xp end |
- C edi wp end of mul1 |
- C ebp |
- |
- movl PARAM_XP, %esi |
- |
- movl 4(%eax), %ebp C multiplier (yp second limb) |
- leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing |
- |
- movl %eax, PARAM_YP |
- movl PARAM_WP, %edi |
- negl %edx |
- |
- movl %edx, PARAM_YSIZE |
- leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1 |
- decl %ecx C xsize-1 |
- |
- movl (%esi), %eax C xp low limb |
- andl $-UNROLL_MASK-1, %ebx |
- negl %ecx C -(xsize-1) |
- |
- negl %ebx |
- andl $UNROLL_MASK, %ecx |
- |
- movl %ebx, VAR_ADJUST |
- movl %ecx, %edx |
- shll $4, %ecx |
- |
- movl %eax, VAR_XP_LOW |
- sarl $UNROLL_LOG2, %ebx |
- negl %edx |
- |
- C 15 code bytes per limb |
-ifdef(`PIC',` |
- call L(pic_calc) |
-L(unroll_here): |
-',` |
- leal L(unroll_inner_entry) (%ecx,%edx,1), %ecx |
-') |
- |
- movl %ecx, VAR_JMP |
- movl %edx, %ecx |
- shll $31, %edx |
- |
- sarl $31, %edx C 0 or -1 as xsize odd or even |
- leal 4(%edi,%ecx,4), %edi C wp and xp, adjust for unrolling, |
- leal 4(%esi,%ecx,4), %esi C and start at second limb |
- |
- movl %edx, VAR_SWAP |
- jmp L(unroll_outer_entry) |
- |
- |
-ifdef(`PIC',` |
-L(pic_calc): |
- C See mpn/x86/README about old gas bugs |
- leal (%ecx,%edx,1), %ecx |
- addl $L(unroll_inner_entry)-L(unroll_here), %ecx |
- addl (%esp), %ecx |
- ret_internal |
-') |
- |
- |
-C -------------------------------------------------------------------------- |
- ALIGN(16) |
-L(unroll_outer_top): |
- C eax |
- C ebx |
- C ecx |
- C edx |
- C esi xp + offset |
- C edi wp + offset |
- C ebp ysize counter, negative |
- |
- movl VAR_ADJUST, %ebx |
- movl PARAM_YP, %edx |
- |
- movl VAR_XP_LOW, %eax |
- movl %ebp, PARAM_YSIZE C store incremented ysize counter |
- |
- leal eval(UNROLL_BYTES + 4) (%edi,%ebx,4), %edi |
- leal (%esi,%ebx,4), %esi |
- sarl $UNROLL_LOG2, %ebx |
- |
- movl (%edx,%ebp,4), %ebp C yp next multiplier |
- |
-L(unroll_outer_entry): |
- mull %ebp |
- |
- movl %ebx, VAR_COUNTER |
- movl %edx, %ebx C carry high |
- movl %eax, %ecx C carry low |
- |
- xorl %edx, %eax |
- movl VAR_JMP, %edx |
- |
- andl VAR_SWAP, %eax |
- |
- xorl %eax, %ebx C carries other way for odd index |
- xorl %eax, %ecx |
- |
- jmp *%edx |
- |
- |
-C ----------------------------------------------------------------------------- |
- |
-L(unroll_inner_top): |
- C eax xp limb |
- C ebx carry high |
- C ecx carry low |
- C edx scratch |
- C esi xp+8 |
- C edi wp |
- C ebp yp multiplier limb |
- C |
- C VAR_COUNTER loop counter, negative |
- C |
- C 15 bytes each limb |
- |
- addl $UNROLL_BYTES, %edi |
- |
-L(unroll_inner_entry): |
- |
-deflit(CHUNK_COUNT,2) |
-forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` |
- deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) |
- deflit(`disp1', eval(disp0 + 4)) |
- |
-Zdisp( movl, disp0,(%esi), %eax) |
- mull %ebp |
-Zdisp( addl, %ecx, disp0,(%edi)) |
- adcl %eax, %ebx C new carry low |
- movl %edx, %ecx |
- adcl $0, %ecx C new carry high |
- |
- movl disp1(%esi), %eax |
- mull %ebp |
- addl %ebx, disp1(%edi) |
- adcl %eax, %ecx C new carry low |
- movl %edx, %ebx |
- adcl $0, %ebx C new carry high |
-') |
- |
- |
- incl VAR_COUNTER |
- leal UNROLL_BYTES(%esi), %esi |
- jnz L(unroll_inner_top) |
- |
- |
- C eax |
- C ebx carry high |
- C ecx carry low |
- C edx |
- C esi |
- C edi wp, pointing at second last limb) |
- C ebp |
- |
-deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128))) |
-deflit(`disp1', eval(disp0 + 4)) |
- |
- movl PARAM_YSIZE, %ebp |
- addl %ecx, disp0(%edi) C carry low |
- |
- adcl $0, %ebx |
- incl %ebp |
- |
- movl %ebx, disp1(%edi) C carry high |
- jnz L(unroll_outer_top) |
- |
- |
- movl SAVE_ESI, %esi |
- |
- movl SAVE_EBP, %ebp |
- |
- movl SAVE_EDI, %edi |
- |
- movl SAVE_EBX, %ebx |
- addl $FRAME, %esp |
- |
- ret |
- |
-EPILOGUE() |