| Index: gcc/gmp/mpn/sparc32/v9/submul_1.asm
|
| diff --git a/gcc/gmp/mpn/sparc32/v9/submul_1.asm b/gcc/gmp/mpn/sparc32/v9/submul_1.asm
|
| deleted file mode 100644
|
| index e5823b1e4b178c6329df064e2a65149755668740..0000000000000000000000000000000000000000
|
| --- a/gcc/gmp/mpn/sparc32/v9/submul_1.asm
|
| +++ /dev/null
|
| @@ -1,305 +0,0 @@
|
| -dnl SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and
|
| -dnl subtract the result from a second limb vector.
|
| -
|
| -dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
|
| -
|
| -dnl This file is part of the GNU MP Library.
|
| -
|
| -dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
| -dnl it under the terms of the GNU Lesser General Public License as published
|
| -dnl by the Free Software Foundation; either version 3 of the License, or (at
|
| -dnl your option) any later version.
|
| -
|
| -dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
| -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
| -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
| -dnl License for more details.
|
| -
|
| -dnl You should have received a copy of the GNU Lesser General Public License
|
| -dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
|
| -
|
| -include(`../config.m4')
|
| -
|
| -C Algorithm: We use two floating-point multiplies per limb product, with the
|
| -C invariant v operand split into two 16-bit pieces, and the u operand split
|
| -C into 32-bit pieces. We convert the two 48-bit products and transfer them to
|
| -C the integer unit.
|
| -
|
| -C cycles/limb
|
| -C UltraSPARC 1&2: 6.5
|
| -C UltraSPARC 3: ?
|
| -
|
| -C Possible optimizations:
|
| -C 1. Combine 32-bit memory operations into 64-bit operations. Since we're
|
| -C memory bandwidth limited, this could save 1.5 cycles/limb.
|
| -C 2. Unroll the inner loop. Since we already use alternate temporary areas,
|
| -C it is very straightforward to unroll, using an exit branch midways.
|
| -C Unrolling would allow deeper scheduling which could improve speed for L2
|
| -C cache case.
|
| -C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es
|
| -C aren't sufficiently apart-scheduled with just two temp areas.
|
| -C 4. Specialize for particular v values. If its upper 16 bits are zero, we
|
| -C could save many operations.
|
| -
|
| -C INPUT PARAMETERS
|
| -C rp i0
|
| -C up i1
|
| -C n i2
|
| -C v i3
|
| -
|
| -define(`FSIZE',224)
|
| -
|
| -ASM_START()
|
| -PROLOGUE(mpn_submul_1)
|
| - add %sp, -FSIZE, %sp
|
| - sethi %hi(0xffff), %g1
|
| - srl %o3, 16, %g2
|
| - or %g1, %lo(0xffff), %g1
|
| - and %o3, %g1, %g1
|
| - stx %g1, [%sp+104]
|
| - stx %g2, [%sp+112]
|
| - ldd [%sp+104], %f6
|
| - ldd [%sp+112], %f8
|
| - fxtod %f6, %f6
|
| - fxtod %f8, %f8
|
| - ld [%sp+104], %f10 C zero f10
|
| -
|
| - mov 0, %g3 C cy = 0
|
| -
|
| -define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe
|
| -
|
| - add %sp, 160, %o5 C point in scratch area
|
| - and %o5, -32, %o5 C align at 0 (mod 32) in scratch area
|
| -
|
| - subcc %o2, 1, %o2
|
| - ld [%o1], %f11 C read up[i]
|
| - add %o1, 4, %o1 C up++
|
| - bne,pt %icc, .L_two_or_more
|
| - fxtod %f10, %f2
|
| -
|
| - fmuld %f2, %f8, %f16
|
| - fmuld %f2, %f6, %f4
|
| - fdtox %f16, %f14
|
| - fdtox %f4, %f12
|
| - std %f14, [%o5+16]
|
| - std %f12, [%o5+24]
|
| - ldx [%o5+16], %g2 C p16
|
| - ldx [%o5+24], %g1 C p0
|
| - lduw [%o0], %g5 C read rp[i]
|
| - b .L1
|
| - add %o0, -16, %o0
|
| -
|
| - .align 16
|
| -.L_two_or_more:
|
| - subcc %o2, 1, %o2
|
| - ld [%o1], %f11 C read up[i]
|
| - fmuld %f2, %f8, %f16
|
| - fmuld %f2, %f6, %f4
|
| - add %o1, 4, %o1 C up++
|
| - bne,pt %icc, .L_three_or_more
|
| - fxtod %f10, %f2
|
| -
|
| - fdtox %f16, %f14
|
| - fdtox %f4, %f12
|
| - std %f14, [%o5+16]
|
| - fmuld %f2, %f8, %f16
|
| - std %f12, [%o5+24]
|
| - fmuld %f2, %f6, %f4
|
| - fdtox %f16, %f14
|
| - fdtox %f4, %f12
|
| - std %f14, [%o5+0]
|
| - std %f12, [%o5+8]
|
| - lduw [%o0], %g5 C read rp[i]
|
| - ldx [%o5+16], %g2 C p16
|
| - ldx [%o5+24], %g1 C p0
|
| - b .L2
|
| - add %o0, -12, %o0
|
| -
|
| - .align 16
|
| -.L_three_or_more:
|
| - subcc %o2, 1, %o2
|
| - ld [%o1], %f11 C read up[i]
|
| - fdtox %f16, %f14
|
| - fdtox %f4, %f12
|
| - std %f14, [%o5+16]
|
| - fmuld %f2, %f8, %f16
|
| - std %f12, [%o5+24]
|
| - fmuld %f2, %f6, %f4
|
| - add %o1, 4, %o1 C up++
|
| - bne,pt %icc, .L_four_or_more
|
| - fxtod %f10, %f2
|
| -
|
| - fdtox %f16, %f14
|
| - fdtox %f4, %f12
|
| - std %f14, [%o5+0]
|
| - fmuld %f2, %f8, %f16
|
| - std %f12, [%o5+8]
|
| - fmuld %f2, %f6, %f4
|
| - fdtox %f16, %f14
|
| - ldx [%o5+16], %g2 C p16
|
| - fdtox %f4, %f12
|
| - ldx [%o5+24], %g1 C p0
|
| - std %f14, [%o5+16]
|
| - std %f12, [%o5+24]
|
| - lduw [%o0], %g5 C read rp[i]
|
| - b .L3
|
| - add %o0, -8, %o0
|
| -
|
| - .align 16
|
| -.L_four_or_more:
|
| - subcc %o2, 1, %o2
|
| - ld [%o1], %f11 C read up[i]
|
| - fdtox %f16, %f14
|
| - fdtox %f4, %f12
|
| - std %f14, [%o5+0]
|
| - fmuld %f2, %f8, %f16
|
| - std %f12, [%o5+8]
|
| - fmuld %f2, %f6, %f4
|
| - add %o1, 4, %o1 C up++
|
| - bne,pt %icc, .L_five_or_more
|
| - fxtod %f10, %f2
|
| -
|
| - fdtox %f16, %f14
|
| - ldx [%o5+16], %g2 C p16
|
| - fdtox %f4, %f12
|
| - ldx [%o5+24], %g1 C p0
|
| - std %f14, [%o5+16]
|
| - fmuld %f2, %f8, %f16
|
| - std %f12, [%o5+24]
|
| - fmuld %f2, %f6, %f4
|
| - add %o1, 4, %o1 C up++
|
| - lduw [%o0], %g5 C read rp[i]
|
| - b .L4
|
| - add %o0, -4, %o0
|
| -
|
| - .align 16
|
| -.L_five_or_more:
|
| - subcc %o2, 1, %o2
|
| - ld [%o1], %f11 C read up[i]
|
| - fdtox %f16, %f14
|
| - ldx [%o5+16], %g2 C p16
|
| - fdtox %f4, %f12
|
| - ldx [%o5+24], %g1 C p0
|
| - std %f14, [%o5+16]
|
| - fmuld %f2, %f8, %f16
|
| - std %f12, [%o5+24]
|
| - fmuld %f2, %f6, %f4
|
| - add %o1, 4, %o1 C up++
|
| - lduw [%o0], %g5 C read rp[i]
|
| - bne,pt %icc, .Loop
|
| - fxtod %f10, %f2
|
| - b,a .L5
|
| -
|
| -C BEGIN MAIN LOOP
|
| - .align 16
|
| -C -- 0
|
| -.Loop: sub %g0, %g3, %g3
|
| - subcc %o2, 1, %o2
|
| - ld [%o1], %f11 C read up[i]
|
| - fdtox %f16, %f14
|
| -C -- 1
|
| - sllx %g2, 16, %g4 C (p16 << 16)
|
| - add %o0, 4, %o0 C rp++
|
| - ldx [%o5+0], %g2 C p16
|
| - fdtox %f4, %f12
|
| -C -- 2
|
| - srl %g3, 0, %g3 C zero most significant 32 bits
|
| - add %g1, %g4, %g4 C p = p0 + (p16 << 16)
|
| - ldx [%o5+8], %g1 C p0
|
| - fanop
|
| -C -- 3
|
| - nop
|
| - add %g3, %g4, %g4 C p += cy
|
| - std %f14, [%o5+0]
|
| - fmuld %f2, %f8, %f16
|
| -C -- 4
|
| - nop
|
| - sub %g5, %g4, %g4 C p += rp[i]
|
| - std %f12, [%o5+8]
|
| - fmuld %f2, %f6, %f4
|
| -C -- 5
|
| - xor %o5, 16, %o5 C alternate scratch variables
|
| - add %o1, 4, %o1 C up++
|
| - stw %g4, [%o0-4]
|
| - fanop
|
| -C -- 6
|
| - srlx %g4, 32, %g3 C new cy
|
| - lduw [%o0], %g5 C read rp[i]
|
| - bne,pt %icc, .Loop
|
| - fxtod %f10, %f2
|
| -C END MAIN LOOP
|
| -
|
| -.L5: sub %g0, %g3, %g3
|
| - fdtox %f16, %f14
|
| - sllx %g2, 16, %g4 C (p16 << 16)
|
| - ldx [%o5+0], %g2 C p16
|
| - fdtox %f4, %f12
|
| - srl %g3, 0, %g3 C zero most significant 32 bits
|
| - add %g1, %g4, %g4 C p = p0 + (p16 << 16)
|
| - ldx [%o5+8], %g1 C p0
|
| - add %g4, %g3, %g4 C p += cy
|
| - std %f14, [%o5+0]
|
| - fmuld %f2, %f8, %f16
|
| - sub %g5, %g4, %g4 C p += rp[i]
|
| - std %f12, [%o5+8]
|
| - fmuld %f2, %f6, %f4
|
| - xor %o5, 16, %o5
|
| - stw %g4, [%o0+0]
|
| - srlx %g4, 32, %g3 C new cy
|
| - lduw [%o0+4], %g5 C read rp[i]
|
| -
|
| - sub %g0, %g3, %g3
|
| -.L4: fdtox %f16, %f14
|
| - sllx %g2, 16, %g4 C (p16 << 16)
|
| - ldx [%o5+0], %g2 C p16
|
| - fdtox %f4, %f12
|
| - srl %g3, 0, %g3 C zero most significant 32 bits
|
| - add %g1, %g4, %g4 C p = p0 + (p16 << 16)
|
| - ldx [%o5+8], %g1 C p0
|
| - add %g3, %g4, %g4 C p += cy
|
| - std %f14, [%o5+0]
|
| - sub %g5, %g4, %g4 C p += rp[i]
|
| - std %f12, [%o5+8]
|
| - xor %o5, 16, %o5
|
| - stw %g4, [%o0+4]
|
| - srlx %g4, 32, %g3 C new cy
|
| - lduw [%o0+8], %g5 C read rp[i]
|
| -
|
| - sub %g0, %g3, %g3
|
| -.L3: sllx %g2, 16, %g4 C (p16 << 16)
|
| - ldx [%o5+0], %g2 C p16
|
| - srl %g3, 0, %g3 C zero most significant 32 bits
|
| - add %g1, %g4, %g4 C p = p0 + (p16 << 16)
|
| - ldx [%o5+8], %g1 C p0
|
| - add %g3, %g4, %g4 C p += cy
|
| - sub %g5, %g4, %g4 C p += rp[i]
|
| - xor %o5, 16, %o5
|
| - stw %g4, [%o0+8]
|
| - srlx %g4, 32, %g3 C new cy
|
| - lduw [%o0+12], %g5 C read rp[i]
|
| -
|
| - sub %g0, %g3, %g3
|
| -.L2: sllx %g2, 16, %g4 C (p16 << 16)
|
| - ldx [%o5+0], %g2 C p16
|
| - srl %g3, 0, %g3 C zero most significant 32 bits
|
| - add %g1, %g4, %g4 C p = p0 + (p16 << 16)
|
| - ldx [%o5+8], %g1 C p0
|
| - add %g3, %g4, %g4 C p += cy
|
| - sub %g5, %g4, %g4 C p += rp[i]
|
| - stw %g4, [%o0+12]
|
| - srlx %g4, 32, %g3 C new cy
|
| - lduw [%o0+16], %g5 C read rp[i]
|
| -
|
| - sub %g0, %g3, %g3
|
| -.L1: sllx %g2, 16, %g4 C (p16 << 16)
|
| - srl %g3, 0, %g3 C zero most significant 32 bits
|
| - add %g1, %g4, %g4 C p = p0 + (p16 << 16)
|
| - add %g3, %g4, %g4 C p += cy
|
| - sub %g5, %g4, %g4 C p += rp[i]
|
| - stw %g4, [%o0+16]
|
| - srlx %g4, 32, %g3 C new cy
|
| -
|
| - sub %g0, %g3, %o0
|
| - retl
|
| - sub %sp, -FSIZE, %sp
|
| -EPILOGUE(mpn_submul_1)
|
|
|