gcc/gmp/mpn/powerpc32/vmx/copyd.asm - Issue 3050029: [gcc] GCC 4.5.0=>4.5.1

Unified Diff: gcc/gmp/mpn/powerpc32/vmx/copyd.asm

Issue 3050029: [gcc] GCC 4.5.0=>4.5.1 (Closed) Base URL: ssh://git@gitrw.chromium.org:9222/nacl-toolchain.git

Patch Set: Created 10 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: gcc/gmp/mpn/powerpc32/vmx/copyd.asm

diff --git a/gcc/gmp/mpn/powerpc32/vmx/copyd.asm b/gcc/gmp/mpn/powerpc32/vmx/copyd.asm

deleted file mode 100644

index e345eef01feba3fbd62e802ad173dafcb51aa04a..0000000000000000000000000000000000000000

--- a/gcc/gmp/mpn/powerpc32/vmx/copyd.asm

+++ /dev/null

@@ -1,192 +0,0 @@

-dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd.

-dnl This file is part of the GNU MP Library.

-dnl The GNU MP Library is free software; you can redistribute it and/or modify

-dnl it under the terms of the GNU Lesser General Public License as published

-dnl by the Free Software Foundation; either version 3 of the License, or (at

-dnl your option) any later version.

-dnl The GNU MP Library is distributed in the hope that it will be useful, but

-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY

-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public

-dnl License for more details.

-dnl You should have received a copy of the GNU Lesser General Public License

-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.

-include(`../config.m4')

-C 16-byte coaligned unaligned

-C cycles/limb cycles/limb

-C 7400,7410 (G4): 0.5 0.64

-C 744x,745x (G4+): 0.75 0.82

-C 970 (G5): 0.78 1.02 (64-bit limbs)

-C STATUS

-C * Works for all sizes and alignments.

-C TODO

-C * Optimize unaligned case. Some basic tests with 2-way and 4-way unrolling

-C indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80

-C c/l for 970.

-C * Consider using VMX instructions also for head and tail, by using some

-C read-modify-write tricks.

-C * The VMX code is used from the smallest sizes it handles, but measurements

-C show a large speed bump at the cutoff points. Small copying (perhaps

-C using some read-modify-write technique) should be optimized.

-C * Make a mpn_com_n based on this code.

-define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))

-define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES))

-define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))

-ifelse(GMP_LIMB_BITS,32,`

- define(`LIMB32',` $1')

- define(`LIMB64',`')

-',`

- define(`LIMB32',`')

- define(`LIMB64',` $1')

-')

-C INPUT PARAMETERS

-define(`rp', `r3')

-define(`up', `r4')

-define(`n', `r5')

-define(`us', `v4')

-ASM_START()

-PROLOGUE(mpn_copyd)

-LIMB32(`slwi. r0, n, 2 ')

-LIMB64(`sldi. r0, n, 3 ')

- add rp, rp, r0

- add up, up, r0

-LIMB32(`cmpi cr7, n, 11 ')

-LIMB64(`cmpdi cr7, n, 5 ')

- bge cr7, L(big)

- beqlr cr0

-C Handle small cases with plain operations

- mtctr n

-L(topS):

-LIMB32(`lwz r0, -4(up) ')

-LIMB64(`ld r0, -8(up) ')

- addi up, up, -GMP_LIMB_BYTES

-LIMB32(`stw r0, -4(rp) ')

-LIMB64(`std r0, -8(rp) ')

- addi rp, rp, -GMP_LIMB_BYTES

- bdnz L(topS)

- blr

-C Handle large cases with VMX operations

-L(big):

- addi rp, rp, -16

- addi up, up, -16

- mfspr r12, 256

- oris r0, r12, 0xf800 C Set VRSAVE bit 0-4

- mtspr 256, r0

-LIMB32(`rlwinm. r7, rp, 30,30,31') C (rp >> 2) mod 4

-LIMB64(`rlwinm. r7, rp, 29,31,31') C (rp >> 3) mod 2

- beq L(rp_aligned)

- subf n, r7, n

-L(top0):

-LIMB32(`lwz r0, 12(up) ')

-LIMB64(`ld r0, 8(up) ')

- addi up, up, -GMP_LIMB_BYTES

-LIMB32(`addic. r7, r7, -1 ')

-LIMB32(`stw r0, 12(rp) ')

-LIMB64(`std r0, 8(rp) ')

- addi rp, rp, -GMP_LIMB_BYTES

-LIMB32(`bne L(top0) ')

-L(rp_aligned):

-LIMB32(`rlwinm. r0, up, 30,30,31') C (up >> 2) mod 4

-LIMB64(`rlwinm. r0, up, 29,31,31') C (up >> 3) mod 2

-LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n

-LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n

- mtctr r7 C copy n to count register

- li r10, -16

- beq L(up_aligned)

- lvsl us, 0, up

- addi up, up, 16

-LIMB32(`andi. r0, n, 0x4 ')

-LIMB64(`andi. r0, n, 0x2 ')

- beq L(1)

- lvx v0, 0, up

- lvx v2, r10, up

- vperm v3, v2, v0, us

- stvx v3, 0, rp

- addi up, up, -32

- addi rp, rp, -16

- b L(lpu)

-L(1): lvx v2, 0, up

- addi up, up, -16

- b L(lpu)

- ALIGN(32)

-L(lpu): lvx v0, 0, up

- vperm v3, v0, v2, us

- stvx v3, 0, rp

- lvx v2, r10, up

- addi up, up, -32

- vperm v3, v2, v0, us

- stvx v3, r10, rp

- addi rp, rp, -32

- bdnz L(lpu)

- b L(tail)

-L(up_aligned):

-LIMB32(`andi. r0, n, 0x4 ')

-LIMB64(`andi. r0, n, 0x2 ')

- beq L(lpa)

- lvx v0, 0, up

- stvx v0, 0, rp

- addi up, up, -16

- addi rp, rp, -16

- b L(lpa)

- ALIGN(32)

-L(lpa): lvx v0, 0, up

- lvx v1, r10, up

- addi up, up, -32

- nop

- stvx v0, 0, rp

- stvx v1, r10, rp

- addi rp, rp, -32

- bdnz L(lpa)

-L(tail):

-LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4

-LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2

- beq L(ret)

-LIMB32(`li r10, 12 ')

-L(top2):

-LIMB32(`lwzx r0, r10, up ')

-LIMB64(`ld r0, 8(up) ')

-LIMB32(`addic. r7, r7, -1 ')

-LIMB32(`stwx r0, r10, rp ')

-LIMB64(`std r0, 8(rp) ')

-LIMB32(`addi r10, r10, -GMP_LIMB_BYTES')

-LIMB32(`bne L(top2) ')

-L(ret): mtspr 256, r12

- blr

-EPILOGUE()

« no previous file with comments | « gcc/gmp/mpn/powerpc32/sublsh1_n.asm ('k') | gcc/gmp/mpn/powerpc32/vmx/popcount.asm » ('j') | no next file with comments »