Index: gcc/gmp/mpn/powerpc32/vmx/copyd.asm |
diff --git a/gcc/gmp/mpn/powerpc32/vmx/copyd.asm b/gcc/gmp/mpn/powerpc32/vmx/copyd.asm |
deleted file mode 100644 |
index e345eef01feba3fbd62e802ad173dafcb51aa04a..0000000000000000000000000000000000000000 |
--- a/gcc/gmp/mpn/powerpc32/vmx/copyd.asm |
+++ /dev/null |
@@ -1,192 +0,0 @@ |
-dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd. |
- |
-dnl Copyright 2006 Free Software Foundation, Inc. |
- |
-dnl This file is part of the GNU MP Library. |
- |
-dnl The GNU MP Library is free software; you can redistribute it and/or modify |
-dnl it under the terms of the GNU Lesser General Public License as published |
-dnl by the Free Software Foundation; either version 3 of the License, or (at |
-dnl your option) any later version. |
- |
-dnl The GNU MP Library is distributed in the hope that it will be useful, but |
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
-dnl License for more details. |
- |
-dnl You should have received a copy of the GNU Lesser General Public License |
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
- |
-include(`../config.m4') |
- |
-C 16-byte coaligned unaligned |
-C cycles/limb cycles/limb |
-C 7400,7410 (G4): 0.5 0.64 |
-C 744x,745x (G4+): 0.75 0.82 |
-C 970 (G5): 0.78 1.02 (64-bit limbs) |
- |
-C STATUS |
-C * Works for all sizes and alignments. |
- |
-C TODO |
-C * Optimize unaligned case. Some basic tests with 2-way and 4-way unrolling |
-C indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80 |
-C c/l for 970. |
-C * Consider using VMX instructions also for head and tail, by using some |
-C read-modify-write tricks. |
-C * The VMX code is used from the smallest sizes it handles, but measurements |
-C show a large speed bump at the cutoff points. Small copying (perhaps |
-C using some read-modify-write technique) should be optimized. |
-C * Make a mpn_com_n based on this code. |
- |
-define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) |
-define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) |
-define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) |
- |
- |
-ifelse(GMP_LIMB_BITS,32,` |
- define(`LIMB32',` $1') |
- define(`LIMB64',`') |
-',` |
- define(`LIMB32',`') |
- define(`LIMB64',` $1') |
-') |
- |
-C INPUT PARAMETERS |
-define(`rp', `r3') |
-define(`up', `r4') |
-define(`n', `r5') |
- |
-define(`us', `v4') |
- |
- |
-ASM_START() |
-PROLOGUE(mpn_copyd) |
- |
-LIMB32(`slwi. r0, n, 2 ') |
-LIMB64(`sldi. r0, n, 3 ') |
- add rp, rp, r0 |
- add up, up, r0 |
- |
-LIMB32(`cmpi cr7, n, 11 ') |
-LIMB64(`cmpdi cr7, n, 5 ') |
- bge cr7, L(big) |
- |
- beqlr cr0 |
- |
-C Handle small cases with plain operations |
- mtctr n |
-L(topS): |
-LIMB32(`lwz r0, -4(up) ') |
-LIMB64(`ld r0, -8(up) ') |
- addi up, up, -GMP_LIMB_BYTES |
-LIMB32(`stw r0, -4(rp) ') |
-LIMB64(`std r0, -8(rp) ') |
- addi rp, rp, -GMP_LIMB_BYTES |
- bdnz L(topS) |
- blr |
- |
-C Handle large cases with VMX operations |
-L(big): |
- addi rp, rp, -16 |
- addi up, up, -16 |
- mfspr r12, 256 |
- oris r0, r12, 0xf800 C Set VRSAVE bit 0-4 |
- mtspr 256, r0 |
- |
-LIMB32(`rlwinm. r7, rp, 30,30,31') C (rp >> 2) mod 4 |
-LIMB64(`rlwinm. r7, rp, 29,31,31') C (rp >> 3) mod 2 |
- beq L(rp_aligned) |
- |
- subf n, r7, n |
-L(top0): |
-LIMB32(`lwz r0, 12(up) ') |
-LIMB64(`ld r0, 8(up) ') |
- addi up, up, -GMP_LIMB_BYTES |
-LIMB32(`addic. r7, r7, -1 ') |
-LIMB32(`stw r0, 12(rp) ') |
-LIMB64(`std r0, 8(rp) ') |
- addi rp, rp, -GMP_LIMB_BYTES |
-LIMB32(`bne L(top0) ') |
- |
-L(rp_aligned): |
- |
-LIMB32(`rlwinm. r0, up, 30,30,31') C (up >> 2) mod 4 |
-LIMB64(`rlwinm. r0, up, 29,31,31') C (up >> 3) mod 2 |
- |
-LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n |
-LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n |
- mtctr r7 C copy n to count register |
- |
- li r10, -16 |
- |
- beq L(up_aligned) |
- |
- lvsl us, 0, up |
- |
- addi up, up, 16 |
-LIMB32(`andi. r0, n, 0x4 ') |
-LIMB64(`andi. r0, n, 0x2 ') |
- beq L(1) |
- lvx v0, 0, up |
- lvx v2, r10, up |
- vperm v3, v2, v0, us |
- stvx v3, 0, rp |
- addi up, up, -32 |
- addi rp, rp, -16 |
- b L(lpu) |
-L(1): lvx v2, 0, up |
- addi up, up, -16 |
- b L(lpu) |
- |
- ALIGN(32) |
-L(lpu): lvx v0, 0, up |
- vperm v3, v0, v2, us |
- stvx v3, 0, rp |
- lvx v2, r10, up |
- addi up, up, -32 |
- vperm v3, v2, v0, us |
- stvx v3, r10, rp |
- addi rp, rp, -32 |
- bdnz L(lpu) |
- |
- b L(tail) |
- |
-L(up_aligned): |
- |
-LIMB32(`andi. r0, n, 0x4 ') |
-LIMB64(`andi. r0, n, 0x2 ') |
- beq L(lpa) |
- lvx v0, 0, up |
- stvx v0, 0, rp |
- addi up, up, -16 |
- addi rp, rp, -16 |
- b L(lpa) |
- |
- ALIGN(32) |
-L(lpa): lvx v0, 0, up |
- lvx v1, r10, up |
- addi up, up, -32 |
- nop |
- stvx v0, 0, rp |
- stvx v1, r10, rp |
- addi rp, rp, -32 |
- bdnz L(lpa) |
- |
-L(tail): |
-LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4 |
-LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2 |
- beq L(ret) |
-LIMB32(`li r10, 12 ') |
-L(top2): |
-LIMB32(`lwzx r0, r10, up ') |
-LIMB64(`ld r0, 8(up) ') |
-LIMB32(`addic. r7, r7, -1 ') |
-LIMB32(`stwx r0, r10, rp ') |
-LIMB64(`std r0, 8(rp) ') |
-LIMB32(`addi r10, r10, -GMP_LIMB_BYTES') |
-LIMB32(`bne L(top2) ') |
- |
-L(ret): mtspr 256, r12 |
- blr |
-EPILOGUE() |