Index: gcc/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm |
diff --git a/gcc/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm b/gcc/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm |
deleted file mode 100644 |
index 2628e5eb72dfe894ffc203d710c6be823b0a6e56..0000000000000000000000000000000000000000 |
--- a/gcc/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm |
+++ /dev/null |
@@ -1,651 +0,0 @@ |
-dnl mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). |
- |
-dnl Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc. |
-dnl |
-dnl This file is part of the GNU MP Library. |
-dnl |
-dnl The GNU MP Library is free software; you can redistribute it and/or modify |
-dnl it under the terms of the GNU Lesser General Public License as published |
-dnl by the Free Software Foundation; either version 3 of the License, or (at |
-dnl your option) any later version. |
-dnl |
-dnl The GNU MP Library is distributed in the hope that it will be useful, but |
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
-dnl License for more details. |
-dnl |
-dnl You should have received a copy of the GNU Lesser General Public License |
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
- |
-include(`../config.m4') |
- |
-C TODO: |
-C * Improve ad-hoc outer loop code and register handling. Some feed-in |
-C scheduling could improve things by several cycles per outer iteration. |
-C * In code for un <= 3, try keeping accumulation operands in registers, |
-C without storing intermediates to rp. |
-C * We might want to keep 32 in a free mm register, since the register form is |
-C 3 bytes and the immediate form is 4 bytes. About 70 bytes to save. |
-C * Look into different loop alignment, we now expand the code about 50 bytes |
-C with possibly needless alignment. |
-C * Perhaps rewrap loops 00,01,02 (6 loops) to allow fall-through entry. |
-C * Use OSP, should solve feed-in latency problems. |
-C * Save a few tens of bytes by doing cross-jumping for Loel0, etc. |
-C * Save around 120 bytes by remapping "m 0", "m 1", "m 2" and "m 3" registers |
-C so that they can share feed-in code, and changing the branch targets from |
-C L<n> to Lm<nn>. |
- |
-C cycles/limb |
-C P6 model 9 (Banias) ? |
-C P6 model 13 (Dothan) 5.24 |
-C P6 model 14 (Yonah) ? |
-C P4 model 0-1 (Willamette): 5 |
-C P4 model 2 (Northwood): 4.60 at 32 limbs |
-C P4 model 3-4 (Prescott): 4.94 at 32 limbs |
- |
-C INPUT PARAMETERS |
-C rp sp + 4 |
-C up sp + 8 |
-C un sp + 12 |
-C vp sp + 16 |
-C vn sp + 20 |
- |
- TEXT |
- ALIGN(16) |
-PROLOGUE(mpn_mul_basecase) |
- push %esi |
- push %ebx |
- mov 12(%esp), %edx C rp |
- mov 16(%esp), %eax C up |
- mov 20(%esp), %ecx C un |
- mov 24(%esp), %esi C vp |
- mov 28(%esp), %ebx C vn |
- movd (%esi), %mm7 C |
-L(ent): cmp $3, %ecx |
- ja L(big) |
- movd (%eax), %mm6 |
- pmuludq %mm7, %mm6 |
- jz L(un3) |
- cmp $2, %ecx |
- jz L(un2) |
- |
-L(un1): movd %mm6, (%edx) C un=1 |
- psrlq $32, %mm6 C un=1 |
- movd %mm6, 4(%edx) C un=1 |
- jmp L(rtr) C un=1 |
- |
-L(un2): movd 4(%eax), %mm1 C un=2 |
- pmuludq %mm7, %mm1 C un=2 |
- movd %mm6, (%edx) C un=2 |
- psrlq $32, %mm6 C un=2 |
- paddq %mm1, %mm6 C un=2 |
- movd %mm6, 4(%edx) C un=2 |
- psrlq $32, %mm6 C un=2 |
- movd %mm6, 8(%edx) C un=2 |
- dec %ebx C un=2 |
- jz L(rtr) C un=2 |
- movd 4(%esi), %mm7 C un=2 |
- movd (%eax), %mm6 C un=2 |
- pmuludq %mm7, %mm6 C un=2 |
- movd 4(%eax), %mm1 C un=2 |
- movd 4(%edx), %mm4 C un=2 |
- pmuludq %mm7, %mm1 C un=2 |
- movd 8(%edx), %mm5 C un=2 |
- paddq %mm4, %mm6 C un=2 |
- paddq %mm1, %mm5 C un=2 |
- movd %mm6, 4(%edx) C un=2 |
- psrlq $32, %mm6 C un=2 |
- paddq %mm5, %mm6 C un=2 |
- movd %mm6, 8(%edx) C un=2 |
- psrlq $32, %mm6 C un=2 |
- movd %mm6, 12(%edx) C un=2 |
-L(rtr): emms |
- pop %ebx |
- pop %esi |
- ret |
- |
-L(un3): movd 4(%eax), %mm1 C un=3 |
- pmuludq %mm7, %mm1 C un=3 |
- movd 8(%eax), %mm2 C un=3 |
- pmuludq %mm7, %mm2 C un=3 |
- movd %mm6, (%edx) C un=3 |
- psrlq $32, %mm6 C un=3 |
- paddq %mm1, %mm6 C un=3 |
- movd %mm6, 4(%edx) C un=3 |
- psrlq $32, %mm6 C un=3 |
- paddq %mm2, %mm6 C un=3 |
- movd %mm6, 8(%edx) C un=3 |
- psrlq $32, %mm6 C un=3 |
- movd %mm6, 12(%edx) C un=3 |
- dec %ebx C un=3 |
- jz L(rtr) C un=3 |
- movd 4(%esi), %mm7 C un=3 |
- movd (%eax), %mm6 C un=3 |
- pmuludq %mm7, %mm6 C un=3 |
- movd 4(%eax), %mm1 C un=3 |
- movd 4(%edx), %mm4 C un=3 |
- pmuludq %mm7, %mm1 C un=3 |
- movd 8(%eax), %mm2 C un=3 |
- movd 8(%edx), %mm5 C un=3 |
- pmuludq %mm7, %mm2 C un=3 |
- paddq %mm4, %mm6 C un=3 |
- paddq %mm1, %mm5 C un=3 |
- movd 12(%edx), %mm4 C un=3 |
- movd %mm6, 4(%edx) C un=3 |
- psrlq $32, %mm6 C un=3 |
- paddq %mm5, %mm6 C un=3 |
- paddq %mm2, %mm4 C un=3 |
- movd %mm6, 8(%edx) C un=3 |
- psrlq $32, %mm6 C un=3 |
- paddq %mm4, %mm6 C un=3 |
- movd %mm6, 12(%edx) C un=3 |
- psrlq $32, %mm6 C un=3 |
- movd %mm6, 16(%edx) C un=3 |
- dec %ebx C un=3 |
- jz L(rtr) C un=3 |
- movd 8(%esi), %mm7 C un=3 |
- movd (%eax), %mm6 C un=3 |
- pmuludq %mm7, %mm6 C un=3 |
- movd 4(%eax), %mm1 C un=3 |
- movd 8(%edx), %mm4 C un=3 |
- pmuludq %mm7, %mm1 C un=3 |
- movd 8(%eax), %mm2 C un=3 |
- movd 12(%edx), %mm5 C un=3 |
- pmuludq %mm7, %mm2 C un=3 |
- paddq %mm4, %mm6 C un=3 |
- paddq %mm1, %mm5 C un=3 |
- movd 16(%edx), %mm4 C un=3 |
- movd %mm6, 8(%edx) C un=3 |
- psrlq $32, %mm6 C un=3 |
- paddq %mm5, %mm6 C un=3 |
- paddq %mm2, %mm4 C un=3 |
- movd %mm6, 12(%edx) C un=3 |
- psrlq $32, %mm6 C un=3 |
- paddq %mm4, %mm6 C un=3 |
- movd %mm6, 16(%edx) C un=3 |
- psrlq $32, %mm6 C un=3 |
- movd %mm6, 20(%edx) C un=3 |
- jmp L(rtr) |
- |
- |
-L(big): push %edi |
- pxor %mm6, %mm6 |
- lea 4(%esi), %esi |
- and $3, %ecx |
- jz L(0) |
- cmp $2, %ecx |
- jc L(1) |
- jz L(2) |
- jmp L(3) C FIXME: one case should fall through |
- |
- |
-L(0): movd (%eax), %mm3 C m 0 |
- sub 24(%esp), %ecx C inner loop count m 0 |
- mov %ecx, 24(%esp) C update loop count for later m 0 |
- pmuludq %mm7, %mm3 C m 0 |
- movd 4(%eax), %mm0 C m 0 |
- pmuludq %mm7, %mm0 C m 0 |
- movd 8(%eax), %mm1 C m 0 |
- jmp L(m00) C m 0 |
- ALIGN(16) C m 0 |
-L(lpm0): |
- pmuludq %mm7, %mm4 C m 0 |
- paddq %mm0, %mm6 C m 0 |
- movd (%eax), %mm3 C m 0 |
- movd %mm6, -12(%edx) C m 0 |
- psrlq $32, %mm6 C m 0 |
- pmuludq %mm7, %mm3 C m 0 |
- paddq %mm1, %mm6 C m 0 |
- movd 4(%eax), %mm0 C m 0 |
- movd %mm6, -8(%edx) C m 0 |
- psrlq $32, %mm6 C m 0 |
- pmuludq %mm7, %mm0 C m 0 |
- paddq %mm4, %mm6 C m 0 |
- movd 8(%eax), %mm1 C m 0 |
- movd %mm6, -4(%edx) C m 0 |
- psrlq $32, %mm6 C m 0 |
-L(m00): pmuludq %mm7, %mm1 C m 0 |
- paddq %mm3, %mm6 C m 0 |
- movd 12(%eax), %mm4 C m 0 |
- movd %mm6, (%edx) C m 0 |
- psrlq $32, %mm6 C m 0 |
- lea 16(%eax), %eax C m 0 |
- lea 16(%edx), %edx C m 0 |
- add $4, %ecx C m 0 |
- ja L(lpm0) C m 0 |
- pmuludq %mm7, %mm4 C m 0 |
- paddq %mm0, %mm6 C m 0 |
- movd %mm6, -12(%edx) C m 0 |
- psrlq $32, %mm6 C m 0 |
- paddq %mm1, %mm6 C m 0 |
- mov 16(%esp), %edi C rp 0 |
- jmp L(x0) |
- |
-L(olp0): |
- lea 4(%edi), %edi C am 0 |
- movd (%esi), %mm7 C am 0 |
- lea 4(%esi), %esi C am 0 |
- mov %edi, %edx C rp am 0 |
- mov 20(%esp), %eax C up am 0 |
- movd (%eax), %mm3 C am 0 |
- mov 24(%esp), %ecx C inner loop count am 0 |
- pxor %mm6, %mm6 C am 0 |
- pmuludq %mm7, %mm3 C am 0 |
- movd 4(%eax), %mm0 C am 0 |
- movd (%edx), %mm5 C am 0 |
- pmuludq %mm7, %mm0 C am 0 |
- movd 8(%eax), %mm1 C am 0 |
- paddq %mm3, %mm5 C am 0 |
- movd 4(%edx), %mm4 C am 0 |
- jmp L(am00) C am 0 |
- ALIGN(16) C mm 0 |
-L(lam0): |
- pmuludq %mm7, %mm2 C am 0 |
- paddq %mm4, %mm6 C am 0 |
- movd (%eax), %mm3 C am 0 |
- paddq %mm1, %mm5 C am 0 |
- movd -4(%edx), %mm4 C am 0 |
- movd %mm6, -12(%edx) C am 0 |
- psrlq $32, %mm6 C am 0 |
- pmuludq %mm7, %mm3 C am 0 |
- paddq %mm5, %mm6 C am 0 |
- movd 4(%eax), %mm0 C am 0 |
- paddq %mm2, %mm4 C am 0 |
- movd (%edx), %mm5 C am 0 |
- movd %mm6, -8(%edx) C am 0 |
- psrlq $32, %mm6 C am 0 |
- pmuludq %mm7, %mm0 C am 0 |
- paddq %mm4, %mm6 C am 0 |
- movd 8(%eax), %mm1 C am 0 |
- paddq %mm3, %mm5 C am 0 |
- movd 4(%edx), %mm4 C am 0 |
- movd %mm6, -4(%edx) C am 0 |
- psrlq $32, %mm6 C am 0 |
-L(am00): |
- pmuludq %mm7, %mm1 C am 0 |
- paddq %mm5, %mm6 C am 0 |
- movd 12(%eax), %mm2 C am 0 |
- paddq %mm0, %mm4 C am 0 |
- movd 8(%edx), %mm5 C am 0 |
- movd %mm6, (%edx) C am 0 |
- psrlq $32, %mm6 C am 0 |
- lea 16(%eax), %eax C am 0 |
- lea 16(%edx), %edx C am 0 |
- add $4, %ecx C am 0 |
- jnz L(lam0) C am 0 |
- pmuludq %mm7, %mm2 C am 0 |
- paddq %mm4, %mm6 C am 0 |
- paddq %mm1, %mm5 C am 0 |
- movd -4(%edx), %mm4 C am 0 |
- movd %mm6, -12(%edx) C am 0 |
- psrlq $32, %mm6 C am 0 |
- paddq %mm5, %mm6 C am 0 |
- paddq %mm2, %mm4 C am 0 |
-L(x0): movd %mm6, -8(%edx) C am 0 |
- psrlq $32, %mm6 C am 0 |
- paddq %mm4, %mm6 C am 0 |
- movd %mm6, -4(%edx) C am 0 |
- psrlq $32, %mm6 C am 0 |
- movd %mm6, (%edx) C am 0 |
- dec %ebx C am 0 |
- jnz L(olp0) C am 0 |
-L(oel0): |
- emms C 0 |
- pop %edi C 0 |
- pop %ebx C 0 |
- pop %esi C 0 |
- ret C 0 |
- |
- |
-L(1): movd (%eax), %mm4 C m 1 |
- sub 24(%esp), %ecx C m 1 |
- mov %ecx, 24(%esp) C update loop count for later m 1 |
- pmuludq %mm7, %mm4 C m 1 |
- movd 4(%eax), %mm3 C m 1 |
- pmuludq %mm7, %mm3 C m 1 |
- movd 8(%eax), %mm0 C m 1 |
- jmp L(m01) C m 1 |
- ALIGN(16) C m 1 |
-L(lpm1): |
- pmuludq %mm7, %mm4 C m 1 |
- paddq %mm0, %mm6 C m 1 |
- movd 4(%eax), %mm3 C m 1 |
- movd %mm6, -8(%edx) C m 1 |
- psrlq $32, %mm6 C m 1 |
- pmuludq %mm7, %mm3 C m 1 |
- paddq %mm1, %mm6 C m 1 |
- movd 8(%eax), %mm0 C m 1 |
- movd %mm6, -4(%edx) C m 1 |
- psrlq $32, %mm6 C m 1 |
-L(m01): pmuludq %mm7, %mm0 C m 1 |
- paddq %mm4, %mm6 C m 1 |
- movd 12(%eax), %mm1 C m 1 |
- movd %mm6, (%edx) C m 1 |
- psrlq $32, %mm6 C m 1 |
- pmuludq %mm7, %mm1 C m 1 |
- paddq %mm3, %mm6 C m 1 |
- movd 16(%eax), %mm4 C m 1 |
- movd %mm6, 4(%edx) C m 1 |
- psrlq $32, %mm6 C m 1 |
- lea 16(%eax), %eax C m 1 |
- lea 16(%edx), %edx C m 1 |
- add $4, %ecx C m 1 |
- ja L(lpm1) C m 1 |
- pmuludq %mm7, %mm4 C m 1 |
- paddq %mm0, %mm6 C m 1 |
- movd %mm6, -8(%edx) C m 1 |
- psrlq $32, %mm6 C m 1 |
- paddq %mm1, %mm6 C m 1 |
- mov 16(%esp), %edi C rp 1 |
- jmp L(x1) |
- |
-L(olp1): |
- lea 4(%edi), %edi C am 1 |
- movd (%esi), %mm7 C am 1 |
- lea 4(%esi), %esi C am 1 |
- mov %edi, %edx C rp am 1 |
- mov 20(%esp), %eax C up am 1 |
- movd (%eax), %mm2 C am 1 |
- mov 24(%esp), %ecx C inner loop count am 1 |
- pxor %mm6, %mm6 C am 1 |
- pmuludq %mm7, %mm2 C am 1 |
- movd 4(%eax), %mm3 C am 1 |
- movd (%edx), %mm4 C am 1 |
- pmuludq %mm7, %mm3 C am 1 |
- movd 8(%eax), %mm0 C am 1 |
- paddq %mm2, %mm4 C am 1 |
- movd 4(%edx), %mm5 C am 1 |
- jmp L(am01) C am 1 |
- ALIGN(16) C am 1 |
-L(lam1): |
- pmuludq %mm7, %mm2 C am 1 |
- paddq %mm4, %mm6 C am 1 |
- movd 4(%eax), %mm3 C am 1 |
- paddq %mm1, %mm5 C am 1 |
- movd (%edx), %mm4 C am 1 |
- movd %mm6, -8(%edx) C am 1 |
- psrlq $32, %mm6 C am 1 |
- pmuludq %mm7, %mm3 C am 1 |
- paddq %mm5, %mm6 C am 1 |
- movd 8(%eax), %mm0 C am 1 |
- paddq %mm2, %mm4 C am 1 |
- movd 4(%edx), %mm5 C am 1 |
- movd %mm6, -4(%edx) C am 1 |
- psrlq $32, %mm6 C am 1 |
-L(am01): |
- pmuludq %mm7, %mm0 C am 1 |
- paddq %mm4, %mm6 C am 1 |
- movd 12(%eax), %mm1 C am 1 |
- paddq %mm3, %mm5 C am 1 |
- movd 8(%edx), %mm4 C am 1 |
- movd %mm6, (%edx) C am 1 |
- psrlq $32, %mm6 C am 1 |
- pmuludq %mm7, %mm1 C am 1 |
- paddq %mm5, %mm6 C am 1 |
- movd 16(%eax), %mm2 C am 1 |
- paddq %mm0, %mm4 C am 1 |
- movd 12(%edx), %mm5 C am 1 |
- movd %mm6, 4(%edx) C am 1 |
- psrlq $32, %mm6 C am 1 |
- lea 16(%eax), %eax C am 1 |
- lea 16(%edx), %edx C am 1 |
- add $4, %ecx C am 1 |
- jnz L(lam1) C am 1 |
- pmuludq %mm7, %mm2 C am 1 |
- paddq %mm4, %mm6 C am 1 |
- paddq %mm1, %mm5 C am 1 |
- movd (%edx), %mm4 C am 1 |
- movd %mm6, -8(%edx) C am 1 |
- psrlq $32, %mm6 C am 1 |
- paddq %mm5, %mm6 C am 1 |
- paddq %mm2, %mm4 C am 1 |
-L(x1): movd %mm6, -4(%edx) C am 1 |
- psrlq $32, %mm6 C am 1 |
- paddq %mm4, %mm6 C am 1 |
- movd %mm6, (%edx) C am 1 |
- psrlq $32, %mm6 C am 1 |
- movd %mm6, 4(%edx) C am 1 |
- dec %ebx C am 1 |
- jnz L(olp1) C am 1 |
-L(oel1): |
- emms C 1 |
- pop %edi C 1 |
- pop %ebx C 1 |
- pop %esi C 1 |
- ret C 1 |
- |
- |
-L(2): movd (%eax), %mm1 C m 2 |
- sub 24(%esp), %ecx C m 2 |
- mov %ecx, 24(%esp) C update loop count for later m 2 |
- pmuludq %mm7, %mm1 C m 2 |
- movd 4(%eax), %mm4 C m 2 |
- pmuludq %mm7, %mm4 C m 2 |
- movd 8(%eax), %mm3 C m 2 |
- jmp L(m10) C m 2 |
- ALIGN(16) C m 2 |
-L(lpm2): |
- pmuludq %mm7, %mm4 C m 2 |
- paddq %mm0, %mm6 C m 2 |
- movd 8(%eax), %mm3 C m 2 |
- movd %mm6, -4(%edx) C m 2 |
- psrlq $32, %mm6 C m 2 |
-L(m10): pmuludq %mm7, %mm3 C m 2 |
- paddq %mm1, %mm6 C m 2 |
- movd 12(%eax), %mm0 C m 2 |
- movd %mm6, (%edx) C m 2 |
- psrlq $32, %mm6 C m 2 |
- pmuludq %mm7, %mm0 C m 2 |
- paddq %mm4, %mm6 C m 2 |
- movd 16(%eax), %mm1 C m 2 |
- movd %mm6, 4(%edx) C m 2 |
- psrlq $32, %mm6 C m 2 |
- pmuludq %mm7, %mm1 C m 2 |
- paddq %mm3, %mm6 C m 2 |
- movd 20(%eax), %mm4 C m 2 |
- movd %mm6, 8(%edx) C m 2 |
- psrlq $32, %mm6 C m 2 |
- lea 16(%eax), %eax C m 2 |
- lea 16(%edx), %edx C m 2 |
- add $4, %ecx C m 2 |
- ja L(lpm2) C m 2 |
- pmuludq %mm7, %mm4 C m 2 |
- paddq %mm0, %mm6 C m 2 |
- movd %mm6, -4(%edx) C m 2 |
- psrlq $32, %mm6 C m 2 |
- paddq %mm1, %mm6 C m 2 |
- mov 16(%esp), %edi C rp 2 |
- jmp L(x2) |
- |
-L(olp2): |
- lea 4(%edi), %edi C am 2 |
- movd (%esi), %mm7 C am 2 |
- lea 4(%esi), %esi C am 2 |
- mov %edi, %edx C rp am 2 |
- mov 20(%esp), %eax C up am 2 |
- movd (%eax), %mm1 C am 2 |
- mov 24(%esp), %ecx C inner loop count am 2 |
- pxor %mm6, %mm6 C am 2 |
- pmuludq %mm7, %mm1 C am 2 |
- movd 4(%eax), %mm2 C am 2 |
- movd (%edx), %mm5 C am 2 |
- pmuludq %mm7, %mm2 C am 2 |
- movd 8(%eax), %mm3 C am 2 |
- paddq %mm1, %mm5 C am 2 |
- movd 4(%edx), %mm4 C am 2 |
- jmp L(am10) C am 2 |
- ALIGN(16) C am 2 |
-L(lam2): |
- pmuludq %mm7, %mm2 C am 2 |
- paddq %mm4, %mm6 C am 2 |
- movd 8(%eax), %mm3 C am 2 |
- paddq %mm1, %mm5 C am 2 |
- movd 4(%edx), %mm4 C am 2 |
- movd %mm6, -4(%edx) C am 2 |
- psrlq $32, %mm6 C am 2 |
-L(am10): |
- pmuludq %mm7, %mm3 C am 2 |
- paddq %mm5, %mm6 C am 2 |
- movd 12(%eax), %mm0 C am 2 |
- paddq %mm2, %mm4 C am 2 |
- movd 8(%edx), %mm5 C am 2 |
- movd %mm6, (%edx) C am 2 |
- psrlq $32, %mm6 C am 2 |
- pmuludq %mm7, %mm0 C am 2 |
- paddq %mm4, %mm6 C am 2 |
- movd 16(%eax), %mm1 C am 2 |
- paddq %mm3, %mm5 C am 2 |
- movd 12(%edx), %mm4 C am 2 |
- movd %mm6, 4(%edx) C am 2 |
- psrlq $32, %mm6 C am 2 |
- pmuludq %mm7, %mm1 C am 2 |
- paddq %mm5, %mm6 C am 2 |
- movd 20(%eax), %mm2 C am 2 |
- paddq %mm0, %mm4 C am 2 |
- movd 16(%edx), %mm5 C am 2 |
- movd %mm6, 8(%edx) C am 2 |
- psrlq $32, %mm6 C am 2 |
- lea 16(%eax), %eax C am 2 |
- lea 16(%edx), %edx C am 2 |
- add $4, %ecx C am 2 |
- jnz L(lam2) C am 2 |
- pmuludq %mm7, %mm2 C am 2 |
- paddq %mm4, %mm6 C am 2 |
- paddq %mm1, %mm5 C am 2 |
- movd 4(%edx), %mm4 C am 2 |
- movd %mm6, -4(%edx) C am 2 |
- psrlq $32, %mm6 C am 2 |
- paddq %mm5, %mm6 C am 2 |
- paddq %mm2, %mm4 C am 2 |
-L(x2): movd %mm6, (%edx) C am 2 |
- psrlq $32, %mm6 C am 2 |
- paddq %mm4, %mm6 C am 2 |
- movd %mm6, 4(%edx) C am 2 |
- psrlq $32, %mm6 C am 2 |
- movd %mm6, 8(%edx) C am 2 |
- dec %ebx C am 2 |
- jnz L(olp2) C am 2 |
-L(oel2): |
- emms C 2 |
- pop %edi C 2 |
- pop %ebx C 2 |
- pop %esi C 2 |
- ret C 2 |
- |
- |
-L(3): movd (%eax), %mm0 C m 3 |
- sub 24(%esp), %ecx C m 3 |
- mov %ecx, 24(%esp) C update loop count for later m 3 |
- pmuludq %mm7, %mm0 C m 3 |
- movd 4(%eax), %mm1 C m 3 |
- pmuludq %mm7, %mm1 C m 3 |
- movd 8(%eax), %mm4 C m 3 |
- jmp L(lpm3) C m 3 |
- ALIGN(16) C m 3 |
-L(lpm3): |
- pmuludq %mm7, %mm4 C m 3 |
- paddq %mm0, %mm6 C m 3 |
- movd 12(%eax), %mm3 C m 3 |
- movd %mm6, (%edx) C m 3 |
- psrlq $32, %mm6 C m 3 |
- pmuludq %mm7, %mm3 C m 3 |
- paddq %mm1, %mm6 C m 3 |
- movd 16(%eax), %mm0 C m 3 |
- movd %mm6, 4(%edx) C m 3 |
- psrlq $32, %mm6 C m 3 |
- pmuludq %mm7, %mm0 C m 3 |
- paddq %mm4, %mm6 C m 3 |
- movd 20(%eax), %mm1 C m 3 |
- movd %mm6, 8(%edx) C m 3 |
- psrlq $32, %mm6 C m 3 |
- pmuludq %mm7, %mm1 C m 3 |
- paddq %mm3, %mm6 C m 3 |
- movd 24(%eax), %mm4 C m 3 |
- movd %mm6, 12(%edx) C m 3 |
- psrlq $32, %mm6 C m 3 |
- lea 16(%eax), %eax C m 3 |
- lea 16(%edx), %edx C m 3 |
- add $4, %ecx C m 3 |
- ja L(lpm3) C m 3 |
- pmuludq %mm7, %mm4 C m 3 |
- paddq %mm0, %mm6 C m 3 |
- movd %mm6, (%edx) C m 3 |
- psrlq $32, %mm6 C m 3 |
- paddq %mm1, %mm6 C m 3 |
- mov 16(%esp), %edi C rp 3 |
- jmp L(x3) |
- |
-L(olp3): |
- lea 4(%edi), %edi C am 3 |
- movd (%esi), %mm7 C am 3 |
- lea 4(%esi), %esi C am 3 |
- mov %edi, %edx C rp am 3 |
- mov 20(%esp), %eax C up am 3 |
- movd (%eax), %mm0 C am 3 |
- mov 24(%esp), %ecx C inner loop count am 3 |
- pxor %mm6, %mm6 C am 3 |
- pmuludq %mm7, %mm0 C am 3 |
- movd 4(%eax), %mm1 C am 3 |
- movd (%edx), %mm4 C am 3 |
- pmuludq %mm7, %mm1 C am 3 |
- movd 8(%eax), %mm2 C am 3 |
- paddq %mm0, %mm4 C am 3 |
- movd 4(%edx), %mm5 C am 3 |
- jmp L(lam3) C am 3 |
- ALIGN(16) C am 3 |
-L(lam3): |
- pmuludq %mm7, %mm2 C am 3 |
- paddq %mm4, %mm6 C am 3 |
- movd 12(%eax), %mm3 C am 3 |
- paddq %mm1, %mm5 C am 3 |
- movd 8(%edx), %mm4 C am 3 |
- movd %mm6, (%edx) C am 3 |
- psrlq $32, %mm6 C am 3 |
- pmuludq %mm7, %mm3 C am 3 |
- paddq %mm5, %mm6 C am 3 |
- movd 16(%eax), %mm0 C am 3 |
- paddq %mm2, %mm4 C am 3 |
- movd 12(%edx), %mm5 C am 3 |
- movd %mm6, 4(%edx) C am 3 |
- psrlq $32, %mm6 C am 3 |
- pmuludq %mm7, %mm0 C am 3 |
- paddq %mm4, %mm6 C am 3 |
- movd 20(%eax), %mm1 C am 3 |
- paddq %mm3, %mm5 C am 3 |
- movd 16(%edx), %mm4 C am 3 |
- movd %mm6, 8(%edx) C am 3 |
- psrlq $32, %mm6 C am 3 |
- pmuludq %mm7, %mm1 C am 3 |
- paddq %mm5, %mm6 C am 3 |
- movd 24(%eax), %mm2 C am 3 |
- paddq %mm0, %mm4 C am 3 |
- movd 20(%edx), %mm5 C am 3 |
- movd %mm6, 12(%edx) C am 3 |
- psrlq $32, %mm6 C am 3 |
- lea 16(%eax), %eax C am 3 |
- lea 16(%edx), %edx C am 3 |
- add $4, %ecx C am 3 |
- jnz L(lam3) C am 3 |
- pmuludq %mm7, %mm2 C am 3 |
- paddq %mm4, %mm6 C am 3 |
- paddq %mm1, %mm5 C am 3 |
- movd 8(%edx), %mm4 C am 3 |
- movd %mm6, (%edx) C am 3 |
- psrlq $32, %mm6 C am 3 |
- paddq %mm5, %mm6 C am 3 |
- paddq %mm2, %mm4 C am 3 |
-L(x3): movd %mm6, 4(%edx) C am 3 |
- psrlq $32, %mm6 C am 3 |
- paddq %mm4, %mm6 C am 3 |
- movd %mm6, 8(%edx) C am 3 |
- psrlq $32, %mm6 C am 3 |
- movd %mm6, 12(%edx) C am 3 |
- dec %ebx C am 3 |
- jnz L(olp3) C am 3 |
-L(oel3): |
- emms C 3 |
- pop %edi C 3 |
- pop %ebx C 3 |
- pop %esi C 3 |
- ret C 3 |
-EPILOGUE() |