| Index: gcc/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm
|
| diff --git a/gcc/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm b/gcc/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm
|
| deleted file mode 100644
|
| index 2628e5eb72dfe894ffc203d710c6be823b0a6e56..0000000000000000000000000000000000000000
|
| --- a/gcc/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm
|
| +++ /dev/null
|
| @@ -1,651 +0,0 @@
|
| -dnl mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
|
| -
|
| -dnl Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc.
|
| -dnl
|
| -dnl This file is part of the GNU MP Library.
|
| -dnl
|
| -dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
| -dnl it under the terms of the GNU Lesser General Public License as published
|
| -dnl by the Free Software Foundation; either version 3 of the License, or (at
|
| -dnl your option) any later version.
|
| -dnl
|
| -dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
| -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
| -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
| -dnl License for more details.
|
| -dnl
|
| -dnl You should have received a copy of the GNU Lesser General Public License
|
| -dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
|
| -
|
| -include(`../config.m4')
|
| -
|
| -C TODO:
|
| -C * Improve ad-hoc outer loop code and register handling. Some feed-in
|
| -C scheduling could improve things by several cycles per outer iteration.
|
| -C * In code for un <= 3, try keeping accumulation operands in registers,
|
| -C without storing intermediates to rp.
|
| -C * We might want to keep 32 in a free mm register, since the register form is
|
| -C 3 bytes and the immediate form is 4 bytes. About 70 bytes to save.
|
| -C * Look into different loop alignment, we now expand the code about 50 bytes
|
| -C with possibly needless alignment.
|
| -C * Perhaps rewrap loops 00,01,02 (6 loops) to allow fall-through entry.
|
| -C * Use OSP, should solve feed-in latency problems.
|
| -C * Save a few tens of bytes by doing cross-jumping for Loel0, etc.
|
| -C * Save around 120 bytes by remapping "m 0", "m 1", "m 2" and "m 3" registers
|
| -C so that they can share feed-in code, and changing the branch targets from
|
| -C L<n> to Lm<nn>.
|
| -
|
| -C cycles/limb
|
| -C P6 model 9 (Banias) ?
|
| -C P6 model 13 (Dothan) 5.24
|
| -C P6 model 14 (Yonah) ?
|
| -C P4 model 0-1 (Willamette): 5
|
| -C P4 model 2 (Northwood): 4.60 at 32 limbs
|
| -C P4 model 3-4 (Prescott): 4.94 at 32 limbs
|
| -
|
| -C INPUT PARAMETERS
|
| -C rp sp + 4
|
| -C up sp + 8
|
| -C un sp + 12
|
| -C vp sp + 16
|
| -C vn sp + 20
|
| -
|
| - TEXT
|
| - ALIGN(16)
|
| -PROLOGUE(mpn_mul_basecase)
|
| - push %esi
|
| - push %ebx
|
| - mov 12(%esp), %edx C rp
|
| - mov 16(%esp), %eax C up
|
| - mov 20(%esp), %ecx C un
|
| - mov 24(%esp), %esi C vp
|
| - mov 28(%esp), %ebx C vn
|
| - movd (%esi), %mm7 C
|
| -L(ent): cmp $3, %ecx
|
| - ja L(big)
|
| - movd (%eax), %mm6
|
| - pmuludq %mm7, %mm6
|
| - jz L(un3)
|
| - cmp $2, %ecx
|
| - jz L(un2)
|
| -
|
| -L(un1): movd %mm6, (%edx) C un=1
|
| - psrlq $32, %mm6 C un=1
|
| - movd %mm6, 4(%edx) C un=1
|
| - jmp L(rtr) C un=1
|
| -
|
| -L(un2): movd 4(%eax), %mm1 C un=2
|
| - pmuludq %mm7, %mm1 C un=2
|
| - movd %mm6, (%edx) C un=2
|
| - psrlq $32, %mm6 C un=2
|
| - paddq %mm1, %mm6 C un=2
|
| - movd %mm6, 4(%edx) C un=2
|
| - psrlq $32, %mm6 C un=2
|
| - movd %mm6, 8(%edx) C un=2
|
| - dec %ebx C un=2
|
| - jz L(rtr) C un=2
|
| - movd 4(%esi), %mm7 C un=2
|
| - movd (%eax), %mm6 C un=2
|
| - pmuludq %mm7, %mm6 C un=2
|
| - movd 4(%eax), %mm1 C un=2
|
| - movd 4(%edx), %mm4 C un=2
|
| - pmuludq %mm7, %mm1 C un=2
|
| - movd 8(%edx), %mm5 C un=2
|
| - paddq %mm4, %mm6 C un=2
|
| - paddq %mm1, %mm5 C un=2
|
| - movd %mm6, 4(%edx) C un=2
|
| - psrlq $32, %mm6 C un=2
|
| - paddq %mm5, %mm6 C un=2
|
| - movd %mm6, 8(%edx) C un=2
|
| - psrlq $32, %mm6 C un=2
|
| - movd %mm6, 12(%edx) C un=2
|
| -L(rtr): emms
|
| - pop %ebx
|
| - pop %esi
|
| - ret
|
| -
|
| -L(un3): movd 4(%eax), %mm1 C un=3
|
| - pmuludq %mm7, %mm1 C un=3
|
| - movd 8(%eax), %mm2 C un=3
|
| - pmuludq %mm7, %mm2 C un=3
|
| - movd %mm6, (%edx) C un=3
|
| - psrlq $32, %mm6 C un=3
|
| - paddq %mm1, %mm6 C un=3
|
| - movd %mm6, 4(%edx) C un=3
|
| - psrlq $32, %mm6 C un=3
|
| - paddq %mm2, %mm6 C un=3
|
| - movd %mm6, 8(%edx) C un=3
|
| - psrlq $32, %mm6 C un=3
|
| - movd %mm6, 12(%edx) C un=3
|
| - dec %ebx C un=3
|
| - jz L(rtr) C un=3
|
| - movd 4(%esi), %mm7 C un=3
|
| - movd (%eax), %mm6 C un=3
|
| - pmuludq %mm7, %mm6 C un=3
|
| - movd 4(%eax), %mm1 C un=3
|
| - movd 4(%edx), %mm4 C un=3
|
| - pmuludq %mm7, %mm1 C un=3
|
| - movd 8(%eax), %mm2 C un=3
|
| - movd 8(%edx), %mm5 C un=3
|
| - pmuludq %mm7, %mm2 C un=3
|
| - paddq %mm4, %mm6 C un=3
|
| - paddq %mm1, %mm5 C un=3
|
| - movd 12(%edx), %mm4 C un=3
|
| - movd %mm6, 4(%edx) C un=3
|
| - psrlq $32, %mm6 C un=3
|
| - paddq %mm5, %mm6 C un=3
|
| - paddq %mm2, %mm4 C un=3
|
| - movd %mm6, 8(%edx) C un=3
|
| - psrlq $32, %mm6 C un=3
|
| - paddq %mm4, %mm6 C un=3
|
| - movd %mm6, 12(%edx) C un=3
|
| - psrlq $32, %mm6 C un=3
|
| - movd %mm6, 16(%edx) C un=3
|
| - dec %ebx C un=3
|
| - jz L(rtr) C un=3
|
| - movd 8(%esi), %mm7 C un=3
|
| - movd (%eax), %mm6 C un=3
|
| - pmuludq %mm7, %mm6 C un=3
|
| - movd 4(%eax), %mm1 C un=3
|
| - movd 8(%edx), %mm4 C un=3
|
| - pmuludq %mm7, %mm1 C un=3
|
| - movd 8(%eax), %mm2 C un=3
|
| - movd 12(%edx), %mm5 C un=3
|
| - pmuludq %mm7, %mm2 C un=3
|
| - paddq %mm4, %mm6 C un=3
|
| - paddq %mm1, %mm5 C un=3
|
| - movd 16(%edx), %mm4 C un=3
|
| - movd %mm6, 8(%edx) C un=3
|
| - psrlq $32, %mm6 C un=3
|
| - paddq %mm5, %mm6 C un=3
|
| - paddq %mm2, %mm4 C un=3
|
| - movd %mm6, 12(%edx) C un=3
|
| - psrlq $32, %mm6 C un=3
|
| - paddq %mm4, %mm6 C un=3
|
| - movd %mm6, 16(%edx) C un=3
|
| - psrlq $32, %mm6 C un=3
|
| - movd %mm6, 20(%edx) C un=3
|
| - jmp L(rtr)
|
| -
|
| -
|
| -L(big): push %edi
|
| - pxor %mm6, %mm6
|
| - lea 4(%esi), %esi
|
| - and $3, %ecx
|
| - jz L(0)
|
| - cmp $2, %ecx
|
| - jc L(1)
|
| - jz L(2)
|
| - jmp L(3) C FIXME: one case should fall through
|
| -
|
| -
|
| -L(0): movd (%eax), %mm3 C m 0
|
| - sub 24(%esp), %ecx C inner loop count m 0
|
| - mov %ecx, 24(%esp) C update loop count for later m 0
|
| - pmuludq %mm7, %mm3 C m 0
|
| - movd 4(%eax), %mm0 C m 0
|
| - pmuludq %mm7, %mm0 C m 0
|
| - movd 8(%eax), %mm1 C m 0
|
| - jmp L(m00) C m 0
|
| - ALIGN(16) C m 0
|
| -L(lpm0):
|
| - pmuludq %mm7, %mm4 C m 0
|
| - paddq %mm0, %mm6 C m 0
|
| - movd (%eax), %mm3 C m 0
|
| - movd %mm6, -12(%edx) C m 0
|
| - psrlq $32, %mm6 C m 0
|
| - pmuludq %mm7, %mm3 C m 0
|
| - paddq %mm1, %mm6 C m 0
|
| - movd 4(%eax), %mm0 C m 0
|
| - movd %mm6, -8(%edx) C m 0
|
| - psrlq $32, %mm6 C m 0
|
| - pmuludq %mm7, %mm0 C m 0
|
| - paddq %mm4, %mm6 C m 0
|
| - movd 8(%eax), %mm1 C m 0
|
| - movd %mm6, -4(%edx) C m 0
|
| - psrlq $32, %mm6 C m 0
|
| -L(m00): pmuludq %mm7, %mm1 C m 0
|
| - paddq %mm3, %mm6 C m 0
|
| - movd 12(%eax), %mm4 C m 0
|
| - movd %mm6, (%edx) C m 0
|
| - psrlq $32, %mm6 C m 0
|
| - lea 16(%eax), %eax C m 0
|
| - lea 16(%edx), %edx C m 0
|
| - add $4, %ecx C m 0
|
| - ja L(lpm0) C m 0
|
| - pmuludq %mm7, %mm4 C m 0
|
| - paddq %mm0, %mm6 C m 0
|
| - movd %mm6, -12(%edx) C m 0
|
| - psrlq $32, %mm6 C m 0
|
| - paddq %mm1, %mm6 C m 0
|
| - mov 16(%esp), %edi C rp 0
|
| - jmp L(x0)
|
| -
|
| -L(olp0):
|
| - lea 4(%edi), %edi C am 0
|
| - movd (%esi), %mm7 C am 0
|
| - lea 4(%esi), %esi C am 0
|
| - mov %edi, %edx C rp am 0
|
| - mov 20(%esp), %eax C up am 0
|
| - movd (%eax), %mm3 C am 0
|
| - mov 24(%esp), %ecx C inner loop count am 0
|
| - pxor %mm6, %mm6 C am 0
|
| - pmuludq %mm7, %mm3 C am 0
|
| - movd 4(%eax), %mm0 C am 0
|
| - movd (%edx), %mm5 C am 0
|
| - pmuludq %mm7, %mm0 C am 0
|
| - movd 8(%eax), %mm1 C am 0
|
| - paddq %mm3, %mm5 C am 0
|
| - movd 4(%edx), %mm4 C am 0
|
| - jmp L(am00) C am 0
|
| - ALIGN(16) C mm 0
|
| -L(lam0):
|
| - pmuludq %mm7, %mm2 C am 0
|
| - paddq %mm4, %mm6 C am 0
|
| - movd (%eax), %mm3 C am 0
|
| - paddq %mm1, %mm5 C am 0
|
| - movd -4(%edx), %mm4 C am 0
|
| - movd %mm6, -12(%edx) C am 0
|
| - psrlq $32, %mm6 C am 0
|
| - pmuludq %mm7, %mm3 C am 0
|
| - paddq %mm5, %mm6 C am 0
|
| - movd 4(%eax), %mm0 C am 0
|
| - paddq %mm2, %mm4 C am 0
|
| - movd (%edx), %mm5 C am 0
|
| - movd %mm6, -8(%edx) C am 0
|
| - psrlq $32, %mm6 C am 0
|
| - pmuludq %mm7, %mm0 C am 0
|
| - paddq %mm4, %mm6 C am 0
|
| - movd 8(%eax), %mm1 C am 0
|
| - paddq %mm3, %mm5 C am 0
|
| - movd 4(%edx), %mm4 C am 0
|
| - movd %mm6, -4(%edx) C am 0
|
| - psrlq $32, %mm6 C am 0
|
| -L(am00):
|
| - pmuludq %mm7, %mm1 C am 0
|
| - paddq %mm5, %mm6 C am 0
|
| - movd 12(%eax), %mm2 C am 0
|
| - paddq %mm0, %mm4 C am 0
|
| - movd 8(%edx), %mm5 C am 0
|
| - movd %mm6, (%edx) C am 0
|
| - psrlq $32, %mm6 C am 0
|
| - lea 16(%eax), %eax C am 0
|
| - lea 16(%edx), %edx C am 0
|
| - add $4, %ecx C am 0
|
| - jnz L(lam0) C am 0
|
| - pmuludq %mm7, %mm2 C am 0
|
| - paddq %mm4, %mm6 C am 0
|
| - paddq %mm1, %mm5 C am 0
|
| - movd -4(%edx), %mm4 C am 0
|
| - movd %mm6, -12(%edx) C am 0
|
| - psrlq $32, %mm6 C am 0
|
| - paddq %mm5, %mm6 C am 0
|
| - paddq %mm2, %mm4 C am 0
|
| -L(x0): movd %mm6, -8(%edx) C am 0
|
| - psrlq $32, %mm6 C am 0
|
| - paddq %mm4, %mm6 C am 0
|
| - movd %mm6, -4(%edx) C am 0
|
| - psrlq $32, %mm6 C am 0
|
| - movd %mm6, (%edx) C am 0
|
| - dec %ebx C am 0
|
| - jnz L(olp0) C am 0
|
| -L(oel0):
|
| - emms C 0
|
| - pop %edi C 0
|
| - pop %ebx C 0
|
| - pop %esi C 0
|
| - ret C 0
|
| -
|
| -
|
| -L(1): movd (%eax), %mm4 C m 1
|
| - sub 24(%esp), %ecx C m 1
|
| - mov %ecx, 24(%esp) C update loop count for later m 1
|
| - pmuludq %mm7, %mm4 C m 1
|
| - movd 4(%eax), %mm3 C m 1
|
| - pmuludq %mm7, %mm3 C m 1
|
| - movd 8(%eax), %mm0 C m 1
|
| - jmp L(m01) C m 1
|
| - ALIGN(16) C m 1
|
| -L(lpm1):
|
| - pmuludq %mm7, %mm4 C m 1
|
| - paddq %mm0, %mm6 C m 1
|
| - movd 4(%eax), %mm3 C m 1
|
| - movd %mm6, -8(%edx) C m 1
|
| - psrlq $32, %mm6 C m 1
|
| - pmuludq %mm7, %mm3 C m 1
|
| - paddq %mm1, %mm6 C m 1
|
| - movd 8(%eax), %mm0 C m 1
|
| - movd %mm6, -4(%edx) C m 1
|
| - psrlq $32, %mm6 C m 1
|
| -L(m01): pmuludq %mm7, %mm0 C m 1
|
| - paddq %mm4, %mm6 C m 1
|
| - movd 12(%eax), %mm1 C m 1
|
| - movd %mm6, (%edx) C m 1
|
| - psrlq $32, %mm6 C m 1
|
| - pmuludq %mm7, %mm1 C m 1
|
| - paddq %mm3, %mm6 C m 1
|
| - movd 16(%eax), %mm4 C m 1
|
| - movd %mm6, 4(%edx) C m 1
|
| - psrlq $32, %mm6 C m 1
|
| - lea 16(%eax), %eax C m 1
|
| - lea 16(%edx), %edx C m 1
|
| - add $4, %ecx C m 1
|
| - ja L(lpm1) C m 1
|
| - pmuludq %mm7, %mm4 C m 1
|
| - paddq %mm0, %mm6 C m 1
|
| - movd %mm6, -8(%edx) C m 1
|
| - psrlq $32, %mm6 C m 1
|
| - paddq %mm1, %mm6 C m 1
|
| - mov 16(%esp), %edi C rp 1
|
| - jmp L(x1)
|
| -
|
| -L(olp1):
|
| - lea 4(%edi), %edi C am 1
|
| - movd (%esi), %mm7 C am 1
|
| - lea 4(%esi), %esi C am 1
|
| - mov %edi, %edx C rp am 1
|
| - mov 20(%esp), %eax C up am 1
|
| - movd (%eax), %mm2 C am 1
|
| - mov 24(%esp), %ecx C inner loop count am 1
|
| - pxor %mm6, %mm6 C am 1
|
| - pmuludq %mm7, %mm2 C am 1
|
| - movd 4(%eax), %mm3 C am 1
|
| - movd (%edx), %mm4 C am 1
|
| - pmuludq %mm7, %mm3 C am 1
|
| - movd 8(%eax), %mm0 C am 1
|
| - paddq %mm2, %mm4 C am 1
|
| - movd 4(%edx), %mm5 C am 1
|
| - jmp L(am01) C am 1
|
| - ALIGN(16) C am 1
|
| -L(lam1):
|
| - pmuludq %mm7, %mm2 C am 1
|
| - paddq %mm4, %mm6 C am 1
|
| - movd 4(%eax), %mm3 C am 1
|
| - paddq %mm1, %mm5 C am 1
|
| - movd (%edx), %mm4 C am 1
|
| - movd %mm6, -8(%edx) C am 1
|
| - psrlq $32, %mm6 C am 1
|
| - pmuludq %mm7, %mm3 C am 1
|
| - paddq %mm5, %mm6 C am 1
|
| - movd 8(%eax), %mm0 C am 1
|
| - paddq %mm2, %mm4 C am 1
|
| - movd 4(%edx), %mm5 C am 1
|
| - movd %mm6, -4(%edx) C am 1
|
| - psrlq $32, %mm6 C am 1
|
| -L(am01):
|
| - pmuludq %mm7, %mm0 C am 1
|
| - paddq %mm4, %mm6 C am 1
|
| - movd 12(%eax), %mm1 C am 1
|
| - paddq %mm3, %mm5 C am 1
|
| - movd 8(%edx), %mm4 C am 1
|
| - movd %mm6, (%edx) C am 1
|
| - psrlq $32, %mm6 C am 1
|
| - pmuludq %mm7, %mm1 C am 1
|
| - paddq %mm5, %mm6 C am 1
|
| - movd 16(%eax), %mm2 C am 1
|
| - paddq %mm0, %mm4 C am 1
|
| - movd 12(%edx), %mm5 C am 1
|
| - movd %mm6, 4(%edx) C am 1
|
| - psrlq $32, %mm6 C am 1
|
| - lea 16(%eax), %eax C am 1
|
| - lea 16(%edx), %edx C am 1
|
| - add $4, %ecx C am 1
|
| - jnz L(lam1) C am 1
|
| - pmuludq %mm7, %mm2 C am 1
|
| - paddq %mm4, %mm6 C am 1
|
| - paddq %mm1, %mm5 C am 1
|
| - movd (%edx), %mm4 C am 1
|
| - movd %mm6, -8(%edx) C am 1
|
| - psrlq $32, %mm6 C am 1
|
| - paddq %mm5, %mm6 C am 1
|
| - paddq %mm2, %mm4 C am 1
|
| -L(x1): movd %mm6, -4(%edx) C am 1
|
| - psrlq $32, %mm6 C am 1
|
| - paddq %mm4, %mm6 C am 1
|
| - movd %mm6, (%edx) C am 1
|
| - psrlq $32, %mm6 C am 1
|
| - movd %mm6, 4(%edx) C am 1
|
| - dec %ebx C am 1
|
| - jnz L(olp1) C am 1
|
| -L(oel1):
|
| - emms C 1
|
| - pop %edi C 1
|
| - pop %ebx C 1
|
| - pop %esi C 1
|
| - ret C 1
|
| -
|
| -
|
| -L(2): movd (%eax), %mm1 C m 2
|
| - sub 24(%esp), %ecx C m 2
|
| - mov %ecx, 24(%esp) C update loop count for later m 2
|
| - pmuludq %mm7, %mm1 C m 2
|
| - movd 4(%eax), %mm4 C m 2
|
| - pmuludq %mm7, %mm4 C m 2
|
| - movd 8(%eax), %mm3 C m 2
|
| - jmp L(m10) C m 2
|
| - ALIGN(16) C m 2
|
| -L(lpm2):
|
| - pmuludq %mm7, %mm4 C m 2
|
| - paddq %mm0, %mm6 C m 2
|
| - movd 8(%eax), %mm3 C m 2
|
| - movd %mm6, -4(%edx) C m 2
|
| - psrlq $32, %mm6 C m 2
|
| -L(m10): pmuludq %mm7, %mm3 C m 2
|
| - paddq %mm1, %mm6 C m 2
|
| - movd 12(%eax), %mm0 C m 2
|
| - movd %mm6, (%edx) C m 2
|
| - psrlq $32, %mm6 C m 2
|
| - pmuludq %mm7, %mm0 C m 2
|
| - paddq %mm4, %mm6 C m 2
|
| - movd 16(%eax), %mm1 C m 2
|
| - movd %mm6, 4(%edx) C m 2
|
| - psrlq $32, %mm6 C m 2
|
| - pmuludq %mm7, %mm1 C m 2
|
| - paddq %mm3, %mm6 C m 2
|
| - movd 20(%eax), %mm4 C m 2
|
| - movd %mm6, 8(%edx) C m 2
|
| - psrlq $32, %mm6 C m 2
|
| - lea 16(%eax), %eax C m 2
|
| - lea 16(%edx), %edx C m 2
|
| - add $4, %ecx C m 2
|
| - ja L(lpm2) C m 2
|
| - pmuludq %mm7, %mm4 C m 2
|
| - paddq %mm0, %mm6 C m 2
|
| - movd %mm6, -4(%edx) C m 2
|
| - psrlq $32, %mm6 C m 2
|
| - paddq %mm1, %mm6 C m 2
|
| - mov 16(%esp), %edi C rp 2
|
| - jmp L(x2)
|
| -
|
| -L(olp2):
|
| - lea 4(%edi), %edi C am 2
|
| - movd (%esi), %mm7 C am 2
|
| - lea 4(%esi), %esi C am 2
|
| - mov %edi, %edx C rp am 2
|
| - mov 20(%esp), %eax C up am 2
|
| - movd (%eax), %mm1 C am 2
|
| - mov 24(%esp), %ecx C inner loop count am 2
|
| - pxor %mm6, %mm6 C am 2
|
| - pmuludq %mm7, %mm1 C am 2
|
| - movd 4(%eax), %mm2 C am 2
|
| - movd (%edx), %mm5 C am 2
|
| - pmuludq %mm7, %mm2 C am 2
|
| - movd 8(%eax), %mm3 C am 2
|
| - paddq %mm1, %mm5 C am 2
|
| - movd 4(%edx), %mm4 C am 2
|
| - jmp L(am10) C am 2
|
| - ALIGN(16) C am 2
|
| -L(lam2):
|
| - pmuludq %mm7, %mm2 C am 2
|
| - paddq %mm4, %mm6 C am 2
|
| - movd 8(%eax), %mm3 C am 2
|
| - paddq %mm1, %mm5 C am 2
|
| - movd 4(%edx), %mm4 C am 2
|
| - movd %mm6, -4(%edx) C am 2
|
| - psrlq $32, %mm6 C am 2
|
| -L(am10):
|
| - pmuludq %mm7, %mm3 C am 2
|
| - paddq %mm5, %mm6 C am 2
|
| - movd 12(%eax), %mm0 C am 2
|
| - paddq %mm2, %mm4 C am 2
|
| - movd 8(%edx), %mm5 C am 2
|
| - movd %mm6, (%edx) C am 2
|
| - psrlq $32, %mm6 C am 2
|
| - pmuludq %mm7, %mm0 C am 2
|
| - paddq %mm4, %mm6 C am 2
|
| - movd 16(%eax), %mm1 C am 2
|
| - paddq %mm3, %mm5 C am 2
|
| - movd 12(%edx), %mm4 C am 2
|
| - movd %mm6, 4(%edx) C am 2
|
| - psrlq $32, %mm6 C am 2
|
| - pmuludq %mm7, %mm1 C am 2
|
| - paddq %mm5, %mm6 C am 2
|
| - movd 20(%eax), %mm2 C am 2
|
| - paddq %mm0, %mm4 C am 2
|
| - movd 16(%edx), %mm5 C am 2
|
| - movd %mm6, 8(%edx) C am 2
|
| - psrlq $32, %mm6 C am 2
|
| - lea 16(%eax), %eax C am 2
|
| - lea 16(%edx), %edx C am 2
|
| - add $4, %ecx C am 2
|
| - jnz L(lam2) C am 2
|
| - pmuludq %mm7, %mm2 C am 2
|
| - paddq %mm4, %mm6 C am 2
|
| - paddq %mm1, %mm5 C am 2
|
| - movd 4(%edx), %mm4 C am 2
|
| - movd %mm6, -4(%edx) C am 2
|
| - psrlq $32, %mm6 C am 2
|
| - paddq %mm5, %mm6 C am 2
|
| - paddq %mm2, %mm4 C am 2
|
| -L(x2): movd %mm6, (%edx) C am 2
|
| - psrlq $32, %mm6 C am 2
|
| - paddq %mm4, %mm6 C am 2
|
| - movd %mm6, 4(%edx) C am 2
|
| - psrlq $32, %mm6 C am 2
|
| - movd %mm6, 8(%edx) C am 2
|
| - dec %ebx C am 2
|
| - jnz L(olp2) C am 2
|
| -L(oel2):
|
| - emms C 2
|
| - pop %edi C 2
|
| - pop %ebx C 2
|
| - pop %esi C 2
|
| - ret C 2
|
| -
|
| -
|
| -L(3): movd (%eax), %mm0 C m 3
|
| - sub 24(%esp), %ecx C m 3
|
| - mov %ecx, 24(%esp) C update loop count for later m 3
|
| - pmuludq %mm7, %mm0 C m 3
|
| - movd 4(%eax), %mm1 C m 3
|
| - pmuludq %mm7, %mm1 C m 3
|
| - movd 8(%eax), %mm4 C m 3
|
| - jmp L(lpm3) C m 3
|
| - ALIGN(16) C m 3
|
| -L(lpm3):
|
| - pmuludq %mm7, %mm4 C m 3
|
| - paddq %mm0, %mm6 C m 3
|
| - movd 12(%eax), %mm3 C m 3
|
| - movd %mm6, (%edx) C m 3
|
| - psrlq $32, %mm6 C m 3
|
| - pmuludq %mm7, %mm3 C m 3
|
| - paddq %mm1, %mm6 C m 3
|
| - movd 16(%eax), %mm0 C m 3
|
| - movd %mm6, 4(%edx) C m 3
|
| - psrlq $32, %mm6 C m 3
|
| - pmuludq %mm7, %mm0 C m 3
|
| - paddq %mm4, %mm6 C m 3
|
| - movd 20(%eax), %mm1 C m 3
|
| - movd %mm6, 8(%edx) C m 3
|
| - psrlq $32, %mm6 C m 3
|
| - pmuludq %mm7, %mm1 C m 3
|
| - paddq %mm3, %mm6 C m 3
|
| - movd 24(%eax), %mm4 C m 3
|
| - movd %mm6, 12(%edx) C m 3
|
| - psrlq $32, %mm6 C m 3
|
| - lea 16(%eax), %eax C m 3
|
| - lea 16(%edx), %edx C m 3
|
| - add $4, %ecx C m 3
|
| - ja L(lpm3) C m 3
|
| - pmuludq %mm7, %mm4 C m 3
|
| - paddq %mm0, %mm6 C m 3
|
| - movd %mm6, (%edx) C m 3
|
| - psrlq $32, %mm6 C m 3
|
| - paddq %mm1, %mm6 C m 3
|
| - mov 16(%esp), %edi C rp 3
|
| - jmp L(x3)
|
| -
|
| -L(olp3):
|
| - lea 4(%edi), %edi C am 3
|
| - movd (%esi), %mm7 C am 3
|
| - lea 4(%esi), %esi C am 3
|
| - mov %edi, %edx C rp am 3
|
| - mov 20(%esp), %eax C up am 3
|
| - movd (%eax), %mm0 C am 3
|
| - mov 24(%esp), %ecx C inner loop count am 3
|
| - pxor %mm6, %mm6 C am 3
|
| - pmuludq %mm7, %mm0 C am 3
|
| - movd 4(%eax), %mm1 C am 3
|
| - movd (%edx), %mm4 C am 3
|
| - pmuludq %mm7, %mm1 C am 3
|
| - movd 8(%eax), %mm2 C am 3
|
| - paddq %mm0, %mm4 C am 3
|
| - movd 4(%edx), %mm5 C am 3
|
| - jmp L(lam3) C am 3
|
| - ALIGN(16) C am 3
|
| -L(lam3):
|
| - pmuludq %mm7, %mm2 C am 3
|
| - paddq %mm4, %mm6 C am 3
|
| - movd 12(%eax), %mm3 C am 3
|
| - paddq %mm1, %mm5 C am 3
|
| - movd 8(%edx), %mm4 C am 3
|
| - movd %mm6, (%edx) C am 3
|
| - psrlq $32, %mm6 C am 3
|
| - pmuludq %mm7, %mm3 C am 3
|
| - paddq %mm5, %mm6 C am 3
|
| - movd 16(%eax), %mm0 C am 3
|
| - paddq %mm2, %mm4 C am 3
|
| - movd 12(%edx), %mm5 C am 3
|
| - movd %mm6, 4(%edx) C am 3
|
| - psrlq $32, %mm6 C am 3
|
| - pmuludq %mm7, %mm0 C am 3
|
| - paddq %mm4, %mm6 C am 3
|
| - movd 20(%eax), %mm1 C am 3
|
| - paddq %mm3, %mm5 C am 3
|
| - movd 16(%edx), %mm4 C am 3
|
| - movd %mm6, 8(%edx) C am 3
|
| - psrlq $32, %mm6 C am 3
|
| - pmuludq %mm7, %mm1 C am 3
|
| - paddq %mm5, %mm6 C am 3
|
| - movd 24(%eax), %mm2 C am 3
|
| - paddq %mm0, %mm4 C am 3
|
| - movd 20(%edx), %mm5 C am 3
|
| - movd %mm6, 12(%edx) C am 3
|
| - psrlq $32, %mm6 C am 3
|
| - lea 16(%eax), %eax C am 3
|
| - lea 16(%edx), %edx C am 3
|
| - add $4, %ecx C am 3
|
| - jnz L(lam3) C am 3
|
| - pmuludq %mm7, %mm2 C am 3
|
| - paddq %mm4, %mm6 C am 3
|
| - paddq %mm1, %mm5 C am 3
|
| - movd 8(%edx), %mm4 C am 3
|
| - movd %mm6, (%edx) C am 3
|
| - psrlq $32, %mm6 C am 3
|
| - paddq %mm5, %mm6 C am 3
|
| - paddq %mm2, %mm4 C am 3
|
| -L(x3): movd %mm6, 4(%edx) C am 3
|
| - psrlq $32, %mm6 C am 3
|
| - paddq %mm4, %mm6 C am 3
|
| - movd %mm6, 8(%edx) C am 3
|
| - psrlq $32, %mm6 C am 3
|
| - movd %mm6, 12(%edx) C am 3
|
| - dec %ebx C am 3
|
| - jnz L(olp3) C am 3
|
| -L(oel3):
|
| - emms C 3
|
| - pop %edi C 3
|
| - pop %ebx C 3
|
| - pop %esi C 3
|
| - ret C 3
|
| -EPILOGUE()
|
|
|