Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(6)

Unified Diff: gcc/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm

Issue 3050029: [gcc] GCC 4.5.0=>4.5.1 (Closed) Base URL: ssh://git@gitrw.chromium.org:9222/nacl-toolchain.git
Patch Set: Created 10 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « gcc/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm ('k') | gcc/gmp/mpn/x86/pentium4/sse2/submul_1.asm » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: gcc/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm
diff --git a/gcc/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm b/gcc/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm
deleted file mode 100644
index 2628e5eb72dfe894ffc203d710c6be823b0a6e56..0000000000000000000000000000000000000000
--- a/gcc/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm
+++ /dev/null
@@ -1,651 +0,0 @@
-dnl mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
-
-dnl Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of the GNU Lesser General Public License as published
-dnl by the Free Software Foundation; either version 3 of the License, or (at
-dnl your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-dnl License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public License
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C TODO:
-C * Improve ad-hoc outer loop code and register handling. Some feed-in
-C scheduling could improve things by several cycles per outer iteration.
-C * In code for un <= 3, try keeping accumulation operands in registers,
-C without storing intermediates to rp.
-C * We might want to keep 32 in a free mm register, since the register form is
-C 3 bytes and the immediate form is 4 bytes. About 70 bytes to save.
-C * Look into different loop alignment, we now expand the code about 50 bytes
-C with possibly needless alignment.
-C * Perhaps rewrap loops 00,01,02 (6 loops) to allow fall-through entry.
-C * Use OSP, should solve feed-in latency problems.
-C * Save a few tens of bytes by doing cross-jumping for Loel0, etc.
-C * Save around 120 bytes by remapping "m 0", "m 1", "m 2" and "m 3" registers
-C so that they can share feed-in code, and changing the branch targets from
-C L<n> to Lm<nn>.
-
-C cycles/limb
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) 5.24
-C P6 model 14 (Yonah) ?
-C P4 model 0-1 (Willamette): 5
-C P4 model 2 (Northwood): 4.60 at 32 limbs
-C P4 model 3-4 (Prescott): 4.94 at 32 limbs
-
-C INPUT PARAMETERS
-C rp sp + 4
-C up sp + 8
-C un sp + 12
-C vp sp + 16
-C vn sp + 20
-
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mul_basecase)
- push %esi
- push %ebx
- mov 12(%esp), %edx C rp
- mov 16(%esp), %eax C up
- mov 20(%esp), %ecx C un
- mov 24(%esp), %esi C vp
- mov 28(%esp), %ebx C vn
- movd (%esi), %mm7 C
-L(ent): cmp $3, %ecx
- ja L(big)
- movd (%eax), %mm6
- pmuludq %mm7, %mm6
- jz L(un3)
- cmp $2, %ecx
- jz L(un2)
-
-L(un1): movd %mm6, (%edx) C un=1
- psrlq $32, %mm6 C un=1
- movd %mm6, 4(%edx) C un=1
- jmp L(rtr) C un=1
-
-L(un2): movd 4(%eax), %mm1 C un=2
- pmuludq %mm7, %mm1 C un=2
- movd %mm6, (%edx) C un=2
- psrlq $32, %mm6 C un=2
- paddq %mm1, %mm6 C un=2
- movd %mm6, 4(%edx) C un=2
- psrlq $32, %mm6 C un=2
- movd %mm6, 8(%edx) C un=2
- dec %ebx C un=2
- jz L(rtr) C un=2
- movd 4(%esi), %mm7 C un=2
- movd (%eax), %mm6 C un=2
- pmuludq %mm7, %mm6 C un=2
- movd 4(%eax), %mm1 C un=2
- movd 4(%edx), %mm4 C un=2
- pmuludq %mm7, %mm1 C un=2
- movd 8(%edx), %mm5 C un=2
- paddq %mm4, %mm6 C un=2
- paddq %mm1, %mm5 C un=2
- movd %mm6, 4(%edx) C un=2
- psrlq $32, %mm6 C un=2
- paddq %mm5, %mm6 C un=2
- movd %mm6, 8(%edx) C un=2
- psrlq $32, %mm6 C un=2
- movd %mm6, 12(%edx) C un=2
-L(rtr): emms
- pop %ebx
- pop %esi
- ret
-
-L(un3): movd 4(%eax), %mm1 C un=3
- pmuludq %mm7, %mm1 C un=3
- movd 8(%eax), %mm2 C un=3
- pmuludq %mm7, %mm2 C un=3
- movd %mm6, (%edx) C un=3
- psrlq $32, %mm6 C un=3
- paddq %mm1, %mm6 C un=3
- movd %mm6, 4(%edx) C un=3
- psrlq $32, %mm6 C un=3
- paddq %mm2, %mm6 C un=3
- movd %mm6, 8(%edx) C un=3
- psrlq $32, %mm6 C un=3
- movd %mm6, 12(%edx) C un=3
- dec %ebx C un=3
- jz L(rtr) C un=3
- movd 4(%esi), %mm7 C un=3
- movd (%eax), %mm6 C un=3
- pmuludq %mm7, %mm6 C un=3
- movd 4(%eax), %mm1 C un=3
- movd 4(%edx), %mm4 C un=3
- pmuludq %mm7, %mm1 C un=3
- movd 8(%eax), %mm2 C un=3
- movd 8(%edx), %mm5 C un=3
- pmuludq %mm7, %mm2 C un=3
- paddq %mm4, %mm6 C un=3
- paddq %mm1, %mm5 C un=3
- movd 12(%edx), %mm4 C un=3
- movd %mm6, 4(%edx) C un=3
- psrlq $32, %mm6 C un=3
- paddq %mm5, %mm6 C un=3
- paddq %mm2, %mm4 C un=3
- movd %mm6, 8(%edx) C un=3
- psrlq $32, %mm6 C un=3
- paddq %mm4, %mm6 C un=3
- movd %mm6, 12(%edx) C un=3
- psrlq $32, %mm6 C un=3
- movd %mm6, 16(%edx) C un=3
- dec %ebx C un=3
- jz L(rtr) C un=3
- movd 8(%esi), %mm7 C un=3
- movd (%eax), %mm6 C un=3
- pmuludq %mm7, %mm6 C un=3
- movd 4(%eax), %mm1 C un=3
- movd 8(%edx), %mm4 C un=3
- pmuludq %mm7, %mm1 C un=3
- movd 8(%eax), %mm2 C un=3
- movd 12(%edx), %mm5 C un=3
- pmuludq %mm7, %mm2 C un=3
- paddq %mm4, %mm6 C un=3
- paddq %mm1, %mm5 C un=3
- movd 16(%edx), %mm4 C un=3
- movd %mm6, 8(%edx) C un=3
- psrlq $32, %mm6 C un=3
- paddq %mm5, %mm6 C un=3
- paddq %mm2, %mm4 C un=3
- movd %mm6, 12(%edx) C un=3
- psrlq $32, %mm6 C un=3
- paddq %mm4, %mm6 C un=3
- movd %mm6, 16(%edx) C un=3
- psrlq $32, %mm6 C un=3
- movd %mm6, 20(%edx) C un=3
- jmp L(rtr)
-
-
-L(big): push %edi
- pxor %mm6, %mm6
- lea 4(%esi), %esi
- and $3, %ecx
- jz L(0)
- cmp $2, %ecx
- jc L(1)
- jz L(2)
- jmp L(3) C FIXME: one case should fall through
-
-
-L(0): movd (%eax), %mm3 C m 0
- sub 24(%esp), %ecx C inner loop count m 0
- mov %ecx, 24(%esp) C update loop count for later m 0
- pmuludq %mm7, %mm3 C m 0
- movd 4(%eax), %mm0 C m 0
- pmuludq %mm7, %mm0 C m 0
- movd 8(%eax), %mm1 C m 0
- jmp L(m00) C m 0
- ALIGN(16) C m 0
-L(lpm0):
- pmuludq %mm7, %mm4 C m 0
- paddq %mm0, %mm6 C m 0
- movd (%eax), %mm3 C m 0
- movd %mm6, -12(%edx) C m 0
- psrlq $32, %mm6 C m 0
- pmuludq %mm7, %mm3 C m 0
- paddq %mm1, %mm6 C m 0
- movd 4(%eax), %mm0 C m 0
- movd %mm6, -8(%edx) C m 0
- psrlq $32, %mm6 C m 0
- pmuludq %mm7, %mm0 C m 0
- paddq %mm4, %mm6 C m 0
- movd 8(%eax), %mm1 C m 0
- movd %mm6, -4(%edx) C m 0
- psrlq $32, %mm6 C m 0
-L(m00): pmuludq %mm7, %mm1 C m 0
- paddq %mm3, %mm6 C m 0
- movd 12(%eax), %mm4 C m 0
- movd %mm6, (%edx) C m 0
- psrlq $32, %mm6 C m 0
- lea 16(%eax), %eax C m 0
- lea 16(%edx), %edx C m 0
- add $4, %ecx C m 0
- ja L(lpm0) C m 0
- pmuludq %mm7, %mm4 C m 0
- paddq %mm0, %mm6 C m 0
- movd %mm6, -12(%edx) C m 0
- psrlq $32, %mm6 C m 0
- paddq %mm1, %mm6 C m 0
- mov 16(%esp), %edi C rp 0
- jmp L(x0)
-
-L(olp0):
- lea 4(%edi), %edi C am 0
- movd (%esi), %mm7 C am 0
- lea 4(%esi), %esi C am 0
- mov %edi, %edx C rp am 0
- mov 20(%esp), %eax C up am 0
- movd (%eax), %mm3 C am 0
- mov 24(%esp), %ecx C inner loop count am 0
- pxor %mm6, %mm6 C am 0
- pmuludq %mm7, %mm3 C am 0
- movd 4(%eax), %mm0 C am 0
- movd (%edx), %mm5 C am 0
- pmuludq %mm7, %mm0 C am 0
- movd 8(%eax), %mm1 C am 0
- paddq %mm3, %mm5 C am 0
- movd 4(%edx), %mm4 C am 0
- jmp L(am00) C am 0
- ALIGN(16) C mm 0
-L(lam0):
- pmuludq %mm7, %mm2 C am 0
- paddq %mm4, %mm6 C am 0
- movd (%eax), %mm3 C am 0
- paddq %mm1, %mm5 C am 0
- movd -4(%edx), %mm4 C am 0
- movd %mm6, -12(%edx) C am 0
- psrlq $32, %mm6 C am 0
- pmuludq %mm7, %mm3 C am 0
- paddq %mm5, %mm6 C am 0
- movd 4(%eax), %mm0 C am 0
- paddq %mm2, %mm4 C am 0
- movd (%edx), %mm5 C am 0
- movd %mm6, -8(%edx) C am 0
- psrlq $32, %mm6 C am 0
- pmuludq %mm7, %mm0 C am 0
- paddq %mm4, %mm6 C am 0
- movd 8(%eax), %mm1 C am 0
- paddq %mm3, %mm5 C am 0
- movd 4(%edx), %mm4 C am 0
- movd %mm6, -4(%edx) C am 0
- psrlq $32, %mm6 C am 0
-L(am00):
- pmuludq %mm7, %mm1 C am 0
- paddq %mm5, %mm6 C am 0
- movd 12(%eax), %mm2 C am 0
- paddq %mm0, %mm4 C am 0
- movd 8(%edx), %mm5 C am 0
- movd %mm6, (%edx) C am 0
- psrlq $32, %mm6 C am 0
- lea 16(%eax), %eax C am 0
- lea 16(%edx), %edx C am 0
- add $4, %ecx C am 0
- jnz L(lam0) C am 0
- pmuludq %mm7, %mm2 C am 0
- paddq %mm4, %mm6 C am 0
- paddq %mm1, %mm5 C am 0
- movd -4(%edx), %mm4 C am 0
- movd %mm6, -12(%edx) C am 0
- psrlq $32, %mm6 C am 0
- paddq %mm5, %mm6 C am 0
- paddq %mm2, %mm4 C am 0
-L(x0): movd %mm6, -8(%edx) C am 0
- psrlq $32, %mm6 C am 0
- paddq %mm4, %mm6 C am 0
- movd %mm6, -4(%edx) C am 0
- psrlq $32, %mm6 C am 0
- movd %mm6, (%edx) C am 0
- dec %ebx C am 0
- jnz L(olp0) C am 0
-L(oel0):
- emms C 0
- pop %edi C 0
- pop %ebx C 0
- pop %esi C 0
- ret C 0
-
-
-L(1): movd (%eax), %mm4 C m 1
- sub 24(%esp), %ecx C m 1
- mov %ecx, 24(%esp) C update loop count for later m 1
- pmuludq %mm7, %mm4 C m 1
- movd 4(%eax), %mm3 C m 1
- pmuludq %mm7, %mm3 C m 1
- movd 8(%eax), %mm0 C m 1
- jmp L(m01) C m 1
- ALIGN(16) C m 1
-L(lpm1):
- pmuludq %mm7, %mm4 C m 1
- paddq %mm0, %mm6 C m 1
- movd 4(%eax), %mm3 C m 1
- movd %mm6, -8(%edx) C m 1
- psrlq $32, %mm6 C m 1
- pmuludq %mm7, %mm3 C m 1
- paddq %mm1, %mm6 C m 1
- movd 8(%eax), %mm0 C m 1
- movd %mm6, -4(%edx) C m 1
- psrlq $32, %mm6 C m 1
-L(m01): pmuludq %mm7, %mm0 C m 1
- paddq %mm4, %mm6 C m 1
- movd 12(%eax), %mm1 C m 1
- movd %mm6, (%edx) C m 1
- psrlq $32, %mm6 C m 1
- pmuludq %mm7, %mm1 C m 1
- paddq %mm3, %mm6 C m 1
- movd 16(%eax), %mm4 C m 1
- movd %mm6, 4(%edx) C m 1
- psrlq $32, %mm6 C m 1
- lea 16(%eax), %eax C m 1
- lea 16(%edx), %edx C m 1
- add $4, %ecx C m 1
- ja L(lpm1) C m 1
- pmuludq %mm7, %mm4 C m 1
- paddq %mm0, %mm6 C m 1
- movd %mm6, -8(%edx) C m 1
- psrlq $32, %mm6 C m 1
- paddq %mm1, %mm6 C m 1
- mov 16(%esp), %edi C rp 1
- jmp L(x1)
-
-L(olp1):
- lea 4(%edi), %edi C am 1
- movd (%esi), %mm7 C am 1
- lea 4(%esi), %esi C am 1
- mov %edi, %edx C rp am 1
- mov 20(%esp), %eax C up am 1
- movd (%eax), %mm2 C am 1
- mov 24(%esp), %ecx C inner loop count am 1
- pxor %mm6, %mm6 C am 1
- pmuludq %mm7, %mm2 C am 1
- movd 4(%eax), %mm3 C am 1
- movd (%edx), %mm4 C am 1
- pmuludq %mm7, %mm3 C am 1
- movd 8(%eax), %mm0 C am 1
- paddq %mm2, %mm4 C am 1
- movd 4(%edx), %mm5 C am 1
- jmp L(am01) C am 1
- ALIGN(16) C am 1
-L(lam1):
- pmuludq %mm7, %mm2 C am 1
- paddq %mm4, %mm6 C am 1
- movd 4(%eax), %mm3 C am 1
- paddq %mm1, %mm5 C am 1
- movd (%edx), %mm4 C am 1
- movd %mm6, -8(%edx) C am 1
- psrlq $32, %mm6 C am 1
- pmuludq %mm7, %mm3 C am 1
- paddq %mm5, %mm6 C am 1
- movd 8(%eax), %mm0 C am 1
- paddq %mm2, %mm4 C am 1
- movd 4(%edx), %mm5 C am 1
- movd %mm6, -4(%edx) C am 1
- psrlq $32, %mm6 C am 1
-L(am01):
- pmuludq %mm7, %mm0 C am 1
- paddq %mm4, %mm6 C am 1
- movd 12(%eax), %mm1 C am 1
- paddq %mm3, %mm5 C am 1
- movd 8(%edx), %mm4 C am 1
- movd %mm6, (%edx) C am 1
- psrlq $32, %mm6 C am 1
- pmuludq %mm7, %mm1 C am 1
- paddq %mm5, %mm6 C am 1
- movd 16(%eax), %mm2 C am 1
- paddq %mm0, %mm4 C am 1
- movd 12(%edx), %mm5 C am 1
- movd %mm6, 4(%edx) C am 1
- psrlq $32, %mm6 C am 1
- lea 16(%eax), %eax C am 1
- lea 16(%edx), %edx C am 1
- add $4, %ecx C am 1
- jnz L(lam1) C am 1
- pmuludq %mm7, %mm2 C am 1
- paddq %mm4, %mm6 C am 1
- paddq %mm1, %mm5 C am 1
- movd (%edx), %mm4 C am 1
- movd %mm6, -8(%edx) C am 1
- psrlq $32, %mm6 C am 1
- paddq %mm5, %mm6 C am 1
- paddq %mm2, %mm4 C am 1
-L(x1): movd %mm6, -4(%edx) C am 1
- psrlq $32, %mm6 C am 1
- paddq %mm4, %mm6 C am 1
- movd %mm6, (%edx) C am 1
- psrlq $32, %mm6 C am 1
- movd %mm6, 4(%edx) C am 1
- dec %ebx C am 1
- jnz L(olp1) C am 1
-L(oel1):
- emms C 1
- pop %edi C 1
- pop %ebx C 1
- pop %esi C 1
- ret C 1
-
-
-L(2): movd (%eax), %mm1 C m 2
- sub 24(%esp), %ecx C m 2
- mov %ecx, 24(%esp) C update loop count for later m 2
- pmuludq %mm7, %mm1 C m 2
- movd 4(%eax), %mm4 C m 2
- pmuludq %mm7, %mm4 C m 2
- movd 8(%eax), %mm3 C m 2
- jmp L(m10) C m 2
- ALIGN(16) C m 2
-L(lpm2):
- pmuludq %mm7, %mm4 C m 2
- paddq %mm0, %mm6 C m 2
- movd 8(%eax), %mm3 C m 2
- movd %mm6, -4(%edx) C m 2
- psrlq $32, %mm6 C m 2
-L(m10): pmuludq %mm7, %mm3 C m 2
- paddq %mm1, %mm6 C m 2
- movd 12(%eax), %mm0 C m 2
- movd %mm6, (%edx) C m 2
- psrlq $32, %mm6 C m 2
- pmuludq %mm7, %mm0 C m 2
- paddq %mm4, %mm6 C m 2
- movd 16(%eax), %mm1 C m 2
- movd %mm6, 4(%edx) C m 2
- psrlq $32, %mm6 C m 2
- pmuludq %mm7, %mm1 C m 2
- paddq %mm3, %mm6 C m 2
- movd 20(%eax), %mm4 C m 2
- movd %mm6, 8(%edx) C m 2
- psrlq $32, %mm6 C m 2
- lea 16(%eax), %eax C m 2
- lea 16(%edx), %edx C m 2
- add $4, %ecx C m 2
- ja L(lpm2) C m 2
- pmuludq %mm7, %mm4 C m 2
- paddq %mm0, %mm6 C m 2
- movd %mm6, -4(%edx) C m 2
- psrlq $32, %mm6 C m 2
- paddq %mm1, %mm6 C m 2
- mov 16(%esp), %edi C rp 2
- jmp L(x2)
-
-L(olp2):
- lea 4(%edi), %edi C am 2
- movd (%esi), %mm7 C am 2
- lea 4(%esi), %esi C am 2
- mov %edi, %edx C rp am 2
- mov 20(%esp), %eax C up am 2
- movd (%eax), %mm1 C am 2
- mov 24(%esp), %ecx C inner loop count am 2
- pxor %mm6, %mm6 C am 2
- pmuludq %mm7, %mm1 C am 2
- movd 4(%eax), %mm2 C am 2
- movd (%edx), %mm5 C am 2
- pmuludq %mm7, %mm2 C am 2
- movd 8(%eax), %mm3 C am 2
- paddq %mm1, %mm5 C am 2
- movd 4(%edx), %mm4 C am 2
- jmp L(am10) C am 2
- ALIGN(16) C am 2
-L(lam2):
- pmuludq %mm7, %mm2 C am 2
- paddq %mm4, %mm6 C am 2
- movd 8(%eax), %mm3 C am 2
- paddq %mm1, %mm5 C am 2
- movd 4(%edx), %mm4 C am 2
- movd %mm6, -4(%edx) C am 2
- psrlq $32, %mm6 C am 2
-L(am10):
- pmuludq %mm7, %mm3 C am 2
- paddq %mm5, %mm6 C am 2
- movd 12(%eax), %mm0 C am 2
- paddq %mm2, %mm4 C am 2
- movd 8(%edx), %mm5 C am 2
- movd %mm6, (%edx) C am 2
- psrlq $32, %mm6 C am 2
- pmuludq %mm7, %mm0 C am 2
- paddq %mm4, %mm6 C am 2
- movd 16(%eax), %mm1 C am 2
- paddq %mm3, %mm5 C am 2
- movd 12(%edx), %mm4 C am 2
- movd %mm6, 4(%edx) C am 2
- psrlq $32, %mm6 C am 2
- pmuludq %mm7, %mm1 C am 2
- paddq %mm5, %mm6 C am 2
- movd 20(%eax), %mm2 C am 2
- paddq %mm0, %mm4 C am 2
- movd 16(%edx), %mm5 C am 2
- movd %mm6, 8(%edx) C am 2
- psrlq $32, %mm6 C am 2
- lea 16(%eax), %eax C am 2
- lea 16(%edx), %edx C am 2
- add $4, %ecx C am 2
- jnz L(lam2) C am 2
- pmuludq %mm7, %mm2 C am 2
- paddq %mm4, %mm6 C am 2
- paddq %mm1, %mm5 C am 2
- movd 4(%edx), %mm4 C am 2
- movd %mm6, -4(%edx) C am 2
- psrlq $32, %mm6 C am 2
- paddq %mm5, %mm6 C am 2
- paddq %mm2, %mm4 C am 2
-L(x2): movd %mm6, (%edx) C am 2
- psrlq $32, %mm6 C am 2
- paddq %mm4, %mm6 C am 2
- movd %mm6, 4(%edx) C am 2
- psrlq $32, %mm6 C am 2
- movd %mm6, 8(%edx) C am 2
- dec %ebx C am 2
- jnz L(olp2) C am 2
-L(oel2):
- emms C 2
- pop %edi C 2
- pop %ebx C 2
- pop %esi C 2
- ret C 2
-
-
-L(3): movd (%eax), %mm0 C m 3
- sub 24(%esp), %ecx C m 3
- mov %ecx, 24(%esp) C update loop count for later m 3
- pmuludq %mm7, %mm0 C m 3
- movd 4(%eax), %mm1 C m 3
- pmuludq %mm7, %mm1 C m 3
- movd 8(%eax), %mm4 C m 3
- jmp L(lpm3) C m 3
- ALIGN(16) C m 3
-L(lpm3):
- pmuludq %mm7, %mm4 C m 3
- paddq %mm0, %mm6 C m 3
- movd 12(%eax), %mm3 C m 3
- movd %mm6, (%edx) C m 3
- psrlq $32, %mm6 C m 3
- pmuludq %mm7, %mm3 C m 3
- paddq %mm1, %mm6 C m 3
- movd 16(%eax), %mm0 C m 3
- movd %mm6, 4(%edx) C m 3
- psrlq $32, %mm6 C m 3
- pmuludq %mm7, %mm0 C m 3
- paddq %mm4, %mm6 C m 3
- movd 20(%eax), %mm1 C m 3
- movd %mm6, 8(%edx) C m 3
- psrlq $32, %mm6 C m 3
- pmuludq %mm7, %mm1 C m 3
- paddq %mm3, %mm6 C m 3
- movd 24(%eax), %mm4 C m 3
- movd %mm6, 12(%edx) C m 3
- psrlq $32, %mm6 C m 3
- lea 16(%eax), %eax C m 3
- lea 16(%edx), %edx C m 3
- add $4, %ecx C m 3
- ja L(lpm3) C m 3
- pmuludq %mm7, %mm4 C m 3
- paddq %mm0, %mm6 C m 3
- movd %mm6, (%edx) C m 3
- psrlq $32, %mm6 C m 3
- paddq %mm1, %mm6 C m 3
- mov 16(%esp), %edi C rp 3
- jmp L(x3)
-
-L(olp3):
- lea 4(%edi), %edi C am 3
- movd (%esi), %mm7 C am 3
- lea 4(%esi), %esi C am 3
- mov %edi, %edx C rp am 3
- mov 20(%esp), %eax C up am 3
- movd (%eax), %mm0 C am 3
- mov 24(%esp), %ecx C inner loop count am 3
- pxor %mm6, %mm6 C am 3
- pmuludq %mm7, %mm0 C am 3
- movd 4(%eax), %mm1 C am 3
- movd (%edx), %mm4 C am 3
- pmuludq %mm7, %mm1 C am 3
- movd 8(%eax), %mm2 C am 3
- paddq %mm0, %mm4 C am 3
- movd 4(%edx), %mm5 C am 3
- jmp L(lam3) C am 3
- ALIGN(16) C am 3
-L(lam3):
- pmuludq %mm7, %mm2 C am 3
- paddq %mm4, %mm6 C am 3
- movd 12(%eax), %mm3 C am 3
- paddq %mm1, %mm5 C am 3
- movd 8(%edx), %mm4 C am 3
- movd %mm6, (%edx) C am 3
- psrlq $32, %mm6 C am 3
- pmuludq %mm7, %mm3 C am 3
- paddq %mm5, %mm6 C am 3
- movd 16(%eax), %mm0 C am 3
- paddq %mm2, %mm4 C am 3
- movd 12(%edx), %mm5 C am 3
- movd %mm6, 4(%edx) C am 3
- psrlq $32, %mm6 C am 3
- pmuludq %mm7, %mm0 C am 3
- paddq %mm4, %mm6 C am 3
- movd 20(%eax), %mm1 C am 3
- paddq %mm3, %mm5 C am 3
- movd 16(%edx), %mm4 C am 3
- movd %mm6, 8(%edx) C am 3
- psrlq $32, %mm6 C am 3
- pmuludq %mm7, %mm1 C am 3
- paddq %mm5, %mm6 C am 3
- movd 24(%eax), %mm2 C am 3
- paddq %mm0, %mm4 C am 3
- movd 20(%edx), %mm5 C am 3
- movd %mm6, 12(%edx) C am 3
- psrlq $32, %mm6 C am 3
- lea 16(%eax), %eax C am 3
- lea 16(%edx), %edx C am 3
- add $4, %ecx C am 3
- jnz L(lam3) C am 3
- pmuludq %mm7, %mm2 C am 3
- paddq %mm4, %mm6 C am 3
- paddq %mm1, %mm5 C am 3
- movd 8(%edx), %mm4 C am 3
- movd %mm6, (%edx) C am 3
- psrlq $32, %mm6 C am 3
- paddq %mm5, %mm6 C am 3
- paddq %mm2, %mm4 C am 3
-L(x3): movd %mm6, 4(%edx) C am 3
- psrlq $32, %mm6 C am 3
- paddq %mm4, %mm6 C am 3
- movd %mm6, 8(%edx) C am 3
- psrlq $32, %mm6 C am 3
- movd %mm6, 12(%edx) C am 3
- dec %ebx C am 3
- jnz L(olp3) C am 3
-L(oel3):
- emms C 3
- pop %edi C 3
- pop %ebx C 3
- pop %esi C 3
- ret C 3
-EPILOGUE()
« no previous file with comments | « gcc/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm ('k') | gcc/gmp/mpn/x86/pentium4/sse2/submul_1.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698