Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1058)

Unified Diff: gcc/gmp/mpn/x86/pentium4/sse2/divrem_1.asm

Issue 3050029: [gcc] GCC 4.5.0=>4.5.1 (Closed) Base URL: ssh://git@gitrw.chromium.org:9222/nacl-toolchain.git
Patch Set: Created 10 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « gcc/gmp/mpn/x86/pentium4/sse2/add_n.asm ('k') | gcc/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: gcc/gmp/mpn/x86/pentium4/sse2/divrem_1.asm
diff --git a/gcc/gmp/mpn/x86/pentium4/sse2/divrem_1.asm b/gcc/gmp/mpn/x86/pentium4/sse2/divrem_1.asm
deleted file mode 100644
index 7f973dbf983861af7115994fbc43284888d72757..0000000000000000000000000000000000000000
--- a/gcc/gmp/mpn/x86/pentium4/sse2/divrem_1.asm
+++ /dev/null
@@ -1,635 +0,0 @@
-dnl Intel Pentium-4 mpn_divrem_1 -- mpn by limb division.
-
-dnl Copyright 1999, 2000, 2001, 2002, 2003, 2004 Free Software Foundation,
-dnl Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 3 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public License
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part.
-
-
-C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
-C mp_srcptr src, mp_size_t size,
-C mp_limb_t divisor);
-C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
-C mp_srcptr src, mp_size_t size,
-C mp_limb_t divisor, mp_limb_t carry);
-C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
-C mp_srcptr src, mp_size_t size,
-C mp_limb_t divisor, mp_limb_t inverse,
-C unsigned shift);
-C
-C Algorithm:
-C
-C The method and nomenclature follow part 8 of "Division by Invariant
-C Integers using Multiplication" by Granlund and Montgomery, reference in
-C gmp.texi.
-C
-C "m" is written for what is m' in the paper, and "d" for d_norm, which
-C won't cause any confusion since it's only the normalized divisor that's of
-C any use in the code. "b" is written for 2^N, the size of a limb, N being
-C 32 here.
-C
-C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
-C "n-d - q1*d". This rearrangement gives the same two-limb answer but lets
-C us have just a psubq on the dependent chain.
-C
-C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here,
-C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much.
-C
-C Notes:
-C
-C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
-C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero
-C carry, since in normal circumstances that will be a very rare event.
-C
-C The test for skipping a division is branch free (once size>=1 is tested).
-C The store to the destination high limb is 0 when a divide is skipped, or
-C if it's not skipped then a copy of the src high limb is stored. The
-C latter is in case src==dst.
-C
-C There's a small bias towards expecting xsize==0, by having code for
-C xsize==0 in a straight line and xsize!=0 under forward jumps.
-C
-C Enhancements:
-C
-C The loop measures 32 cycles, but the dependent chain would suggest it
-C could be done with 30. Not sure where to start looking for the extras.
-C
-C Alternatives:
-C
-C If the divisor is normalized (high bit set) then a division step can
-C always be skipped, since the high destination limb is always 0 or 1 in
-C that case. It doesn't seem worth checking for this though, since it
-C probably occurs infrequently.
-
-
-dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
-dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
-dnl
-dnl The inverse takes about 80-90 cycles to calculate, but after that the
-dnl multiply is 32 c/l versus division at about 58 c/l.
-dnl
-dnl At 4 limbs the div is a touch faster than the mul (and of course
-dnl simpler), so start the mul from 5 limbs.
-
-deflit(MUL_THRESHOLD, 5)
-
-
-defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1
-defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1
-defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c
-defframe(PARAM_DIVISOR,20)
-defframe(PARAM_SIZE, 16)
-defframe(PARAM_SRC, 12)
-defframe(PARAM_XSIZE, 8)
-defframe(PARAM_DST, 4)
-
-dnl re-use parameter space
-define(SAVE_ESI,`PARAM_SIZE')
-define(SAVE_EBP,`PARAM_SRC')
-define(SAVE_EDI,`PARAM_DIVISOR')
-define(SAVE_EBX,`PARAM_DST')
-
- TEXT
-
- ALIGN(16)
-PROLOGUE(mpn_preinv_divrem_1)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %ecx
- xorl %edx, %edx C carry if can't skip a div
-
- movl %esi, SAVE_ESI
- movl PARAM_SRC, %esi
-
- movl %ebp, SAVE_EBP
- movl PARAM_DIVISOR, %ebp
-
- movl %edi, SAVE_EDI
- movl PARAM_DST, %edi
-
- movl -4(%esi,%ecx,4), %eax C src high limb
-
- movl %ebx, SAVE_EBX
- movl PARAM_XSIZE, %ebx
-
- movd PARAM_PREINV_INVERSE, %mm4
-
- movd PARAM_PREINV_SHIFT, %mm7 C l
- cmpl %ebp, %eax C high cmp divisor
-
- cmovc( %eax, %edx) C high is carry if high<divisor
- movd %edx, %mm0 C carry
-
- movd %edx, %mm1 C carry
- movl $0, %edx
-
- movd %ebp, %mm5 C d
- cmovnc( %eax, %edx) C 0 if skip div, src high if not
- C (the latter in case src==dst)
- leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
-
- movl %edx, (%edi,%ecx,4) C dst high limb
- sbbl $0, %ecx C skip one division if high<divisor
- movl $32, %eax
-
- subl PARAM_PREINV_SHIFT, %eax
- psllq %mm7, %mm5 C d normalized
- leal (%edi,%ecx,4), %edi C &dst[xsize+size-1]
- leal -4(%esi,%ecx,4), %esi C &src[size-1]
-
- movd %eax, %mm6 C 32-l
- jmp L(start_preinv)
-
-EPILOGUE()
-
-
- ALIGN(16)
-PROLOGUE(mpn_divrem_1c)
-deflit(`FRAME',0)
-
- movl PARAM_CARRY, %edx
-
- movl PARAM_SIZE, %ecx
-
- movl %esi, SAVE_ESI
- movl PARAM_SRC, %esi
-
- movl %ebp, SAVE_EBP
- movl PARAM_DIVISOR, %ebp
-
- movl %edi, SAVE_EDI
- movl PARAM_DST, %edi
-
- movl %ebx, SAVE_EBX
- movl PARAM_XSIZE, %ebx
-
- leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
- jmp L(start_1c)
-
-EPILOGUE()
-
-
- ALIGN(16)
-PROLOGUE(mpn_divrem_1)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %ecx
- xorl %edx, %edx C initial carry (if can't skip a div)
-
- movl %esi, SAVE_ESI
- movl PARAM_SRC, %esi
-
- movl %ebp, SAVE_EBP
- movl PARAM_DIVISOR, %ebp
-
- movl %edi, SAVE_EDI
- movl PARAM_DST, %edi
-
- movl %ebx, SAVE_EBX
- movl PARAM_XSIZE, %ebx
- leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
-
- orl %ecx, %ecx C size
- jz L(no_skip_div) C if size==0
- movl -4(%esi,%ecx,4), %eax C src high limb
-
- cmpl %ebp, %eax C high cmp divisor
-
- cmovnc( %eax, %edx) C 0 if skip div, src high if not
- movl %edx, (%edi,%ecx,4) C dst high limb
-
- movl $0, %edx
- cmovc( %eax, %edx) C high is carry if high<divisor
-
- sbbl $0, %ecx C size-1 if high<divisor
-L(no_skip_div):
-
-
-L(start_1c):
- C eax
- C ebx xsize
- C ecx size
- C edx carry
- C esi src
- C edi &dst[xsize-1]
- C ebp divisor
-
- leal (%ebx,%ecx), %eax C size+xsize
- leal -4(%esi,%ecx,4), %esi C &src[size-1]
- leal (%edi,%ecx,4), %edi C &dst[size+xsize-1]
-
- cmpl $MUL_THRESHOLD, %eax
- jae L(mul_by_inverse)
-
-
- orl %ecx, %ecx
- jz L(divide_no_integer) C if size==0
-
-L(divide_integer):
- C eax scratch (quotient)
- C ebx xsize
- C ecx counter
- C edx carry
- C esi src, decrementing
- C edi dst, decrementing
- C ebp divisor
-
- movl (%esi), %eax
- subl $4, %esi
-
- divl %ebp
-
- movl %eax, (%edi)
- subl $4, %edi
-
- subl $1, %ecx
- jnz L(divide_integer)
-
-
-L(divide_no_integer):
- orl %ebx, %ebx
- jnz L(divide_fraction) C if xsize!=0
-
-L(divide_done):
- movl SAVE_ESI, %esi
- movl SAVE_EDI, %edi
- movl SAVE_EBX, %ebx
- movl SAVE_EBP, %ebp
- movl %edx, %eax
- ret
-
-
-L(divide_fraction):
- C eax scratch (quotient)
- C ebx counter
- C ecx
- C edx carry
- C esi
- C edi dst, decrementing
- C ebp divisor
-
- movl $0, %eax
-
- divl %ebp
-
- movl %eax, (%edi)
- subl $4, %edi
-
- subl $1, %ebx
- jnz L(divide_fraction)
-
- jmp L(divide_done)
-
-
-
-C -----------------------------------------------------------------------------
-
-L(mul_by_inverse):
- C eax
- C ebx xsize
- C ecx size
- C edx carry
- C esi &src[size-1]
- C edi &dst[size+xsize-1]
- C ebp divisor
-
- bsrl %ebp, %eax C 31-l
- movd %edx, %mm0 C carry
- movd %edx, %mm1 C carry
- movl %ecx, %edx C size
- movl $31, %ecx
-
- C
-
- xorl %eax, %ecx C l = leading zeros on d
- addl $1, %eax
-
- shll %cl, %ebp C d normalized
- movd %ecx, %mm7 C l
- movl %edx, %ecx C size
-
- movd %eax, %mm6 C 32-l
- movl $-1, %edx
- movl $-1, %eax
-
- C
-
- subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1
-
- divl %ebp C floor (b*(b-d)-1 / d)
- movd %ebp, %mm5 C d
-
- C
-
- movd %eax, %mm4 C m
-
-
-L(start_preinv):
- C eax inverse
- C ebx xsize
- C ecx size
- C edx
- C esi &src[size-1]
- C edi &dst[size+xsize-1]
- C ebp
- C
- C mm0 carry
- C mm1 carry
- C mm2
- C mm4 m
- C mm5 d
- C mm6 31-l
- C mm7 l
-
- psllq %mm7, %mm0 C n2 = carry << l, for size==0
-
- subl $1, %ecx
- jb L(integer_none)
-
- movd (%esi), %mm0 C src high limb
- punpckldq %mm1, %mm0
- psrlq %mm6, %mm0 C n2 = high (carry:srchigh << l)
- jz L(integer_last)
-
-
-C The dependent chain here consists of
-C
-C 2 paddd n1+n2
-C 8 pmuludq m*(n1+n2)
-C 2 paddq n2:nadj + m*(n1+n2)
-C 2 psrlq q1
-C 8 pmuludq d*q1
-C 2 psubq (n-d)-q1*d
-C 2 psrlq high n-(q1+1)*d mask
-C 2 pand d masked
-C 2 paddd n2+d addback
-C --
-C 30
-C
-C But it seems to run at 32 cycles, so presumably there's something else
-C going on.
-
- ALIGN(16)
-L(integer_top):
- C eax
- C ebx
- C ecx counter, size-1 to 0
- C edx
- C esi src, decrementing
- C edi dst, decrementing
- C
- C mm0 n2
- C mm4 m
- C mm5 d
- C mm6 32-l
- C mm7 l
-
- ASSERT(b,`C n2<d
- movd %mm0, %eax
- movd %mm5, %edx
- cmpl %edx, %eax')
-
- movd -4(%esi), %mm1 C next src limbs
- movd (%esi), %mm2
- leal -4(%esi), %esi
-
- punpckldq %mm2, %mm1
- psrlq %mm6, %mm1 C n10
-
- movq %mm1, %mm2 C n10
- movq %mm1, %mm3 C n10
- psrad $31, %mm1 C -n1
- pand %mm5, %mm1 C -n1 & d
- paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow
-
- psrld $31, %mm2 C n1
- paddd %mm0, %mm2 C n2+n1
- punpckldq %mm0, %mm1 C n2:nadj
-
- pmuludq %mm4, %mm2 C m*(n2+n1)
-
- C
-
- paddq %mm2, %mm1 C n2:nadj + m*(n2+n1)
- pxor %mm2, %mm2 C break dependency, saves 4 cycles
- pcmpeqd %mm2, %mm2 C FF...FF
- psrlq $63, %mm2 C 1
-
- psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1))
-
- paddd %mm1, %mm2 C q1+1
- pmuludq %mm5, %mm1 C q1*d
-
- punpckldq %mm0, %mm3 C n = n2:n10
- pxor %mm0, %mm0
-
- psubq %mm5, %mm3 C n - d
-
- C
-
- psubq %mm1, %mm3 C n - (q1+1)*d
-
- por %mm3, %mm0 C copy remainder -> new n2
- psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1
-
- ASSERT(be,`C 0 or -1
- movd %mm3, %eax
- addl $1, %eax
- cmpl $1, %eax')
-
- paddd %mm3, %mm2 C q
- pand %mm5, %mm3 C mask & d
-
- paddd %mm3, %mm0 C addback if necessary
- movd %mm2, (%edi)
- leal -4(%edi), %edi
-
- subl $1, %ecx
- ja L(integer_top)
-
-
-L(integer_last):
- C eax
- C ebx xsize
- C ecx
- C edx
- C esi &src[0]
- C edi &dst[xsize]
- C
- C mm0 n2
- C mm4 m
- C mm5 d
- C mm6
- C mm7 l
-
- ASSERT(b,`C n2<d
- movd %mm0, %eax
- movd %mm5, %edx
- cmpl %edx, %eax')
-
- movd (%esi), %mm1 C src[0]
- psllq %mm7, %mm1 C n10
-
- movq %mm1, %mm2 C n10
- movq %mm1, %mm3 C n10
- psrad $31, %mm1 C -n1
- pand %mm5, %mm1 C -n1 & d
- paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow
-
- psrld $31, %mm2 C n1
- paddd %mm0, %mm2 C n2+n1
- punpckldq %mm0, %mm1 C n2:nadj
-
- pmuludq %mm4, %mm2 C m*(n2+n1)
-
- C
-
- paddq %mm2, %mm1 C n2:nadj + m*(n2+n1)
- pcmpeqd %mm2, %mm2 C FF...FF
- psrlq $63, %mm2 C 1
-
- psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1))
- paddd %mm1, %mm2 C q1
-
- pmuludq %mm5, %mm1 C q1*d
- punpckldq %mm0, %mm3 C n
- psubq %mm5, %mm3 C n - d
- pxor %mm0, %mm0
-
- C
-
- psubq %mm1, %mm3 C n - (q1+1)*d
-
- por %mm3, %mm0 C remainder -> n2
- psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1
-
- ASSERT(be,`C 0 or -1
- movd %mm3, %eax
- addl $1, %eax
- cmpl $1, %eax')
-
- paddd %mm3, %mm2 C q
- pand %mm5, %mm3 C mask & d
-
- paddd %mm3, %mm0 C addback if necessary
- movd %mm2, (%edi)
- leal -4(%edi), %edi
-
-
-L(integer_none):
- C eax
- C ebx xsize
-
- orl %ebx, %ebx
- jnz L(fraction_some) C if xsize!=0
-
-
-L(fraction_done):
- movl SAVE_EBP, %ebp
- psrld %mm7, %mm0 C remainder
-
- movl SAVE_EDI, %edi
- movd %mm0, %eax
-
- movl SAVE_ESI, %esi
- movl SAVE_EBX, %ebx
- emms
- ret
-
-
-
-C -----------------------------------------------------------------------------
-C
-
-L(fraction_some):
- C eax
- C ebx xsize
- C ecx
- C edx
- C esi
- C edi &dst[xsize-1]
- C ebp
-
-
-L(fraction_top):
- C eax
- C ebx counter, xsize iterations
- C ecx
- C edx
- C esi src, decrementing
- C edi dst, decrementing
- C
- C mm0 n2
- C mm4 m
- C mm5 d
- C mm6 32-l
- C mm7 l
-
- ASSERT(b,`C n2<d
- movd %mm0, %eax
- movd %mm5, %edx
- cmpl %edx, %eax')
-
- movq %mm0, %mm1 C n2
- pmuludq %mm4, %mm0 C m*n2
-
- pcmpeqd %mm2, %mm2
- psrlq $63, %mm2
-
- C
-
- psrlq $32, %mm0 C high(m*n2)
-
- paddd %mm1, %mm0 C q1 = high(n2:0 + m*n2)
-
- paddd %mm0, %mm2 C q1+1
- pmuludq %mm5, %mm0 C q1*d
-
- psllq $32, %mm1 C n = n2:0
- psubq %mm5, %mm1 C n - d
-
- C
-
- psubq %mm0, %mm1 C r = n - (q1+1)*d
- pxor %mm0, %mm0
-
- por %mm1, %mm0 C r -> n2
- psrlq $32, %mm1 C high n - (q1+1)*d, 0 or -1
-
- ASSERT(be,`C 0 or -1
- movd %mm1, %eax
- addl $1, %eax
- cmpl $1, %eax')
-
- paddd %mm1, %mm2 C q
- pand %mm5, %mm1 C mask & d
-
- paddd %mm1, %mm0 C addback if necessary
- movd %mm2, (%edi)
- leal -4(%edi), %edi
-
- subl $1, %ebx
- jne L(fraction_top)
-
-
- jmp L(fraction_done)
-
-EPILOGUE()
« no previous file with comments | « gcc/gmp/mpn/x86/pentium4/sse2/add_n.asm ('k') | gcc/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698