gcc/gmp/mpn/x86/k6/gcd_1.asm - Issue 3050029: [gcc] GCC 4.5.0=>4.5.1

Unified Diff: gcc/gmp/mpn/x86/k6/gcd_1.asm

Issue 3050029: [gcc] GCC 4.5.0=>4.5.1 (Closed) Base URL: ssh://git@gitrw.chromium.org:9222/nacl-toolchain.git

Patch Set: Created 10 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: gcc/gmp/mpn/x86/k6/gcd_1.asm

diff --git a/gcc/gmp/mpn/x86/k6/gcd_1.asm b/gcc/gmp/mpn/x86/k6/gcd_1.asm

deleted file mode 100644

index 58aff08221028369ade94b920c471d9b351ff703..0000000000000000000000000000000000000000

--- a/gcc/gmp/mpn/x86/k6/gcd_1.asm

+++ /dev/null

@@ -1,351 +0,0 @@

-dnl AMD K6 mpn_gcd_1 -- mpn by 1 gcd.

-dnl

-dnl This file is part of the GNU MP Library.

-dnl

-dnl The GNU MP Library is free software; you can redistribute it and/or

-dnl modify it under the terms of the GNU Lesser General Public License as

-dnl published by the Free Software Foundation; either version 3 of the

-dnl License, or (at your option) any later version.

-dnl

-dnl The GNU MP Library is distributed in the hope that it will be useful,

-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of

-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

-dnl Lesser General Public License for more details.

-dnl

-dnl You should have received a copy of the GNU Lesser General Public License

-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.

-include(`../config.m4')

-C K6: 9.5 cycles/bit (approx) 1x1 gcd

-C 11.0 cycles/limb Nx1 reduction (modexact_1_odd)

-C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t y);

-C

-C This code is nothing very special, but offers a speedup over what gcc 2.95

-C can do with mpn/generic/gcd_1.c.

-C

-C Future:

-C

-C Using a lookup table to count trailing zeros seems a touch quicker, but

-C after a slightly longer startup. Might be worthwhile if an mpn_gcd_2 used

-C it too.

-dnl If size==1 and x (the larger operand) is more than DIV_THRESHOLD bits

-dnl bigger than y, then a division x%y is done to reduce it.

-dnl

-dnl A divl is 20 cycles and the loop runs at about 9.5 cycles/bitpair so

-dnl there should be an advantage in the divl at about 4 or 5 bits, which is

-dnl what's found.

-deflit(DIV_THRESHOLD, 5)

-defframe(PARAM_LIMB, 12)

-defframe(PARAM_SIZE, 8)

-defframe(PARAM_SRC, 4)

- TEXT

- ALIGN(16)

-PROLOGUE(mpn_gcd_1)

-deflit(`FRAME',0)

- ASSERT(ne, `cmpl $0, PARAM_LIMB')

- ASSERT(ae, `cmpl $1, PARAM_SIZE')

- movl PARAM_SRC, %eax

- pushl %ebx FRAME_pushl()

- movl PARAM_LIMB, %edx

- movl $-1, %ecx

- movl (%eax), %ebx C src low limb

- movl %ebx, %eax C src low limb

- orl %edx, %ebx

-L(common_twos):

- shrl %ebx

- incl %ecx

- jnc L(common_twos) C 1/4 chance on random data

- shrl %cl, %edx C y

- cmpl $1, PARAM_SIZE

- ja L(size_two_or_more)

- ASSERT(nz, `orl %eax, %eax') C should have src limb != 0

- shrl %cl, %eax C x

- C Swap if necessary to make x>=y. Measures a touch quicker as a

- C jump than a branch free calculation.

- C

- C eax x

- C ebx

- C ecx common twos

- C edx y

- movl %eax, %ebx

- cmpl %eax, %edx

- jb L(noswap)

- movl %edx, %eax

- movl %ebx, %edx

- movl %eax, %ebx

-L(noswap):

- C See if it's worth reducing x with a divl.

- C

- C eax x

- C ebx x

- C ecx common twos

- C edx y

- shrl $DIV_THRESHOLD, %ebx

- cmpl %ebx, %edx

- ja L(nodiv)

- C Reduce x to x%y.

- C

- C eax x

- C ebx

- C ecx common twos

- C edx y

- movl %edx, %ebx

- xorl %edx, %edx

- divl %ebx

- orl %edx, %edx C y

- nop C code alignment

- movl %ebx, %eax C x

- jz L(done_shll)

-L(nodiv):

- C eax x

- C ebx

- C ecx common twos

- C edx y

- C esi

- C edi

- C ebp

-L(strip_y):

- shrl %edx

- jnc L(strip_y)

- leal 1(%edx,%edx), %edx

- movl %ecx, %ebx C common twos

- leal 1(%eax), %ecx

- jmp L(strip_x_and)

-C Calculating a %cl shift based on the low bit 0 or 1 avoids doing a branch

-C on a 50/50 chance of 0 or 1. The chance of the next bit also being 0 is

-C only 1/4.

-C

-C A second computed %cl shift was tried, but that measured a touch slower

-C than branching back.

-C

-C A branch-free abs(x-y) and min(x,y) calculation was tried, but that

-C measured about 1 cycle/bit slower.

- C eax x

- C ebx common twos

- C ecx scratch

- C edx y

- ALIGN(4)

-L(swap):

- addl %eax, %edx C x-y+y = x

- negl %eax C -(x-y) = y-x

-L(strip_x):

- shrl %eax C odd-odd = even, so always one to strip

- ASSERT(nz)

-L(strip_x_leal):

- leal 1(%eax), %ecx

-L(strip_x_and):

- andl $1, %ecx C (x^1)&1

- shrl %cl, %eax C shift if x even

- testb $1, %al

- jz L(strip_x)

- ASSERT(nz,`testl $1, %eax') C x, y odd

- ASSERT(nz,`testl $1, %edx')

- subl %edx, %eax

- jb L(swap)

- ja L(strip_x)

- movl %edx, %eax

- movl %ebx, %ecx

-L(done_shll):

- shll %cl, %eax

- popl %ebx

- ret

-C -----------------------------------------------------------------------------

-C Two or more limbs.

-C

-C x={src,size} is reduced modulo y using either a plain mod_1 style

-C remainder, or a modexact_1 style exact division.

-deflit(MODEXACT_THRESHOLD, ifdef(`PIC', 4, 4))

- ALIGN(8)

-L(size_two_or_more):

- C eax

- C ebx

- C ecx common twos

- C edx y, without common twos

- C esi

- C edi

- C ebp

-deflit(FRAME_TWO_OR_MORE, FRAME)

- pushl %edi defframe_pushl(SAVE_EDI)

- movl PARAM_SRC, %ebx

-L(y_twos):

- shrl %edx

- jnc L(y_twos)

- movl %ecx, %edi C common twos

- movl PARAM_SIZE, %ecx

- pushl %esi defframe_pushl(SAVE_ESI)

- leal 1(%edx,%edx), %esi C y (odd)

- movl -4(%ebx,%ecx,4), %eax C src high limb

- cmpl %edx, %eax C carry if high<divisor

- sbbl %edx, %edx C -1 if high<divisor

- addl %edx, %ecx C skip one limb if high<divisor

- andl %eax, %edx

- cmpl $MODEXACT_THRESHOLD, %ecx

- jae L(modexact)

-L(divide_top):

- C eax scratch (quotient)

- C ebx src

- C ecx counter, size-1 to 1

- C edx carry (remainder)

- C esi divisor (odd)

- C edi

- C ebp

- movl -4(%ebx,%ecx,4), %eax

- divl %esi

- loop L(divide_top)

- movl %edx, %eax C x

- movl %esi, %edx C y (odd)

- movl %edi, %ebx C common twos

- popl %esi

- popl %edi

- leal 1(%eax), %ecx

- orl %eax, %eax

- jnz L(strip_x_and)

- movl %ebx, %ecx

- movl %edx, %eax

- shll %cl, %eax

- popl %ebx

- ret

- ALIGN(8)

-L(modexact):

- C eax

- C ebx src ptr

- C ecx size or size-1

- C edx

- C esi y odd

- C edi common twos

- C ebp

- movl PARAM_SIZE, %eax

- pushl %esi FRAME_pushl()

- pushl %eax FRAME_pushl()

- pushl %ebx FRAME_pushl()

-ifdef(`PIC',`

- nop C code alignment

- call L(movl_eip_ebx)

-L(here):

- addl $_GLOBAL_OFFSET_TABLE_, %ebx

- call GSYM_PREFIX`'mpn_modexact_1_odd@PLT

-',`

- call GSYM_PREFIX`'mpn_modexact_1_odd

-')

- movl %esi, %edx C y odd

- movl SAVE_ESI, %esi

- movl %edi, %ebx C common twos

- movl SAVE_EDI, %edi

- addl $eval(FRAME - FRAME_TWO_OR_MORE), %esp

- orl %eax, %eax

- leal 1(%eax), %ecx

- jnz L(strip_x_and)

- movl %ebx, %ecx

- movl %edx, %eax

- shll %cl, %eax

- popl %ebx

- ret

-ifdef(`PIC',`

-L(movl_eip_ebx):

- movl (%esp), %ebx

- ret_internal

-')

-EPILOGUE()

« no previous file with comments | « gcc/gmp/mpn/x86/fat/diveby3.c ('k') | gcc/gmp/mpn/x86/k6/gmp-mparam.h » ('j') | no next file with comments »