gcc/gmp/mpn/x86/k7/gcd_1.asm - Issue 3050029: [gcc] GCC 4.5.0=>4.5.1

Unified Diff: gcc/gmp/mpn/x86/k7/gcd_1.asm

Issue 3050029: [gcc] GCC 4.5.0=>4.5.1 (Closed) Base URL: ssh://git@gitrw.chromium.org:9222/nacl-toolchain.git

Patch Set: Created 10 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: gcc/gmp/mpn/x86/k7/gcd_1.asm

diff --git a/gcc/gmp/mpn/x86/k7/gcd_1.asm b/gcc/gmp/mpn/x86/k7/gcd_1.asm

deleted file mode 100644

index f912f43730411d68d3d1443fc26973b6285f0c23..0000000000000000000000000000000000000000

--- a/gcc/gmp/mpn/x86/k7/gcd_1.asm

+++ /dev/null

@@ -1,369 +0,0 @@

-dnl AMD K7 mpn_gcd_1 -- mpn by 1 gcd.

-dnl

-dnl This file is part of the GNU MP Library.

-dnl

-dnl The GNU MP Library is free software; you can redistribute it and/or

-dnl modify it under the terms of the GNU Lesser General Public License as

-dnl published by the Free Software Foundation; either version 3 of the

-dnl License, or (at your option) any later version.

-dnl

-dnl The GNU MP Library is distributed in the hope that it will be useful,

-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of

-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

-dnl Lesser General Public License for more details.

-dnl

-dnl You should have received a copy of the GNU Lesser General Public License

-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.

-include(`../config.m4')

-C K7: 6.75 cycles/bit (approx) 1x1 gcd

-C 11.0 cycles/limb Nx1 reduction (modexact_1_odd)

-dnl Reduce using x%y if x is more than DIV_THRESHOLD bits bigger than y,

-dnl where x is the larger of the two. See tune/README for more.

-dnl

-dnl divl at 40 cycles compared to the gcd at about 7 cycles/bitpair

-dnl suggests 40/7*2=11.4 but 7 seems to be about right.

-deflit(DIV_THRESHOLD, 7)

-C table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.

-C

-C This is mixed in with the code, but as per the k7 optimization manual it's

-C a full cache line and suitably aligned so it won't get swapped between

-C code and data. Having it in TEXT rather than RODATA saves needing a GOT

-C entry when PIC.

-C

-C Actually, there doesn't seem to be a measurable difference between this in

-C it's own cache line or plonked in the middle of the code. Presumably

-C since TEXT is read-only there's no worries about coherency.

-deflit(MASK, 63)

-deflit(MAXSHIFT, 6)

- TEXT

- ALIGN(64)

-L(table):

- .byte MAXSHIFT

-forloop(i,1,MASK,

-` .byte m4_count_trailing_zeros(i)

-')

-C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t limb);

-C

-defframe(PARAM_LIMB, 12)

-defframe(PARAM_SIZE, 8)

-defframe(PARAM_SRC, 4)

-defframe(SAVE_EBX, -4)

-defframe(SAVE_ESI, -8)

-defframe(SAVE_EDI, -12)

-defframe(SAVE_EBP, -16)

-defframe(CALL_DIVISOR,-20)

-defframe(CALL_SIZE, -24)

-defframe(CALL_SRC, -28)

-deflit(STACK_SPACE, 28)

- TEXT

- ALIGN(16)

-PROLOGUE(mpn_gcd_1)

-deflit(`FRAME',0)

- ASSERT(ne, `cmpl $0, PARAM_LIMB') C y!=0

- ASSERT(ae, `cmpl $1, PARAM_SIZE') C size>=1

- movl PARAM_SRC, %eax

- movl PARAM_LIMB, %edx

- subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)

- movl %esi, SAVE_ESI

- movl %ebx, SAVE_EBX

- movl (%eax), %esi C src low limb

-ifdef(`PIC',`

- movl %edi, SAVE_EDI

- call L(movl_eip_to_edi)

-L(here):

- addl $L(table)-L(here), %edi

-')

- movl %esi, %ebx

- orl %edx, %esi C x|y

- movl $-1, %ecx

-L(twos):

- incl %ecx

- shrl %esi

- jnc L(twos) C 3/4 chance of x or y odd already

- shrl %cl, %ebx

- shrl %cl, %edx

- movl %ecx, %esi C common twos

- movl PARAM_SIZE, %ecx

- cmpl $1, %ecx

- ja L(divide)

- C eax

- C ebx x

- C ecx

- C edx y

- C esi common twos

- C edi [PIC] L(table)

- C ebp

- movl %edx, %eax

- cmpl %ebx, %edx

- cmovb( %ebx, %eax) C swap to make x bigger than y

- cmovb( %edx, %ebx)

-L(strip_y):

- C eax x

- C ebx y

- C ecx

- C edx

- C esi common twos

- C edi [PIC] L(table)

- C ebp

- ASSERT(nz,`orl %ebx,%ebx')

- shrl %ebx

- jnc L(strip_y)

- rcll %ebx

- C eax x

- C ebx y (odd)

- C ecx

- C edx

- C esi common twos

- C edi [PIC] L(table)

- C ebp

- movl %eax, %ecx

- movl %ebx, %edx

- shrl $DIV_THRESHOLD, %eax

- cmpl %eax, %ebx

- movl %ecx, %eax

- ja L(strip_x_entry) C do x%y if x much bigger than y

- xorl %edx, %edx

- divl %ebx

- orl %edx, %edx

- movl %edx, %eax C remainder -> x

- movl %ebx, %edx C y

- jz L(done_ebx)

- jmp L(strip_x)

- C Offset 0x9D here for non-PIC. About 0.4 cycles/bit is saved by

- C ensuring the end of the jnz at the end of this loop doesn't cross

- C into the next cache line at 0xC0.

- C

- C PIC on the other hand is offset 0xAC here and extends to 0xC9, so

- C it crosses but doesn't suffer any measurable slowdown.

-L(top):

- C eax x

- C ebx y-x

- C ecx x-y

- C edx y

- C esi twos, for use at end

- C edi [PIC] L(table)

- cmovc( %ebx, %ecx) C if x-y gave carry, use x and y-x

- cmovc( %eax, %edx)

-L(strip_x):

- movl %ecx, %eax

-L(strip_x_entry):

- andl $MASK, %ecx

- ASSERT(nz, `orl %eax, %eax')

-ifdef(`PIC',`

- movb (%ecx,%edi), %cl

-',`

- movb L(table) (%ecx), %cl

-')

- shrl %cl, %eax

- cmpb $MAXSHIFT, %cl

- movl %eax, %ecx

- movl %edx, %ebx

- je L(strip_x)

- ASSERT(nz, `testl $1, %eax') C both odd

- ASSERT(nz, `testl $1, %edx')

- subl %eax, %ebx

- subl %edx, %ecx

- jnz L(top)

-L(done):

- movl %esi, %ecx

- movl SAVE_ESI, %esi

-ifdef(`PIC',`

- movl SAVE_EDI, %edi

-')

- shll %cl, %eax

- movl SAVE_EBX, %ebx

- addl $FRAME, %esp

- ret

-C -----------------------------------------------------------------------------

-C two or more limbs

-dnl MODEXACT_THRESHOLD is the size at which it's better to call

-dnl mpn_modexact_1_odd than do an inline loop.

-deflit(MODEXACT_THRESHOLD, ifdef(`PIC',6,5))

-L(divide):

- C eax src

- C ebx

- C ecx size

- C edx y

- C esi common twos

- C edi [PIC] L(table)

- C ebp

-L(divide_strip_y):

- ASSERT(nz,`orl %edx,%edx')

- shrl %edx

- jnc L(divide_strip_y)

- leal 1(%edx,%edx), %ebx C y now odd

- movl %ebp, SAVE_EBP

- movl %eax, %ebp

- movl -4(%eax,%ecx,4), %eax C src high limb

- cmp $MODEXACT_THRESHOLD, %ecx

- jae L(modexact)

- cmpl %ebx, %eax C high cmp divisor

- movl $0, %edx

- cmovc( %eax, %edx) C skip a div if high<divisor

- sbbl $0, %ecx

-L(divide_top):

- C eax scratch (quotient)

- C ebx y

- C ecx counter (size to 1, inclusive)

- C edx carry (remainder)

- C esi common twos

- C edi [PIC] L(table)

- C ebp src

- movl -4(%ebp,%ecx,4), %eax

- divl %ebx

- decl %ecx

- jnz L(divide_top)

- C eax

- C ebx y (odd)

- C ecx

- C edx x

- C esi common twos

- C edi [PIC] L(table)

- C ebp

- orl %edx, %edx

- movl SAVE_EBP, %ebp

- movl %edx, %eax

- movl %edx, %ecx

- movl %ebx, %edx

- jnz L(strip_x_entry)

-L(done_ebx):

- movl %ebx, %eax

- jmp L(done)

-L(modexact):

- C eax

- C ebx y

- C ecx size

- C edx

- C esi common twos

- C edi [PIC] L(table)

- C ebp src

-ifdef(`PIC',`

- movl %ebp, CALL_SRC

- movl %ebx, %ebp C y

- movl %edi, %ebx C L(table)

- addl $_GLOBAL_OFFSET_TABLE_+[.-L(table)], %ebx

- movl %ebp, CALL_DIVISOR

- movl %ecx, CALL_SIZE

- call GSYM_PREFIX`'mpn_modexact_1_odd@PLT

-',`

-dnl non-PIC

- movl %ebx, CALL_DIVISOR

- movl %ebp, CALL_SRC

- movl %ecx, CALL_SIZE

- call GSYM_PREFIX`'mpn_modexact_1_odd

-')

- C eax x

- C ebx [non-PIC] y

- C ecx

- C edx

- C esi common twos

- C edi [PIC] L(table)

- C ebp [PIC] y

- orl %eax, %eax

- movl ifdef(`PIC',`%ebp',`%ebx'), %edx

- movl SAVE_EBP, %ebp

- movl %eax, %ecx

- jnz L(strip_x_entry)

- movl %edx, %eax

- jmp L(done)

-ifdef(`PIC', `

-L(movl_eip_to_edi):

- movl (%esp), %edi

- ret_internal

-')

-EPILOGUE()

« no previous file with comments | « gcc/gmp/mpn/x86/k7/dive_1.asm ('k') | gcc/gmp/mpn/x86/k7/mmx/com_n.asm » ('j') | no next file with comments »