gcc/gmp/mpn/x86_64/mod_34lsub1.asm - Issue 3050029: [gcc] GCC 4.5.0=>4.5.1

Side by Side Diff: gcc/gmp/mpn/x86_64/mod_34lsub1.asm

Issue 3050029: [gcc] GCC 4.5.0=>4.5.1 (Closed) Base URL: ssh://git@gitrw.chromium.org:9222/nacl-toolchain.git

Patch Set: Created 10 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.

2

3 dnl Copyright 2000, 2001, 2002, 2004, 2005, 2007 Free Software Foundation,

4 dnl Inc.

5 dnl

6 dnl This file is part of the GNU MP Library.

7 dnl

8 dnl The GNU MP Library is free software; you can redistribute it and/or

9 dnl modify it under the terms of the GNU Lesser General Public License as

10 dnl published by the Free Software Foundation; either version 3 of the

11 dnl License, or (at your option) any later version.

12 dnl

13 dnl The GNU MP Library is distributed in the hope that it will be useful,

14 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of

15 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

16 dnl Lesser General Public License for more details.

17 dnl

18 dnl You should have received a copy of the GNU Lesser General Public License

19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.

20

21 include(`../config.m4')

22

23

24 C cycles/limb

25 C K8,K9: 1.0

26 C K10: 1.12

27 C P4: 3.25

28 C P6-15 (Core2): 1.5

29 C P6-28 (Atom): 2.5

30

31

32 C INPUT PARAMETERS

33 C up rdi

34 C n rsi

35

36 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)

37

38 C TODO

39 C * Apply the movzwl tricks to the x86/k7 code

40 C * Review feed-in and wind-down code. In particular, try to avoid adcq and

41 C sbbq to placate Pentium4.

42 C * More unrolling and/or index addressing could bring time to under 1 c/l

43 C for Athlon64, approaching 0.67 c/l seems possible.

44 C * There are recurrencies on the carry registers (r8, r9, r10) that might

45 C be the limiting factor for the Pentium4 speed. Splitting these into 6

46 C registers would help.

47 C * For ultimate Athlon64 performance, a sequence like this might be best.

48 C It should reach 0.5 c/l (limited by L1 cache bandwidth).

49 C

50 C addq (%rdi), %rax

51 C adcq 8(%rdi), %rcx

52 C adcq 16(%rdi), %rdx

53 C adcq $0, %r8

54 C addq 24(%rdi), %rax

55 C adcq 32(%rdi), %rcx

56 C adcq 40(%rdi), %rdx

57 C adcq $0, %r8

58 C ...

59

60

61 ASM_START()

62 TEXT

63 ALIGN(32)

64 PROLOGUE(mpn_mod_34lsub1)

65

66 mov $0x0000FFFFFFFFFFFF, %r11

67

68 sub $2, %rsi

69 ja L(gt2)

70

71 mov (%rdi), %rax

72 nop

73 jb L(1)

74

75 mov 8(%rdi), %rsi

76 mov %rax, %rdx

77 shr $48, %rax C src[0] low

78

79 and %r11, %rdx C src[0] high

80 add %rdx, %rax

81 mov %esi, %edx

82

83 shr $32, %rsi C src[1] high

84 add %rsi, %rax

85

86 shl $16, %rdx C src[1] low

87 add %rdx, %rax

88

89 L(1): ret

90

91

92 ALIGN(16)

93 L(gt2): xor %eax, %eax

94 xor %ecx, %ecx

95 xor %edx, %edx

96 xor %r8, %r8

97 xor %r9, %r9

98 xor %r10, %r10

99

100 L(top): add (%rdi), %rax

101 adc $0, %r10

102 add 8(%rdi), %rcx

103 adc $0, %r8

104 add 16(%rdi), %rdx

105 adc $0, %r9

106

107 sub $3,%rsi

108 jng L(end)

109

110 add 24(%rdi), %rax

111 adc $0, %r10

112 add 32(%rdi), %rcx

113 adc $0, %r8

114 add 40(%rdi), %rdx

115 lea 48(%rdi), %rdi

116 adc $0, %r9

117

118 sub $3,%rsi

119 jg L(top)

120

121

122 add $-24, %rdi

123 L(end): add %r9, %rax

124 adc %r10, %rcx

125 adc %r8, %rdx

126

127 inc %rsi

128 mov $0x1, %r10d

129 js L(combine)

130

131 mov $0x10000, %r10d

132 adc 24(%rdi), %rax

133 dec %rsi

134 js L(combine)

135

136 adc 32(%rdi), %rcx

137 mov $0x100000000, %r10

138

139 L(combine):

140 sbb %rsi, %rsi C carry

141 mov %rax, %rdi C 0mod3

142 shr $48, %rax C 0mod3 high

143

144 and %r10, %rsi C carry masked

145 and %r11, %rdi C 0mod3 low

146 mov %ecx, %r10d C 1mod3

147

148 add %rsi, %rax C apply carry

149 shr $32, %rcx C 1mod3 high

150

151 add %rdi, %rax C apply 0mod3 low

152 movzwl %dx, %edi C 2mod3

153 shl $16, %r10 C 1mod3 low

154

155 add %rcx, %rax C apply 1mod3 high

156 shr $16, %rdx C 2mod3 high

157

158 add %r10, %rax C apply 1mod3 low

159 shl $32, %rdi C 2mod3 low

160

161 add %rdx, %rax C apply 2mod3 high

162 add %rdi, %rax C apply 2mod3 low

163

164 ret

165 EPILOGUE()

OLD	NEW

« no previous file with comments | « gcc/gmp/mpn/x86_64/lshsub_n.asm ('k') | gcc/gmp/mpn/x86_64/mul_1.asm » ('j') | no next file with comments »