gcc/gmp/mpn/x86/pentium/mmx/rshift.asm - Issue 3050029: [gcc] GCC 4.5.0=>4.5.1

Side by Side Diff: gcc/gmp/mpn/x86/pentium/mmx/rshift.asm

Issue 3050029: [gcc] GCC 4.5.0=>4.5.1 (Closed) Base URL: ssh://git@gitrw.chromium.org:9222/nacl-toolchain.git

Patch Set: Created 10 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 dnl Intel P5 mpn_rshift -- mpn right shift.

2

3 dnl Copyright 2000, 2002 Free Software Foundation, Inc.

4 dnl

5 dnl This file is part of the GNU MP Library.

6 dnl

7 dnl The GNU MP Library is free software; you can redistribute it and/or

8 dnl modify it under the terms of the GNU Lesser General Public License as

9 dnl published by the Free Software Foundation; either version 3 of the

10 dnl License, or (at your option) any later version.

11 dnl

12 dnl The GNU MP Library is distributed in the hope that it will be useful,

13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of

14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

15 dnl Lesser General Public License for more details.

16 dnl

17 dnl You should have received a copy of the GNU Lesser General Public License

18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.

19

20 include(`../config.m4')

21

22

23 C P5: 1.75 cycles/limb.

24

25

26 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,

27 C unsigned shift);

28 C

29 C Shift src,size right by shift many bits and store the result in dst,size.

30 C Zeros are shifted in at the left. Return the bits shifted out at the

31 C right.

32 C

33 C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,

34 C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.

35 C

36 C Full speed depends on source and destination being aligned. Unaligned mmx

37 C loads and stores on P5 don't pair and have a 2 cycle penalty. Some hairy

38 C setups and finish-ups are done to ensure alignment for the loop.

39 C

40 C MMX shifts work out a bit faster even for the simple loop.

41

42 defframe(PARAM_SHIFT,16)

43 defframe(PARAM_SIZE, 12)

44 defframe(PARAM_SRC, 8)

45 defframe(PARAM_DST, 4)

46 deflit(`FRAME',0)

47

48 dnl Minimum 5, because the unrolled loop can't handle less.

49 deflit(UNROLL_THRESHOLD, 5)

50

51 TEXT

52 ALIGN(8)

53

54 PROLOGUE(mpn_rshift)

55

56 pushl %ebx

57 pushl %edi

58 deflit(`FRAME',8)

59

60 movl PARAM_SIZE, %eax

61 movl PARAM_DST, %edx

62

63 movl PARAM_SRC, %ebx

64 movl PARAM_SHIFT, %ecx

65

66 cmp $UNROLL_THRESHOLD, %eax

67 jae L(unroll)

68

69 decl %eax

70 movl (%ebx), %edi C src low limb

71

72 jnz L(simple)

73

74 shrdl( %cl, %edi, %eax) C eax was decremented to zero

75

76 shrl %cl, %edi

77

78 movl %edi, (%edx) C dst low limb

79 popl %edi C risk of data cache bank clash

80

81 popl %ebx

82

83 ret

84

85

86 C -----------------------------------------------------------------------------

87 ALIGN(8)

88 L(simple):

89 C eax size-1

90 C ebx src

91 C ecx shift

92 C edx dst

93 C esi

94 C edi

95 C ebp

96 deflit(`FRAME',8)

97

98 movd (%ebx), %mm5 C src[0]

99 leal (%ebx,%eax,4), %ebx C &src[size-1]

100

101 movd %ecx, %mm6 C rshift

102 leal -4(%edx,%eax,4), %edx C &dst[size-2]

103

104 psllq $32, %mm5

105 negl %eax

106

107

108 C This loop is 5 or 8 cycles, with every second load unaligned and a wasted

109 C cycle waiting for the mm0 result to be ready. For comparison a shrdl is 4

110 C cycles and would be 8 in a simple loop. Using mmx helps the return value

111 C and last limb calculations too.

112

113 L(simple_top):

114 C eax counter, limbs, negative

115 C ebx &src[size-1]

116 C ecx return value

117 C edx &dst[size-2]

118 C

119 C mm0 scratch

120 C mm5 return value

121 C mm6 shift

122

123 movq (%ebx,%eax,4), %mm0

124 incl %eax

125

126 psrlq %mm6, %mm0

127

128 movd %mm0, (%edx,%eax,4)

129 jnz L(simple_top)

130

131

132 movd (%ebx), %mm0

133 psrlq %mm6, %mm5 C return value

134

135 psrlq %mm6, %mm0

136 popl %edi

137

138 movd %mm5, %eax

139 popl %ebx

140

141 movd %mm0, 4(%edx)

142

143 emms

144

145 ret

146

147

148 C -----------------------------------------------------------------------------

149 ALIGN(8)

150 L(unroll):

151 C eax size

152 C ebx src

153 C ecx shift

154 C edx dst

155 C esi

156 C edi

157 C ebp

158 deflit(`FRAME',8)

159

160 movd (%ebx), %mm5 C src[0]

161 movl $4, %edi

162

163 movd %ecx, %mm6 C rshift

164 testl %edi, %ebx

165

166 psllq $32, %mm5

167 jz L(start_src_aligned)

168

169

170 C src isn't aligned, process low limb separately (marked xxx) and

171 C step src and dst by one limb, making src aligned.

172 C

173 C source ebx

174 C --+-------+-------+-------+

175 C \| xxx \|

176 C --+-------+-------+-------+

177 C 4mod8 0mod8 4mod8

178 C

179 C dest edx

180 C --+-------+-------+

181 C \| \| xxx \|

182 C --+-------+-------+

183

184 movq (%ebx), %mm0 C unaligned load

185

186 psrlq %mm6, %mm0

187 addl $4, %ebx

188

189 decl %eax

190

191 movd %mm0, (%edx)

192 addl $4, %edx

193 L(start_src_aligned):

194

195

196 movq (%ebx), %mm1

197 testl %edi, %edx

198

199 psrlq %mm6, %mm5 C retval

200 jz L(start_dst_aligned)

201

202 C dst isn't aligned, add 4 to make it so, and pretend the shift is

203 C 32 bits extra. Low limb of dst (marked xxx) handled here

204 C separately.

205 C

206 C source ebx

207 C --+-------+-------+

208 C \| mm1 \|

209 C --+-------+-------+

210 C 4mod8 0mod8

211 C

212 C dest edx

213 C --+-------+-------+-------+

214 C \| xxx \|

215 C --+-------+-------+-------+

216 C 4mod8 0mod8 4mod8

217

218 movq %mm1, %mm0

219 addl $32, %ecx C new shift

220

221 psrlq %mm6, %mm0

222

223 movd %ecx, %mm6

224

225 movd %mm0, (%edx)

226 addl $4, %edx

227 L(start_dst_aligned):

228

229

230 movq 8(%ebx), %mm3

231 negl %ecx

232

233 movq %mm3, %mm2 C mm2 src qword

234 addl $64, %ecx

235

236 movd %ecx, %mm7

237 psrlq %mm6, %mm1

238

239 leal -12(%ebx,%eax,4), %ebx

240 leal -20(%edx,%eax,4), %edx

241

242 psllq %mm7, %mm3

243 subl $7, %eax C size-7

244

245 por %mm1, %mm3 C mm3 ready to store

246 negl %eax C -(size-7)

247

248 jns L(finish)

249

250

251 C This loop is the important bit, the rest is just support. Careful

252 C instruction scheduling achieves the claimed 1.75 c/l. The

253 C relevant parts of the pairing rules are:

254 C

255 C - mmx loads and stores execute only in the U pipe

256 C - only one mmx shift in a pair

257 C - wait one cycle before storing an mmx register result

258 C - the usual address generation interlock

259 C

260 C Two qword calculations are slightly interleaved. The instructions

261 C marked "C" belong to the second qword, and the "C prev" one is for

262 C the second qword from the previous iteration.

263

264 ALIGN(8)

265 L(unroll_loop):

266 C eax counter, limbs, negative

267 C ebx &src[size-12]

268 C ecx

269 C edx &dst[size-12]

270 C esi

271 C edi

272 C

273 C mm0

274 C mm1

275 C mm2 src qword from -8(%ebx,%eax,4)

276 C mm3 dst qword ready to store to -8(%edx,%eax,4)

277 C

278 C mm5 return value

279 C mm6 rshift

280 C mm7 lshift

281

282 movq (%ebx,%eax,4), %mm0

283 psrlq %mm6, %mm2

284

285 movq %mm0, %mm1

286 psllq %mm7, %mm0

287

288 movq %mm3, -8(%edx,%eax,4) C prev

289 por %mm2, %mm0

290

291 movq 8(%ebx,%eax,4), %mm3 C

292 psrlq %mm6, %mm1 C

293

294 movq %mm0, (%edx,%eax,4)

295 movq %mm3, %mm2 C

296

297 psllq %mm7, %mm3 C

298 addl $4, %eax

299

300 por %mm1, %mm3 C

301 js L(unroll_loop)

302

303

304 L(finish):

305 C eax 0 to 3 representing respectively 3 to 0 limbs remaining

306

307 testb $2, %al

308

309 jnz L(finish_no_two)

310

311 movq (%ebx,%eax,4), %mm0

312 psrlq %mm6, %mm2

313

314 movq %mm0, %mm1

315 psllq %mm7, %mm0

316

317 movq %mm3, -8(%edx,%eax,4) C prev

318 por %mm2, %mm0

319

320 movq %mm1, %mm2

321 movq %mm0, %mm3

322

323 addl $2, %eax

324 L(finish_no_two):

325

326

327 C eax 2 or 3 representing respectively 1 or 0 limbs remaining

328 C

329 C mm2 src prev qword, from -8(%ebx,%eax,4)

330 C mm3 dst qword, for -8(%edx,%eax,4)

331

332 testb $1, %al

333 popl %edi

334

335 movd %mm5, %eax C retval

336 jnz L(finish_zero)

337

338

339 C One extra limb, destination was aligned.

340 C

341 C source ebx

342 C +-------+---------------+--

343 C \| \| mm2 \|

344 C +-------+---------------+--

345 C

346 C dest edx

347 C +-------+---------------+---------------+--

348 C \| \| \| mm3 \|

349 C +-------+---------------+---------------+--

350 C

351 C mm6 = shift

352 C mm7 = ecx = 64-shift

353

354

355 C One extra limb, destination was unaligned.

356 C

357 C source ebx

358 C +-------+---------------+--

359 C \| \| mm2 \|

360 C +-------+---------------+--

361 C

362 C dest edx

363 C +---------------+---------------+--

364 C \| \| mm3 \|

365 C +---------------+---------------+--

366 C

367 C mm6 = shift+32

368 C mm7 = ecx = 64-(shift+32)

369

370

371 C In both cases there's one extra limb of src to fetch and combine

372 C with mm2 to make a qword at 8(%edx), and in the aligned case

373 C there's a further extra limb of dst to be formed.

374

375

376 movd 8(%ebx), %mm0

377 psrlq %mm6, %mm2

378

379 movq %mm0, %mm1

380 psllq %mm7, %mm0

381

382 movq %mm3, (%edx)

383 por %mm2, %mm0

384

385 psrlq %mm6, %mm1

386 andl $32, %ecx

387

388 popl %ebx

389 jz L(finish_one_unaligned)

390

391 C dst was aligned, must store one extra limb

392 movd %mm1, 16(%edx)

393 L(finish_one_unaligned):

394

395 movq %mm0, 8(%edx)

396

397 emms

398

399 ret

400

401

402 L(finish_zero):

403

404 C No extra limbs, destination was aligned.

405 C

406 C source ebx

407 C +---------------+--

408 C \| mm2 \|

409 C +---------------+--

410 C

411 C dest edx+4

412 C +---------------+---------------+--

413 C \| \| mm3 \|

414 C +---------------+---------------+--

415 C

416 C mm6 = shift

417 C mm7 = ecx = 64-shift

418

419

420 C No extra limbs, destination was unaligned.

421 C

422 C source ebx

423 C +---------------+--

424 C \| mm2 \|

425 C +---------------+--

426 C

427 C dest edx+4

428 C +-------+---------------+--

429 C \| \| mm3 \|

430 C +-------+---------------+--

431 C

432 C mm6 = shift+32

433 C mm7 = 64-(shift+32)

434

435

436 C The movd for the unaligned case is clearly the same data as the

437 C movq for the aligned case, it's just a choice between whether one

438 C or two limbs should be written.

439

440

441 movq %mm3, 4(%edx)

442 psrlq %mm6, %mm2

443

444 movd %mm2, 12(%edx)

445 andl $32, %ecx

446

447 popl %ebx

448 jz L(finish_zero_unaligned)

449

450 movq %mm2, 12(%edx)

451 L(finish_zero_unaligned):

452

453 emms

454

455 ret

456

457 EPILOGUE()

OLD	NEW

« no previous file with comments | « gcc/gmp/mpn/x86/pentium/mmx/mul_1.asm ('k') | gcc/gmp/mpn/x86/pentium/mode1o.asm » ('j') | no next file with comments »