| OLD | NEW |
| (Empty) |
| 1 dnl Intel P5 mpn_rshift -- mpn right shift. | |
| 2 | |
| 3 dnl Copyright 2000, 2002 Free Software Foundation, Inc. | |
| 4 dnl | |
| 5 dnl This file is part of the GNU MP Library. | |
| 6 dnl | |
| 7 dnl The GNU MP Library is free software; you can redistribute it and/or | |
| 8 dnl modify it under the terms of the GNU Lesser General Public License as | |
| 9 dnl published by the Free Software Foundation; either version 3 of the | |
| 10 dnl License, or (at your option) any later version. | |
| 11 dnl | |
| 12 dnl The GNU MP Library is distributed in the hope that it will be useful, | |
| 13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 15 dnl Lesser General Public License for more details. | |
| 16 dnl | |
| 17 dnl You should have received a copy of the GNU Lesser General Public License | |
| 18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. | |
| 19 | |
| 20 include(`../config.m4') | |
| 21 | |
| 22 | |
| 23 C P5: 1.75 cycles/limb. | |
| 24 | |
| 25 | |
| 26 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, | |
| 27 C unsigned shift); | |
| 28 C | |
| 29 C Shift src,size right by shift many bits and store the result in dst,size. | |
| 30 C Zeros are shifted in at the left. Return the bits shifted out at the | |
| 31 C right. | |
| 32 C | |
| 33 C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb, | |
| 34 C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l. | |
| 35 C | |
| 36 C Full speed depends on source and destination being aligned. Unaligned mmx | |
| 37 C loads and stores on P5 don't pair and have a 2 cycle penalty. Some hairy | |
| 38 C setups and finish-ups are done to ensure alignment for the loop. | |
| 39 C | |
| 40 C MMX shifts work out a bit faster even for the simple loop. | |
| 41 | |
| 42 defframe(PARAM_SHIFT,16) | |
| 43 defframe(PARAM_SIZE, 12) | |
| 44 defframe(PARAM_SRC, 8) | |
| 45 defframe(PARAM_DST, 4) | |
| 46 deflit(`FRAME',0) | |
| 47 | |
| 48 dnl Minimum 5, because the unrolled loop can't handle less. | |
| 49 deflit(UNROLL_THRESHOLD, 5) | |
| 50 | |
| 51 TEXT | |
| 52 ALIGN(8) | |
| 53 | |
| 54 PROLOGUE(mpn_rshift) | |
| 55 | |
| 56 pushl %ebx | |
| 57 pushl %edi | |
| 58 deflit(`FRAME',8) | |
| 59 | |
| 60 movl PARAM_SIZE, %eax | |
| 61 movl PARAM_DST, %edx | |
| 62 | |
| 63 movl PARAM_SRC, %ebx | |
| 64 movl PARAM_SHIFT, %ecx | |
| 65 | |
| 66 cmp $UNROLL_THRESHOLD, %eax | |
| 67 jae L(unroll) | |
| 68 | |
| 69 decl %eax | |
| 70 movl (%ebx), %edi C src low limb | |
| 71 | |
| 72 jnz L(simple) | |
| 73 | |
| 74 shrdl( %cl, %edi, %eax) C eax was decremented to zero | |
| 75 | |
| 76 shrl %cl, %edi | |
| 77 | |
| 78 movl %edi, (%edx) C dst low limb | |
| 79 popl %edi C risk of data cache bank clash | |
| 80 | |
| 81 popl %ebx | |
| 82 | |
| 83 ret | |
| 84 | |
| 85 | |
| 86 C ----------------------------------------------------------------------------- | |
| 87 ALIGN(8) | |
| 88 L(simple): | |
| 89 C eax size-1 | |
| 90 C ebx src | |
| 91 C ecx shift | |
| 92 C edx dst | |
| 93 C esi | |
| 94 C edi | |
| 95 C ebp | |
| 96 deflit(`FRAME',8) | |
| 97 | |
| 98 movd (%ebx), %mm5 C src[0] | |
| 99 leal (%ebx,%eax,4), %ebx C &src[size-1] | |
| 100 | |
| 101 movd %ecx, %mm6 C rshift | |
| 102 leal -4(%edx,%eax,4), %edx C &dst[size-2] | |
| 103 | |
| 104 psllq $32, %mm5 | |
| 105 negl %eax | |
| 106 | |
| 107 | |
| 108 C This loop is 5 or 8 cycles, with every second load unaligned and a wasted | |
| 109 C cycle waiting for the mm0 result to be ready. For comparison a shrdl is 4 | |
| 110 C cycles and would be 8 in a simple loop. Using mmx helps the return value | |
| 111 C and last limb calculations too. | |
| 112 | |
| 113 L(simple_top): | |
| 114 C eax counter, limbs, negative | |
| 115 C ebx &src[size-1] | |
| 116 C ecx return value | |
| 117 C edx &dst[size-2] | |
| 118 C | |
| 119 C mm0 scratch | |
| 120 C mm5 return value | |
| 121 C mm6 shift | |
| 122 | |
| 123 movq (%ebx,%eax,4), %mm0 | |
| 124 incl %eax | |
| 125 | |
| 126 psrlq %mm6, %mm0 | |
| 127 | |
| 128 movd %mm0, (%edx,%eax,4) | |
| 129 jnz L(simple_top) | |
| 130 | |
| 131 | |
| 132 movd (%ebx), %mm0 | |
| 133 psrlq %mm6, %mm5 C return value | |
| 134 | |
| 135 psrlq %mm6, %mm0 | |
| 136 popl %edi | |
| 137 | |
| 138 movd %mm5, %eax | |
| 139 popl %ebx | |
| 140 | |
| 141 movd %mm0, 4(%edx) | |
| 142 | |
| 143 emms | |
| 144 | |
| 145 ret | |
| 146 | |
| 147 | |
| 148 C ----------------------------------------------------------------------------- | |
| 149 ALIGN(8) | |
| 150 L(unroll): | |
| 151 C eax size | |
| 152 C ebx src | |
| 153 C ecx shift | |
| 154 C edx dst | |
| 155 C esi | |
| 156 C edi | |
| 157 C ebp | |
| 158 deflit(`FRAME',8) | |
| 159 | |
| 160 movd (%ebx), %mm5 C src[0] | |
| 161 movl $4, %edi | |
| 162 | |
| 163 movd %ecx, %mm6 C rshift | |
| 164 testl %edi, %ebx | |
| 165 | |
| 166 psllq $32, %mm5 | |
| 167 jz L(start_src_aligned) | |
| 168 | |
| 169 | |
| 170 C src isn't aligned, process low limb separately (marked xxx) and | |
| 171 C step src and dst by one limb, making src aligned. | |
| 172 C | |
| 173 C source ebx | |
| 174 C --+-------+-------+-------+ | |
| 175 C | xxx | | |
| 176 C --+-------+-------+-------+ | |
| 177 C 4mod8 0mod8 4mod8 | |
| 178 C | |
| 179 C dest edx | |
| 180 C --+-------+-------+ | |
| 181 C | | xxx | | |
| 182 C --+-------+-------+ | |
| 183 | |
| 184 movq (%ebx), %mm0 C unaligned load | |
| 185 | |
| 186 psrlq %mm6, %mm0 | |
| 187 addl $4, %ebx | |
| 188 | |
| 189 decl %eax | |
| 190 | |
| 191 movd %mm0, (%edx) | |
| 192 addl $4, %edx | |
| 193 L(start_src_aligned): | |
| 194 | |
| 195 | |
| 196 movq (%ebx), %mm1 | |
| 197 testl %edi, %edx | |
| 198 | |
| 199 psrlq %mm6, %mm5 C retval | |
| 200 jz L(start_dst_aligned) | |
| 201 | |
| 202 C dst isn't aligned, add 4 to make it so, and pretend the shift is | |
| 203 C 32 bits extra. Low limb of dst (marked xxx) handled here | |
| 204 C separately. | |
| 205 C | |
| 206 C source ebx | |
| 207 C --+-------+-------+ | |
| 208 C | mm1 | | |
| 209 C --+-------+-------+ | |
| 210 C 4mod8 0mod8 | |
| 211 C | |
| 212 C dest edx | |
| 213 C --+-------+-------+-------+ | |
| 214 C | xxx | | |
| 215 C --+-------+-------+-------+ | |
| 216 C 4mod8 0mod8 4mod8 | |
| 217 | |
| 218 movq %mm1, %mm0 | |
| 219 addl $32, %ecx C new shift | |
| 220 | |
| 221 psrlq %mm6, %mm0 | |
| 222 | |
| 223 movd %ecx, %mm6 | |
| 224 | |
| 225 movd %mm0, (%edx) | |
| 226 addl $4, %edx | |
| 227 L(start_dst_aligned): | |
| 228 | |
| 229 | |
| 230 movq 8(%ebx), %mm3 | |
| 231 negl %ecx | |
| 232 | |
| 233 movq %mm3, %mm2 C mm2 src qword | |
| 234 addl $64, %ecx | |
| 235 | |
| 236 movd %ecx, %mm7 | |
| 237 psrlq %mm6, %mm1 | |
| 238 | |
| 239 leal -12(%ebx,%eax,4), %ebx | |
| 240 leal -20(%edx,%eax,4), %edx | |
| 241 | |
| 242 psllq %mm7, %mm3 | |
| 243 subl $7, %eax C size-7 | |
| 244 | |
| 245 por %mm1, %mm3 C mm3 ready to store | |
| 246 negl %eax C -(size-7) | |
| 247 | |
| 248 jns L(finish) | |
| 249 | |
| 250 | |
| 251 C This loop is the important bit, the rest is just support. Careful | |
| 252 C instruction scheduling achieves the claimed 1.75 c/l. The | |
| 253 C relevant parts of the pairing rules are: | |
| 254 C | |
| 255 C - mmx loads and stores execute only in the U pipe | |
| 256 C - only one mmx shift in a pair | |
| 257 C - wait one cycle before storing an mmx register result | |
| 258 C - the usual address generation interlock | |
| 259 C | |
| 260 C Two qword calculations are slightly interleaved. The instructions | |
| 261 C marked "C" belong to the second qword, and the "C prev" one is for | |
| 262 C the second qword from the previous iteration. | |
| 263 | |
| 264 ALIGN(8) | |
| 265 L(unroll_loop): | |
| 266 C eax counter, limbs, negative | |
| 267 C ebx &src[size-12] | |
| 268 C ecx | |
| 269 C edx &dst[size-12] | |
| 270 C esi | |
| 271 C edi | |
| 272 C | |
| 273 C mm0 | |
| 274 C mm1 | |
| 275 C mm2 src qword from -8(%ebx,%eax,4) | |
| 276 C mm3 dst qword ready to store to -8(%edx,%eax,4) | |
| 277 C | |
| 278 C mm5 return value | |
| 279 C mm6 rshift | |
| 280 C mm7 lshift | |
| 281 | |
| 282 movq (%ebx,%eax,4), %mm0 | |
| 283 psrlq %mm6, %mm2 | |
| 284 | |
| 285 movq %mm0, %mm1 | |
| 286 psllq %mm7, %mm0 | |
| 287 | |
| 288 movq %mm3, -8(%edx,%eax,4) C prev | |
| 289 por %mm2, %mm0 | |
| 290 | |
| 291 movq 8(%ebx,%eax,4), %mm3 C | |
| 292 psrlq %mm6, %mm1 C | |
| 293 | |
| 294 movq %mm0, (%edx,%eax,4) | |
| 295 movq %mm3, %mm2 C | |
| 296 | |
| 297 psllq %mm7, %mm3 C | |
| 298 addl $4, %eax | |
| 299 | |
| 300 por %mm1, %mm3 C | |
| 301 js L(unroll_loop) | |
| 302 | |
| 303 | |
| 304 L(finish): | |
| 305 C eax 0 to 3 representing respectively 3 to 0 limbs remaining | |
| 306 | |
| 307 testb $2, %al | |
| 308 | |
| 309 jnz L(finish_no_two) | |
| 310 | |
| 311 movq (%ebx,%eax,4), %mm0 | |
| 312 psrlq %mm6, %mm2 | |
| 313 | |
| 314 movq %mm0, %mm1 | |
| 315 psllq %mm7, %mm0 | |
| 316 | |
| 317 movq %mm3, -8(%edx,%eax,4) C prev | |
| 318 por %mm2, %mm0 | |
| 319 | |
| 320 movq %mm1, %mm2 | |
| 321 movq %mm0, %mm3 | |
| 322 | |
| 323 addl $2, %eax | |
| 324 L(finish_no_two): | |
| 325 | |
| 326 | |
| 327 C eax 2 or 3 representing respectively 1 or 0 limbs remaining | |
| 328 C | |
| 329 C mm2 src prev qword, from -8(%ebx,%eax,4) | |
| 330 C mm3 dst qword, for -8(%edx,%eax,4) | |
| 331 | |
| 332 testb $1, %al | |
| 333 popl %edi | |
| 334 | |
| 335 movd %mm5, %eax C retval | |
| 336 jnz L(finish_zero) | |
| 337 | |
| 338 | |
| 339 C One extra limb, destination was aligned. | |
| 340 C | |
| 341 C source ebx | |
| 342 C +-------+---------------+-- | |
| 343 C | | mm2 | | |
| 344 C +-------+---------------+-- | |
| 345 C | |
| 346 C dest edx | |
| 347 C +-------+---------------+---------------+-- | |
| 348 C | | | mm3 | | |
| 349 C +-------+---------------+---------------+-- | |
| 350 C | |
| 351 C mm6 = shift | |
| 352 C mm7 = ecx = 64-shift | |
| 353 | |
| 354 | |
| 355 C One extra limb, destination was unaligned. | |
| 356 C | |
| 357 C source ebx | |
| 358 C +-------+---------------+-- | |
| 359 C | | mm2 | | |
| 360 C +-------+---------------+-- | |
| 361 C | |
| 362 C dest edx | |
| 363 C +---------------+---------------+-- | |
| 364 C | | mm3 | | |
| 365 C +---------------+---------------+-- | |
| 366 C | |
| 367 C mm6 = shift+32 | |
| 368 C mm7 = ecx = 64-(shift+32) | |
| 369 | |
| 370 | |
| 371 C In both cases there's one extra limb of src to fetch and combine | |
| 372 C with mm2 to make a qword at 8(%edx), and in the aligned case | |
| 373 C there's a further extra limb of dst to be formed. | |
| 374 | |
| 375 | |
| 376 movd 8(%ebx), %mm0 | |
| 377 psrlq %mm6, %mm2 | |
| 378 | |
| 379 movq %mm0, %mm1 | |
| 380 psllq %mm7, %mm0 | |
| 381 | |
| 382 movq %mm3, (%edx) | |
| 383 por %mm2, %mm0 | |
| 384 | |
| 385 psrlq %mm6, %mm1 | |
| 386 andl $32, %ecx | |
| 387 | |
| 388 popl %ebx | |
| 389 jz L(finish_one_unaligned) | |
| 390 | |
| 391 C dst was aligned, must store one extra limb | |
| 392 movd %mm1, 16(%edx) | |
| 393 L(finish_one_unaligned): | |
| 394 | |
| 395 movq %mm0, 8(%edx) | |
| 396 | |
| 397 emms | |
| 398 | |
| 399 ret | |
| 400 | |
| 401 | |
| 402 L(finish_zero): | |
| 403 | |
| 404 C No extra limbs, destination was aligned. | |
| 405 C | |
| 406 C source ebx | |
| 407 C +---------------+-- | |
| 408 C | mm2 | | |
| 409 C +---------------+-- | |
| 410 C | |
| 411 C dest edx+4 | |
| 412 C +---------------+---------------+-- | |
| 413 C | | mm3 | | |
| 414 C +---------------+---------------+-- | |
| 415 C | |
| 416 C mm6 = shift | |
| 417 C mm7 = ecx = 64-shift | |
| 418 | |
| 419 | |
| 420 C No extra limbs, destination was unaligned. | |
| 421 C | |
| 422 C source ebx | |
| 423 C +---------------+-- | |
| 424 C | mm2 | | |
| 425 C +---------------+-- | |
| 426 C | |
| 427 C dest edx+4 | |
| 428 C +-------+---------------+-- | |
| 429 C | | mm3 | | |
| 430 C +-------+---------------+-- | |
| 431 C | |
| 432 C mm6 = shift+32 | |
| 433 C mm7 = 64-(shift+32) | |
| 434 | |
| 435 | |
| 436 C The movd for the unaligned case is clearly the same data as the | |
| 437 C movq for the aligned case, it's just a choice between whether one | |
| 438 C or two limbs should be written. | |
| 439 | |
| 440 | |
| 441 movq %mm3, 4(%edx) | |
| 442 psrlq %mm6, %mm2 | |
| 443 | |
| 444 movd %mm2, 12(%edx) | |
| 445 andl $32, %ecx | |
| 446 | |
| 447 popl %ebx | |
| 448 jz L(finish_zero_unaligned) | |
| 449 | |
| 450 movq %mm2, 12(%edx) | |
| 451 L(finish_zero_unaligned): | |
| 452 | |
| 453 emms | |
| 454 | |
| 455 ret | |
| 456 | |
| 457 EPILOGUE() | |
| OLD | NEW |