| OLD | NEW |
| (Empty) |
| 1 dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1. | |
| 2 | |
| 3 dnl Copyright 2000, 2001, 2002, 2004, 2005, 2007 Free Software Foundation, | |
| 4 dnl Inc. | |
| 5 dnl | |
| 6 dnl This file is part of the GNU MP Library. | |
| 7 dnl | |
| 8 dnl The GNU MP Library is free software; you can redistribute it and/or | |
| 9 dnl modify it under the terms of the GNU Lesser General Public License as | |
| 10 dnl published by the Free Software Foundation; either version 3 of the | |
| 11 dnl License, or (at your option) any later version. | |
| 12 dnl | |
| 13 dnl The GNU MP Library is distributed in the hope that it will be useful, | |
| 14 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 15 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 16 dnl Lesser General Public License for more details. | |
| 17 dnl | |
| 18 dnl You should have received a copy of the GNU Lesser General Public License | |
| 19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. | |
| 20 | |
| 21 include(`../config.m4') | |
| 22 | |
| 23 | |
| 24 C cycles/limb | |
| 25 C K8,K9: 1.0 | |
| 26 C K10: 1.12 | |
| 27 C P4: 3.25 | |
| 28 C P6-15 (Core2): 1.5 | |
| 29 C P6-28 (Atom): 2.5 | |
| 30 | |
| 31 | |
| 32 C INPUT PARAMETERS | |
| 33 C up rdi | |
| 34 C n rsi | |
| 35 | |
| 36 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) | |
| 37 | |
| 38 C TODO | |
| 39 C * Apply the movzwl tricks to the x86/k7 code | |
| 40 C * Review feed-in and wind-down code. In particular, try to avoid adcq and | |
| 41 C sbbq to placate Pentium4. | |
| 42 C * More unrolling and/or index addressing could bring time to under 1 c/l | |
| 43 C for Athlon64, approaching 0.67 c/l seems possible. | |
| 44 C * There are recurrencies on the carry registers (r8, r9, r10) that might | |
| 45 C be the limiting factor for the Pentium4 speed. Splitting these into 6 | |
| 46 C registers would help. | |
| 47 C * For ultimate Athlon64 performance, a sequence like this might be best. | |
| 48 C It should reach 0.5 c/l (limited by L1 cache bandwidth). | |
| 49 C | |
| 50 C addq (%rdi), %rax | |
| 51 C adcq 8(%rdi), %rcx | |
| 52 C adcq 16(%rdi), %rdx | |
| 53 C adcq $0, %r8 | |
| 54 C addq 24(%rdi), %rax | |
| 55 C adcq 32(%rdi), %rcx | |
| 56 C adcq 40(%rdi), %rdx | |
| 57 C adcq $0, %r8 | |
| 58 C ... | |
| 59 | |
| 60 | |
| 61 ASM_START() | |
| 62 TEXT | |
| 63 ALIGN(32) | |
| 64 PROLOGUE(mpn_mod_34lsub1) | |
| 65 | |
| 66 mov $0x0000FFFFFFFFFFFF, %r11 | |
| 67 | |
| 68 sub $2, %rsi | |
| 69 ja L(gt2) | |
| 70 | |
| 71 mov (%rdi), %rax | |
| 72 nop | |
| 73 jb L(1) | |
| 74 | |
| 75 mov 8(%rdi), %rsi | |
| 76 mov %rax, %rdx | |
| 77 shr $48, %rax C src[0] low | |
| 78 | |
| 79 and %r11, %rdx C src[0] high | |
| 80 add %rdx, %rax | |
| 81 mov %esi, %edx | |
| 82 | |
| 83 shr $32, %rsi C src[1] high | |
| 84 add %rsi, %rax | |
| 85 | |
| 86 shl $16, %rdx C src[1] low | |
| 87 add %rdx, %rax | |
| 88 | |
| 89 L(1): ret | |
| 90 | |
| 91 | |
| 92 ALIGN(16) | |
| 93 L(gt2): xor %eax, %eax | |
| 94 xor %ecx, %ecx | |
| 95 xor %edx, %edx | |
| 96 xor %r8, %r8 | |
| 97 xor %r9, %r9 | |
| 98 xor %r10, %r10 | |
| 99 | |
| 100 L(top): add (%rdi), %rax | |
| 101 adc $0, %r10 | |
| 102 add 8(%rdi), %rcx | |
| 103 adc $0, %r8 | |
| 104 add 16(%rdi), %rdx | |
| 105 adc $0, %r9 | |
| 106 | |
| 107 sub $3,%rsi | |
| 108 jng L(end) | |
| 109 | |
| 110 add 24(%rdi), %rax | |
| 111 adc $0, %r10 | |
| 112 add 32(%rdi), %rcx | |
| 113 adc $0, %r8 | |
| 114 add 40(%rdi), %rdx | |
| 115 lea 48(%rdi), %rdi | |
| 116 adc $0, %r9 | |
| 117 | |
| 118 sub $3,%rsi | |
| 119 jg L(top) | |
| 120 | |
| 121 | |
| 122 add $-24, %rdi | |
| 123 L(end): add %r9, %rax | |
| 124 adc %r10, %rcx | |
| 125 adc %r8, %rdx | |
| 126 | |
| 127 inc %rsi | |
| 128 mov $0x1, %r10d | |
| 129 js L(combine) | |
| 130 | |
| 131 mov $0x10000, %r10d | |
| 132 adc 24(%rdi), %rax | |
| 133 dec %rsi | |
| 134 js L(combine) | |
| 135 | |
| 136 adc 32(%rdi), %rcx | |
| 137 mov $0x100000000, %r10 | |
| 138 | |
| 139 L(combine): | |
| 140 sbb %rsi, %rsi C carry | |
| 141 mov %rax, %rdi C 0mod3 | |
| 142 shr $48, %rax C 0mod3 high | |
| 143 | |
| 144 and %r10, %rsi C carry masked | |
| 145 and %r11, %rdi C 0mod3 low | |
| 146 mov %ecx, %r10d C 1mod3 | |
| 147 | |
| 148 add %rsi, %rax C apply carry | |
| 149 shr $32, %rcx C 1mod3 high | |
| 150 | |
| 151 add %rdi, %rax C apply 0mod3 low | |
| 152 movzwl %dx, %edi C 2mod3 | |
| 153 shl $16, %r10 C 1mod3 low | |
| 154 | |
| 155 add %rcx, %rax C apply 1mod3 high | |
| 156 shr $16, %rdx C 2mod3 high | |
| 157 | |
| 158 add %r10, %rax C apply 1mod3 low | |
| 159 shl $32, %rdi C 2mod3 low | |
| 160 | |
| 161 add %rdx, %rax C apply 2mod3 high | |
| 162 add %rdi, %rax C apply 2mod3 low | |
| 163 | |
| 164 ret | |
| 165 EPILOGUE() | |
| OLD | NEW |