| OLD | NEW |
| (Empty) |
| 1 dnl x86-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number. | |
| 2 | |
| 3 dnl Copyright 2007, 2008 Free Software Foundation, Inc. | |
| 4 | |
| 5 dnl This file is part of the GNU MP Library. | |
| 6 | |
| 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify | |
| 8 dnl it under the terms of the GNU Lesser General Public License as published | |
| 9 dnl by the Free Software Foundation; either version 3 of the License, or (at | |
| 10 dnl your option) any later version. | |
| 11 | |
| 12 dnl The GNU MP Library is distributed in the hope that it will be useful, but | |
| 13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
| 14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public | |
| 15 dnl License for more details. | |
| 16 | |
| 17 dnl You should have received a copy of the GNU Lesser General Public License | |
| 18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. | |
| 19 | |
| 20 include(`../config.m4') | |
| 21 | |
| 22 | |
| 23 C norm frac | |
| 24 C K8 20 20 | |
| 25 C P4 73 73 | |
| 26 C P6-15 37 37 | |
| 27 | |
| 28 C TODO | |
| 29 C * Perhaps compute the inverse without relying on divq? Could either use | |
| 30 C Newton's method and mulq, or perhaps the faster fdiv. | |
| 31 C * The loop has not been carefully tuned, nor analysed for critical path | |
| 32 C length. It seems that 20 c/l is a bit long, compared to the 13 c/l for | |
| 33 C mpn_divrem_1. | |
| 34 C * Clean up. This code is really crude. | |
| 35 | |
| 36 | |
| 37 C INPUT PARAMETERS | |
| 38 define(`qp', `%rdi') | |
| 39 define(`fn', `%rsi') | |
| 40 define(`up_param', `%rdx') | |
| 41 define(`un_param', `%rcx') | |
| 42 define(`dp', `%r8') | |
| 43 | |
| 44 define(`dinv', `%r9') | |
| 45 | |
| 46 | |
| 47 C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 | |
| 48 C cnt qp d dinv | |
| 49 | |
| 50 ASM_START() | |
| 51 TEXT | |
| 52 ALIGN(16) | |
| 53 PROLOGUE(mpn_divrem_2) | |
| 54 | |
| 55 push %r15 | |
| 56 lea (%rdx,%rcx,8), %rax | |
| 57 push %r14 | |
| 58 push %r13 | |
| 59 mov %rsi, %r13 | |
| 60 push %r12 | |
| 61 lea -24(%rax), %r12 | |
| 62 push %rbp | |
| 63 mov %rdi, %rbp | |
| 64 push %rbx | |
| 65 mov 8(%r8), %r11 | |
| 66 mov -8(%rax), %r9 | |
| 67 mov (%r8), %r8 | |
| 68 mov -16(%rax), %r10 | |
| 69 xor R32(%r15), R32(%r15) | |
| 70 cmp %r9, %r11 | |
| 71 ja L(2) | |
| 72 setb %dl | |
| 73 cmp %r10, %r8 | |
| 74 setbe %al | |
| 75 or %al, %dl | |
| 76 jne L(23) | |
| 77 L(2): | |
| 78 lea -3(%rcx,%r13), %rbx C un + fn - 3 | |
| 79 test %rbx, %rbx | |
| 80 js L(6) | |
| 81 mov %r11, %rdx | |
| 82 mov $-1, %rax | |
| 83 not %rdx | |
| 84 div %r11 | |
| 85 mov %r11, %rdx | |
| 86 mov %rax, %rdi | |
| 87 imul %rax, %rdx | |
| 88 mov %rdx, %r14 | |
| 89 mul %r8 | |
| 90 mov %rdx, %rcx | |
| 91 mov $-1, %rdx | |
| 92 add %r8, %r14 | |
| 93 adc $0, %rdx | |
| 94 add %rcx, %r14 | |
| 95 adc $0, %rdx | |
| 96 js L(8) | |
| 97 L(18): | |
| 98 dec %rdi | |
| 99 sub %r11, %r14 | |
| 100 sbb $0, %rdx | |
| 101 jns L(18) | |
| 102 L(8): | |
| 103 | |
| 104 C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 | |
| 105 C n2 un n1 dinv qp d0 d1 up fn msl | |
| 106 C n2 un -d1 n1 dinv XX XX | |
| 107 | |
| 108 ifdef(`NEW',` | |
| 109 lea (%rbp,%rbx,8), %rbp | |
| 110 mov %rbx, %rcx C un | |
| 111 mov %r9, %rbx | |
| 112 mov %rdi, %r9 C di | |
| 113 mov %r10, %r14 | |
| 114 mov %r11, %rsi | |
| 115 neg %rsi C -d1 | |
| 116 ALIGN(16) | |
| 117 L(loop): | |
| 118 mov %r9, %rax C di ncp | |
| 119 mul %rbx C 0, 18 | |
| 120 add %r14, %rax C 4 | |
| 121 mov %rax, %r10 C q0 5 | |
| 122 adc %rbx, %rdx C 5 | |
| 123 mov %rdx, %rdi C q 6 | |
| 124 imul %rsi, %rdx C 6 | |
| 125 mov %r8, %rax C ncp | |
| 126 lea (%rdx, %r14), %rbx C n1 -= ... 7 | |
| 127 mul %rdi C 7 | |
| 128 xor R32(%r14), R32(%r14) C | |
| 129 cmp %rcx, %r13 C | |
| 130 jg L(19) C | |
| 131 mov (%r12), %r14 C | |
| 132 sub $8, %r12 C | |
| 133 L(19): sub %r8, %r14 C ncp | |
| 134 sbb %r11, %rbx C 9 | |
| 135 sub %rax, %r14 C 11 | |
| 136 sbb %rdx, %rbx C 12 | |
| 137 inc %rdi C 7 | |
| 138 xor R32(%rdx), R32(%rdx) C | |
| 139 cmp %r10, %rbx C 13 | |
| 140 mov %r8, %rax C d1 ncp | |
| 141 adc $-1, %rdx C mask 14 | |
| 142 add %rdx, %rdi C q-- 15 | |
| 143 and %rdx, %rax C d0 or 0 15 | |
| 144 and %r11, %rdx C d1 or 0 15 | |
| 145 add %rax, %r14 C 16 | |
| 146 adc %rdx, %rbx C 16 | |
| 147 cmp %r11, %rbx C 17 | |
| 148 jae L(fix) C | |
| 149 L(bck): mov %rdi, (%rbp) C | |
| 150 sub $8, %rbp C | |
| 151 dec %rcx | |
| 152 jns L(loop) | |
| 153 | |
| 154 mov %r14, %r10 | |
| 155 mov %rbx, %r9 | |
| 156 ',` | |
| 157 lea (%rbp,%rbx,8), %rbp | |
| 158 mov %rbx, %rcx | |
| 159 mov %r9, %rax | |
| 160 mov %r10, %rsi | |
| 161 ALIGN(16) | |
| 162 L(loop): | |
| 163 mov %rax, %r14 C 0, 19 | |
| 164 mul %rdi C 0 | |
| 165 mov %r11, %r9 C 1 | |
| 166 add %rsi, %rax C 4 | |
| 167 mov %rax, %rbx C q0 5 | |
| 168 adc %r14, %rdx C q 5 | |
| 169 lea 1(%rdx), %r10 C 6 | |
| 170 mov %rdx, %rax C 6 | |
| 171 imul %rdx, %r9 C 6 | |
| 172 sub %r9, %rsi C 10 | |
| 173 xor R32(%r9), R32(%r9) C | |
| 174 mul %r8 C 7 | |
| 175 cmp %rcx, %r13 C | |
| 176 jg L(13) C | |
| 177 mov (%r12), %r9 C | |
| 178 sub $8, %r12 C | |
| 179 L(13): sub %r8, %r9 C ncp | |
| 180 sbb %r11, %rsi C 11 | |
| 181 sub %rax, %r9 C 11 | |
| 182 sbb %rdx, %rsi C 12 | |
| 183 cmp %rbx, %rsi C 13 | |
| 184 sbb %rax, %rax C 14 | |
| 185 not %rax C 15 | |
| 186 add %rax, %r10 C 16 | |
| 187 mov %r8, %rbx C ncp | |
| 188 and %rax, %rbx C 16 | |
| 189 and %r11, %rax C 16 | |
| 190 add %rbx, %r9 C 17 | |
| 191 adc %rsi, %rax C 18 | |
| 192 cmp %rax, %r11 C 19 | |
| 193 jbe L(fix) C | |
| 194 L(bck): mov %r10, (%rbp) C | |
| 195 sub $8, %rbp C | |
| 196 mov %r9, %rsi C 18 | |
| 197 dec %rcx | |
| 198 jns L(loop) | |
| 199 | |
| 200 mov %rsi, %r10 | |
| 201 mov %rax, %r9 | |
| 202 ') | |
| 203 L(6): | |
| 204 mov %r10, 8(%r12) | |
| 205 mov %r9, 16(%r12) | |
| 206 pop %rbx | |
| 207 pop %rbp | |
| 208 pop %r12 | |
| 209 pop %r13 | |
| 210 pop %r14 | |
| 211 mov %r15, %rax | |
| 212 pop %r15 | |
| 213 ret | |
| 214 | |
| 215 L(23): inc R32(%r15) | |
| 216 sub %r8, %r10 | |
| 217 sbb %r11, %r9 | |
| 218 jmp L(2) | |
| 219 | |
| 220 ifdef(`NEW',` | |
| 221 L(fix): seta %dl | |
| 222 cmp %r8, %r14 | |
| 223 setae %al | |
| 224 orb %dl, %al | |
| 225 je L(bck) | |
| 226 inc %rdi | |
| 227 sub %r8, %r14 | |
| 228 sbb %r11, %rbx | |
| 229 jmp L(bck) | |
| 230 ',` | |
| 231 L(fix): jb L(88) | |
| 232 cmp %r8, %r9 | |
| 233 jb L(bck) | |
| 234 L(88): inc %r10 | |
| 235 sub %r8, %r9 | |
| 236 sbb %r11, %rax | |
| 237 jmp L(bck) | |
| 238 ') | |
| 239 EPILOGUE() | |
| OLD | NEW |