| OLD | NEW |
| (Empty) |
| 1 dnl Alpha mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number. | |
| 2 | |
| 3 dnl Copyright 2007, 2008 Free Software Foundation, Inc. | |
| 4 | |
| 5 dnl This file is part of the GNU MP Library. | |
| 6 | |
| 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify | |
| 8 dnl it under the terms of the GNU Lesser General Public License as published | |
| 9 dnl by the Free Software Foundation; either version 3 of the License, or (at | |
| 10 dnl your option) any later version. | |
| 11 | |
| 12 dnl The GNU MP Library is distributed in the hope that it will be useful, but | |
| 13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
| 14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public | |
| 15 dnl License for more details. | |
| 16 | |
| 17 dnl You should have received a copy of the GNU Lesser General Public License | |
| 18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. | |
| 19 | |
| 20 include(`../config.m4') | |
| 21 | |
| 22 C norm frac | |
| 23 C ev4 | |
| 24 C ev5 70 70 | |
| 25 C ev6 29 29 | |
| 26 | |
| 27 C TODO | |
| 28 C * Perhaps inline mpn_invert_limb, that would allow us to not save/restore | |
| 29 C any registers (thus save ~10 cycles per call). | |
| 30 C * Use negated d1 and/or d0 to speed carry propagation. Might save a cycle | |
| 31 C or two. | |
| 32 C * Check cluster delays (for ev6). We very likely could save some cycles. | |
| 33 C * Use branch-free code for computing di. | |
| 34 C * CAVEAT: We rely on r19 not being clobbered by mpn_invert_limb call. | |
| 35 | |
| 36 C INPUT PARAMETERS | |
| 37 define(`qp', `r16') | |
| 38 define(`fn', `r17') | |
| 39 define(`up_param', `r18') | |
| 40 define(`un_param', `r19') | |
| 41 define(`dp', `r20') | |
| 42 | |
| 43 ASM_START() | |
| 44 PROLOGUE(mpn_divrem_2) | |
| 45 ldgp r29, 0(r27) | |
| 46 lda r30, -80(r30) | |
| 47 stq r26, 0(r30) | |
| 48 stq r9, 8(r30) | |
| 49 stq r10, 16(r30) | |
| 50 stq r11, 24(r30) | |
| 51 stq r12, 32(r30) | |
| 52 stq r13, 40(r30) | |
| 53 C stq r14, 48(r30) | |
| 54 stq r15, 56(r30) | |
| 55 .prologue 1 | |
| 56 stq r16, 64(r30) | |
| 57 bis r31, r17, r15 | |
| 58 s8addq r19, r18, r13 | |
| 59 lda r13, -24(r13) | |
| 60 ldq r12, 8(r20) | |
| 61 ldq r10, 0(r20) | |
| 62 ldq r11, 16(r13) | |
| 63 ldq r9, 8(r13) | |
| 64 | |
| 65 bis r31, r31, r3 C most_significant_q_limb = 0 | |
| 66 cmpult r11, r12, r1 | |
| 67 bne r1, L(L8) | |
| 68 cmpule r11, r12, r1 | |
| 69 cmpult r9, r10, r2 | |
| 70 and r1, r2, r1 | |
| 71 bne r1, L(L8) | |
| 72 subq r11, r12, r11 | |
| 73 subq r11, r2, r11 | |
| 74 subq r9, r10, r9 | |
| 75 lda r3, 1(r31) C most_significant_q_limb = 1 | |
| 76 L(L8): stq r3, 72(r30) | |
| 77 | |
| 78 addq r15, r19, r19 | |
| 79 lda r19, -3(r19) | |
| 80 blt r19, L(L10) | |
| 81 bis r31, r12, r16 | |
| 82 jsr r26, mpn_invert_limb | |
| 83 ldgp r29, 0(r26) | |
| 84 mulq r0, r12, r4 C t0 = LO(di * d1) | |
| 85 umulh r0, r10, r2 C s1 = HI(di * d0) | |
| 86 addq r4, r10, r4 C t0 += d0 | |
| 87 cmpule r10, r4, r7 C (t0 < d0) | |
| 88 addq r4, r2, r4 C t0 += s1 | |
| 89 cmpult r4, r2, r1 | |
| 90 subq r1, r7, r7 C t1 (-1, 0, or 1) | |
| 91 blt r7, L(L42) | |
| 92 L(L22): | |
| 93 lda r0, -1(r0) C di-- | |
| 94 cmpult r4, r12, r1 C cy for: t0 -= d1 (below) | |
| 95 subq r7, r1, r7 C t1 -= cy | |
| 96 subq r4, r12, r4 C t0 -= d1 | |
| 97 bge r7, L(L22) | |
| 98 L(L42): | |
| 99 ldq r16, 64(r30) | |
| 100 s8addq r19, r16, r16 | |
| 101 ALIGN(16) | |
| 102 L(loop): | |
| 103 mulq r11, r0, r5 C q0 (early) | |
| 104 umulh r11, r0, r6 C q (early) | |
| 105 addq r5, r9, r8 C q0 += n1 | |
| 106 addq r6, r11, r6 C q += n2 | |
| 107 cmpult r8, r5, r1 C cy for: q0 += n1 | |
| 108 addq r6, r1, r6 C q += cy | |
| 109 unop | |
| 110 mulq r12, r6, r1 C LO(d1 * q) | |
| 111 umulh r10, r6, r7 C t1 = HI(d0 * q) | |
| 112 subq r9, r1, r9 C n1 -= LO(d1 * q) | |
| 113 mulq r10, r6, r4 C t0 = LO(d0 * q) | |
| 114 unop | |
| 115 cmple r15, r19, r5 C condition and n0... | |
| 116 beq r5, L(L31) | |
| 117 ldq r5, 0(r13) | |
| 118 lda r13, -8(r13) | |
| 119 L(L31): subq r9, r12, r9 C n1 -= d1 | |
| 120 cmpult r5, r10, r1 C | |
| 121 subq r9, r1, r9 C | |
| 122 subq r5, r10, r5 C n0 -= d0 | |
| 123 subq r9, r7, r9 C n1 -= t0 | |
| 124 cmpult r5, r4, r1 C | |
| 125 subq r9, r1, r2 C | |
| 126 subq r5, r4, r5 C n0 -= t1 | |
| 127 cmpult r2, r8, r1 C (n1 < q0) | |
| 128 addq r6, r1, r6 C q += cond | |
| 129 lda r1, -1(r1) C -(n1 >= q0) | |
| 130 and r1, r10, r4 C | |
| 131 addq r5, r4, r9 C n0 += mask & d0 | |
| 132 and r1, r12, r1 C | |
| 133 cmpult r9, r5, r11 C cy for: n0 += mask & d0 | |
| 134 addq r2, r1, r1 C n1 += mask & d1 | |
| 135 addq r1, r11, r11 C n1 += cy | |
| 136 cmpult r11, r12, r1 C | |
| 137 beq r1, L(fix) C | |
| 138 L(bck): stq r6, 0(r16) | |
| 139 lda r16, -8(r16) | |
| 140 lda r19, -1(r19) | |
| 141 bge r19, L(loop) | |
| 142 | |
| 143 L(L10): stq r9, 8(r13) | |
| 144 stq r11, 16(r13) | |
| 145 ldq r0, 72(r30) | |
| 146 ldq r26, 0(r30) | |
| 147 ldq r9, 8(r30) | |
| 148 ldq r10, 16(r30) | |
| 149 ldq r11, 24(r30) | |
| 150 ldq r12, 32(r30) | |
| 151 ldq r13, 40(r30) | |
| 152 C ldq r14, 48(r30) | |
| 153 ldq r15, 56(r30) | |
| 154 lda r30, 80(r30) | |
| 155 ret r31, (r26), 1 | |
| 156 | |
| 157 L(fix): cmpule r11, r12, r1 | |
| 158 cmpult r9, r10, r2 | |
| 159 and r1, r2, r1 | |
| 160 bne r1, L(bck) | |
| 161 subq r11, r12, r11 | |
| 162 subq r11, r2, r11 | |
| 163 subq r9, r10, r9 | |
| 164 lda r6, 1(r6) | |
| 165 br L(bck) | |
| 166 EPILOGUE() | |
| 167 ASM_END() | |
| OLD | NEW |