| OLD | NEW |
| (Empty) |
| 1 dnl AMD K7 mpn_mod_34lsub1 -- remainder modulo 2^24-1. | |
| 2 | |
| 3 dnl Copyright 2000, 2001, 2002, 2004, 2005, 2008 Free Software Foundation, | |
| 4 dnl Inc. | |
| 5 dnl | |
| 6 dnl This file is part of the GNU MP Library. | |
| 7 dnl | |
| 8 dnl The GNU MP Library is free software; you can redistribute it and/or | |
| 9 dnl modify it under the terms of the GNU Lesser General Public License as | |
| 10 dnl published by the Free Software Foundation; either version 3 of the | |
| 11 dnl License, or (at your option) any later version. | |
| 12 dnl | |
| 13 dnl The GNU MP Library is distributed in the hope that it will be useful, | |
| 14 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 15 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 16 dnl Lesser General Public License for more details. | |
| 17 dnl | |
| 18 dnl You should have received a copy of the GNU Lesser General Public License | |
| 19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. | |
| 20 | |
| 21 include(`../config.m4') | |
| 22 | |
| 23 | |
| 24 C cycles/limb | |
| 25 C Athlon: 1 | |
| 26 C Hammer: 1 | |
| 27 | |
| 28 | |
| 29 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) | |
| 30 C | |
| 31 C The loop form below and the 64 byte code alignment seem necessary for the | |
| 32 C claimed speed. This is a bit strange, since normally k7 isn't very | |
| 33 C sensitive to such things. Perhaps there has to be 6 instructions in the | |
| 34 C first 16 bytes for the BTB entry or something. | |
| 35 | |
| 36 defframe(PARAM_SIZE, 8) | |
| 37 defframe(PARAM_SRC, 4) | |
| 38 | |
| 39 dnl re-use parameter space | |
| 40 define(SAVE_EDI, `PARAM_SIZE') | |
| 41 | |
| 42 TEXT | |
| 43 ALIGN(64) | |
| 44 PROLOGUE(mpn_mod_34lsub1) | |
| 45 deflit(`FRAME',0) | |
| 46 | |
| 47 movl PARAM_SIZE, %ecx | |
| 48 movl PARAM_SRC, %edx | |
| 49 | |
| 50 subl $2, %ecx | |
| 51 ja L(three_or_more) | |
| 52 | |
| 53 movl (%edx), %eax | |
| 54 jb L(one) | |
| 55 | |
| 56 movl 4(%edx), %ecx | |
| 57 movl %eax, %edx | |
| 58 shrl $24, %eax C src[0] low | |
| 59 | |
| 60 andl $0xFFFFFF, %edx C src[0] high | |
| 61 addl %edx, %eax | |
| 62 movl %ecx, %edx | |
| 63 | |
| 64 andl $0xFFFF, %ecx | |
| 65 shrl $16, %edx C src[1] high | |
| 66 addl %edx, %eax | |
| 67 | |
| 68 shll $8, %ecx C src[1] low | |
| 69 addl %ecx, %eax | |
| 70 | |
| 71 L(one): | |
| 72 ret | |
| 73 | |
| 74 | |
| 75 L(three_or_more): | |
| 76 C eax | |
| 77 C ebx | |
| 78 C ecx size-2 | |
| 79 C edx src | |
| 80 C esi | |
| 81 C edi | |
| 82 | |
| 83 pushl %ebx FRAME_pushl() | |
| 84 xorl %eax, %eax | |
| 85 xorl %ebx, %ebx | |
| 86 | |
| 87 movl %edi, SAVE_EDI | |
| 88 pushl %esi FRAME_pushl() | |
| 89 xorl %esi, %esi C and clear carry flag | |
| 90 | |
| 91 | |
| 92 C code offset 0x40 at this point | |
| 93 L(top): | |
| 94 C eax acc 0mod3 | |
| 95 C ebx acc 1mod3 | |
| 96 C ecx counter, limbs | |
| 97 C edx src | |
| 98 C esi acc 2mod3 | |
| 99 C edi | |
| 100 | |
| 101 leal 24(%edx), %edx | |
| 102 leal -2(%ecx), %ecx | |
| 103 adcl -24(%edx), %eax | |
| 104 adcl -20(%edx), %ebx | |
| 105 adcl -16(%edx), %esi | |
| 106 | |
| 107 decl %ecx | |
| 108 jng L(done_loop) | |
| 109 | |
| 110 leal -2(%ecx), %ecx | |
| 111 adcl -12(%edx), %eax | |
| 112 adcl -8(%edx), %ebx | |
| 113 adcl -4(%edx), %esi | |
| 114 | |
| 115 decl %ecx | |
| 116 jg L(top) | |
| 117 | |
| 118 | |
| 119 leal 12(%edx), %edx | |
| 120 | |
| 121 | |
| 122 L(done_loop): | |
| 123 C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively | |
| 124 | |
| 125 incl %ecx | |
| 126 movl $0xFFFFFFFF, %edi | |
| 127 js L(combine) | |
| 128 | |
| 129 adcl -12(%edx), %eax | |
| 130 decl %ecx | |
| 131 movl $0xFFFFFF00, %edi | |
| 132 js L(combine) | |
| 133 | |
| 134 adcl -8(%edx), %ebx | |
| 135 movl $0xFFFF0000, %edi | |
| 136 | |
| 137 | |
| 138 L(combine): | |
| 139 C eax acc 0mod3 | |
| 140 C ebx acc 1mod3 | |
| 141 C ecx | |
| 142 C edx | |
| 143 C esi acc 2mod3 | |
| 144 C edi mask | |
| 145 | |
| 146 sbbl %ecx, %ecx C carry | |
| 147 movl %eax, %edx C 0mod3 | |
| 148 shrl $24, %eax C 0mod3 high | |
| 149 | |
| 150 andl %edi, %ecx C carry masked | |
| 151 andl $0x00FFFFFF, %edx C 0mod3 low | |
| 152 movl %ebx, %edi C 1mod3 | |
| 153 | |
| 154 subl %ecx, %eax C apply carry | |
| 155 shrl $16, %ebx C 1mod3 high | |
| 156 andl $0xFFFF, %edi | |
| 157 | |
| 158 addl %edx, %eax C apply 0mod3 low | |
| 159 movl %esi, %edx C 2mod3 | |
| 160 shll $8, %edi C 1mod3 low | |
| 161 | |
| 162 addl %ebx, %eax C apply 1mod3 high | |
| 163 shrl $8, %esi C 2mod3 high | |
| 164 movzbl %dl, %edx C 2mod3 low | |
| 165 | |
| 166 addl %edi, %eax C apply 1mod3 low | |
| 167 shll $16, %edx C 2mod3 low | |
| 168 | |
| 169 addl %esi, %eax C apply 2mod3 high | |
| 170 popl %esi FRAME_popl() | |
| 171 | |
| 172 movl SAVE_EDI, %edi | |
| 173 addl %edx, %eax C apply 2mod3 low | |
| 174 popl %ebx FRAME_popl() | |
| 175 | |
| 176 ret | |
| 177 | |
| 178 EPILOGUE() | |
| OLD | NEW |