| OLD | NEW |
| (Empty) |
| 1 dnl AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n, | |
| 2 dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations. | |
| 3 | |
| 4 dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. | |
| 5 dnl | |
| 6 dnl This file is part of the GNU MP Library. | |
| 7 dnl | |
| 8 dnl The GNU MP Library is free software; you can redistribute it and/or | |
| 9 dnl modify it under the terms of the GNU Lesser General Public License as | |
| 10 dnl published by the Free Software Foundation; either version 3 of the | |
| 11 dnl License, or (at your option) any later version. | |
| 12 dnl | |
| 13 dnl The GNU MP Library is distributed in the hope that it will be useful, | |
| 14 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 15 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 16 dnl Lesser General Public License for more details. | |
| 17 dnl | |
| 18 dnl You should have received a copy of the GNU Lesser General Public License | |
| 19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. | |
| 20 | |
| 21 include(`../config.m4') | |
| 22 | |
| 23 NAILS_SUPPORT(0-31) | |
| 24 | |
| 25 | |
| 26 C alignment dst/src1/src2, A=0mod8, N=4mod8 | |
| 27 C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N | |
| 28 C | |
| 29 C K6-2 1.2 1.5 1.5 1.2 1.2 1.5 1.5 1.2 and,andn,ior,xor | |
| 30 C K6-2 1.5 1.75 2.0 1.75 1.75 2.0 1.75 1.5 iorn,xnor | |
| 31 C K6-2 1.75 2.0 2.0 2.0 2.0 2.0 2.0 1.75 nand,nior | |
| 32 C | |
| 33 C K6 1.5 1.68 1.75 1.2 1.75 1.75 1.68 1.5 and,andn,ior,xor | |
| 34 C K6 2.0 2.0 2.25 2.25 2.25 2.25 2.0 2.0 iorn,xnor | |
| 35 C K6 2.0 2.25 2.25 2.25 2.25 2.25 2.25 2.0 nand,nior | |
| 36 | |
| 37 | |
| 38 dnl M4_p and M4_i are the MMX and integer instructions | |
| 39 dnl M4_*_neg_dst means whether to negate the final result before writing | |
| 40 dnl M4_*_neg_src2 means whether to negate the src2 values before using them | |
| 41 | |
| 42 define(M4_choose_op, | |
| 43 m4_assert_numargs(7) | |
| 44 `ifdef(`OPERATION_$1',` | |
| 45 define(`M4_function', `mpn_$1') | |
| 46 define(`M4_operation', `$1') | |
| 47 define(`M4_p', `$2') | |
| 48 define(`M4_p_neg_dst', `$3') | |
| 49 define(`M4_p_neg_src2',`$4') | |
| 50 define(`M4_i', `$5') | |
| 51 define(`M4_i_neg_dst', `$6') | |
| 52 define(`M4_i_neg_src2',`$7') | |
| 53 ')') | |
| 54 | |
| 55 dnl xnor is done in "iorn" style because it's a touch faster than "nior" | |
| 56 dnl style (the two are equivalent for xor). | |
| 57 dnl | |
| 58 dnl pandn can't be used with nails. | |
| 59 | |
| 60 M4_choose_op( and_n, pand,0,0, andl,0,0) | |
| 61 ifelse(GMP_NAIL_BITS,0, | |
| 62 `M4_choose_op(andn_n, pandn,0,0, andl,0,1)', | |
| 63 `M4_choose_op(andn_n, pand,0,1, andl,0,1)') | |
| 64 M4_choose_op( nand_n, pand,1,0, andl,1,0) | |
| 65 M4_choose_op( ior_n, por,0,0, orl,0,0) | |
| 66 M4_choose_op( iorn_n, por,0,1, orl,0,1) | |
| 67 M4_choose_op( nior_n, por,1,0, orl,1,0) | |
| 68 M4_choose_op( xor_n, pxor,0,0, xorl,0,0) | |
| 69 M4_choose_op( xnor_n, pxor,0,1, xorl,0,1) | |
| 70 | |
| 71 ifdef(`M4_function',, | |
| 72 `m4_error(`Unrecognised or undefined OPERATION symbol | |
| 73 ')') | |
| 74 | |
| 75 MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n
mpn_xor_n mpn_xnor_n) | |
| 76 | |
| 77 | |
| 78 C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, | |
| 79 C mp_size_t size); | |
| 80 C | |
| 81 C Do src1,size M4_operation src2,size, storing the result in dst,size. | |
| 82 C | |
| 83 C Unaligned movq loads and stores are a bit slower than aligned ones. The | |
| 84 C test at the start of the routine checks the alignment of src1 and if | |
| 85 C necessary processes one limb separately at the low end to make it aligned. | |
| 86 C | |
| 87 C The raw speeds without this alignment switch are as follows. | |
| 88 C | |
| 89 C alignment dst/src1/src2, A=0mod8, N=4mod8 | |
| 90 C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N | |
| 91 C | |
| 92 C K6 1.5 2.0 1.5 2.0 and,andn,ior,xor | |
| 93 C K6 1.75 2.2 2.0 2.28 iorn,xnor | |
| 94 C K6 2.0 2.25 2.35 2.28 nand,nior | |
| 95 C | |
| 96 C | |
| 97 C Future: | |
| 98 C | |
| 99 C K6 can do one 64-bit load per cycle so each of these routines should be | |
| 100 C able to approach 1.0 c/l, if aligned. The basic and/andn/ior/xor might be | |
| 101 C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs. | |
| 102 C The others are 4 instructions per 2 limbs, and so can only approach 1.0 | |
| 103 C because there's nowhere to hide some loop control. | |
| 104 | |
| 105 defframe(PARAM_SIZE,16) | |
| 106 defframe(PARAM_SRC2,12) | |
| 107 defframe(PARAM_SRC1,8) | |
| 108 defframe(PARAM_DST, 4) | |
| 109 deflit(`FRAME',0) | |
| 110 | |
| 111 TEXT | |
| 112 ALIGN(32) | |
| 113 PROLOGUE(M4_function) | |
| 114 movl PARAM_SIZE, %ecx | |
| 115 pushl %ebx FRAME_pushl() | |
| 116 | |
| 117 movl PARAM_SRC1, %eax | |
| 118 | |
| 119 movl PARAM_SRC2, %ebx | |
| 120 cmpl $1, %ecx | |
| 121 | |
| 122 movl PARAM_DST, %edx | |
| 123 ja L(two_or_more) | |
| 124 | |
| 125 | |
| 126 movl (%ebx), %ecx | |
| 127 popl %ebx | |
| 128 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)') | |
| 129 M4_i (%eax), %ecx | |
| 130 ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)') | |
| 131 movl %ecx, (%edx) | |
| 132 | |
| 133 ret | |
| 134 | |
| 135 | |
| 136 L(two_or_more): | |
| 137 C eax src1 | |
| 138 C ebx src2 | |
| 139 C ecx size | |
| 140 C edx dst | |
| 141 C esi | |
| 142 C edi | |
| 143 C ebp | |
| 144 | |
| 145 pushl %esi FRAME_pushl() | |
| 146 testl $4, %eax | |
| 147 jz L(alignment_ok) | |
| 148 | |
| 149 movl (%ebx), %esi | |
| 150 addl $4, %ebx | |
| 151 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %esi)') | |
| 152 M4_i (%eax), %esi | |
| 153 addl $4, %eax | |
| 154 ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %esi)') | |
| 155 movl %esi, (%edx) | |
| 156 addl $4, %edx | |
| 157 decl %ecx | |
| 158 | |
| 159 L(alignment_ok): | |
| 160 movl %ecx, %esi | |
| 161 shrl %ecx | |
| 162 jnz L(still_two_or_more) | |
| 163 | |
| 164 movl (%ebx), %ecx | |
| 165 popl %esi | |
| 166 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)') | |
| 167 M4_i (%eax), %ecx | |
| 168 ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)') | |
| 169 popl %ebx | |
| 170 movl %ecx, (%edx) | |
| 171 ret | |
| 172 | |
| 173 | |
| 174 L(still_two_or_more): | |
| 175 ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,` | |
| 176 pcmpeqd %mm7, %mm7 C all ones | |
| 177 ifelse(GMP_NAIL_BITS,0,,`psrld $GMP_NAIL_BITS, %mm7') C clear nails | |
| 178 ') | |
| 179 | |
| 180 ALIGN(16) | |
| 181 L(top): | |
| 182 C eax src1 | |
| 183 C ebx src2 | |
| 184 C ecx counter | |
| 185 C edx dst | |
| 186 C esi | |
| 187 C edi | |
| 188 C ebp | |
| 189 C | |
| 190 C carry bit is low of size | |
| 191 | |
| 192 movq -8(%ebx,%ecx,8), %mm0 | |
| 193 ifelse(M4_p_neg_src2,1,`pxor %mm7, %mm0') | |
| 194 M4_p -8(%eax,%ecx,8), %mm0 | |
| 195 ifelse(M4_p_neg_dst,1,` pxor %mm7, %mm0') | |
| 196 movq %mm0, -8(%edx,%ecx,8) | |
| 197 | |
| 198 loop L(top) | |
| 199 | |
| 200 | |
| 201 jnc L(no_extra) | |
| 202 | |
| 203 movl -4(%ebx,%esi,4), %ebx | |
| 204 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ebx)') | |
| 205 M4_i -4(%eax,%esi,4), %ebx | |
| 206 ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ebx)') | |
| 207 movl %ebx, -4(%edx,%esi,4) | |
| 208 L(no_extra): | |
| 209 | |
| 210 popl %esi | |
| 211 popl %ebx | |
| 212 emms_or_femms | |
| 213 ret | |
| 214 | |
| 215 EPILOGUE() | |
| OLD | NEW |