OLD | NEW |
(Empty) | |
| 1 ; This Source Code Form is subject to the terms of the Mozilla Public |
| 2 ; License, v. 2.0. If a copy of the MPL was not distributed with this |
| 3 ; file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| 4 |
| 5 ; |
| 6 ; This code is converted from mpi_amd64_gas.asm for MASM for x64. |
| 7 ; |
| 8 |
| 9 ; ------------------------------------------------------------------------ |
| 10 ; |
| 11 ; Implementation of s_mpv_mul_set_vec which exploits |
| 12 ; the 64X64->128 bit unsigned multiply instruction. |
| 13 ; |
| 14 ; ------------------------------------------------------------------------ |
| 15 |
| 16 ; r = a * digit, r and a are vectors of length len |
| 17 ; returns the carry digit |
| 18 ; r and a are 64 bit aligned. |
| 19 ; |
| 20 ; uint64_t |
| 21 ; s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) |
| 22 ; |
| 23 |
| 24 .CODE |
| 25 |
| 26 s_mpv_mul_set_vec64 PROC |
| 27 |
| 28 ; compatibilities for paramenter registers |
| 29 ; |
| 30 ; About GAS and MASM, the usage of parameter registers are different. |
| 31 |
| 32 push rdi |
| 33 push rsi |
| 34 |
| 35 mov rdi, rcx |
| 36 mov rsi, rdx |
| 37 mov edx, r8d |
| 38 mov rcx, r9 |
| 39 |
| 40 xor rax, rax |
| 41 test rdx, rdx |
| 42 jz L17 |
| 43 mov r8, rdx |
| 44 xor r9, r9 |
| 45 |
| 46 L15: |
| 47 cmp r8, 8 |
| 48 jb L16 |
| 49 mov rax, [rsi] |
| 50 mov r11, [8+rsi] |
| 51 mul rcx |
| 52 add rax, r9 |
| 53 adc rdx, 0 |
| 54 mov [0+rdi], rax |
| 55 mov r9, rdx |
| 56 mov rax,r11 |
| 57 mov r11, [16+rsi] |
| 58 mul rcx |
| 59 add rax,r9 |
| 60 adc rdx,0 |
| 61 mov [8+rdi],rax |
| 62 mov r9,rdx |
| 63 mov rax,r11 |
| 64 mov r11, [24+rsi] |
| 65 mul rcx |
| 66 add rax,r9 |
| 67 adc rdx,0 |
| 68 mov [16+rdi],rax |
| 69 mov r9,rdx |
| 70 mov rax,r11 |
| 71 mov r11, [32+rsi] |
| 72 mul rcx |
| 73 add rax,r9 |
| 74 adc rdx,0 |
| 75 mov [24+rdi],rax |
| 76 mov r9,rdx |
| 77 mov rax,r11 |
| 78 mov r11, [40+rsi] |
| 79 mul rcx |
| 80 add rax,r9 |
| 81 adc rdx,0 |
| 82 mov [32+rdi],rax |
| 83 mov r9,rdx |
| 84 mov rax,r11 |
| 85 mov r11, [48+rsi] |
| 86 mul rcx |
| 87 add rax,r9 |
| 88 adc rdx,0 |
| 89 mov [40+rdi],rax |
| 90 mov r9,rdx |
| 91 mov rax,r11 |
| 92 mov r11, [56+rsi] |
| 93 mul rcx |
| 94 add rax,r9 |
| 95 adc rdx,0 |
| 96 mov [48+rdi],rax |
| 97 mov r9,rdx |
| 98 mov rax,r11 |
| 99 mul rcx |
| 100 add rax,r9 |
| 101 adc rdx,0 |
| 102 mov [56+rdi],rax |
| 103 mov r9,rdx |
| 104 add rsi, 64 |
| 105 add rdi, 64 |
| 106 sub r8, 8 |
| 107 jz L17 |
| 108 jmp L15 |
| 109 |
| 110 L16: |
| 111 mov rax, [0+rsi] |
| 112 mul rcx |
| 113 add rax, r9 |
| 114 adc rdx,0 |
| 115 mov [0+rdi],rax |
| 116 mov r9,rdx |
| 117 dec r8 |
| 118 jz L17 |
| 119 mov rax, [8+rsi] |
| 120 mul rcx |
| 121 add rax,r9 |
| 122 adc rdx,0 |
| 123 mov [8+rdi], rax |
| 124 mov r9, rdx |
| 125 dec r8 |
| 126 jz L17 |
| 127 mov rax, [16+rsi] |
| 128 mul rcx |
| 129 add rax, r9 |
| 130 adc rdx, 0 |
| 131 mov [16+rdi],rax |
| 132 mov r9,rdx |
| 133 dec r8 |
| 134 jz L17 |
| 135 mov rax, [24+rsi] |
| 136 mul rcx |
| 137 add rax, r9 |
| 138 adc rdx, 0 |
| 139 mov [24+rdi], rax |
| 140 mov r9, rdx |
| 141 dec r8 |
| 142 jz L17 |
| 143 mov rax, [32+rsi] |
| 144 mul rcx |
| 145 add rax, r9 |
| 146 adc rdx, 0 |
| 147 mov [32+rdi],rax |
| 148 mov r9, rdx |
| 149 dec r8 |
| 150 jz L17 |
| 151 mov rax, [40+rsi] |
| 152 mul rcx |
| 153 add rax, r9 |
| 154 adc rdx, 0 |
| 155 mov [40+rdi], rax |
| 156 mov r9, rdx |
| 157 dec r8 |
| 158 jz L17 |
| 159 mov rax, [48+rsi] |
| 160 mul rcx |
| 161 add rax, r9 |
| 162 adc rdx, 0 |
| 163 mov [48+rdi], rax |
| 164 mov r9, rdx |
| 165 dec r8 |
| 166 jz L17 |
| 167 |
| 168 L17: |
| 169 mov rax, r9 |
| 170 pop rsi |
| 171 pop rdi |
| 172 ret |
| 173 |
| 174 s_mpv_mul_set_vec64 ENDP |
| 175 |
| 176 |
| 177 ;------------------------------------------------------------------------ |
| 178 ; |
| 179 ; Implementation of s_mpv_mul_add_vec which exploits |
| 180 ; the 64X64->128 bit unsigned multiply instruction. |
| 181 ; |
| 182 ;------------------------------------------------------------------------ |
| 183 |
| 184 ; r += a * digit, r and a are vectors of length len |
| 185 ; returns the carry digit |
| 186 ; r and a are 64 bit aligned. |
| 187 ; |
| 188 ; uint64_t |
| 189 ; s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) |
| 190 ; |
| 191 |
| 192 s_mpv_mul_add_vec64 PROC |
| 193 |
| 194 ; compatibilities for paramenter registers |
| 195 ; |
| 196 ; About GAS and MASM, the usage of parameter registers are different. |
| 197 |
| 198 push rdi |
| 199 push rsi |
| 200 |
| 201 mov rdi, rcx |
| 202 mov rsi, rdx |
| 203 mov edx, r8d |
| 204 mov rcx, r9 |
| 205 |
| 206 xor rax, rax |
| 207 test rdx, rdx |
| 208 jz L27 |
| 209 mov r8, rdx |
| 210 xor r9, r9 |
| 211 |
| 212 L25: |
| 213 cmp r8, 8 |
| 214 jb L26 |
| 215 mov rax, [0+rsi] |
| 216 mov r10, [0+rdi] |
| 217 mov r11, [8+rsi] |
| 218 mul rcx |
| 219 add rax,r10 |
| 220 adc rdx,0 |
| 221 mov r10, [8+rdi] |
| 222 add rax,r9 |
| 223 adc rdx,0 |
| 224 mov [0+rdi],rax |
| 225 mov r9,rdx |
| 226 mov rax,r11 |
| 227 mov r11, [16+rsi] |
| 228 mul rcx |
| 229 add rax,r10 |
| 230 adc rdx,0 |
| 231 mov r10, [16+rdi] |
| 232 add rax,r9 |
| 233 adc rdx,0 |
| 234 mov [8+rdi],rax |
| 235 mov r9,rdx |
| 236 mov rax,r11 |
| 237 mov r11, [24+rsi] |
| 238 mul rcx |
| 239 add rax,r10 |
| 240 adc rdx,0 |
| 241 mov r10, [24+rdi] |
| 242 add rax,r9 |
| 243 adc rdx,0 |
| 244 mov [16+rdi],rax |
| 245 mov r9,rdx |
| 246 mov rax,r11 |
| 247 mov r11, [32+rsi] |
| 248 mul rcx |
| 249 add rax,r10 |
| 250 adc rdx,0 |
| 251 mov r10, [32+rdi] |
| 252 add rax,r9 |
| 253 adc rdx,0 |
| 254 mov [24+rdi],rax |
| 255 mov r9,rdx |
| 256 mov rax,r11 |
| 257 mov r11, [40+rsi] |
| 258 mul rcx |
| 259 add rax,r10 |
| 260 adc rdx,0 |
| 261 mov r10, [40+rdi] |
| 262 add rax,r9 |
| 263 adc rdx,0 |
| 264 mov [32+rdi],rax |
| 265 mov r9,rdx |
| 266 mov rax,r11 |
| 267 mov r11, [48+rsi] |
| 268 mul rcx |
| 269 add rax,r10 |
| 270 adc rdx,0 |
| 271 mov r10, [48+rdi] |
| 272 add rax,r9 |
| 273 adc rdx,0 |
| 274 mov [40+rdi],rax |
| 275 mov r9,rdx |
| 276 mov rax,r11 |
| 277 mov r11, [56+rsi] |
| 278 mul rcx |
| 279 add rax,r10 |
| 280 adc rdx,0 |
| 281 mov r10, [56+rdi] |
| 282 add rax,r9 |
| 283 adc rdx,0 |
| 284 mov [48+rdi],rax |
| 285 mov r9,rdx |
| 286 mov rax,r11 |
| 287 mul rcx |
| 288 add rax,r10 |
| 289 adc rdx,0 |
| 290 add rax,r9 |
| 291 adc rdx,0 |
| 292 mov [56+rdi],rax |
| 293 mov r9,rdx |
| 294 add rsi,64 |
| 295 add rdi,64 |
| 296 sub r8, 8 |
| 297 jz L27 |
| 298 jmp L25 |
| 299 |
| 300 L26: |
| 301 mov rax, [0+rsi] |
| 302 mov r10, [0+rdi] |
| 303 mul rcx |
| 304 add rax,r10 |
| 305 adc rdx,0 |
| 306 add rax,r9 |
| 307 adc rdx,0 |
| 308 mov [0+rdi],rax |
| 309 mov r9,rdx |
| 310 dec r8 |
| 311 jz L27 |
| 312 mov rax, [8+rsi] |
| 313 mov r10, [8+rdi] |
| 314 mul rcx |
| 315 add rax,r10 |
| 316 adc rdx,0 |
| 317 add rax,r9 |
| 318 adc rdx,0 |
| 319 mov [8+rdi],rax |
| 320 mov r9,rdx |
| 321 dec r8 |
| 322 jz L27 |
| 323 mov rax, [16+rsi] |
| 324 mov r10, [16+rdi] |
| 325 mul rcx |
| 326 add rax,r10 |
| 327 adc rdx,0 |
| 328 add rax,r9 |
| 329 adc rdx,0 |
| 330 mov [16+rdi],rax |
| 331 mov r9,rdx |
| 332 dec r8 |
| 333 jz L27 |
| 334 mov rax, [24+rsi] |
| 335 mov r10, [24+rdi] |
| 336 mul rcx |
| 337 add rax,r10 |
| 338 adc rdx,0 |
| 339 add rax,r9 |
| 340 adc rdx,0 |
| 341 mov [24+rdi],rax |
| 342 mov r9,rdx |
| 343 dec r8 |
| 344 jz L27 |
| 345 mov rax, [32+rsi] |
| 346 mov r10, [32+rdi] |
| 347 mul rcx |
| 348 add rax,r10 |
| 349 adc rdx,0 |
| 350 add rax,r9 |
| 351 adc rdx,0 |
| 352 mov [32+rdi],rax |
| 353 mov r9,rdx |
| 354 dec r8 |
| 355 jz L27 |
| 356 mov rax, [40+rsi] |
| 357 mov r10, [40+rdi] |
| 358 mul rcx |
| 359 add rax,r10 |
| 360 adc rdx,0 |
| 361 add rax,r9 |
| 362 adc rdx,0 |
| 363 mov [40+rdi],rax |
| 364 mov r9,rdx |
| 365 dec r8 |
| 366 jz L27 |
| 367 mov rax, [48+rsi] |
| 368 mov r10, [48+rdi] |
| 369 mul rcx |
| 370 add rax,r10 |
| 371 adc rdx,0 |
| 372 add rax, r9 |
| 373 adc rdx, 0 |
| 374 mov [48+rdi], rax |
| 375 mov r9, rdx |
| 376 dec r8 |
| 377 jz L27 |
| 378 |
| 379 L27: |
| 380 mov rax, r9 |
| 381 |
| 382 pop rsi |
| 383 pop rdi |
| 384 ret |
| 385 |
| 386 s_mpv_mul_add_vec64 ENDP |
| 387 |
| 388 END |
OLD | NEW |