| OLD | NEW |
| (Empty) |
| 1 #!/usr/bin/env perl | |
| 2 # | |
| 3 # ==================================================================== | |
| 4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
| 5 # project. The module is, however, dual licensed under OpenSSL and | |
| 6 # CRYPTOGAMS licenses depending on where you obtain it. For further | |
| 7 # details see http://www.openssl.org/~appro/cryptogams/. | |
| 8 # ==================================================================== | |
| 9 # | |
| 10 # March 2010 | |
| 11 # | |
| 12 # The module implements "4-bit" GCM GHASH function and underlying | |
| 13 # single multiplication operation in GF(2^128). "4-bit" means that it | |
| 14 # uses 256 bytes per-key table [+128 bytes shared table]. Even though | |
| 15 # loops are aggressively modulo-scheduled in respect to references to | |
| 16 # Htbl and Z.hi updates for 8 cycles per byte, measured performance is | |
| 17 # ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic | |
| 18 # scheduling "glitch," because uprofile(1) indicates uniform sample | |
| 19 # distribution, as if all instruction bundles execute in 1.5 cycles. | |
| 20 # Meaning that it could have been even faster, yet 12 cycles is ~60% | |
| 21 # better than gcc-generated code and ~80% than code generated by vendor | |
| 22 # compiler. | |
| 23 | |
| 24 $cnt="v0"; # $0 | |
| 25 $t0="t0"; | |
| 26 $t1="t1"; | |
| 27 $t2="t2"; | |
| 28 $Thi0="t3"; # $4 | |
| 29 $Tlo0="t4"; | |
| 30 $Thi1="t5"; | |
| 31 $Tlo1="t6"; | |
| 32 $rem="t7"; # $8 | |
| 33 ################# | |
| 34 $Xi="a0"; # $16, input argument block | |
| 35 $Htbl="a1"; | |
| 36 $inp="a2"; | |
| 37 $len="a3"; | |
| 38 $nlo="a4"; # $20 | |
| 39 $nhi="a5"; | |
| 40 $Zhi="t8"; | |
| 41 $Zlo="t9"; | |
| 42 $Xhi="t10"; # $24 | |
| 43 $Xlo="t11"; | |
| 44 $remp="t12"; | |
| 45 $rem_4bit="AT"; # $28 | |
| 46 | |
| 47 { my $N; | |
| 48 sub loop() { | |
| 49 | |
| 50 $N++; | |
| 51 $code.=<<___; | |
| 52 .align 4 | |
| 53 extbl $Xlo,7,$nlo | |
| 54 and $nlo,0xf0,$nhi | |
| 55 sll $nlo,4,$nlo | |
| 56 and $nlo,0xf0,$nlo | |
| 57 | |
| 58 addq $nlo,$Htbl,$nlo | |
| 59 ldq $Zlo,8($nlo) | |
| 60 addq $nhi,$Htbl,$nhi | |
| 61 ldq $Zhi,0($nlo) | |
| 62 | |
| 63 and $Zlo,0x0f,$remp | |
| 64 sll $Zhi,60,$t0 | |
| 65 lda $cnt,6(zero) | |
| 66 extbl $Xlo,6,$nlo | |
| 67 | |
| 68 ldq $Tlo1,8($nhi) | |
| 69 s8addq $remp,$rem_4bit,$remp | |
| 70 ldq $Thi1,0($nhi) | |
| 71 srl $Zlo,4,$Zlo | |
| 72 | |
| 73 ldq $rem,0($remp) | |
| 74 srl $Zhi,4,$Zhi | |
| 75 xor $t0,$Zlo,$Zlo | |
| 76 and $nlo,0xf0,$nhi | |
| 77 | |
| 78 xor $Tlo1,$Zlo,$Zlo | |
| 79 sll $nlo,4,$nlo | |
| 80 xor $Thi1,$Zhi,$Zhi | |
| 81 and $nlo,0xf0,$nlo | |
| 82 | |
| 83 addq $nlo,$Htbl,$nlo | |
| 84 ldq $Tlo0,8($nlo) | |
| 85 addq $nhi,$Htbl,$nhi | |
| 86 ldq $Thi0,0($nlo) | |
| 87 | |
| 88 .Looplo$N: | |
| 89 and $Zlo,0x0f,$remp | |
| 90 sll $Zhi,60,$t0 | |
| 91 subq $cnt,1,$cnt | |
| 92 srl $Zlo,4,$Zlo | |
| 93 | |
| 94 ldq $Tlo1,8($nhi) | |
| 95 xor $rem,$Zhi,$Zhi | |
| 96 ldq $Thi1,0($nhi) | |
| 97 s8addq $remp,$rem_4bit,$remp | |
| 98 | |
| 99 ldq $rem,0($remp) | |
| 100 srl $Zhi,4,$Zhi | |
| 101 xor $t0,$Zlo,$Zlo | |
| 102 extbl $Xlo,$cnt,$nlo | |
| 103 | |
| 104 and $nlo,0xf0,$nhi | |
| 105 xor $Thi0,$Zhi,$Zhi | |
| 106 xor $Tlo0,$Zlo,$Zlo | |
| 107 sll $nlo,4,$nlo | |
| 108 | |
| 109 | |
| 110 and $Zlo,0x0f,$remp | |
| 111 sll $Zhi,60,$t0 | |
| 112 and $nlo,0xf0,$nlo | |
| 113 srl $Zlo,4,$Zlo | |
| 114 | |
| 115 s8addq $remp,$rem_4bit,$remp | |
| 116 xor $rem,$Zhi,$Zhi | |
| 117 addq $nlo,$Htbl,$nlo | |
| 118 addq $nhi,$Htbl,$nhi | |
| 119 | |
| 120 ldq $rem,0($remp) | |
| 121 srl $Zhi,4,$Zhi | |
| 122 ldq $Tlo0,8($nlo) | |
| 123 xor $t0,$Zlo,$Zlo | |
| 124 | |
| 125 xor $Tlo1,$Zlo,$Zlo | |
| 126 xor $Thi1,$Zhi,$Zhi | |
| 127 ldq $Thi0,0($nlo) | |
| 128 bne $cnt,.Looplo$N | |
| 129 | |
| 130 | |
| 131 and $Zlo,0x0f,$remp | |
| 132 sll $Zhi,60,$t0 | |
| 133 lda $cnt,7(zero) | |
| 134 srl $Zlo,4,$Zlo | |
| 135 | |
| 136 ldq $Tlo1,8($nhi) | |
| 137 xor $rem,$Zhi,$Zhi | |
| 138 ldq $Thi1,0($nhi) | |
| 139 s8addq $remp,$rem_4bit,$remp | |
| 140 | |
| 141 ldq $rem,0($remp) | |
| 142 srl $Zhi,4,$Zhi | |
| 143 xor $t0,$Zlo,$Zlo | |
| 144 extbl $Xhi,$cnt,$nlo | |
| 145 | |
| 146 and $nlo,0xf0,$nhi | |
| 147 xor $Thi0,$Zhi,$Zhi | |
| 148 xor $Tlo0,$Zlo,$Zlo | |
| 149 sll $nlo,4,$nlo | |
| 150 | |
| 151 and $Zlo,0x0f,$remp | |
| 152 sll $Zhi,60,$t0 | |
| 153 and $nlo,0xf0,$nlo | |
| 154 srl $Zlo,4,$Zlo | |
| 155 | |
| 156 s8addq $remp,$rem_4bit,$remp | |
| 157 xor $rem,$Zhi,$Zhi | |
| 158 addq $nlo,$Htbl,$nlo | |
| 159 addq $nhi,$Htbl,$nhi | |
| 160 | |
| 161 ldq $rem,0($remp) | |
| 162 srl $Zhi,4,$Zhi | |
| 163 ldq $Tlo0,8($nlo) | |
| 164 xor $t0,$Zlo,$Zlo | |
| 165 | |
| 166 xor $Tlo1,$Zlo,$Zlo | |
| 167 xor $Thi1,$Zhi,$Zhi | |
| 168 ldq $Thi0,0($nlo) | |
| 169 unop | |
| 170 | |
| 171 | |
| 172 .Loophi$N: | |
| 173 and $Zlo,0x0f,$remp | |
| 174 sll $Zhi,60,$t0 | |
| 175 subq $cnt,1,$cnt | |
| 176 srl $Zlo,4,$Zlo | |
| 177 | |
| 178 ldq $Tlo1,8($nhi) | |
| 179 xor $rem,$Zhi,$Zhi | |
| 180 ldq $Thi1,0($nhi) | |
| 181 s8addq $remp,$rem_4bit,$remp | |
| 182 | |
| 183 ldq $rem,0($remp) | |
| 184 srl $Zhi,4,$Zhi | |
| 185 xor $t0,$Zlo,$Zlo | |
| 186 extbl $Xhi,$cnt,$nlo | |
| 187 | |
| 188 and $nlo,0xf0,$nhi | |
| 189 xor $Thi0,$Zhi,$Zhi | |
| 190 xor $Tlo0,$Zlo,$Zlo | |
| 191 sll $nlo,4,$nlo | |
| 192 | |
| 193 | |
| 194 and $Zlo,0x0f,$remp | |
| 195 sll $Zhi,60,$t0 | |
| 196 and $nlo,0xf0,$nlo | |
| 197 srl $Zlo,4,$Zlo | |
| 198 | |
| 199 s8addq $remp,$rem_4bit,$remp | |
| 200 xor $rem,$Zhi,$Zhi | |
| 201 addq $nlo,$Htbl,$nlo | |
| 202 addq $nhi,$Htbl,$nhi | |
| 203 | |
| 204 ldq $rem,0($remp) | |
| 205 srl $Zhi,4,$Zhi | |
| 206 ldq $Tlo0,8($nlo) | |
| 207 xor $t0,$Zlo,$Zlo | |
| 208 | |
| 209 xor $Tlo1,$Zlo,$Zlo | |
| 210 xor $Thi1,$Zhi,$Zhi | |
| 211 ldq $Thi0,0($nlo) | |
| 212 bne $cnt,.Loophi$N | |
| 213 | |
| 214 | |
| 215 and $Zlo,0x0f,$remp | |
| 216 sll $Zhi,60,$t0 | |
| 217 srl $Zlo,4,$Zlo | |
| 218 | |
| 219 ldq $Tlo1,8($nhi) | |
| 220 xor $rem,$Zhi,$Zhi | |
| 221 ldq $Thi1,0($nhi) | |
| 222 s8addq $remp,$rem_4bit,$remp | |
| 223 | |
| 224 ldq $rem,0($remp) | |
| 225 srl $Zhi,4,$Zhi | |
| 226 xor $t0,$Zlo,$Zlo | |
| 227 | |
| 228 xor $Tlo0,$Zlo,$Zlo | |
| 229 xor $Thi0,$Zhi,$Zhi | |
| 230 | |
| 231 and $Zlo,0x0f,$remp | |
| 232 sll $Zhi,60,$t0 | |
| 233 srl $Zlo,4,$Zlo | |
| 234 | |
| 235 s8addq $remp,$rem_4bit,$remp | |
| 236 xor $rem,$Zhi,$Zhi | |
| 237 | |
| 238 ldq $rem,0($remp) | |
| 239 srl $Zhi,4,$Zhi | |
| 240 xor $Tlo1,$Zlo,$Zlo | |
| 241 xor $Thi1,$Zhi,$Zhi | |
| 242 xor $t0,$Zlo,$Zlo | |
| 243 xor $rem,$Zhi,$Zhi | |
| 244 ___ | |
| 245 }} | |
| 246 | |
| 247 $code=<<___; | |
| 248 #ifdef __linux__ | |
| 249 #include <asm/regdef.h> | |
| 250 #else | |
| 251 #include <asm.h> | |
| 252 #include <regdef.h> | |
| 253 #endif | |
| 254 | |
| 255 .text | |
| 256 | |
| 257 .set noat | |
| 258 .set noreorder | |
| 259 .globl gcm_gmult_4bit | |
| 260 .align 4 | |
| 261 .ent gcm_gmult_4bit | |
| 262 gcm_gmult_4bit: | |
| 263 .frame sp,0,ra | |
| 264 .prologue 0 | |
| 265 | |
| 266 ldq $Xlo,8($Xi) | |
| 267 ldq $Xhi,0($Xi) | |
| 268 | |
| 269 br $rem_4bit,.Lpic1 | |
| 270 .Lpic1: lda $rem_4bit,rem_4bit-.Lpic1($rem_4bit) | |
| 271 ___ | |
| 272 | |
| 273 &loop(); | |
| 274 | |
| 275 $code.=<<___; | |
| 276 srl $Zlo,24,$t0 # byte swap | |
| 277 srl $Zlo,8,$t1 | |
| 278 | |
| 279 sll $Zlo,8,$t2 | |
| 280 sll $Zlo,24,$Zlo | |
| 281 zapnot $t0,0x11,$t0 | |
| 282 zapnot $t1,0x22,$t1 | |
| 283 | |
| 284 zapnot $Zlo,0x88,$Zlo | |
| 285 or $t0,$t1,$t0 | |
| 286 zapnot $t2,0x44,$t2 | |
| 287 | |
| 288 or $Zlo,$t0,$Zlo | |
| 289 srl $Zhi,24,$t0 | |
| 290 srl $Zhi,8,$t1 | |
| 291 | |
| 292 or $Zlo,$t2,$Zlo | |
| 293 sll $Zhi,8,$t2 | |
| 294 sll $Zhi,24,$Zhi | |
| 295 | |
| 296 srl $Zlo,32,$Xlo | |
| 297 sll $Zlo,32,$Zlo | |
| 298 | |
| 299 zapnot $t0,0x11,$t0 | |
| 300 zapnot $t1,0x22,$t1 | |
| 301 or $Zlo,$Xlo,$Xlo | |
| 302 | |
| 303 zapnot $Zhi,0x88,$Zhi | |
| 304 or $t0,$t1,$t0 | |
| 305 zapnot $t2,0x44,$t2 | |
| 306 | |
| 307 or $Zhi,$t0,$Zhi | |
| 308 or $Zhi,$t2,$Zhi | |
| 309 | |
| 310 srl $Zhi,32,$Xhi | |
| 311 sll $Zhi,32,$Zhi | |
| 312 | |
| 313 or $Zhi,$Xhi,$Xhi | |
| 314 stq $Xlo,8($Xi) | |
| 315 stq $Xhi,0($Xi) | |
| 316 | |
| 317 ret (ra) | |
| 318 .end gcm_gmult_4bit | |
| 319 ___ | |
| 320 | |
| 321 $inhi="s0"; | |
| 322 $inlo="s1"; | |
| 323 | |
| 324 $code.=<<___; | |
| 325 .globl gcm_ghash_4bit | |
| 326 .align 4 | |
| 327 .ent gcm_ghash_4bit | |
| 328 gcm_ghash_4bit: | |
| 329 lda sp,-32(sp) | |
| 330 stq ra,0(sp) | |
| 331 stq s0,8(sp) | |
| 332 stq s1,16(sp) | |
| 333 .mask 0x04000600,-32 | |
| 334 .frame sp,32,ra | |
| 335 .prologue 0 | |
| 336 | |
| 337 ldq_u $inhi,0($inp) | |
| 338 ldq_u $Thi0,7($inp) | |
| 339 ldq_u $inlo,8($inp) | |
| 340 ldq_u $Tlo0,15($inp) | |
| 341 ldq $Xhi,0($Xi) | |
| 342 ldq $Xlo,8($Xi) | |
| 343 | |
| 344 br $rem_4bit,.Lpic2 | |
| 345 .Lpic2: lda $rem_4bit,rem_4bit-.Lpic2($rem_4bit) | |
| 346 | |
| 347 .Louter: | |
| 348 extql $inhi,$inp,$inhi | |
| 349 extqh $Thi0,$inp,$Thi0 | |
| 350 or $inhi,$Thi0,$inhi | |
| 351 lda $inp,16($inp) | |
| 352 | |
| 353 extql $inlo,$inp,$inlo | |
| 354 extqh $Tlo0,$inp,$Tlo0 | |
| 355 or $inlo,$Tlo0,$inlo | |
| 356 subq $len,16,$len | |
| 357 | |
| 358 xor $Xlo,$inlo,$Xlo | |
| 359 xor $Xhi,$inhi,$Xhi | |
| 360 ___ | |
| 361 | |
| 362 &loop(); | |
| 363 | |
| 364 $code.=<<___; | |
| 365 srl $Zlo,24,$t0 # byte swap | |
| 366 srl $Zlo,8,$t1 | |
| 367 | |
| 368 sll $Zlo,8,$t2 | |
| 369 sll $Zlo,24,$Zlo | |
| 370 zapnot $t0,0x11,$t0 | |
| 371 zapnot $t1,0x22,$t1 | |
| 372 | |
| 373 zapnot $Zlo,0x88,$Zlo | |
| 374 or $t0,$t1,$t0 | |
| 375 zapnot $t2,0x44,$t2 | |
| 376 | |
| 377 or $Zlo,$t0,$Zlo | |
| 378 srl $Zhi,24,$t0 | |
| 379 srl $Zhi,8,$t1 | |
| 380 | |
| 381 or $Zlo,$t2,$Zlo | |
| 382 sll $Zhi,8,$t2 | |
| 383 sll $Zhi,24,$Zhi | |
| 384 | |
| 385 srl $Zlo,32,$Xlo | |
| 386 sll $Zlo,32,$Zlo | |
| 387 beq $len,.Ldone | |
| 388 | |
| 389 zapnot $t0,0x11,$t0 | |
| 390 zapnot $t1,0x22,$t1 | |
| 391 or $Zlo,$Xlo,$Xlo | |
| 392 ldq_u $inhi,0($inp) | |
| 393 | |
| 394 zapnot $Zhi,0x88,$Zhi | |
| 395 or $t0,$t1,$t0 | |
| 396 zapnot $t2,0x44,$t2 | |
| 397 ldq_u $Thi0,7($inp) | |
| 398 | |
| 399 or $Zhi,$t0,$Zhi | |
| 400 or $Zhi,$t2,$Zhi | |
| 401 ldq_u $inlo,8($inp) | |
| 402 ldq_u $Tlo0,15($inp) | |
| 403 | |
| 404 srl $Zhi,32,$Xhi | |
| 405 sll $Zhi,32,$Zhi | |
| 406 | |
| 407 or $Zhi,$Xhi,$Xhi | |
| 408 br zero,.Louter | |
| 409 | |
| 410 .Ldone: | |
| 411 zapnot $t0,0x11,$t0 | |
| 412 zapnot $t1,0x22,$t1 | |
| 413 or $Zlo,$Xlo,$Xlo | |
| 414 | |
| 415 zapnot $Zhi,0x88,$Zhi | |
| 416 or $t0,$t1,$t0 | |
| 417 zapnot $t2,0x44,$t2 | |
| 418 | |
| 419 or $Zhi,$t0,$Zhi | |
| 420 or $Zhi,$t2,$Zhi | |
| 421 | |
| 422 srl $Zhi,32,$Xhi | |
| 423 sll $Zhi,32,$Zhi | |
| 424 | |
| 425 or $Zhi,$Xhi,$Xhi | |
| 426 | |
| 427 stq $Xlo,8($Xi) | |
| 428 stq $Xhi,0($Xi) | |
| 429 | |
| 430 .set noreorder | |
| 431 /*ldq ra,0(sp)*/ | |
| 432 ldq s0,8(sp) | |
| 433 ldq s1,16(sp) | |
| 434 lda sp,32(sp) | |
| 435 ret (ra) | |
| 436 .end gcm_ghash_4bit | |
| 437 | |
| 438 .align 4 | |
| 439 rem_4bit: | |
| 440 .quad 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48 | |
| 441 .quad 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48 | |
| 442 .quad 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48 | |
| 443 .quad 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48 | |
| 444 .ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>" | |
| 445 .align 4 | |
| 446 | |
| 447 ___ | |
| 448 $output=shift and open STDOUT,">$output"; | |
| 449 print $code; | |
| 450 close STDOUT; | |
| 451 | |
| OLD | NEW |