| Index: openssl/crypto/modes/asm/ghash-alpha.pl
|
| diff --git a/openssl/crypto/modes/asm/ghash-alpha.pl b/openssl/crypto/modes/asm/ghash-alpha.pl
|
| deleted file mode 100644
|
| index 6358b2750fabf54c0c96b8103fbd2057ae651ad0..0000000000000000000000000000000000000000
|
| --- a/openssl/crypto/modes/asm/ghash-alpha.pl
|
| +++ /dev/null
|
| @@ -1,451 +0,0 @@
|
| -#!/usr/bin/env perl
|
| -#
|
| -# ====================================================================
|
| -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
| -# project. The module is, however, dual licensed under OpenSSL and
|
| -# CRYPTOGAMS licenses depending on where you obtain it. For further
|
| -# details see http://www.openssl.org/~appro/cryptogams/.
|
| -# ====================================================================
|
| -#
|
| -# March 2010
|
| -#
|
| -# The module implements "4-bit" GCM GHASH function and underlying
|
| -# single multiplication operation in GF(2^128). "4-bit" means that it
|
| -# uses 256 bytes per-key table [+128 bytes shared table]. Even though
|
| -# loops are aggressively modulo-scheduled in respect to references to
|
| -# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
|
| -# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
|
| -# scheduling "glitch," because uprofile(1) indicates uniform sample
|
| -# distribution, as if all instruction bundles execute in 1.5 cycles.
|
| -# Meaning that it could have been even faster, yet 12 cycles is ~60%
|
| -# better than gcc-generated code and ~80% than code generated by vendor
|
| -# compiler.
|
| -
|
| -$cnt="v0"; # $0
|
| -$t0="t0";
|
| -$t1="t1";
|
| -$t2="t2";
|
| -$Thi0="t3"; # $4
|
| -$Tlo0="t4";
|
| -$Thi1="t5";
|
| -$Tlo1="t6";
|
| -$rem="t7"; # $8
|
| -#################
|
| -$Xi="a0"; # $16, input argument block
|
| -$Htbl="a1";
|
| -$inp="a2";
|
| -$len="a3";
|
| -$nlo="a4"; # $20
|
| -$nhi="a5";
|
| -$Zhi="t8";
|
| -$Zlo="t9";
|
| -$Xhi="t10"; # $24
|
| -$Xlo="t11";
|
| -$remp="t12";
|
| -$rem_4bit="AT"; # $28
|
| -
|
| -{ my $N;
|
| - sub loop() {
|
| -
|
| - $N++;
|
| -$code.=<<___;
|
| -.align 4
|
| - extbl $Xlo,7,$nlo
|
| - and $nlo,0xf0,$nhi
|
| - sll $nlo,4,$nlo
|
| - and $nlo,0xf0,$nlo
|
| -
|
| - addq $nlo,$Htbl,$nlo
|
| - ldq $Zlo,8($nlo)
|
| - addq $nhi,$Htbl,$nhi
|
| - ldq $Zhi,0($nlo)
|
| -
|
| - and $Zlo,0x0f,$remp
|
| - sll $Zhi,60,$t0
|
| - lda $cnt,6(zero)
|
| - extbl $Xlo,6,$nlo
|
| -
|
| - ldq $Tlo1,8($nhi)
|
| - s8addq $remp,$rem_4bit,$remp
|
| - ldq $Thi1,0($nhi)
|
| - srl $Zlo,4,$Zlo
|
| -
|
| - ldq $rem,0($remp)
|
| - srl $Zhi,4,$Zhi
|
| - xor $t0,$Zlo,$Zlo
|
| - and $nlo,0xf0,$nhi
|
| -
|
| - xor $Tlo1,$Zlo,$Zlo
|
| - sll $nlo,4,$nlo
|
| - xor $Thi1,$Zhi,$Zhi
|
| - and $nlo,0xf0,$nlo
|
| -
|
| - addq $nlo,$Htbl,$nlo
|
| - ldq $Tlo0,8($nlo)
|
| - addq $nhi,$Htbl,$nhi
|
| - ldq $Thi0,0($nlo)
|
| -
|
| -.Looplo$N:
|
| - and $Zlo,0x0f,$remp
|
| - sll $Zhi,60,$t0
|
| - subq $cnt,1,$cnt
|
| - srl $Zlo,4,$Zlo
|
| -
|
| - ldq $Tlo1,8($nhi)
|
| - xor $rem,$Zhi,$Zhi
|
| - ldq $Thi1,0($nhi)
|
| - s8addq $remp,$rem_4bit,$remp
|
| -
|
| - ldq $rem,0($remp)
|
| - srl $Zhi,4,$Zhi
|
| - xor $t0,$Zlo,$Zlo
|
| - extbl $Xlo,$cnt,$nlo
|
| -
|
| - and $nlo,0xf0,$nhi
|
| - xor $Thi0,$Zhi,$Zhi
|
| - xor $Tlo0,$Zlo,$Zlo
|
| - sll $nlo,4,$nlo
|
| -
|
| -
|
| - and $Zlo,0x0f,$remp
|
| - sll $Zhi,60,$t0
|
| - and $nlo,0xf0,$nlo
|
| - srl $Zlo,4,$Zlo
|
| -
|
| - s8addq $remp,$rem_4bit,$remp
|
| - xor $rem,$Zhi,$Zhi
|
| - addq $nlo,$Htbl,$nlo
|
| - addq $nhi,$Htbl,$nhi
|
| -
|
| - ldq $rem,0($remp)
|
| - srl $Zhi,4,$Zhi
|
| - ldq $Tlo0,8($nlo)
|
| - xor $t0,$Zlo,$Zlo
|
| -
|
| - xor $Tlo1,$Zlo,$Zlo
|
| - xor $Thi1,$Zhi,$Zhi
|
| - ldq $Thi0,0($nlo)
|
| - bne $cnt,.Looplo$N
|
| -
|
| -
|
| - and $Zlo,0x0f,$remp
|
| - sll $Zhi,60,$t0
|
| - lda $cnt,7(zero)
|
| - srl $Zlo,4,$Zlo
|
| -
|
| - ldq $Tlo1,8($nhi)
|
| - xor $rem,$Zhi,$Zhi
|
| - ldq $Thi1,0($nhi)
|
| - s8addq $remp,$rem_4bit,$remp
|
| -
|
| - ldq $rem,0($remp)
|
| - srl $Zhi,4,$Zhi
|
| - xor $t0,$Zlo,$Zlo
|
| - extbl $Xhi,$cnt,$nlo
|
| -
|
| - and $nlo,0xf0,$nhi
|
| - xor $Thi0,$Zhi,$Zhi
|
| - xor $Tlo0,$Zlo,$Zlo
|
| - sll $nlo,4,$nlo
|
| -
|
| - and $Zlo,0x0f,$remp
|
| - sll $Zhi,60,$t0
|
| - and $nlo,0xf0,$nlo
|
| - srl $Zlo,4,$Zlo
|
| -
|
| - s8addq $remp,$rem_4bit,$remp
|
| - xor $rem,$Zhi,$Zhi
|
| - addq $nlo,$Htbl,$nlo
|
| - addq $nhi,$Htbl,$nhi
|
| -
|
| - ldq $rem,0($remp)
|
| - srl $Zhi,4,$Zhi
|
| - ldq $Tlo0,8($nlo)
|
| - xor $t0,$Zlo,$Zlo
|
| -
|
| - xor $Tlo1,$Zlo,$Zlo
|
| - xor $Thi1,$Zhi,$Zhi
|
| - ldq $Thi0,0($nlo)
|
| - unop
|
| -
|
| -
|
| -.Loophi$N:
|
| - and $Zlo,0x0f,$remp
|
| - sll $Zhi,60,$t0
|
| - subq $cnt,1,$cnt
|
| - srl $Zlo,4,$Zlo
|
| -
|
| - ldq $Tlo1,8($nhi)
|
| - xor $rem,$Zhi,$Zhi
|
| - ldq $Thi1,0($nhi)
|
| - s8addq $remp,$rem_4bit,$remp
|
| -
|
| - ldq $rem,0($remp)
|
| - srl $Zhi,4,$Zhi
|
| - xor $t0,$Zlo,$Zlo
|
| - extbl $Xhi,$cnt,$nlo
|
| -
|
| - and $nlo,0xf0,$nhi
|
| - xor $Thi0,$Zhi,$Zhi
|
| - xor $Tlo0,$Zlo,$Zlo
|
| - sll $nlo,4,$nlo
|
| -
|
| -
|
| - and $Zlo,0x0f,$remp
|
| - sll $Zhi,60,$t0
|
| - and $nlo,0xf0,$nlo
|
| - srl $Zlo,4,$Zlo
|
| -
|
| - s8addq $remp,$rem_4bit,$remp
|
| - xor $rem,$Zhi,$Zhi
|
| - addq $nlo,$Htbl,$nlo
|
| - addq $nhi,$Htbl,$nhi
|
| -
|
| - ldq $rem,0($remp)
|
| - srl $Zhi,4,$Zhi
|
| - ldq $Tlo0,8($nlo)
|
| - xor $t0,$Zlo,$Zlo
|
| -
|
| - xor $Tlo1,$Zlo,$Zlo
|
| - xor $Thi1,$Zhi,$Zhi
|
| - ldq $Thi0,0($nlo)
|
| - bne $cnt,.Loophi$N
|
| -
|
| -
|
| - and $Zlo,0x0f,$remp
|
| - sll $Zhi,60,$t0
|
| - srl $Zlo,4,$Zlo
|
| -
|
| - ldq $Tlo1,8($nhi)
|
| - xor $rem,$Zhi,$Zhi
|
| - ldq $Thi1,0($nhi)
|
| - s8addq $remp,$rem_4bit,$remp
|
| -
|
| - ldq $rem,0($remp)
|
| - srl $Zhi,4,$Zhi
|
| - xor $t0,$Zlo,$Zlo
|
| -
|
| - xor $Tlo0,$Zlo,$Zlo
|
| - xor $Thi0,$Zhi,$Zhi
|
| -
|
| - and $Zlo,0x0f,$remp
|
| - sll $Zhi,60,$t0
|
| - srl $Zlo,4,$Zlo
|
| -
|
| - s8addq $remp,$rem_4bit,$remp
|
| - xor $rem,$Zhi,$Zhi
|
| -
|
| - ldq $rem,0($remp)
|
| - srl $Zhi,4,$Zhi
|
| - xor $Tlo1,$Zlo,$Zlo
|
| - xor $Thi1,$Zhi,$Zhi
|
| - xor $t0,$Zlo,$Zlo
|
| - xor $rem,$Zhi,$Zhi
|
| -___
|
| -}}
|
| -
|
| -$code=<<___;
|
| -#ifdef __linux__
|
| -#include <asm/regdef.h>
|
| -#else
|
| -#include <asm.h>
|
| -#include <regdef.h>
|
| -#endif
|
| -
|
| -.text
|
| -
|
| -.set noat
|
| -.set noreorder
|
| -.globl gcm_gmult_4bit
|
| -.align 4
|
| -.ent gcm_gmult_4bit
|
| -gcm_gmult_4bit:
|
| - .frame sp,0,ra
|
| - .prologue 0
|
| -
|
| - ldq $Xlo,8($Xi)
|
| - ldq $Xhi,0($Xi)
|
| -
|
| - br $rem_4bit,.Lpic1
|
| -.Lpic1: lda $rem_4bit,rem_4bit-.Lpic1($rem_4bit)
|
| -___
|
| -
|
| - &loop();
|
| -
|
| -$code.=<<___;
|
| - srl $Zlo,24,$t0 # byte swap
|
| - srl $Zlo,8,$t1
|
| -
|
| - sll $Zlo,8,$t2
|
| - sll $Zlo,24,$Zlo
|
| - zapnot $t0,0x11,$t0
|
| - zapnot $t1,0x22,$t1
|
| -
|
| - zapnot $Zlo,0x88,$Zlo
|
| - or $t0,$t1,$t0
|
| - zapnot $t2,0x44,$t2
|
| -
|
| - or $Zlo,$t0,$Zlo
|
| - srl $Zhi,24,$t0
|
| - srl $Zhi,8,$t1
|
| -
|
| - or $Zlo,$t2,$Zlo
|
| - sll $Zhi,8,$t2
|
| - sll $Zhi,24,$Zhi
|
| -
|
| - srl $Zlo,32,$Xlo
|
| - sll $Zlo,32,$Zlo
|
| -
|
| - zapnot $t0,0x11,$t0
|
| - zapnot $t1,0x22,$t1
|
| - or $Zlo,$Xlo,$Xlo
|
| -
|
| - zapnot $Zhi,0x88,$Zhi
|
| - or $t0,$t1,$t0
|
| - zapnot $t2,0x44,$t2
|
| -
|
| - or $Zhi,$t0,$Zhi
|
| - or $Zhi,$t2,$Zhi
|
| -
|
| - srl $Zhi,32,$Xhi
|
| - sll $Zhi,32,$Zhi
|
| -
|
| - or $Zhi,$Xhi,$Xhi
|
| - stq $Xlo,8($Xi)
|
| - stq $Xhi,0($Xi)
|
| -
|
| - ret (ra)
|
| -.end gcm_gmult_4bit
|
| -___
|
| -
|
| -$inhi="s0";
|
| -$inlo="s1";
|
| -
|
| -$code.=<<___;
|
| -.globl gcm_ghash_4bit
|
| -.align 4
|
| -.ent gcm_ghash_4bit
|
| -gcm_ghash_4bit:
|
| - lda sp,-32(sp)
|
| - stq ra,0(sp)
|
| - stq s0,8(sp)
|
| - stq s1,16(sp)
|
| - .mask 0x04000600,-32
|
| - .frame sp,32,ra
|
| - .prologue 0
|
| -
|
| - ldq_u $inhi,0($inp)
|
| - ldq_u $Thi0,7($inp)
|
| - ldq_u $inlo,8($inp)
|
| - ldq_u $Tlo0,15($inp)
|
| - ldq $Xhi,0($Xi)
|
| - ldq $Xlo,8($Xi)
|
| -
|
| - br $rem_4bit,.Lpic2
|
| -.Lpic2: lda $rem_4bit,rem_4bit-.Lpic2($rem_4bit)
|
| -
|
| -.Louter:
|
| - extql $inhi,$inp,$inhi
|
| - extqh $Thi0,$inp,$Thi0
|
| - or $inhi,$Thi0,$inhi
|
| - lda $inp,16($inp)
|
| -
|
| - extql $inlo,$inp,$inlo
|
| - extqh $Tlo0,$inp,$Tlo0
|
| - or $inlo,$Tlo0,$inlo
|
| - subq $len,16,$len
|
| -
|
| - xor $Xlo,$inlo,$Xlo
|
| - xor $Xhi,$inhi,$Xhi
|
| -___
|
| -
|
| - &loop();
|
| -
|
| -$code.=<<___;
|
| - srl $Zlo,24,$t0 # byte swap
|
| - srl $Zlo,8,$t1
|
| -
|
| - sll $Zlo,8,$t2
|
| - sll $Zlo,24,$Zlo
|
| - zapnot $t0,0x11,$t0
|
| - zapnot $t1,0x22,$t1
|
| -
|
| - zapnot $Zlo,0x88,$Zlo
|
| - or $t0,$t1,$t0
|
| - zapnot $t2,0x44,$t2
|
| -
|
| - or $Zlo,$t0,$Zlo
|
| - srl $Zhi,24,$t0
|
| - srl $Zhi,8,$t1
|
| -
|
| - or $Zlo,$t2,$Zlo
|
| - sll $Zhi,8,$t2
|
| - sll $Zhi,24,$Zhi
|
| -
|
| - srl $Zlo,32,$Xlo
|
| - sll $Zlo,32,$Zlo
|
| - beq $len,.Ldone
|
| -
|
| - zapnot $t0,0x11,$t0
|
| - zapnot $t1,0x22,$t1
|
| - or $Zlo,$Xlo,$Xlo
|
| - ldq_u $inhi,0($inp)
|
| -
|
| - zapnot $Zhi,0x88,$Zhi
|
| - or $t0,$t1,$t0
|
| - zapnot $t2,0x44,$t2
|
| - ldq_u $Thi0,7($inp)
|
| -
|
| - or $Zhi,$t0,$Zhi
|
| - or $Zhi,$t2,$Zhi
|
| - ldq_u $inlo,8($inp)
|
| - ldq_u $Tlo0,15($inp)
|
| -
|
| - srl $Zhi,32,$Xhi
|
| - sll $Zhi,32,$Zhi
|
| -
|
| - or $Zhi,$Xhi,$Xhi
|
| - br zero,.Louter
|
| -
|
| -.Ldone:
|
| - zapnot $t0,0x11,$t0
|
| - zapnot $t1,0x22,$t1
|
| - or $Zlo,$Xlo,$Xlo
|
| -
|
| - zapnot $Zhi,0x88,$Zhi
|
| - or $t0,$t1,$t0
|
| - zapnot $t2,0x44,$t2
|
| -
|
| - or $Zhi,$t0,$Zhi
|
| - or $Zhi,$t2,$Zhi
|
| -
|
| - srl $Zhi,32,$Xhi
|
| - sll $Zhi,32,$Zhi
|
| -
|
| - or $Zhi,$Xhi,$Xhi
|
| -
|
| - stq $Xlo,8($Xi)
|
| - stq $Xhi,0($Xi)
|
| -
|
| - .set noreorder
|
| - /*ldq ra,0(sp)*/
|
| - ldq s0,8(sp)
|
| - ldq s1,16(sp)
|
| - lda sp,32(sp)
|
| - ret (ra)
|
| -.end gcm_ghash_4bit
|
| -
|
| -.align 4
|
| -rem_4bit:
|
| - .quad 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
|
| - .quad 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
|
| - .quad 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
|
| - .quad 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
|
| -.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
|
| -.align 4
|
| -
|
| -___
|
| -$output=shift and open STDOUT,">$output";
|
| -print $code;
|
| -close STDOUT;
|
| -
|
|
|