Index: openssl/crypto/bn/bn_asm.c |
=================================================================== |
--- openssl/crypto/bn/bn_asm.c (revision 105093) |
+++ openssl/crypto/bn/bn_asm.c (working copy) |
@@ -75,6 +75,7 @@ |
assert(num >= 0); |
if (num <= 0) return(c1); |
+#ifndef OPENSSL_SMALL_FOOTPRINT |
while (num&~3) |
{ |
mul_add(rp[0],ap[0],w,c1); |
@@ -83,11 +84,11 @@ |
mul_add(rp[3],ap[3],w,c1); |
ap+=4; rp+=4; num-=4; |
} |
- if (num) |
+#endif |
+ while (num) |
{ |
- mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1; |
- mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1; |
- mul_add(rp[2],ap[2],w,c1); return c1; |
+ mul_add(rp[0],ap[0],w,c1); |
+ ap++; rp++; num--; |
} |
return(c1); |
@@ -100,6 +101,7 @@ |
assert(num >= 0); |
if (num <= 0) return(c1); |
+#ifndef OPENSSL_SMALL_FOOTPRINT |
while (num&~3) |
{ |
mul(rp[0],ap[0],w,c1); |
@@ -108,11 +110,11 @@ |
mul(rp[3],ap[3],w,c1); |
ap+=4; rp+=4; num-=4; |
} |
- if (num) |
+#endif |
+ while (num) |
{ |
- mul(rp[0],ap[0],w,c1); if (--num == 0) return c1; |
- mul(rp[1],ap[1],w,c1); if (--num == 0) return c1; |
- mul(rp[2],ap[2],w,c1); |
+ mul(rp[0],ap[0],w,c1); |
+ ap++; rp++; num--; |
} |
return(c1); |
} |
@@ -121,6 +123,8 @@ |
{ |
assert(n >= 0); |
if (n <= 0) return; |
+ |
+#ifndef OPENSSL_SMALL_FOOTPRINT |
while (n&~3) |
{ |
sqr(r[0],r[1],a[0]); |
@@ -129,11 +133,11 @@ |
sqr(r[6],r[7],a[3]); |
a+=4; r+=8; n-=4; |
} |
- if (n) |
+#endif |
+ while (n) |
{ |
- sqr(r[0],r[1],a[0]); if (--n == 0) return; |
- sqr(r[2],r[3],a[1]); if (--n == 0) return; |
- sqr(r[4],r[5],a[2]); |
+ sqr(r[0],r[1],a[0]); |
+ a++; r+=2; n--; |
} |
} |
@@ -150,19 +154,21 @@ |
bl=LBITS(w); |
bh=HBITS(w); |
- for (;;) |
+#ifndef OPENSSL_SMALL_FOOTPRINT |
+ while (num&~3) |
{ |
mul_add(rp[0],ap[0],bl,bh,c); |
- if (--num == 0) break; |
mul_add(rp[1],ap[1],bl,bh,c); |
- if (--num == 0) break; |
mul_add(rp[2],ap[2],bl,bh,c); |
- if (--num == 0) break; |
mul_add(rp[3],ap[3],bl,bh,c); |
- if (--num == 0) break; |
- ap+=4; |
- rp+=4; |
+ ap+=4; rp+=4; num-=4; |
} |
+#endif |
+ while (num) |
+ { |
+ mul_add(rp[0],ap[0],bl,bh,c); |
+ ap++; rp++; num--; |
+ } |
return(c); |
} |
@@ -177,19 +183,21 @@ |
bl=LBITS(w); |
bh=HBITS(w); |
- for (;;) |
+#ifndef OPENSSL_SMALL_FOOTPRINT |
+ while (num&~3) |
{ |
mul(rp[0],ap[0],bl,bh,carry); |
- if (--num == 0) break; |
mul(rp[1],ap[1],bl,bh,carry); |
- if (--num == 0) break; |
mul(rp[2],ap[2],bl,bh,carry); |
- if (--num == 0) break; |
mul(rp[3],ap[3],bl,bh,carry); |
- if (--num == 0) break; |
- ap+=4; |
- rp+=4; |
+ ap+=4; rp+=4; num-=4; |
} |
+#endif |
+ while (num) |
+ { |
+ mul(rp[0],ap[0],bl,bh,carry); |
+ ap++; rp++; num--; |
+ } |
return(carry); |
} |
@@ -197,23 +205,22 @@ |
{ |
assert(n >= 0); |
if (n <= 0) return; |
- for (;;) |
+ |
+#ifndef OPENSSL_SMALL_FOOTPRINT |
+ while (n&~3) |
{ |
sqr64(r[0],r[1],a[0]); |
- if (--n == 0) break; |
- |
sqr64(r[2],r[3],a[1]); |
- if (--n == 0) break; |
- |
sqr64(r[4],r[5],a[2]); |
- if (--n == 0) break; |
- |
sqr64(r[6],r[7],a[3]); |
- if (--n == 0) break; |
- |
- a+=4; |
- r+=8; |
+ a+=4; r+=8; n-=4; |
} |
+#endif |
+ while (n) |
+ { |
+ sqr64(r[0],r[1],a[0]); |
+ a++; r+=2; n--; |
+ } |
} |
#endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */ |
@@ -303,32 +310,31 @@ |
assert(n >= 0); |
if (n <= 0) return((BN_ULONG)0); |
- for (;;) |
+#ifndef OPENSSL_SMALL_FOOTPRINT |
+ while (n&~3) |
{ |
ll+=(BN_ULLONG)a[0]+b[0]; |
r[0]=(BN_ULONG)ll&BN_MASK2; |
ll>>=BN_BITS2; |
- if (--n <= 0) break; |
- |
ll+=(BN_ULLONG)a[1]+b[1]; |
r[1]=(BN_ULONG)ll&BN_MASK2; |
ll>>=BN_BITS2; |
- if (--n <= 0) break; |
- |
ll+=(BN_ULLONG)a[2]+b[2]; |
r[2]=(BN_ULONG)ll&BN_MASK2; |
ll>>=BN_BITS2; |
- if (--n <= 0) break; |
- |
ll+=(BN_ULLONG)a[3]+b[3]; |
r[3]=(BN_ULONG)ll&BN_MASK2; |
ll>>=BN_BITS2; |
- if (--n <= 0) break; |
- |
- a+=4; |
- b+=4; |
- r+=4; |
+ a+=4; b+=4; r+=4; n-=4; |
} |
+#endif |
+ while (n) |
+ { |
+ ll+=(BN_ULLONG)a[0]+b[0]; |
+ r[0]=(BN_ULONG)ll&BN_MASK2; |
+ ll>>=BN_BITS2; |
+ a++; b++; r++; n--; |
+ } |
return((BN_ULONG)ll); |
} |
#else /* !BN_LLONG */ |
@@ -340,7 +346,8 @@ |
if (n <= 0) return((BN_ULONG)0); |
c=0; |
- for (;;) |
+#ifndef OPENSSL_SMALL_FOOTPRINT |
+ while (n&~3) |
{ |
t=a[0]; |
t=(t+c)&BN_MASK2; |
@@ -348,36 +355,37 @@ |
l=(t+b[0])&BN_MASK2; |
c+=(l < t); |
r[0]=l; |
- if (--n <= 0) break; |
- |
t=a[1]; |
t=(t+c)&BN_MASK2; |
c=(t < c); |
l=(t+b[1])&BN_MASK2; |
c+=(l < t); |
r[1]=l; |
- if (--n <= 0) break; |
- |
t=a[2]; |
t=(t+c)&BN_MASK2; |
c=(t < c); |
l=(t+b[2])&BN_MASK2; |
c+=(l < t); |
r[2]=l; |
- if (--n <= 0) break; |
- |
t=a[3]; |
t=(t+c)&BN_MASK2; |
c=(t < c); |
l=(t+b[3])&BN_MASK2; |
c+=(l < t); |
r[3]=l; |
- if (--n <= 0) break; |
- |
- a+=4; |
- b+=4; |
- r+=4; |
+ a+=4; b+=4; r+=4; n-=4; |
} |
+#endif |
+ while(n) |
+ { |
+ t=a[0]; |
+ t=(t+c)&BN_MASK2; |
+ c=(t < c); |
+ l=(t+b[0])&BN_MASK2; |
+ c+=(l < t); |
+ r[0]=l; |
+ a++; b++; r++; n--; |
+ } |
return((BN_ULONG)c); |
} |
#endif /* !BN_LLONG */ |
@@ -390,36 +398,35 @@ |
assert(n >= 0); |
if (n <= 0) return((BN_ULONG)0); |
- for (;;) |
+#ifndef OPENSSL_SMALL_FOOTPRINT |
+ while (n&~3) |
{ |
t1=a[0]; t2=b[0]; |
r[0]=(t1-t2-c)&BN_MASK2; |
if (t1 != t2) c=(t1 < t2); |
- if (--n <= 0) break; |
- |
t1=a[1]; t2=b[1]; |
r[1]=(t1-t2-c)&BN_MASK2; |
if (t1 != t2) c=(t1 < t2); |
- if (--n <= 0) break; |
- |
t1=a[2]; t2=b[2]; |
r[2]=(t1-t2-c)&BN_MASK2; |
if (t1 != t2) c=(t1 < t2); |
- if (--n <= 0) break; |
- |
t1=a[3]; t2=b[3]; |
r[3]=(t1-t2-c)&BN_MASK2; |
if (t1 != t2) c=(t1 < t2); |
- if (--n <= 0) break; |
- |
- a+=4; |
- b+=4; |
- r+=4; |
+ a+=4; b+=4; r+=4; n-=4; |
} |
+#endif |
+ while (n) |
+ { |
+ t1=a[0]; t2=b[0]; |
+ r[0]=(t1-t2-c)&BN_MASK2; |
+ if (t1 != t2) c=(t1 < t2); |
+ a++; b++; r++; n--; |
+ } |
return(c); |
} |
-#ifdef BN_MUL_COMBA |
+#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT) |
#undef bn_mul_comba8 |
#undef bn_mul_comba4 |
@@ -820,18 +827,134 @@ |
r[6]=c1; |
r[7]=c2; |
} |
+ |
+#ifdef OPENSSL_NO_ASM |
+#ifdef OPENSSL_BN_ASM_MONT |
+#include <alloca.h> |
+/* |
+ * This is essentially reference implementation, which may or may not |
+ * result in performance improvement. E.g. on IA-32 this routine was |
+ * observed to give 40% faster rsa1024 private key operations and 10% |
+ * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only |
+ * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a |
+ * reference implementation, one to be used as starting point for |
+ * platform-specific assembler. Mentioned numbers apply to compiler |
+ * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and |
+ * can vary not only from platform to platform, but even for compiler |
+ * versions. Assembler vs. assembler improvement coefficients can |
+ * [and are known to] differ and are to be documented elsewhere. |
+ */ |
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0p, int num) |
+ { |
+ BN_ULONG c0,c1,ml,*tp,n0; |
+#ifdef mul64 |
+ BN_ULONG mh; |
+#endif |
+ volatile BN_ULONG *vp; |
+ int i=0,j; |
+ |
+#if 0 /* template for platform-specific implementation */ |
+ if (ap==bp) return bn_sqr_mont(rp,ap,np,n0p,num); |
+#endif |
+ vp = tp = alloca((num+2)*sizeof(BN_ULONG)); |
+ |
+ n0 = *n0p; |
+ |
+ c0 = 0; |
+ ml = bp[0]; |
+#ifdef mul64 |
+ mh = HBITS(ml); |
+ ml = LBITS(ml); |
+ for (j=0;j<num;++j) |
+ mul(tp[j],ap[j],ml,mh,c0); |
+#else |
+ for (j=0;j<num;++j) |
+ mul(tp[j],ap[j],ml,c0); |
+#endif |
+ |
+ tp[num] = c0; |
+ tp[num+1] = 0; |
+ goto enter; |
+ |
+ for(i=0;i<num;i++) |
+ { |
+ c0 = 0; |
+ ml = bp[i]; |
+#ifdef mul64 |
+ mh = HBITS(ml); |
+ ml = LBITS(ml); |
+ for (j=0;j<num;++j) |
+ mul_add(tp[j],ap[j],ml,mh,c0); |
+#else |
+ for (j=0;j<num;++j) |
+ mul_add(tp[j],ap[j],ml,c0); |
+#endif |
+ c1 = (tp[num] + c0)&BN_MASK2; |
+ tp[num] = c1; |
+ tp[num+1] = (c1<c0?1:0); |
+ enter: |
+ c1 = tp[0]; |
+ ml = (c1*n0)&BN_MASK2; |
+ c0 = 0; |
+#ifdef mul64 |
+ mh = HBITS(ml); |
+ ml = LBITS(ml); |
+ mul_add(c1,np[0],ml,mh,c0); |
+#else |
+ mul_add(c1,ml,np[0],c0); |
+#endif |
+ for(j=1;j<num;j++) |
+ { |
+ c1 = tp[j]; |
+#ifdef mul64 |
+ mul_add(c1,np[j],ml,mh,c0); |
+#else |
+ mul_add(c1,ml,np[j],c0); |
+#endif |
+ tp[j-1] = c1&BN_MASK2; |
+ } |
+ c1 = (tp[num] + c0)&BN_MASK2; |
+ tp[num-1] = c1; |
+ tp[num] = tp[num+1] + (c1<c0?1:0); |
+ } |
+ |
+ if (tp[num]!=0 || tp[num-1]>=np[num-1]) |
+ { |
+ c0 = bn_sub_words(rp,tp,np,num); |
+ if (tp[num]!=0 || c0==0) |
+ { |
+ for(i=0;i<num+2;i++) vp[i] = 0; |
+ return 1; |
+ } |
+ } |
+ for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0; |
+ vp[num] = 0; |
+ vp[num+1] = 0; |
+ return 1; |
+ } |
+#else |
+/* |
+ * Return value of 0 indicates that multiplication/convolution was not |
+ * performed to signal the caller to fall down to alternative/original |
+ * code-path. |
+ */ |
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num) |
+{ return 0; } |
+#endif /* OPENSSL_BN_ASM_MONT */ |
+#endif |
+ |
#else /* !BN_MUL_COMBA */ |
/* hmm... is it faster just to do a multiply? */ |
#undef bn_sqr_comba4 |
-void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) |
+void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) |
{ |
BN_ULONG t[8]; |
bn_sqr_normal(r,a,4,t); |
} |
#undef bn_sqr_comba8 |
-void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) |
+void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) |
{ |
BN_ULONG t[16]; |
bn_sqr_normal(r,a,8,t); |
@@ -857,4 +980,51 @@ |
r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]); |
} |
+#ifdef OPENSSL_NO_ASM |
+#ifdef OPENSSL_BN_ASM_MONT |
+#include <alloca.h> |
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0p, int num) |
+ { |
+ BN_ULONG c0,c1,*tp,n0=*n0p; |
+ volatile BN_ULONG *vp; |
+ int i=0,j; |
+ |
+ vp = tp = alloca((num+2)*sizeof(BN_ULONG)); |
+ |
+ for(i=0;i<=num;i++) tp[i]=0; |
+ |
+ for(i=0;i<num;i++) |
+ { |
+ c0 = bn_mul_add_words(tp,ap,num,bp[i]); |
+ c1 = (tp[num] + c0)&BN_MASK2; |
+ tp[num] = c1; |
+ tp[num+1] = (c1<c0?1:0); |
+ |
+ c0 = bn_mul_add_words(tp,np,num,tp[0]*n0); |
+ c1 = (tp[num] + c0)&BN_MASK2; |
+ tp[num] = c1; |
+ tp[num+1] += (c1<c0?1:0); |
+ for(j=0;j<=num;j++) tp[j]=tp[j+1]; |
+ } |
+ |
+ if (tp[num]!=0 || tp[num-1]>=np[num-1]) |
+ { |
+ c0 = bn_sub_words(rp,tp,np,num); |
+ if (tp[num]!=0 || c0==0) |
+ { |
+ for(i=0;i<num+2;i++) vp[i] = 0; |
+ return 1; |
+ } |
+ } |
+ for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0; |
+ vp[num] = 0; |
+ vp[num+1] = 0; |
+ return 1; |
+ } |
+#else |
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num) |
+{ return 0; } |
+#endif /* OPENSSL_BN_ASM_MONT */ |
+#endif |
+ |
#endif /* !BN_MUL_COMBA */ |