Index: third_party/boringssl/win-x86/crypto/chacha/chacha-x86.asm |
diff --git a/third_party/boringssl/win-x86/crypto/chacha/chacha-x86.asm b/third_party/boringssl/win-x86/crypto/chacha/chacha-x86.asm |
index 283c8e4f0ceb022ab2237a59bff43109fc3beea8..3ba31a2b35094e291232eb1dcc60333f9ffc34b0 100644 |
--- a/third_party/boringssl/win-x86/crypto/chacha/chacha-x86.asm |
+++ b/third_party/boringssl/win-x86/crypto/chacha/chacha-x86.asm |
@@ -272,13 +272,11 @@ L$004loop: |
xor esi,DWORD [36+ebx] |
xor edx,DWORD [48+ebx] |
xor edi,DWORD [56+ebx] |
- mov DWORD [16+esp],ebp |
- mov ebp,DWORD [esp] |
- mov DWORD [32+esp],ecx |
- mov DWORD [36+esp],esi |
- mov DWORD [48+esp],edx |
- mov DWORD [56+esp],edi |
- mov DWORD [eax],ebp |
+ mov DWORD [16+eax],ebp |
+ mov DWORD [32+eax],ecx |
+ mov DWORD [36+eax],esi |
+ mov DWORD [48+eax],edx |
+ mov DWORD [56+eax],edi |
mov ebp,DWORD [4+esp] |
mov ecx,DWORD [8+esp] |
mov esi,DWORD [12+esp] |
@@ -295,42 +293,34 @@ L$004loop: |
xor edx,DWORD [20+ebx] |
xor edi,DWORD [24+ebx] |
mov DWORD [4+eax],ebp |
- mov ebp,DWORD [16+esp] |
mov DWORD [8+eax],ecx |
mov DWORD [12+eax],esi |
- mov DWORD [16+eax],ebp |
mov DWORD [20+eax],edx |
mov DWORD [24+eax],edi |
- mov ecx,DWORD [28+esp] |
- mov edx,DWORD [32+esp] |
- mov edi,DWORD [36+esp] |
- add ecx,DWORD [92+esp] |
- mov ebp,DWORD [40+esp] |
- xor ecx,DWORD [28+ebx] |
+ mov ebp,DWORD [28+esp] |
+ mov ecx,DWORD [40+esp] |
mov esi,DWORD [44+esp] |
- mov DWORD [28+eax],ecx |
- mov DWORD [32+eax],edx |
- mov DWORD [36+eax],edi |
- add ebp,DWORD [104+esp] |
- add esi,DWORD [108+esp] |
- xor ebp,DWORD [40+ebx] |
- xor esi,DWORD [44+ebx] |
- mov DWORD [40+eax],ebp |
- mov DWORD [44+eax],esi |
- mov ecx,DWORD [48+esp] |
- mov esi,DWORD [56+esp] |
mov edx,DWORD [52+esp] |
mov edi,DWORD [60+esp] |
+ add ebp,DWORD [92+esp] |
+ add ecx,DWORD [104+esp] |
+ add esi,DWORD [108+esp] |
add edx,DWORD [116+esp] |
add edi,DWORD [124+esp] |
+ xor ebp,DWORD [28+ebx] |
+ xor ecx,DWORD [40+ebx] |
+ xor esi,DWORD [44+ebx] |
xor edx,DWORD [52+ebx] |
xor edi,DWORD [60+ebx] |
lea ebx,[64+ebx] |
- mov DWORD [48+eax],ecx |
+ mov DWORD [28+eax],ebp |
+ mov ebp,DWORD [esp] |
+ mov DWORD [40+eax],ecx |
mov ecx,DWORD [160+esp] |
+ mov DWORD [44+eax],esi |
mov DWORD [52+eax],edx |
- mov DWORD [56+eax],esi |
mov DWORD [60+eax],edi |
+ mov DWORD [eax],ebp |
lea eax,[64+eax] |
sub ecx,64 |
jnz NEAR L$003outer_loop |
@@ -723,14 +713,24 @@ L$010loop: |
punpcklqdq xmm6,xmm7 |
punpckhqdq xmm1,xmm2 |
punpckhqdq xmm3,xmm7 |
- movdqa [ebx-128],xmm0 |
+ movdqu xmm4,[esi-128] |
+ movdqu xmm5,[esi-64] |
+ movdqu xmm2,[esi] |
+ movdqu xmm7,[64+esi] |
+ lea esi,[16+esi] |
+ pxor xmm4,xmm0 |
movdqa xmm0,[ebx-64] |
- movdqa [ebx-112],xmm1 |
- movdqa [ebx-96],xmm6 |
- movdqa [ebx-80],xmm3 |
+ pxor xmm5,xmm1 |
movdqa xmm1,[ebx-48] |
+ pxor xmm6,xmm2 |
movdqa xmm2,[ebx-32] |
+ pxor xmm7,xmm3 |
movdqa xmm3,[ebx-16] |
+ movdqu [edi-128],xmm4 |
+ movdqu [edi-64],xmm5 |
+ movdqu [edi],xmm6 |
+ movdqu [64+edi],xmm7 |
+ lea edi,[16+edi] |
paddd xmm0,[ebp-64] |
paddd xmm1,[ebp-48] |
paddd xmm2,[ebp-32] |
@@ -747,14 +747,24 @@ L$010loop: |
punpcklqdq xmm6,xmm7 |
punpckhqdq xmm1,xmm2 |
punpckhqdq xmm3,xmm7 |
- movdqa [ebx-64],xmm0 |
+ movdqu xmm4,[esi-128] |
+ movdqu xmm5,[esi-64] |
+ movdqu xmm2,[esi] |
+ movdqu xmm7,[64+esi] |
+ lea esi,[16+esi] |
+ pxor xmm4,xmm0 |
movdqa xmm0,[ebx] |
- movdqa [ebx-48],xmm1 |
- movdqa [ebx-32],xmm6 |
- movdqa [ebx-16],xmm3 |
+ pxor xmm5,xmm1 |
movdqa xmm1,[16+ebx] |
+ pxor xmm6,xmm2 |
movdqa xmm2,[32+ebx] |
+ pxor xmm7,xmm3 |
movdqa xmm3,[48+ebx] |
+ movdqu [edi-128],xmm4 |
+ movdqu [edi-64],xmm5 |
+ movdqu [edi],xmm6 |
+ movdqu [64+edi],xmm7 |
+ lea edi,[16+edi] |
paddd xmm0,[ebp] |
paddd xmm1,[16+ebp] |
paddd xmm2,[32+ebp] |
@@ -771,14 +781,24 @@ L$010loop: |
punpcklqdq xmm6,xmm7 |
punpckhqdq xmm1,xmm2 |
punpckhqdq xmm3,xmm7 |
- movdqa [ebx],xmm0 |
+ movdqu xmm4,[esi-128] |
+ movdqu xmm5,[esi-64] |
+ movdqu xmm2,[esi] |
+ movdqu xmm7,[64+esi] |
+ lea esi,[16+esi] |
+ pxor xmm4,xmm0 |
movdqa xmm0,[64+ebx] |
- movdqa [16+ebx],xmm1 |
- movdqa [32+ebx],xmm6 |
- movdqa [48+ebx],xmm3 |
+ pxor xmm5,xmm1 |
movdqa xmm1,[80+ebx] |
+ pxor xmm6,xmm2 |
movdqa xmm2,[96+ebx] |
+ pxor xmm7,xmm3 |
movdqa xmm3,[112+ebx] |
+ movdqu [edi-128],xmm4 |
+ movdqu [edi-64],xmm5 |
+ movdqu [edi],xmm6 |
+ movdqu [64+edi],xmm7 |
+ lea edi,[16+edi] |
paddd xmm0,[64+ebp] |
paddd xmm1,[80+ebp] |
paddd xmm2,[96+ebp] |
@@ -795,60 +815,20 @@ L$010loop: |
punpcklqdq xmm6,xmm7 |
punpckhqdq xmm1,xmm2 |
punpckhqdq xmm3,xmm7 |
- movdqa [64+ebx],xmm0 |
- movdqa [80+ebx],xmm1 |
- movdqa [96+ebx],xmm6 |
- movdqa [112+ebx],xmm3 |
- movdqu xmm0,[esi-128] |
- movdqu xmm1,[esi-112] |
- movdqu xmm2,[esi-96] |
- movdqu xmm3,[esi-80] |
- pxor xmm0,[ebx-128] |
- pxor xmm1,[ebx-64] |
- pxor xmm2,[ebx] |
- pxor xmm3,[64+ebx] |
- movdqu [edi-128],xmm0 |
- movdqu [edi-112],xmm1 |
- movdqu [edi-96],xmm2 |
- movdqu [edi-80],xmm3 |
- movdqu xmm0,[esi-64] |
- movdqu xmm1,[esi-48] |
- movdqu xmm2,[esi-32] |
- movdqu xmm3,[esi-16] |
- pxor xmm0,[ebx-112] |
- pxor xmm1,[ebx-48] |
- pxor xmm2,[16+ebx] |
- pxor xmm3,[80+ebx] |
- movdqu [edi-64],xmm0 |
- movdqu [edi-48],xmm1 |
- movdqu [edi-32],xmm2 |
- movdqu [edi-16],xmm3 |
- movdqu xmm0,[esi] |
- movdqu xmm1,[16+esi] |
- movdqu xmm2,[32+esi] |
- movdqu xmm3,[48+esi] |
- pxor xmm0,[ebx-96] |
- pxor xmm1,[ebx-32] |
- pxor xmm2,[32+ebx] |
- pxor xmm3,[96+ebx] |
- movdqu [edi],xmm0 |
- movdqu [16+edi],xmm1 |
- movdqu [32+edi],xmm2 |
- movdqu [48+edi],xmm3 |
- movdqu xmm0,[64+esi] |
- movdqu xmm1,[80+esi] |
- movdqu xmm2,[96+esi] |
- movdqu xmm3,[112+esi] |
- pxor xmm0,[ebx-80] |
- pxor xmm1,[ebx-16] |
- pxor xmm2,[48+ebx] |
- pxor xmm3,[112+ebx] |
- movdqu [64+edi],xmm0 |
- movdqu [80+edi],xmm1 |
- movdqu [96+edi],xmm2 |
- movdqu [112+edi],xmm3 |
- lea esi,[256+esi] |
- lea edi,[256+edi] |
+ movdqu xmm4,[esi-128] |
+ movdqu xmm5,[esi-64] |
+ movdqu xmm2,[esi] |
+ movdqu xmm7,[64+esi] |
+ lea esi,[208+esi] |
+ pxor xmm4,xmm0 |
+ pxor xmm5,xmm1 |
+ pxor xmm6,xmm2 |
+ pxor xmm7,xmm3 |
+ movdqu [edi-128],xmm4 |
+ movdqu [edi-64],xmm5 |
+ movdqu [edi],xmm6 |
+ movdqu [64+edi],xmm7 |
+ lea edi,[208+edi] |
sub ecx,256 |
jnc NEAR L$009outer_loop |
add ecx,256 |