| Index: mozilla/security/nss/lib/freebl/arcfour-amd64-masm.asm
|
| diff --git a/mozilla/security/nss/lib/freebl/arcfour-amd64-masm.asm b/mozilla/security/nss/lib/freebl/arcfour-amd64-masm.asm
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..1601c4f899895dd7ee2d0825a987268442c0d6bd
|
| --- /dev/null
|
| +++ b/mozilla/security/nss/lib/freebl/arcfour-amd64-masm.asm
|
| @@ -0,0 +1,107 @@
|
| +; This Source Code Form is subject to the terms of the Mozilla Public
|
| +; License, v. 2.0. If a copy of the MPL was not distributed with this
|
| +; file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
| +
|
| +; ** ARCFOUR implementation optimized for AMD64.
|
| +; **
|
| +; ** The throughput achieved by this code is about 320 MBytes/sec, on
|
| +; ** a 1.8 GHz AMD Opteron (rev C0) processor.
|
| +
|
| +.CODE
|
| +
|
| +; extern void ARCFOUR(RC4Context *cx, unsigned long long inputLen,
|
| +; const unsigned char *input, unsigned char *output);
|
| +
|
| +
|
| +ARCFOUR PROC
|
| +
|
| + push rbp
|
| + push rbx
|
| + push rsi
|
| + push rdi
|
| +
|
| + mov rbp, rcx ; key = ARG(key)
|
| + mov rbx, rdx ; rbx = ARG(len)
|
| + mov rsi, r8 ; in = ARG(in)
|
| + mov rdi, r9 ; out = ARG(out)
|
| + mov rcx, [rbp] ; x = key->x
|
| + mov rdx, [rbp+8] ; y = key->y
|
| + add rbp, 16 ; d = key->data
|
| + inc rcx ; x++
|
| + and rcx, 0ffh ; x &= 0xff
|
| + lea rbx, [rbx+rsi-8] ; rbx = in+len-8
|
| + mov r9, rbx ; tmp = in+len-8
|
| + mov rax, [rbp+rcx*8] ; tx = d[x]
|
| + cmp rbx, rsi ; cmp in with in+len-8
|
| + jl Lend ; jump if (in+len-8 < in)
|
| +
|
| +Lstart:
|
| + add rsi, 8 ; increment in
|
| + add rdi, 8 ; increment out
|
| +
|
| + ;
|
| + ; generate the next 8 bytes of the rc4 stream into r8
|
| + ;
|
| +
|
| + mov r11, 8 ; byte counter
|
| +
|
| +@@:
|
| + add dl, al ; y += tx
|
| + mov ebx, [rbp+rdx*8] ; ty = d[y]
|
| + mov [rbp+rcx*8], ebx ; d[x] = ty
|
| + add bl, al ; val = ty + tx
|
| + mov [rbp+rdx*8], eax ; d[y] = tx
|
| + inc cl ; x++ (NEXT ROUND)
|
| + mov eax, [rbp+rcx*8] ; tx = d[x] (NEXT ROUND)
|
| + mov r8b, [rbp+rbx*8] ; val = d[val]
|
| + dec r11b
|
| + ror r8, 8 ; (ror does not change ZF)
|
| + jnz @b
|
| +
|
| + ;
|
| + ; xor 8 bytes
|
| + ;
|
| +
|
| + xor r8, [rsi-8]
|
| + cmp rsi, r9 ; cmp in+len-8 with in
|
| + mov [rdi-8], r8
|
| + jle Lstart
|
| +
|
| +Lend:
|
| + add r9, 8 ; tmp = in+len
|
| +
|
| + ;
|
| + ; handle the last bytes, one by one
|
| + ;
|
| +
|
| +@@:
|
| + cmp r9, rsi ; cmp in with in+len
|
| + jle Lfinished ; jump if (in+len <= in)
|
| + add dl, al ; y += tx
|
| + mov ebx, [rbp+rdx*8] ; ty = d[y]
|
| + mov [rbp+rcx*8], ebx ; d[x] = ty
|
| + add bl, al ; val = ty + tx
|
| + mov [rbp+rdx*8], eax ; d[y] = tx
|
| + inc cl ; x++ (NEXT ROUND)
|
| + mov eax, [rbp+rcx*8] ; tx = d[x] (NEXT ROUND)
|
| + mov r8b, [rbp+rbx*8] ; val = d[val]
|
| + xor r8b, [rsi] ; xor 1 byte
|
| + mov [rdi], r8b
|
| + inc rsi ; in++
|
| + inc rdi
|
| + jmp @b
|
| +
|
| +Lfinished:
|
| + dec rcx ; x--
|
| + mov [rbp-8], dl ; key->y = y
|
| + mov [rbp-16], cl ; key->x = x
|
| +
|
| + pop rdi
|
| + pop rsi
|
| + pop rbx
|
| + pop rbp
|
| + ret
|
| +
|
| +ARCFOUR ENDP
|
| +
|
| +END
|
|
|