Chromium Code Reviews| Index: newlib/libc/machine/x86_64/memset.S |
| diff --git a/newlib/libc/machine/x86_64/memset.S b/newlib/libc/machine/x86_64/memset.S |
| index c16cd58106ee87c78b82f640a273e63496ddcca4..a46dbfcbf6a63174045b4269f5055d95e73a673b 100644 |
| --- a/newlib/libc/machine/x86_64/memset.S |
| +++ b/newlib/libc/machine/x86_64/memset.S |
| @@ -13,78 +13,119 @@ |
| .global SYM (memset) |
| SOTYPE_FUNCTION(memset) |
| + /* |
| + * Avoid revealing the sandbox base address. In particular, we never read from |
| + * r11, and avoid using the string instructions (stos) because they leave the |
| + * full 64 bits in rdi. |
| + */ |
| + |
| SYM (memset): |
| movl edi, r9d /* Save return value */ |
| movl esi, eax |
| movl edx, ecx |
| cmpl $16, edx |
| - jb byte_set |
| + jb .Lbyte_set |
| movl edi, r8d /* Align on quad word boundary */ |
| andl $7, r8d |
| - jz quadword_aligned |
| + jz .Lquadword_aligned |
| movl $8, ecx |
| subl r8d, ecx |
| subl ecx, edx |
| - rep stosb al, %nacl:(rdi), r15 |
| +.Lheader_loop: |
| + movb al, %nacl:(r15, rdi) |
| + inc edi |
| + dec ecx |
| + jnz .Lheader_loop |
| + |
| movl edx, ecx |
| -quadword_aligned: |
| +.Lquadword_aligned: |
| movabs $0x0101010101010101, r8 |
| movzbl sil, eax |
| imul r8, rax |
| cmpl $256, edx |
| - jb quadword_set |
| + jb .Lquadword_set |
| - shrl $7, ecx /* Store 128 bytes at a time with minimum cache polution */ |
| + shrl $7, ecx /* Store 128 bytes at a time */ |
| + pushq $0 |
| + movl ebp, (rsp) |
| .p2align 4 |
| -loop: |
| - movntiq rax, %nacl: (r15,rdi) |
| - movntiq rax, %nacl: 8 (r15,rdi) |
| - movntiq rax, %nacl: 16 (r15,rdi) |
| - movntiq rax, %nacl: 24 (r15,rdi) |
| - movntiq rax, %nacl: 32 (r15,rdi) |
| - movntiq rax, %nacl: 40 (r15,rdi) |
| - movntiq rax, %nacl: 48 (r15,rdi) |
| - movntiq rax, %nacl: 56 (r15,rdi) |
| - movntiq rax, %nacl: 64 (r15,rdi) |
| - movntiq rax, %nacl: 72 (r15,rdi) |
| - movntiq rax, %nacl: 80 (r15,rdi) |
| - movntiq rax, %nacl: 88 (r15,rdi) |
| - movntiq rax, %nacl: 96 (r15,rdi) |
| - movntiq rax, %nacl: 104 (r15,rdi) |
| - movntiq rax, %nacl: 112 (r15,rdi) |
| - movntiq rax, %nacl: 120 (r15,rdi) |
| +.Lquadword_aligned_loop: |
| + naclrestbp edi, r15 |
| + movq rax, (rbp) |
| + movq rax, 8 (rbp) |
| + movq rax, 16 (rbp) |
| + movq rax, 24 (rbp) |
| + movq rax, 32 (rbp) |
| + movq rax, 40 (rbp) |
| + movq rax, 48 (rbp) |
| + movq rax, 56 (rbp) |
| + movq rax, 64 (rbp) |
| + movq rax, 72 (rbp) |
| + movq rax, 80 (rbp) |
| + movq rax, 88 (rbp) |
| + movq rax, 96 (rbp) |
| + movq rax, 104 (rbp) |
| + movq rax, 112 (rbp) |
| + movq rax, 120 (rbp) |
| leal 128 (rdi), edi |
|
jvoung (off chromium)
2015/02/27 20:33:52
Might have been able to add to ebp and then restor
Derek Schuff
2015/02/27 21:03:19
I like it. Done.
|
| dec ecx |
| - jnz loop |
| + jnz .Lquadword_aligned_loop |
| + |
| + popq r8 |
| + naclrestbp r8d, r15 |
| - sfence |
| movl edx, ecx |
| andl $127, ecx |
| - rep stosb al, %nacl:(rdi), r15 |
| + jz .Lfooter_loop_end |
| +.Lfooter_loop: |
| + movb al, %nacl:(r15, rdi) |
| + inc edi |
| + dec ecx |
| + jnz .Lfooter_loop |
| +.Lfooter_loop_end: |
| movl r9d, eax |
| pop r11 |
| nacljmp r11d, r15 |
| -byte_set: |
| - rep stosb al, %nacl:(rdi), r15 |
| +.Lbyte_set: |
| + testl ecx, ecx |
| + jz .Lbyte_set_end |
| +.Lbyte_set_loop: |
| + movb al, %nacl:(r15, rdi) |
| + inc edi |
| + dec ecx |
| + jnz .Lbyte_set_loop |
| +.Lbyte_set_end: |
| movl r9d, eax |
| pop r11 |
| nacljmp r11d, r15 |
| -quadword_set: |
| +.Lquadword_set: |
| shrl $3, ecx |
| + jz .Lquadword_loop_end |
| .p2align 4 |
| - rep stosq rax, %nacl:(rdi), r15 |
| +.Lquadword_loop: |
| + movq rax, %nacl:(r15, rdi) |
| + add $8, edi |
| + dec ecx |
| + jnz .Lquadword_loop |
| +.Lquadword_loop_end: |
| movl edx, ecx |
| andl $7, ecx |
| - rep stosb al, %nacl:(rdi), r15 /* Store the remaining bytes */ |
| + jz .Lquadword_footer_end |
| +.Lquadword_footer: |
| + movb al, %nacl:(r15, rdi) |
| + inc edi |
| + dec ecx |
| + jnz .Lquadword_footer |
| +.Lquadword_footer_end: |
| movl r9d, eax |
| pop r11 |
| nacljmp r11d, r15 |