Index: newlib/libc/machine/x86_64/memset.S |
diff --git a/newlib/libc/machine/x86_64/memset.S b/newlib/libc/machine/x86_64/memset.S |
index c16cd58106ee87c78b82f640a273e63496ddcca4..f7a4165de755894c841e3ee391ef19abc463b1fb 100644 |
--- a/newlib/libc/machine/x86_64/memset.S |
+++ b/newlib/libc/machine/x86_64/memset.S |
@@ -13,78 +13,126 @@ |
.global SYM (memset) |
SOTYPE_FUNCTION(memset) |
+ /* |
+ * Avoid revealing the sandbox base address. In particular, we never read from |
+ * r11, and avoid using the string instructions (stos) because they leave the |
+ * full 64 bits in rdi. |
+ */ |
+ |
SYM (memset): |
movl edi, r9d /* Save return value */ |
movl esi, eax |
movl edx, ecx |
cmpl $16, edx |
- jb byte_set |
+ jb .Lbyte_set |
movl edi, r8d /* Align on quad word boundary */ |
andl $7, r8d |
- jz quadword_aligned |
+ jz .Lquadword_aligned |
movl $8, ecx |
subl r8d, ecx |
subl ecx, edx |
- rep stosb al, %nacl:(rdi), r15 |
+.Lheader_loop: |
+ movb al, %nacl:(r15, rdi) |
+ inc edi |
+ dec ecx |
+ jnz .Lheader_loop |
+ |
movl edx, ecx |
-quadword_aligned: |
+.Lquadword_aligned: |
movabs $0x0101010101010101, r8 |
movzbl sil, eax |
imul r8, rax |
cmpl $256, edx |
- jb quadword_set |
+ jb .Lquadword_set |
- shrl $7, ecx /* Store 128 bytes at a time with minimum cache polution */ |
+ shrl $7, ecx /* Store 128 bytes at a time */ |
+ |
+ /* Save the frame pointer and use it as a base to avoid repeated masking */ |
+ pushq $0 |
+ movl ebp, (rsp) |
+ naclrestbp edi, r15 |
.p2align 4 |
-loop: |
- movntiq rax, %nacl: (r15,rdi) |
- movntiq rax, %nacl: 8 (r15,rdi) |
- movntiq rax, %nacl: 16 (r15,rdi) |
- movntiq rax, %nacl: 24 (r15,rdi) |
- movntiq rax, %nacl: 32 (r15,rdi) |
- movntiq rax, %nacl: 40 (r15,rdi) |
- movntiq rax, %nacl: 48 (r15,rdi) |
- movntiq rax, %nacl: 56 (r15,rdi) |
- movntiq rax, %nacl: 64 (r15,rdi) |
- movntiq rax, %nacl: 72 (r15,rdi) |
- movntiq rax, %nacl: 80 (r15,rdi) |
- movntiq rax, %nacl: 88 (r15,rdi) |
- movntiq rax, %nacl: 96 (r15,rdi) |
- movntiq rax, %nacl: 104 (r15,rdi) |
- movntiq rax, %nacl: 112 (r15,rdi) |
- movntiq rax, %nacl: 120 (r15,rdi) |
- |
- leal 128 (rdi), edi |
+.Lquadword_aligned_loop: |
+ |
+ movq rax, (rbp) |
+ movq rax, 8 (rbp) |
+ movq rax, 16 (rbp) |
+ movq rax, 24 (rbp) |
+ movq rax, 32 (rbp) |
+ movq rax, 40 (rbp) |
+ movq rax, 48 (rbp) |
+ movq rax, 56 (rbp) |
+ movq rax, 64 (rbp) |
+ movq rax, 72 (rbp) |
+ movq rax, 80 (rbp) |
+ movq rax, 88 (rbp) |
+ movq rax, 96 (rbp) |
+ movq rax, 104 (rbp) |
+ movq rax, 112 (rbp) |
+ movq rax, 120 (rbp) |
+ |
+ .bundle_lock |
+ leal 128 (rbp), ebp |
+ add r15, rbp |
+ .bundle_unlock |
dec ecx |
- jnz loop |
+ jnz .Lquadword_aligned_loop |
+ |
+ movl ebp, edi |
+ popq r8 |
+ naclrestbp r8d, r15 |
- sfence |
movl edx, ecx |
andl $127, ecx |
- rep stosb al, %nacl:(rdi), r15 |
+ jz .Lfooter_loop_end |
+.Lfooter_loop: |
+ movb al, %nacl:(r15, rdi) |
+ inc edi |
+ dec ecx |
+ jnz .Lfooter_loop |
+.Lfooter_loop_end: |
movl r9d, eax |
pop r11 |
nacljmp r11d, r15 |
-byte_set: |
- rep stosb al, %nacl:(rdi), r15 |
+.Lbyte_set: |
+ testl ecx, ecx |
+ jz .Lbyte_set_end |
+.Lbyte_set_loop: |
+ movb al, %nacl:(r15, rdi) |
+ inc edi |
+ dec ecx |
+ jnz .Lbyte_set_loop |
+.Lbyte_set_end: |
movl r9d, eax |
pop r11 |
nacljmp r11d, r15 |
-quadword_set: |
+.Lquadword_set: |
shrl $3, ecx |
+ jz .Lquadword_loop_end |
.p2align 4 |
- rep stosq rax, %nacl:(rdi), r15 |
+.Lquadword_loop: |
+ movq rax, %nacl:(r15, rdi) |
+ add $8, edi |
+ dec ecx |
+ jnz .Lquadword_loop |
+.Lquadword_loop_end: |
movl edx, ecx |
andl $7, ecx |
- rep stosb al, %nacl:(rdi), r15 /* Store the remaining bytes */ |
+ jz .Lquadword_footer_end |
+.Lquadword_footer: |
+ movb al, %nacl:(r15, rdi) |
+ inc edi |
+ dec ecx |
+ jnz .Lquadword_footer |
+.Lquadword_footer_end: |
movl r9d, eax |
pop r11 |
nacljmp r11d, r15 |