Index: newlib/libc/machine/x86_64/memcpy.S |
diff --git a/newlib/libc/machine/x86_64/memcpy.S b/newlib/libc/machine/x86_64/memcpy.S |
index 0041d5f52352004a8ee62db454beacf027e79ea5..619d7e6423c5ee9317a66f8e34a370c4dcb20a25 100644 |
--- a/newlib/libc/machine/x86_64/memcpy.S |
+++ b/newlib/libc/machine/x86_64/memcpy.S |
@@ -16,81 +16,115 @@ |
SYM (memcpy): |
movl edi, eax /* Store destination in return value */ |
cmpl $16, edx |
- jb byte_copy |
+ jb .Lbyte_copy |
movl edi, r8d /* Align destination on quad word boundary */ |
andl $7, r8d |
- jz quadword_aligned |
+ jz .Lquadword_aligned |
movl $8, ecx |
subl r8d, ecx |
subl ecx, edx |
- rep movsb %nacl:(rsi), %nacl:(rdi), r15 |
+.Lheader_loop: |
+ movb %nacl:(r15, rsi), r8b |
+ inc esi |
+ movb r8b, %nacl:(r15, rdi) |
+ inc edi |
+ dec ecx |
+ jnz .Lheader_loop |
-quadword_aligned: |
- movl esi, esi /* We must clear top half for prefetch */ |
+.Lquadword_aligned: |
cmpl $256, edx |
- jb quadword_copy |
+ jb .Lquadword_copy |
pushq rax |
pushq r12 |
pushq r13 |
pushq r14 |
- movl edx, ecx /* Copy 128 bytes at a time with minimum cache polution */ |
+ movl edx, ecx /* Copy 128 bytes at a time */ |
shrl $7, ecx |
+ /* |
+ * Avoid revealing the sandbox base address. |
+ * In particular this means that we don't do the following: |
+ * movq 32(r15,rsi), r11 |
+ * ... |
+ * movq r11, %nacl:32(r15,rdi) |
+ * because the latter instruction might be reached via a direct or |
+ * indirect jump when r11 contains the sandbox base address in its |
+ * top 32 bits, and this would write the sandbox base address into |
+ * memory. We treat r11 as a write-only register to avoid |
+ * revealing the sandbox base address to user code. |
+ * Instead, we spill rdx and use that. Additionally, we avoid string |
+ * instructions (movs) because they leave the full 64 bits in rsi/rdi. |
+ */ |
+ pushq $0 |
+ movl ebp, (rsp) |
+ pushq rdx /* Save byte count */ |
.p2align 4 |
-loop: |
- prefetchnta 768 (r15,rsi) |
- prefetchnta 832 (r15,rsi) |
- |
- movq %nacl: (r15,rsi), rax |
- movq %nacl: 8 (r15,rsi), r8 |
- movq %nacl: 16 (r15,rsi), r9 |
- movq %nacl: 24 (r15,rsi), r10 |
- movq %nacl: 32 (r15,rsi), r11 |
- movq %nacl: 40 (r15,rsi), r12 |
- movq %nacl: 48 (r15,rsi), r13 |
- movq %nacl: 56 (r15,rsi), r14 |
- |
- movntiq rax, %nacl: (r15,rdi) |
- movntiq r8 , %nacl: 8 (r15,rdi) |
- movntiq r9 , %nacl: 16 (r15,rdi) |
- movntiq r10, %nacl: 24 (r15,rdi) |
- movntiq r11, %nacl: 32 (r15,rdi) |
- movntiq r12, %nacl: 40 (r15,rdi) |
- movntiq r13, %nacl: 48 (r15,rdi) |
- movntiq r14, %nacl: 56 (r15,rdi) |
- |
- movq %nacl: 64 (r15,rsi), rax |
- movq %nacl: 72 (r15,rsi), r8 |
- movq %nacl: 80 (r15,rsi), r9 |
- movq %nacl: 88 (r15,rsi), r10 |
- movq %nacl: 96 (r15,rsi), r11 |
- movq %nacl: 104 (r15,rsi), r12 |
- movq %nacl: 112 (r15,rsi), r13 |
- movq %nacl: 120 (r15,rsi), r14 |
- |
- movntiq rax, %nacl: 64 (r15,rdi) |
- movntiq r8 , %nacl: 72 (r15,rdi) |
- movntiq r9 , %nacl: 80 (r15,rdi) |
- movntiq r10, %nacl: 88 (r15,rdi) |
- movntiq r11, %nacl: 96 (r15,rdi) |
- movntiq r12, %nacl: 104 (r15,rdi) |
- movntiq r13, %nacl: 112 (r15,rdi) |
- movntiq r14, %nacl: 120 (r15,rdi) |
+ |
+.Lloop: |
+ naclrestbp esi, r15 |
+ movq (rbp), rax |
+ movq 8 (rbp), r8 |
+ movq 16 (rbp), r9 |
+ movq 24 (rbp), r10 |
+ movq 32 (rbp), rdx |
+ movq 40 (rbp), r12 |
+ movq 48 (rbp), r13 |
+ movq 56 (rbp), r14 |
+ |
+ naclrestbp edi, r15 |
+ movq rax, (rbp) |
+ movq r8 , 8 (rbp) |
+ movq r9 , 16 (rbp) |
+ movq r10, 24 (rbp) |
+ movq rdx, 32 (rbp) |
+ movq r12, 40 (rbp) |
+ movq r13, 48 (rbp) |
+ movq r14, 56 (rbp) |
+ |
+ naclrestbp esi, r15 |
+ movq 64 (rbp), rax |
+ movq 72 (rbp), r8 |
+ movq 80 (rbp), r9 |
+ movq 88 (rbp), r10 |
+ movq 96 (rbp), rdx |
+ movq 104 (rbp), r12 |
+ movq 112 (rbp), r13 |
+ movq 120 (rbp), r14 |
+ |
+ naclrestbp edi, r15 |
+ movq rax, 64 (rbp) |
+ movq r8 , 72 (rbp) |
+ movq r9 , 80 (rbp) |
+ movq r10, 88 (rbp) |
+ movq rdx, 96 (rbp) |
+ movq r12, 104 (rbp) |
+ movq r13, 112 (rbp) |
+ movq r14, 120 (rbp) |
leal 128 (rsi), esi |
leal 128 (rdi), edi |
dec ecx |
- jnz loop |
+ jnz .Lloop |
- sfence |
- movl edx, ecx |
+ popq rcx /* Restore byte count */ |
+ popq rax |
+ naclrestbp eax, r15 |
+ /* Copy the remaining bytes */ |
andl $127, ecx |
- rep movsb %nacl:(rsi), %nacl:(rdi), r15 |
+ jz .Lrep1_end |
+.Lrep1: |
+ movb %nacl:(r15, rsi), r8b |
+ inc esi |
+ movb r8b, %nacl:(r15, rdi) |
+ inc edi |
+ dec ecx |
+ jnz .Lrep1 |
+.Lrep1_end: |
popq r14 |
popq r13 |
popq r12 |
@@ -99,20 +133,44 @@ loop: |
nacljmp r11d, r15 |
-byte_copy: |
- movl edx, ecx |
- rep movsb %nacl:(rsi), %nacl:(rdi), r15 |
+.Lbyte_copy: |
+ testl edx, edx |
+ jz .Lbyte_copy_end |
+.Lbyte_copy_loop: |
+ movb %nacl:(r15, rsi), r8b |
+ inc esi |
+ movb r8b, %nacl:(r15, rdi) |
+ inc edi |
+ dec edx |
+ jnz .Lbyte_copy_loop |
+.Lbyte_copy_end: |
pop r11 |
nacljmp r11d, r15 |
-quadword_copy: |
+.Lquadword_copy: |
movl edx, ecx |
shrl $3, ecx |
+ jz .Lrep2_end |
.p2align 4 |
- rep movsq %nacl:(rsi), %nacl:(rdi), r15 |
- movl edx, ecx |
- andl $7, ecx |
- rep movsb %nacl:(rsi), %nacl:(rdi), r15 /* Copy the remaining bytes */ |
+.Lrep2: |
+ movq %nacl:(r15, rsi), r8 |
+ add $8, esi |
+ movq r8, %nacl:(r15, rdi) |
+ add $8, edi |
+ dec ecx |
+ jnz .Lrep2 |
+.Lrep2_end: |
+ andl $7, edx |
+ jz .Lrep3_end |
+.Lrep3: |
+ /* Copy the remaining bytes */ |
+ movb %nacl:(r15, rsi), r8b |
+ inc esi |
+ movb r8b, %nacl:(r15, rdi) |
+ inc edi |
+ dec edx |
+ jnz .Lrep3 |
+.Lrep3_end: |
pop r11 |
nacljmp r11d, r15 |