| Index: newlib/libc/machine/x86_64/memcpy.S
|
| diff --git a/newlib/libc/machine/x86_64/memcpy.S b/newlib/libc/machine/x86_64/memcpy.S
|
| index 0041d5f52352004a8ee62db454beacf027e79ea5..619d7e6423c5ee9317a66f8e34a370c4dcb20a25 100644
|
| --- a/newlib/libc/machine/x86_64/memcpy.S
|
| +++ b/newlib/libc/machine/x86_64/memcpy.S
|
| @@ -16,81 +16,115 @@
|
| SYM (memcpy):
|
| movl edi, eax /* Store destination in return value */
|
| cmpl $16, edx
|
| - jb byte_copy
|
| + jb .Lbyte_copy
|
|
|
| movl edi, r8d /* Align destination on quad word boundary */
|
| andl $7, r8d
|
| - jz quadword_aligned
|
| + jz .Lquadword_aligned
|
| movl $8, ecx
|
| subl r8d, ecx
|
| subl ecx, edx
|
|
|
| - rep movsb %nacl:(rsi), %nacl:(rdi), r15
|
| +.Lheader_loop:
|
| + movb %nacl:(r15, rsi), r8b
|
| + inc esi
|
| + movb r8b, %nacl:(r15, rdi)
|
| + inc edi
|
| + dec ecx
|
| + jnz .Lheader_loop
|
|
|
| -quadword_aligned:
|
| - movl esi, esi /* We must clear top half for prefetch */
|
| +.Lquadword_aligned:
|
| cmpl $256, edx
|
| - jb quadword_copy
|
| + jb .Lquadword_copy
|
|
|
| pushq rax
|
| pushq r12
|
| pushq r13
|
| pushq r14
|
|
|
| - movl edx, ecx /* Copy 128 bytes at a time with minimum cache polution */
|
| + movl edx, ecx /* Copy 128 bytes at a time */
|
| shrl $7, ecx
|
|
|
| + /*
|
| + * Avoid revealing the sandbox base address.
|
| + * In particular this means that we don't do the following:
|
| + * movq 32(r15,rsi), r11
|
| + * ...
|
| + * movq r11, %nacl:32(r15,rdi)
|
| + * because the latter instruction might be reached via a direct or
|
| + * indirect jump when r11 contains the sandbox base address in its
|
| + * top 32 bits, and this would write the sandbox base address into
|
| + * memory. We treat r11 as a write-only register to avoid
|
| + * revealing the sandbox base address to user code.
|
| + * Instead, we spill rdx and use that. Additionally, we avoid string
|
| + * instructions (movs) because they leave the full 64 bits in rsi/rdi.
|
| + */
|
| + pushq $0
|
| + movl ebp, (rsp)
|
| + pushq rdx /* Save byte count */
|
| .p2align 4
|
| -loop:
|
| - prefetchnta 768 (r15,rsi)
|
| - prefetchnta 832 (r15,rsi)
|
| -
|
| - movq %nacl: (r15,rsi), rax
|
| - movq %nacl: 8 (r15,rsi), r8
|
| - movq %nacl: 16 (r15,rsi), r9
|
| - movq %nacl: 24 (r15,rsi), r10
|
| - movq %nacl: 32 (r15,rsi), r11
|
| - movq %nacl: 40 (r15,rsi), r12
|
| - movq %nacl: 48 (r15,rsi), r13
|
| - movq %nacl: 56 (r15,rsi), r14
|
| -
|
| - movntiq rax, %nacl: (r15,rdi)
|
| - movntiq r8 , %nacl: 8 (r15,rdi)
|
| - movntiq r9 , %nacl: 16 (r15,rdi)
|
| - movntiq r10, %nacl: 24 (r15,rdi)
|
| - movntiq r11, %nacl: 32 (r15,rdi)
|
| - movntiq r12, %nacl: 40 (r15,rdi)
|
| - movntiq r13, %nacl: 48 (r15,rdi)
|
| - movntiq r14, %nacl: 56 (r15,rdi)
|
| -
|
| - movq %nacl: 64 (r15,rsi), rax
|
| - movq %nacl: 72 (r15,rsi), r8
|
| - movq %nacl: 80 (r15,rsi), r9
|
| - movq %nacl: 88 (r15,rsi), r10
|
| - movq %nacl: 96 (r15,rsi), r11
|
| - movq %nacl: 104 (r15,rsi), r12
|
| - movq %nacl: 112 (r15,rsi), r13
|
| - movq %nacl: 120 (r15,rsi), r14
|
| -
|
| - movntiq rax, %nacl: 64 (r15,rdi)
|
| - movntiq r8 , %nacl: 72 (r15,rdi)
|
| - movntiq r9 , %nacl: 80 (r15,rdi)
|
| - movntiq r10, %nacl: 88 (r15,rdi)
|
| - movntiq r11, %nacl: 96 (r15,rdi)
|
| - movntiq r12, %nacl: 104 (r15,rdi)
|
| - movntiq r13, %nacl: 112 (r15,rdi)
|
| - movntiq r14, %nacl: 120 (r15,rdi)
|
| +
|
| +.Lloop:
|
| + naclrestbp esi, r15
|
| + movq (rbp), rax
|
| + movq 8 (rbp), r8
|
| + movq 16 (rbp), r9
|
| + movq 24 (rbp), r10
|
| + movq 32 (rbp), rdx
|
| + movq 40 (rbp), r12
|
| + movq 48 (rbp), r13
|
| + movq 56 (rbp), r14
|
| +
|
| + naclrestbp edi, r15
|
| + movq rax, (rbp)
|
| + movq r8 , 8 (rbp)
|
| + movq r9 , 16 (rbp)
|
| + movq r10, 24 (rbp)
|
| + movq rdx, 32 (rbp)
|
| + movq r12, 40 (rbp)
|
| + movq r13, 48 (rbp)
|
| + movq r14, 56 (rbp)
|
| +
|
| + naclrestbp esi, r15
|
| + movq 64 (rbp), rax
|
| + movq 72 (rbp), r8
|
| + movq 80 (rbp), r9
|
| + movq 88 (rbp), r10
|
| + movq 96 (rbp), rdx
|
| + movq 104 (rbp), r12
|
| + movq 112 (rbp), r13
|
| + movq 120 (rbp), r14
|
| +
|
| + naclrestbp edi, r15
|
| + movq rax, 64 (rbp)
|
| + movq r8 , 72 (rbp)
|
| + movq r9 , 80 (rbp)
|
| + movq r10, 88 (rbp)
|
| + movq rdx, 96 (rbp)
|
| + movq r12, 104 (rbp)
|
| + movq r13, 112 (rbp)
|
| + movq r14, 120 (rbp)
|
|
|
| leal 128 (rsi), esi
|
| leal 128 (rdi), edi
|
|
|
| dec ecx
|
| - jnz loop
|
| + jnz .Lloop
|
|
|
| - sfence
|
| - movl edx, ecx
|
| + popq rcx /* Restore byte count */
|
| + popq rax
|
| + naclrestbp eax, r15
|
| + /* Copy the remaining bytes */
|
| andl $127, ecx
|
| - rep movsb %nacl:(rsi), %nacl:(rdi), r15
|
| + jz .Lrep1_end
|
| +.Lrep1:
|
| + movb %nacl:(r15, rsi), r8b
|
| + inc esi
|
| + movb r8b, %nacl:(r15, rdi)
|
| + inc edi
|
| + dec ecx
|
| + jnz .Lrep1
|
| +.Lrep1_end:
|
| popq r14
|
| popq r13
|
| popq r12
|
| @@ -99,20 +133,44 @@ loop:
|
| nacljmp r11d, r15
|
|
|
|
|
| -byte_copy:
|
| - movl edx, ecx
|
| - rep movsb %nacl:(rsi), %nacl:(rdi), r15
|
| +.Lbyte_copy:
|
| + testl edx, edx
|
| + jz .Lbyte_copy_end
|
| +.Lbyte_copy_loop:
|
| + movb %nacl:(r15, rsi), r8b
|
| + inc esi
|
| + movb r8b, %nacl:(r15, rdi)
|
| + inc edi
|
| + dec edx
|
| + jnz .Lbyte_copy_loop
|
| +.Lbyte_copy_end:
|
| pop r11
|
| nacljmp r11d, r15
|
|
|
|
|
| -quadword_copy:
|
| +.Lquadword_copy:
|
| movl edx, ecx
|
| shrl $3, ecx
|
| + jz .Lrep2_end
|
| .p2align 4
|
| - rep movsq %nacl:(rsi), %nacl:(rdi), r15
|
| - movl edx, ecx
|
| - andl $7, ecx
|
| - rep movsb %nacl:(rsi), %nacl:(rdi), r15 /* Copy the remaining bytes */
|
| +.Lrep2:
|
| + movq %nacl:(r15, rsi), r8
|
| + add $8, esi
|
| + movq r8, %nacl:(r15, rdi)
|
| + add $8, edi
|
| + dec ecx
|
| + jnz .Lrep2
|
| +.Lrep2_end:
|
| + andl $7, edx
|
| + jz .Lrep3_end
|
| +.Lrep3:
|
| + /* Copy the remaining bytes */
|
| + movb %nacl:(r15, rsi), r8b
|
| + inc esi
|
| + movb r8b, %nacl:(r15, rdi)
|
| + inc edi
|
| + dec edx
|
| + jnz .Lrep3
|
| +.Lrep3_end:
|
| pop r11
|
| nacljmp r11d, r15
|
|
|