Chromium Code Reviews| Index: newlib/libc/machine/x86_64/memcpy.S |
| diff --git a/newlib/libc/machine/x86_64/memcpy.S b/newlib/libc/machine/x86_64/memcpy.S |
| index 0041d5f52352004a8ee62db454beacf027e79ea5..8492ce0dd017a2640166507493f5917bb5e72796 100644 |
| --- a/newlib/libc/machine/x86_64/memcpy.S |
| +++ b/newlib/libc/machine/x86_64/memcpy.S |
| @@ -16,21 +16,26 @@ |
| SYM (memcpy): |
| movl edi, eax /* Store destination in return value */ |
| cmpl $16, edx |
| - jb byte_copy |
| + jb .Lbyte_copy |
| movl edi, r8d /* Align destination on quad word boundary */ |
| andl $7, r8d |
| - jz quadword_aligned |
| + jz .Lquadword_aligned |
| movl $8, ecx |
| subl r8d, ecx |
| subl ecx, edx |
| - rep movsb %nacl:(rsi), %nacl:(rdi), r15 |
| +.Lheader_loop: |
| + movb %nacl:(r15, rsi), r8b |
| + inc esi |
| + movb r8b, %nacl:(r15, rdi) |
| + inc edi |
| + dec ecx |
| + jnz .Lheader_loop |
| -quadword_aligned: |
| - movl esi, esi /* We must clear top half for prefetch */ |
| +.Lquadword_aligned: |
| cmpl $256, edx |
| - jb quadword_copy |
| + jb .Lquadword_copy |
| pushq rax |
| pushq r12 |
| @@ -40,57 +45,85 @@ quadword_aligned: |
| movl edx, ecx /* Copy 128 bytes at a time with minimum cache polution */ |
|
Mark Seaborn
2015/02/20 00:43:45
Remove "with minimum cache polution"?
Derek Schuff
2015/02/23 17:19:43
Done.
|
| shrl $7, ecx |
| + /* |
| + * Avoid revealing the sandbox base address. |
| + * In particular this means that we don't do the following: |
| + * movq 32(r15,rsi), r11 |
| + * ... |
| + * movq r11, %nacl:32(r15,rdi) |
| + * because the latter instruction might be reached via a direct or |
| + * indirect jump when r11 contains the sandbox base address in its |
| + * top 32 bits, and this would write the sandbox base address into |
| + * memory. We treat r11 as a write-only register to avoid |
| + * revealing the sandbox base address to user code. |
| + * Instead, we spill rdx and use that. Additionally, we avoid string |
| + * instructions (movs) because they leave the full 64 bits in rsi/rdi. |
| + */ |
| + pushq $0 |
| + movl ebp, (rsp) |
| + pushq rdx |
|
Mark Seaborn
2015/02/20 00:43:45
Maybe add "/* Save byte count */"
Derek Schuff
2015/02/23 17:19:43
Done.
|
| .p2align 4 |
| -loop: |
| - prefetchnta 768 (r15,rsi) |
| - prefetchnta 832 (r15,rsi) |
| - |
| - movq %nacl: (r15,rsi), rax |
| - movq %nacl: 8 (r15,rsi), r8 |
| - movq %nacl: 16 (r15,rsi), r9 |
| - movq %nacl: 24 (r15,rsi), r10 |
| - movq %nacl: 32 (r15,rsi), r11 |
| - movq %nacl: 40 (r15,rsi), r12 |
| - movq %nacl: 48 (r15,rsi), r13 |
| - movq %nacl: 56 (r15,rsi), r14 |
| - |
| - movntiq rax, %nacl: (r15,rdi) |
| - movntiq r8 , %nacl: 8 (r15,rdi) |
| - movntiq r9 , %nacl: 16 (r15,rdi) |
| - movntiq r10, %nacl: 24 (r15,rdi) |
| - movntiq r11, %nacl: 32 (r15,rdi) |
| - movntiq r12, %nacl: 40 (r15,rdi) |
| - movntiq r13, %nacl: 48 (r15,rdi) |
| - movntiq r14, %nacl: 56 (r15,rdi) |
| - |
| - movq %nacl: 64 (r15,rsi), rax |
| - movq %nacl: 72 (r15,rsi), r8 |
| - movq %nacl: 80 (r15,rsi), r9 |
| - movq %nacl: 88 (r15,rsi), r10 |
| - movq %nacl: 96 (r15,rsi), r11 |
| - movq %nacl: 104 (r15,rsi), r12 |
| - movq %nacl: 112 (r15,rsi), r13 |
| - movq %nacl: 120 (r15,rsi), r14 |
| - |
| - movntiq rax, %nacl: 64 (r15,rdi) |
| - movntiq r8 , %nacl: 72 (r15,rdi) |
| - movntiq r9 , %nacl: 80 (r15,rdi) |
| - movntiq r10, %nacl: 88 (r15,rdi) |
| - movntiq r11, %nacl: 96 (r15,rdi) |
| - movntiq r12, %nacl: 104 (r15,rdi) |
| - movntiq r13, %nacl: 112 (r15,rdi) |
| - movntiq r14, %nacl: 120 (r15,rdi) |
| + |
| +.Lloop: |
| + naclrestbp esi, r15 |
| + movq (rbp), rax |
| + movq 8 (rbp), r8 |
| + movq 16 (rbp), r9 |
| + movq 24 (rbp), r10 |
| + movq 32 (rbp), rdx |
| + movq 40 (rbp), r12 |
| + movq 48 (rbp), r13 |
| + movq 56 (rbp), r14 |
| + |
| + naclrestbp edi, r15 |
| + movq rax, (rbp) |
| + movq r8 , 8 (rbp) |
| + movq r9 , 16 (rbp) |
| + movq r10, 24 (rbp) |
| + movq rdx, 32 (rbp) |
| + movq r12, 40 (rbp) |
| + movq r13, 48 (rbp) |
| + movq r14, 56 (rbp) |
| + |
| + naclrestbp esi, r15 |
| + movq 64 (rbp), rax |
| + movq 72 (rbp), r8 |
| + movq 80 (rbp), r9 |
| + movq 88 (rbp), r10 |
| + movq 96 (rbp), rdx |
| + movq 104 (rbp), r12 |
| + movq 112 (rbp), r13 |
| + movq 120 (rbp), r14 |
| + |
| + naclrestbp edi, r15 |
| + movq rax, 64 (rbp) |
| + movq r8 , 72 (rbp) |
| + movq r9 , 80 (rbp) |
| + movq r10, 88 (rbp) |
| + movq rdx, 96 (rbp) |
| + movq r12, 104 (rbp) |
| + movq r13, 112 (rbp) |
| + movq r14, 120 (rbp) |
| leal 128 (rsi), esi |
| leal 128 (rdi), edi |
| dec ecx |
| - jnz loop |
| + jnz .Lloop |
| - sfence |
| - movl edx, ecx |
| + popq rcx |
|
Mark Seaborn
2015/02/20 00:43:45
and "/* Restore byte count */"
Derek Schuff
2015/02/23 17:19:43
Done.
|
| + popq rax |
| + naclrestbp eax, r15 |
| andl $127, ecx |
|
Mark Seaborn
2015/02/20 00:43:45
add "/* Copy the remaining bytes */"
Derek Schuff
2015/02/23 17:19:43
Done.
|
| - rep movsb %nacl:(rsi), %nacl:(rdi), r15 |
| + jz .Lrep1_end |
| +.Lrep1: |
| + movb %nacl:(r15, rsi), r8b |
| + inc esi |
| + movb r8b, %nacl:(r15, rdi) |
| + inc edi |
| + dec ecx |
| + jnz .Lrep1 |
| +.Lrep1_end: |
| popq r14 |
| popq r13 |
| popq r12 |
| @@ -99,20 +132,45 @@ loop: |
| nacljmp r11d, r15 |
| -byte_copy: |
| - movl edx, ecx |
| - rep movsb %nacl:(rsi), %nacl:(rdi), r15 |
| +.Lbyte_copy: |
| + testl edx, edx |
| + jz .Lbyte_copy_end |
| +.Lbyte_copy_loop: |
| + movb %nacl:(r15, rsi), r8b |
| + inc esi |
| + movb r8b, %nacl:(r15, rdi) |
| + inc edi |
| + dec edx |
| + jnz .Lbyte_copy_loop |
| +.Lbyte_copy_end: |
| pop r11 |
| nacljmp r11d, r15 |
| -quadword_copy: |
| +.Lquadword_copy: |
| movl edx, ecx |
| shrl $3, ecx |
| + jz .Lrep2_end |
| .p2align 4 |
| - rep movsq %nacl:(rsi), %nacl:(rdi), r15 |
| +.Lrep2: |
| + movq %nacl:(r15, rsi), r8 |
| + add $8, esi |
| + movq r8, %nacl:(r15, rdi) |
| + add $8, edi |
| + dec ecx |
| + jnz .Lrep2 |
| +.Lrep2_end: |
| movl edx, ecx |
|
Mark Seaborn
2015/02/20 00:43:45
You could use edx from here to the end, and avoid
Derek Schuff
2015/02/23 17:19:43
Done.
|
| andl $7, ecx |
| - rep movsb %nacl:(rsi), %nacl:(rdi), r15 /* Copy the remaining bytes */ |
| + jz .Lrep3_end |
| +.Lrep3: |
| + /* Copy the remaining bytes */ |
| + movb %nacl:(r15, rsi), r8b |
| + inc esi |
| + movb r8b, %nacl:(r15, rdi) |
| + inc edi |
| + dec ecx |
| + jnz .Lrep3 |
| +.Lrep3_end: |
| pop r11 |
| nacljmp r11d, r15 |