Index: sandbox/linux/seccomp/syscall.cc |
=================================================================== |
--- sandbox/linux/seccomp/syscall.cc (revision 45661) |
+++ sandbox/linux/seccomp/syscall.cc (working copy) |
@@ -46,8 +46,17 @@ |
".globl playground$syscallWrapper\n" |
".type playground$syscallWrapper, @function\n" |
#if defined(__x86_64__) |
+ // Check for rt_sigreturn(). It needs to be handled specially. |
+ "cmp $15, %rax\n" // NR_rt_sigreturn |
+ "jnz 1f\n" |
+ "add $0x90, %rsp\n" // pop return addresses and red zone |
+ "0:syscall\n" // rt_sigreturn() is unrestricted |
+ "mov $66, %edi\n" // rt_sigreturn() should never return |
+ "mov $231, %eax\n" // NR_exit_group |
+ "jmp 0b\n" |
+ |
// Save all registers |
- "push %rbp\n" |
+ "1:push %rbp\n" |
"mov %rsp, %rbp\n" |
"push %rbx\n" |
"push %rcx\n" |
@@ -70,7 +79,7 @@ |
// Check range of system call |
"cmp playground$maxSyscall(%rip), %eax\n" |
- "ja 1f\n" |
+ "ja 3f\n" |
// Retrieve function call from system call table (c.f. syscall_table.c). |
// We have three different types of entries; zero for denied system calls, |
@@ -86,9 +95,9 @@ |
// Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise |
// jump to fallback handler. |
"cmp $1, %r10\n" |
- "jbe 1f\n" |
+ "jbe 3f\n" |
"call *%r10\n" |
- "0:" |
+ "2:" |
// Restore CPU registers, except for %rax which was set by the system call. |
"pop %r15\n" |
@@ -113,7 +122,7 @@ |
// Return to caller |
"ret\n" |
- "1:" |
+ "3:" |
// If we end up calling a specific handler, we don't need to know the |
// system call number. However, in the generic case, we do. Shift |
// registers so that the system call number becomes visible as the |
@@ -129,10 +138,55 @@ |
// Call default handler. |
"call playground$defaultSystemCallHandler\n" |
"pop %r9\n" |
- "jmp 0b\n" |
+ "jmp 2b\n" |
#elif defined(__i386__) |
+ "cmp $119, %eax\n" // NR_sigreturn |
+ "jnz 1f\n" |
+ "add $0x4, %esp\n" // pop return address |
+ "0:int $0x80\n" // sigreturn() is unrestricted |
+ "mov $66, %ebx\n" // sigreturn() should never return |
+ "mov %ebx, %eax\n" // NR_exit |
+ "jmp 0b\n" |
+ "1:cmp $173, %eax\n" // NR_rt_sigreturn |
+ "jnz 3f\n" |
+ |
+ // Convert rt_sigframe into sigframe, allowing us to call sigreturn(). |
+ // This is possible since the first part of signal stack frames have |
+ // stayed very stable since the earliest kernel versions. While never |
+ // officially documented, lots of user space applications rely on this |
+ // part of the ABI, and kernel developers have been careful to maintain |
+ // backwards compatibility. |
+ // In general, the rt_sigframe includes a lot of extra information that |
+ // the signal handler can look at. Most notably, this means a complete |
+ // siginfo record. |
+ // Fortunately though, the kernel doesn't look at any of this extra data |
+ // when returning from a signal handler. So, we can safely convert an |
+ // rt_sigframe to a legacy sigframe, discarding the extra data in the |
+ // process. Interestingly, the legacy signal frame is actually larger than |
+ // the rt signal frame, as it includes a lot more padding. |
+ "sub $0x1C8, %esp\n" // a legacy signal stack is much larger |
+ "mov 0x1CC(%esp), %eax\n" // push signal number |
+ "push %eax\n" |
+ "lea 0x270(%esp), %esi\n" // copy siginfo register values |
+ "lea 0x4(%esp), %edi\n" // into new location |
+ "mov $0x16, %ecx\n" |
+ "cld\n" |
+ "rep movsl\n" |
+ "mov 0x2C8(%esp), %ebx\n" // copy first half of signal mask |
+ "mov %ebx, 0x54(%esp)\n" |
+ "lea 2f, %esi\n" |
+ "push %esi\n" // push restorer function |
+ "lea 0x2D4(%esp), %edi\n" // patch up retcode magic numbers |
+ "movb $2, %cl\n" |
+ "rep movsl\n" |
+ "ret\n" // return to restorer function |
+ "2:pop %eax\n" // remove dummy argument (signo) |
+ "mov $119, %eax\n" // NR_sigaction |
+ "int $0x80\n" |
+ |
+ |
// Preserve all registers |
- "push %ebx\n" |
+ "3:push %ebx\n" |
"push %ecx\n" |
"push %edx\n" |
"push %esi\n" |
@@ -150,7 +204,7 @@ |
// Check range of system call |
"cmp playground$maxSyscall, %eax\n" |
- "ja 5f\n" |
+ "ja 9f\n" |
// We often have long sequences of calls to gettimeofday(). This is |
// needlessly expensive. Coalesce them into a single call. |
@@ -164,9 +218,9 @@ |
// or maybe, if we have recently seen requests to compute |
// the time. There might be a repeated pattern of those. |
"cmp $78, %eax\n" // __NR_gettimeofday |
- "jnz 2f\n" |
+ "jnz 6f\n" |
"cmp %eax, %fs:0x102C-0x58\n" // last system call |
- "jnz 0f\n" |
+ "jnz 4f\n" |
// This system call and the last system call prior to this one both are |
// calls to gettimeofday(). Try to avoid making the new call and just |
@@ -174,7 +228,7 @@ |
// Just in case the caller is spinning on the result from gettimeofday(), |
// every so often, call the actual system call. |
"decl %fs:0x1030-0x58\n" // countdown calls to gettimofday() |
- "jz 0f\n" |
+ "jz 4f\n" |
// Atomically read the 64bit word representing last-known timestamp and |
// return it to the caller. On x86-32 this is a little more complicated and |
@@ -186,11 +240,11 @@ |
"mov %edx, 4(%ebx)\n" |
"xor %eax, %eax\n" |
"add $28, %esp\n" |
- "jmp 4f\n" |
+ "jmp 8f\n" |
// This is a call to gettimeofday(), but we don't have a valid cached |
// result, yet. |
- "0:mov %eax, %fs:0x102C-0x58\n" // remember syscall number |
+ "4:mov %eax, %fs:0x102C-0x58\n" // remember syscall number |
"movl $500, %fs:0x1030-0x58\n" // make system call, each 500 invocations |
"call playground$defaultSystemCallHandler\n" |
@@ -201,17 +255,17 @@ |
"mov 0(%ebx), %ebx\n" |
"mov 100f, %eax\n" |
"mov 101f, %edx\n" |
- "1:lock; cmpxchg8b 100f\n" |
- "jnz 1b\n" |
+ "5:lock; cmpxchg8b 100f\n" |
+ "jnz 5b\n" |
"xor %eax, %eax\n" |
- "jmp 6f\n" |
+ "jmp 10f\n" |
// Remember the number of the last system call made. We deliberately do |
// not remember calls to gettid(), as we have often seen long sequences |
// of calls to just gettimeofday() and gettid(). In that situation, we |
// would still like to coalesce the gettimeofday() calls. |
- "2:cmp $224, %eax\n" // __NR_gettid |
- "jz 3f\n" |
+ "6:cmp $224, %eax\n" // __NR_gettid |
+ "jz 7f\n" |
"mov %eax, %fs:0x102C-0x58\n" // remember syscall number |
// Retrieve function call from system call table (c.f. syscall_table.c). |
@@ -219,7 +273,7 @@ |
// that should be handled by the defaultSystemCallHandler(); minus one |
// for unrestricted system calls that need to be forwarded to the trusted |
// thread; and function pointers to specific handler functions. |
- "3:shl $3, %eax\n" |
+ "7:shl $3, %eax\n" |
"lea playground$syscallTable, %ebx\n" |
"add %ebx, %eax\n" |
"mov 0(%eax), %eax\n" |
@@ -227,13 +281,13 @@ |
// Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise |
// jump to fallback handler. |
"cmp $1, %eax\n" |
- "jbe 5f\n" |
+ "jbe 9f\n" |
"add $4, %esp\n" |
"call *%eax\n" |
"add $24, %esp\n" |
// Restore CPU registers, except for %eax which was set by the system call. |
- "4:pop %ebp\n" |
+ "8:pop %ebp\n" |
"pop %edi\n" |
"pop %esi\n" |
"pop %edx\n" |
@@ -244,9 +298,9 @@ |
"ret\n" |
// Call default handler. |
- "5:call playground$defaultSystemCallHandler\n" |
- "6:add $28, %esp\n" |
- "jmp 4b\n" |
+ "9:call playground$defaultSystemCallHandler\n" |
+ "10:add $28, %esp\n" |
+ "jmp 8b\n" |
".pushsection \".bss\"\n" |
".balign 8\n" |
@@ -267,9 +321,9 @@ |
void* arg5) { |
// TODO(markus): The following comment is currently not true, we do intercept these system calls. Try to fix that. |
- // We try to avoid intercepting read(), write(), and sigreturn(), as |
- // these system calls are not restricted in Seccomp mode. But depending on |
- // the exact instruction sequence in libc, we might not be able to reliably |
+ // We try to avoid intercepting read(), and write(), as these system calls |
+ // are not restricted in Seccomp mode. But depending on the exact |
+ // instruction sequence in libc, we might not be able to reliably |
// filter out these system calls at the time when we instrument the code. |
SysCalls sys; |
long rc; |
@@ -283,10 +337,6 @@ |
Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call"); |
rc = sys.write((long)arg0, arg1, (size_t)arg2); |
break; |
- case __NR_rt_sigreturn: |
- Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call"); |
- rc = sys.rt_sigreturn((unsigned long)arg0); |
- break; |
default: |
if (Debug::isEnabled()) { |
// In debug mode, prevent stderr from being closed |