| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "debug.h" | |
| 6 #include "sandbox_impl.h" | |
| 7 #include "syscall_table.h" | |
| 8 | |
| 9 namespace playground { | |
| 10 | |
| 11 // TODO(markus): change this into a function that returns the address of the ass
embly code. If that isn't possible for sandbox_clone, then move that function in
to a *.S file | |
| 12 asm( | |
| 13 ".pushsection .text, \"ax\", @progbits\n" | |
| 14 | |
| 15 // This is the special wrapper for the clone() system call. The code | |
| 16 // relies on the stack layout of the system call wrapper (c.f. below). It | |
| 17 // passes the stack pointer as an additional argument to sandbox__clone(), | |
| 18 // so that upon starting the child, register values can be restored and | |
| 19 // the child can start executing at the correct IP, instead of trying to | |
| 20 // run in the trusted thread. | |
| 21 "playground$sandbox_clone:" | |
| 22 ".globl playground$sandbox_clone\n" | |
| 23 ".type playground$sandbox_clone, @function\n" | |
| 24 #if defined(__x86_64__) | |
| 25 // Skip the 8 byte return address into the system call wrapper. The | |
| 26 // following bytes are the saved register values that we need to restore | |
| 27 // upon return from clone() in the new thread. | |
| 28 "lea 8(%rsp), %r9\n" | |
| 29 "jmp playground$sandbox__clone\n" | |
| 30 #elif defined(__i386__) | |
| 31 // As i386 passes function arguments on the stack, we need to skip a few | |
| 32 // more values before we can get to the saved registers. | |
| 33 "lea 28(%esp), %eax\n" | |
| 34 "mov %eax, 24(%esp)\n" | |
| 35 "jmp playground$sandbox__clone\n" | |
| 36 #else | |
| 37 #error Unsupported target platform | |
| 38 #endif | |
| 39 ".size playground$sandbox_clone, .-playground$sandbox_clone\n" | |
| 40 | |
| 41 | |
| 42 // This is the wrapper which is called by the untrusted code, trying to | |
| 43 // make a system call. | |
| 44 "playground$syscallWrapper:" | |
| 45 ".internal playground$syscallWrapper\n" | |
| 46 ".globl playground$syscallWrapper\n" | |
| 47 ".type playground$syscallWrapper, @function\n" | |
| 48 #if defined(__x86_64__) | |
| 49 // Check for rt_sigreturn(). It needs to be handled specially. | |
| 50 "cmp $15, %rax\n" // NR_rt_sigreturn | |
| 51 "jnz 1f\n" | |
| 52 "add $0x90, %rsp\n" // pop return addresses and red zone | |
| 53 "0:syscall\n" // rt_sigreturn() is unrestricted | |
| 54 "mov $66, %edi\n" // rt_sigreturn() should never return | |
| 55 "mov $231, %eax\n" // NR_exit_group | |
| 56 "jmp 0b\n" | |
| 57 | |
| 58 // Save all registers | |
| 59 "1:push %rbp\n" | |
| 60 "mov %rsp, %rbp\n" | |
| 61 "push %rbx\n" | |
| 62 "push %rcx\n" | |
| 63 "push %rdx\n" | |
| 64 "push %rsi\n" | |
| 65 "push %rdi\n" | |
| 66 "push %r8\n" | |
| 67 "push %r9\n" | |
| 68 "push %r10\n" | |
| 69 "push %r11\n" | |
| 70 "push %r12\n" | |
| 71 "push %r13\n" | |
| 72 "push %r14\n" | |
| 73 "push %r15\n" | |
| 74 | |
| 75 // Convert from syscall calling conventions to C calling conventions. | |
| 76 // System calls have a subtly different register ordering than the user- | |
| 77 // space x86-64 ABI. | |
| 78 "mov %r10, %rcx\n" | |
| 79 | |
| 80 // Check range of system call | |
| 81 "cmp playground$maxSyscall(%rip), %eax\n" | |
| 82 "ja 3f\n" | |
| 83 | |
| 84 // Retrieve function call from system call table (c.f. syscall_table.c). | |
| 85 // We have three different types of entries; zero for denied system calls, | |
| 86 // that should be handled by the defaultSystemCallHandler(); minus one | |
| 87 // for unrestricted system calls that need to be forwarded to the trusted | |
| 88 // thread; and function pointers to specific handler functions. | |
| 89 "mov %rax, %r10\n" | |
| 90 "shl $4, %r10\n" | |
| 91 "lea playground$syscallTable(%rip), %r11\n" | |
| 92 "add %r11, %r10\n" | |
| 93 "mov 0(%r10), %r10\n" | |
| 94 | |
| 95 // Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise | |
| 96 // jump to fallback handler. | |
| 97 "cmp $1, %r10\n" | |
| 98 "jbe 3f\n" | |
| 99 "call *%r10\n" | |
| 100 "2:" | |
| 101 | |
| 102 // Restore CPU registers, except for %rax which was set by the system call. | |
| 103 "pop %r15\n" | |
| 104 "pop %r14\n" | |
| 105 "pop %r13\n" | |
| 106 "pop %r12\n" | |
| 107 "pop %r11\n" | |
| 108 "pop %r10\n" | |
| 109 "pop %r9\n" | |
| 110 "pop %r8\n" | |
| 111 "pop %rdi\n" | |
| 112 "pop %rsi\n" | |
| 113 "pop %rdx\n" | |
| 114 "pop %rcx\n" | |
| 115 "pop %rbx\n" | |
| 116 "pop %rbp\n" | |
| 117 | |
| 118 // Remove fake return address. This is added in the patching code in | |
| 119 // library.cc and it makes stack traces a little cleaner. | |
| 120 "add $8, %rsp\n" | |
| 121 | |
| 122 // Return to caller | |
| 123 "ret\n" | |
| 124 | |
| 125 "3:" | |
| 126 // If we end up calling a specific handler, we don't need to know the | |
| 127 // system call number. However, in the generic case, we do. Shift | |
| 128 // registers so that the system call number becomes visible as the | |
| 129 // first function argument. | |
| 130 "push %r9\n" | |
| 131 "mov %r8, %r9\n" | |
| 132 "mov %rcx, %r8\n" | |
| 133 "mov %rdx, %rcx\n" | |
| 134 "mov %rsi, %rdx\n" | |
| 135 "mov %rdi, %rsi\n" | |
| 136 "mov %rax, %rdi\n" | |
| 137 | |
| 138 // Call default handler. | |
| 139 "call playground$defaultSystemCallHandler\n" | |
| 140 "pop %r9\n" | |
| 141 "jmp 2b\n" | |
| 142 #elif defined(__i386__) | |
| 143 "cmp $119, %eax\n" // NR_sigreturn | |
| 144 "jnz 1f\n" | |
| 145 "add $0x4, %esp\n" // pop return address | |
| 146 "0:int $0x80\n" // sigreturn() is unrestricted | |
| 147 "mov $66, %ebx\n" // sigreturn() should never return | |
| 148 "mov %ebx, %eax\n" // NR_exit | |
| 149 "jmp 0b\n" | |
| 150 "1:cmp $173, %eax\n" // NR_rt_sigreturn | |
| 151 "jnz 3f\n" | |
| 152 | |
| 153 // Convert rt_sigframe into sigframe, allowing us to call sigreturn(). | |
| 154 // This is possible since the first part of signal stack frames have | |
| 155 // stayed very stable since the earliest kernel versions. While never | |
| 156 // officially documented, lots of user space applications rely on this | |
| 157 // part of the ABI, and kernel developers have been careful to maintain | |
| 158 // backwards compatibility. | |
| 159 // In general, the rt_sigframe includes a lot of extra information that | |
| 160 // the signal handler can look at. Most notably, this means a complete | |
| 161 // siginfo record. | |
| 162 // Fortunately though, the kernel doesn't look at any of this extra data | |
| 163 // when returning from a signal handler. So, we can safely convert an | |
| 164 // rt_sigframe to a legacy sigframe, discarding the extra data in the | |
| 165 // process. Interestingly, the legacy signal frame is actually larger than | |
| 166 // the rt signal frame, as it includes a lot more padding. | |
| 167 "sub $0x1C8, %esp\n" // a legacy signal stack is much larger | |
| 168 "mov 0x1CC(%esp), %eax\n" // push signal number | |
| 169 "push %eax\n" | |
| 170 "lea 0x270(%esp), %esi\n" // copy siginfo register values | |
| 171 "lea 0x4(%esp), %edi\n" // into new location | |
| 172 "mov $0x16, %ecx\n" | |
| 173 "cld\n" | |
| 174 "rep movsl\n" | |
| 175 "mov 0x2C8(%esp), %ebx\n" // copy first half of signal mask | |
| 176 "mov %ebx, 0x54(%esp)\n" | |
| 177 "lea 2f, %esi\n" | |
| 178 "push %esi\n" // push restorer function | |
| 179 "lea 0x2D4(%esp), %edi\n" // patch up retcode magic numbers | |
| 180 "movb $2, %cl\n" | |
| 181 "rep movsl\n" | |
| 182 "ret\n" // return to restorer function | |
| 183 "2:pop %eax\n" // remove dummy argument (signo) | |
| 184 "mov $119, %eax\n" // NR_sigaction | |
| 185 "int $0x80\n" | |
| 186 | |
| 187 | |
| 188 // Preserve all registers | |
| 189 "3:push %ebx\n" | |
| 190 "push %ecx\n" | |
| 191 "push %edx\n" | |
| 192 "push %esi\n" | |
| 193 "push %edi\n" | |
| 194 "push %ebp\n" | |
| 195 | |
| 196 // Convert from syscall calling conventions to C calling conventions | |
| 197 "push %ebp\n" | |
| 198 "push %edi\n" | |
| 199 "push %esi\n" | |
| 200 "push %edx\n" | |
| 201 "push %ecx\n" | |
| 202 "push %ebx\n" | |
| 203 "push %eax\n" | |
| 204 | |
| 205 // Check range of system call | |
| 206 "cmp playground$maxSyscall, %eax\n" | |
| 207 "ja 9f\n" | |
| 208 | |
| 209 // We often have long sequences of calls to gettimeofday(). This is | |
| 210 // needlessly expensive. Coalesce them into a single call. | |
| 211 // | |
| 212 // We keep track of state in TLS storage that we can access through | |
| 213 // the %fs segment register. See trusted_thread.cc for the exact | |
| 214 // memory layout. | |
| 215 // | |
| 216 // TODO(markus): maybe, we should proactively call gettimeofday() and | |
| 217 // clock_gettime(), whenever we talk to the trusted thread? | |
| 218 // or maybe, if we have recently seen requests to compute | |
| 219 // the time. There might be a repeated pattern of those. | |
| 220 "cmp $78, %eax\n" // __NR_gettimeofday | |
| 221 "jnz 6f\n" | |
| 222 "cmp %eax, %fs:0x102C-0x58\n" // last system call | |
| 223 "jnz 4f\n" | |
| 224 | |
| 225 // This system call and the last system call prior to this one both are | |
| 226 // calls to gettimeofday(). Try to avoid making the new call and just | |
| 227 // return the same result as in the previous call. | |
| 228 // Just in case the caller is spinning on the result from gettimeofday(), | |
| 229 // every so often, call the actual system call. | |
| 230 "decl %fs:0x1030-0x58\n" // countdown calls to gettimofday() | |
| 231 "jz 4f\n" | |
| 232 | |
| 233 // Atomically read the 64bit word representing last-known timestamp and | |
| 234 // return it to the caller. On x86-32 this is a little more complicated and | |
| 235 // requires the use of the cmpxchg8b instruction. | |
| 236 "mov %ebx, %eax\n" | |
| 237 "mov %ecx, %edx\n" | |
| 238 "lock; cmpxchg8b 100f\n" | |
| 239 "mov %eax, 0(%ebx)\n" | |
| 240 "mov %edx, 4(%ebx)\n" | |
| 241 "xor %eax, %eax\n" | |
| 242 "add $28, %esp\n" | |
| 243 "jmp 8f\n" | |
| 244 | |
| 245 // This is a call to gettimeofday(), but we don't have a valid cached | |
| 246 // result, yet. | |
| 247 "4:mov %eax, %fs:0x102C-0x58\n" // remember syscall number | |
| 248 "movl $500, %fs:0x1030-0x58\n" // make system call, each 500 invocations | |
| 249 "call playground$defaultSystemCallHandler\n" | |
| 250 | |
| 251 // Returned from gettimeofday(). Remember return value, in case the | |
| 252 // application calls us again right away. | |
| 253 // Again, this has to happen atomically and requires cmpxchg8b. | |
| 254 "mov 4(%ebx), %ecx\n" | |
| 255 "mov 0(%ebx), %ebx\n" | |
| 256 "mov 100f, %eax\n" | |
| 257 "mov 101f, %edx\n" | |
| 258 "5:lock; cmpxchg8b 100f\n" | |
| 259 "jnz 5b\n" | |
| 260 "xor %eax, %eax\n" | |
| 261 "jmp 10f\n" | |
| 262 | |
| 263 // Remember the number of the last system call made. We deliberately do | |
| 264 // not remember calls to gettid(), as we have often seen long sequences | |
| 265 // of calls to just gettimeofday() and gettid(). In that situation, we | |
| 266 // would still like to coalesce the gettimeofday() calls. | |
| 267 "6:cmp $224, %eax\n" // __NR_gettid | |
| 268 "jz 7f\n" | |
| 269 "mov %eax, %fs:0x102C-0x58\n" // remember syscall number | |
| 270 | |
| 271 // Retrieve function call from system call table (c.f. syscall_table.c). | |
| 272 // We have three different types of entries; zero for denied system calls, | |
| 273 // that should be handled by the defaultSystemCallHandler(); minus one | |
| 274 // for unrestricted system calls that need to be forwarded to the trusted | |
| 275 // thread; and function pointers to specific handler functions. | |
| 276 "7:shl $3, %eax\n" | |
| 277 "lea playground$syscallTable, %ebx\n" | |
| 278 "add %ebx, %eax\n" | |
| 279 "mov 0(%eax), %eax\n" | |
| 280 | |
| 281 // Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise | |
| 282 // jump to fallback handler. | |
| 283 "cmp $1, %eax\n" | |
| 284 "jbe 9f\n" | |
| 285 "add $4, %esp\n" | |
| 286 "call *%eax\n" | |
| 287 "add $24, %esp\n" | |
| 288 | |
| 289 // Restore CPU registers, except for %eax which was set by the system call. | |
| 290 "8:pop %ebp\n" | |
| 291 "pop %edi\n" | |
| 292 "pop %esi\n" | |
| 293 "pop %edx\n" | |
| 294 "pop %ecx\n" | |
| 295 "pop %ebx\n" | |
| 296 | |
| 297 // Return to caller | |
| 298 "ret\n" | |
| 299 | |
| 300 // Call default handler. | |
| 301 "9:call playground$defaultSystemCallHandler\n" | |
| 302 "10:add $28, %esp\n" | |
| 303 "jmp 8b\n" | |
| 304 | |
| 305 ".pushsection \".bss\"\n" | |
| 306 ".balign 8\n" | |
| 307 "100:.byte 0, 0, 0, 0\n" | |
| 308 "101:.byte 0, 0, 0, 0\n" | |
| 309 ".popsection\n" | |
| 310 | |
| 311 #else | |
| 312 #error Unsupported target platform | |
| 313 #endif | |
| 314 ".size playground$syscallWrapper, .-playground$syscallWrapper\n" | |
| 315 ".popsection\n" | |
| 316 ); | |
| 317 | |
| 318 | |
| 319 void* Sandbox::defaultSystemCallHandler(int syscallNum, void* arg0, void* arg1, | |
| 320 void* arg2, void* arg3, void* arg4, | |
| 321 void* arg5) { | |
| 322 // TODO(markus): The following comment is currently not true, we do intercept
these system calls. Try to fix that. | |
| 323 | |
| 324 // We try to avoid intercepting read(), and write(), as these system calls | |
| 325 // are not restricted in Seccomp mode. But depending on the exact | |
| 326 // instruction sequence in libc, we might not be able to reliably | |
| 327 // filter out these system calls at the time when we instrument the code. | |
| 328 SysCalls sys; | |
| 329 long rc; | |
| 330 long long tm; | |
| 331 switch (syscallNum) { | |
| 332 case __NR_read: | |
| 333 Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call"); | |
| 334 rc = sys.read((long)arg0, arg1, (size_t)arg2); | |
| 335 break; | |
| 336 case __NR_write: | |
| 337 Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call"); | |
| 338 rc = sys.write((long)arg0, arg1, (size_t)arg2); | |
| 339 break; | |
| 340 default: | |
| 341 if (Debug::isEnabled()) { | |
| 342 // In debug mode, prevent stderr from being closed | |
| 343 if (syscallNum == __NR_close && arg0 == (void *)2) | |
| 344 return 0; | |
| 345 } | |
| 346 | |
| 347 if ((unsigned)syscallNum <= maxSyscall && | |
| 348 syscallTable[syscallNum].handler == UNRESTRICTED_SYSCALL) { | |
| 349 Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call"); | |
| 350 perform_unrestricted: | |
| 351 struct { | |
| 352 int sysnum; | |
| 353 void* unrestricted_req[6]; | |
| 354 } __attribute__((packed)) request = { | |
| 355 syscallNum, { arg0, arg1, arg2, arg3, arg4, arg5 } }; | |
| 356 | |
| 357 int thread = threadFdPub(); | |
| 358 void* rc; | |
| 359 if (write(sys, thread, &request, sizeof(request)) != sizeof(request) || | |
| 360 read(sys, thread, &rc, sizeof(rc)) != sizeof(rc)) { | |
| 361 die("Failed to forward unrestricted system call"); | |
| 362 } | |
| 363 Debug::elapsed(tm, syscallNum); | |
| 364 return rc; | |
| 365 } else if (Debug::isEnabled()) { | |
| 366 Debug::syscall(&tm, syscallNum, | |
| 367 "In production mode, this call would be disallowed"); | |
| 368 goto perform_unrestricted; | |
| 369 } else { | |
| 370 return (void *)-ENOSYS; | |
| 371 } | |
| 372 } | |
| 373 if (rc < 0) { | |
| 374 rc = -sys.my_errno; | |
| 375 } | |
| 376 Debug::elapsed(tm, syscallNum); | |
| 377 return (void *)rc; | |
| 378 } | |
| 379 | |
| 380 } // namespace | |
| OLD | NEW |