| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "library.h" | |
| 6 #include "sandbox_impl.h" | |
| 7 #include "syscall_table.h" | |
| 8 | |
| 9 namespace playground { | |
| 10 | |
| 11 // Global variables | |
| 12 int Sandbox::proc_self_maps_ = -1; | |
| 13 enum Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN; | |
| 14 int Sandbox::pid_; | |
| 15 int Sandbox::processFdPub_; | |
| 16 int Sandbox::cloneFdPub_; | |
| 17 Sandbox::SysCalls::kernel_sigaction Sandbox::sa_segv_; | |
| 18 Sandbox::ProtectedMap Sandbox::protectedMap_; | |
| 19 std::vector<SecureMem::Args*> Sandbox::secureMemPool_; | |
| 20 | |
| 21 bool Sandbox::sendFd(int transport, int fd0, int fd1, const void* buf, | |
| 22 size_t len) { | |
| 23 int fds[2], count = 0; | |
| 24 if (fd0 >= 0) { fds[count++] = fd0; } | |
| 25 if (fd1 >= 0) { fds[count++] = fd1; } | |
| 26 if (!count) { | |
| 27 return false; | |
| 28 } | |
| 29 char cmsg_buf[CMSG_SPACE(count*sizeof(int))]; | |
| 30 memset(cmsg_buf, 0, sizeof(cmsg_buf)); | |
| 31 struct SysCalls::kernel_iovec iov[2] = { { 0 } }; | |
| 32 struct SysCalls::kernel_msghdr msg = { 0 }; | |
| 33 int dummy = 0; | |
| 34 iov[0].iov_base = &dummy; | |
| 35 iov[0].iov_len = sizeof(dummy); | |
| 36 if (buf && len > 0) { | |
| 37 iov[1].iov_base = const_cast<void *>(buf); | |
| 38 iov[1].iov_len = len; | |
| 39 } | |
| 40 msg.msg_iov = iov; | |
| 41 msg.msg_iovlen = (buf && len > 0) ? 2 : 1; | |
| 42 msg.msg_control = cmsg_buf; | |
| 43 msg.msg_controllen = CMSG_LEN(count*sizeof(int)); | |
| 44 struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); | |
| 45 cmsg->cmsg_level = SOL_SOCKET; | |
| 46 cmsg->cmsg_type = SCM_RIGHTS; | |
| 47 cmsg->cmsg_len = CMSG_LEN(count*sizeof(int)); | |
| 48 memcpy(CMSG_DATA(cmsg), fds, count*sizeof(int)); | |
| 49 SysCalls sys; | |
| 50 return NOINTR_SYS(sys.sendmsg(transport, &msg, 0)) == | |
| 51 (ssize_t)(sizeof(dummy) + ((buf && len > 0) ? len : 0)); | |
| 52 } | |
| 53 | |
| 54 bool Sandbox::getFd(int transport, int* fd0, int* fd1, void* buf, size_t*len) { | |
| 55 int count = 0; | |
| 56 int *err = NULL; | |
| 57 if (fd0) { | |
| 58 count++; | |
| 59 err = fd0; | |
| 60 *fd0 = -1; | |
| 61 } | |
| 62 if (fd1) { | |
| 63 if (!count++) { | |
| 64 err = fd1; | |
| 65 } | |
| 66 *fd1 = -1; | |
| 67 } | |
| 68 if (!count) { | |
| 69 return false; | |
| 70 } | |
| 71 char cmsg_buf[CMSG_SPACE(count*sizeof(int))]; | |
| 72 memset(cmsg_buf, 0, sizeof(cmsg_buf)); | |
| 73 struct SysCalls::kernel_iovec iov[2] = { { 0 } }; | |
| 74 struct SysCalls::kernel_msghdr msg = { 0 }; | |
| 75 iov[0].iov_base = err; | |
| 76 iov[0].iov_len = sizeof(int); | |
| 77 if (buf && len && *len > 0) { | |
| 78 iov[1].iov_base = buf; | |
| 79 iov[1].iov_len = *len; | |
| 80 } | |
| 81 msg.msg_iov = iov; | |
| 82 msg.msg_iovlen = (buf && len && *len > 0) ? 2 : 1; | |
| 83 msg.msg_control = cmsg_buf; | |
| 84 msg.msg_controllen = CMSG_LEN(count*sizeof(int)); | |
| 85 SysCalls sys; | |
| 86 ssize_t bytes = NOINTR_SYS(sys.recvmsg(transport, &msg, 0)); | |
| 87 if (len) { | |
| 88 *len = bytes > (int)sizeof(int) ? | |
| 89 bytes - sizeof(int) : 0; | |
| 90 } | |
| 91 if (bytes != (ssize_t)(sizeof(int) + ((buf && len && *len > 0) ? *len : 0))){ | |
| 92 *err = bytes >= 0 ? 0 : -EBADF; | |
| 93 return false; | |
| 94 } | |
| 95 if (*err) { | |
| 96 // "err" is the first four bytes of the payload. If these are non-zero, | |
| 97 // the sender on the other side of the socketpair sent us an errno value. | |
| 98 // We don't expect to get any file handles in this case. | |
| 99 return false; | |
| 100 } | |
| 101 struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); | |
| 102 if ((msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) || | |
| 103 !cmsg || | |
| 104 cmsg->cmsg_level != SOL_SOCKET || | |
| 105 cmsg->cmsg_type != SCM_RIGHTS || | |
| 106 cmsg->cmsg_len != CMSG_LEN(count*sizeof(int))) { | |
| 107 *err = -EBADF; | |
| 108 return false; | |
| 109 } | |
| 110 if (fd1) { *fd1 = ((int *)CMSG_DATA(cmsg))[--count]; } | |
| 111 if (fd0) { *fd0 = ((int *)CMSG_DATA(cmsg))[--count]; } | |
| 112 return true; | |
| 113 } | |
| 114 | |
| 115 void Sandbox::setupSignalHandlers() { | |
| 116 // Set SIGCHLD to SIG_DFL so that waitpid() can work | |
| 117 SysCalls sys; | |
| 118 struct SysCalls::kernel_sigaction sa; | |
| 119 memset(&sa, 0, sizeof(sa)); | |
| 120 sa.sa_handler_ = SIG_DFL; | |
| 121 sys.sigaction(SIGCHLD, &sa, NULL); | |
| 122 | |
| 123 // Set up SEGV handler for dealing with RDTSC instructions, system calls | |
| 124 // that have been rewritten to use INT0, for sigprocmask() emulation, for | |
| 125 // the creation of threads, and for user-provided SEGV handlers. | |
| 126 sa.sa_sigaction_ = segv(); | |
| 127 sa.sa_flags = SA_SIGINFO | SA_NODEFER; | |
| 128 sys.sigaction(SIGSEGV, &sa, &sa_segv_); | |
| 129 | |
| 130 // Unblock SIGSEGV and SIGCHLD | |
| 131 SysCalls::kernel_sigset_t mask; | |
| 132 memset(&mask, 0x00, sizeof(mask)); | |
| 133 mask.sig[0] |= (1 << (SIGSEGV - 1)) | (1 << (SIGCHLD - 1)); | |
| 134 sys.sigprocmask(SIG_UNBLOCK, &mask, 0); | |
| 135 } | |
| 136 | |
| 137 void (*Sandbox::segv())(int signo, SysCalls::siginfo *context, void *unused) { | |
| 138 void (*fnc)(int signo, SysCalls::siginfo *context, void *unused); | |
| 139 asm volatile( | |
| 140 "call 999f\n" | |
| 141 #if defined(__x86_64__) | |
| 142 // Inspect instruction at the point where the segmentation fault | |
| 143 // happened. If it is RDTSC, forward the request to the trusted | |
| 144 // thread. | |
| 145 "mov $-3, %%r14\n" // request for RDTSC | |
| 146 "mov 0xB0(%%rsp), %%r15\n" // %rip at time of segmentation fault | |
| 147 "cmpw $0x310F, (%%r15)\n" // RDTSC | |
| 148 "jz 0f\n" | |
| 149 "cmpw $0x010F, (%%r15)\n" // RDTSCP | |
| 150 "jnz 8f\n" | |
| 151 "cmpb $0xF9, 2(%%r15)\n" | |
| 152 "jnz 8f\n" | |
| 153 "mov $-4, %%r14\n" // request for RDTSCP | |
| 154 "0:" | |
| 155 #ifndef NDEBUG | |
| 156 "lea 100f(%%rip), %%rdi\n" | |
| 157 "call playground$debugMessage\n" | |
| 158 #endif | |
| 159 "sub $4, %%rsp\n" | |
| 160 "push %%r14\n" | |
| 161 "mov %%gs:16, %%edi\n" // fd = threadFdPub | |
| 162 "mov %%rsp, %%rsi\n" // buf = %rsp | |
| 163 "mov $4, %%edx\n" // len = sizeof(int) | |
| 164 "1:mov $1, %%eax\n" // NR_write | |
| 165 "syscall\n" | |
| 166 "cmp %%rax, %%rdx\n" | |
| 167 "jz 5f\n" | |
| 168 "cmp $-4, %%eax\n" // EINTR | |
| 169 "jz 1b\n" | |
| 170 "2:add $12, %%rsp\n" | |
| 171 "movq $0, 0x98(%%rsp)\n" // %rax at time of segmentation fault | |
| 172 "movq $0, 0x90(%%rsp)\n" // %rdx at time of segmentation fault | |
| 173 "cmpw $0x310F, (%%r15)\n" // RDTSC | |
| 174 "jz 3f\n" | |
| 175 "movq $0, 0xA0(%%rsp)\n" // %rcx at time of segmentation fault | |
| 176 "3:addq $2, 0xB0(%%rsp)\n" // %rip at time of segmentation fault | |
| 177 "cmpw $0x010F, (%%r15)\n" // RDTSC | |
| 178 "jnz 4f\n" | |
| 179 "addq $1, 0xB0(%%rsp)\n" // %rip at time of segmentation fault | |
| 180 "4:ret\n" | |
| 181 "5:mov $12, %%edx\n" // len = 3*sizeof(int) | |
| 182 "6:mov $0, %%eax\n" // NR_read | |
| 183 "syscall\n" | |
| 184 "cmp $-4, %%eax\n" // EINTR | |
| 185 "jz 6b\n" | |
| 186 "cmp %%rax, %%rdx\n" | |
| 187 "jnz 2b\n" | |
| 188 "mov 0(%%rsp), %%eax\n" | |
| 189 "mov 4(%%rsp), %%edx\n" | |
| 190 "mov 8(%%rsp), %%ecx\n" | |
| 191 "add $12, %%rsp\n" | |
| 192 "mov %%rdx, 0x90(%%rsp)\n" // %rdx at time of segmentation fault | |
| 193 "cmpw $0x310F, (%%r15)\n" // RDTSC | |
| 194 "jz 7f\n" | |
| 195 "mov %%rcx, 0xA0(%%rsp)\n" // %rcx at time of segmentation fault | |
| 196 "7:mov %%rax, 0x98(%%rsp)\n" // %rax at time of segmentation fault | |
| 197 "jmp 3b\n" | |
| 198 | |
| 199 // If the instruction is INT 0, then this was probably the result | |
| 200 // of playground::Library being unable to find a way to safely | |
| 201 // rewrite the system call instruction. Retrieve the CPU register | |
| 202 // at the time of the segmentation fault and invoke syscallWrapper(). | |
| 203 "8:cmpw $0x00CD, (%%r15)\n" // INT $0x0 | |
| 204 "jnz 16f\n" | |
| 205 #ifndef NDEBUG | |
| 206 "lea 200f(%%rip), %%rdi\n" | |
| 207 "call playground$debugMessage\n" | |
| 208 #endif | |
| 209 "mov 0x98(%%rsp), %%rax\n" // %rax at time of segmentation fault | |
| 210 "mov 0x70(%%rsp), %%rdi\n" // %rdi at time of segmentation fault | |
| 211 "mov 0x78(%%rsp), %%rsi\n" // %rsi at time of segmentation fault | |
| 212 "mov 0x90(%%rsp), %%rdx\n" // %rdx at time of segmentation fault | |
| 213 "mov 0x40(%%rsp), %%r10\n" // %r10 at time of segmentation fault | |
| 214 "mov 0x30(%%rsp), %%r8\n" // %r8 at time of segmentation fault | |
| 215 "mov 0x38(%%rsp), %%r9\n" // %r9 at time of segmentation fault | |
| 216 | |
| 217 // Handle rt_sigprocmask() | |
| 218 "cmp $14, %%rax\n" // NR_rt_sigprocmask | |
| 219 "jnz 12f\n" | |
| 220 "mov $-22, %%rax\n" // -EINVAL | |
| 221 "cmp $8, %%r10\n" // %r10 = sigsetsize (8 bytes = 64 signals) | |
| 222 "jl 7b\n" | |
| 223 "mov 0x130(%%rsp), %%r10\n" // signal mask at time of segmentation fault | |
| 224 "test %%rsi, %%rsi\n" // only set mask, if set is non-NULL | |
| 225 "jz 11f\n" | |
| 226 "mov 0(%%rsi), %%rsi\n" | |
| 227 "cmp $0, %%rdi\n" // %rdi = how (SIG_BLOCK) | |
| 228 "jnz 9f\n" | |
| 229 "or %%rsi, 0x130(%%rsp)\n" // signal mask at time of segmentation fault | |
| 230 "jmp 11f\n" | |
| 231 "9:cmp $1, %%rdi\n" // %rdi = how (SIG_UNBLOCK) | |
| 232 "jnz 10f\n" | |
| 233 "xor $-1, %%rsi\n" | |
| 234 "and %%rsi, 0x130(%%rsp)\n" // signal mask at time of segmentation fault | |
| 235 "jmp 11f\n" | |
| 236 "10:cmp $2, %%rdi\n" // %rdi = how (SIG_SETMASK) | |
| 237 "jnz 7b\n" | |
| 238 "mov %%rsi, 0x130(%%rsp)\n" // signal mask at time of segmentation fault | |
| 239 "11:xor %%rax, %%rax\n" | |
| 240 "test %%rdx, %%rdx\n" // only return old mask, if set is non-NULL | |
| 241 "jz 7b\n" | |
| 242 "mov %%r10, 0(%%rdx)\n" // old_set | |
| 243 "jmp 7b\n" | |
| 244 | |
| 245 // Handle rt_sigreturn() | |
| 246 "12:cmp $15, %%rax\n" // NR_rt_sigreturn | |
| 247 "jnz 14f\n" | |
| 248 "mov 0xA8(%%rsp), %%rsp\n" // %rsp at time of segmentation fault | |
| 249 "13:syscall\n" // rt_sigreturn() is unrestricted | |
| 250 "mov $66, %%edi\n" // rt_sigreturn() should never return | |
| 251 "mov $231, %%eax\n" // NR_exit_group | |
| 252 "jmp 13b\n" | |
| 253 | |
| 254 // Copy signal frame onto new stack. See clone.cc for details | |
| 255 "14:cmp $56+0xF000, %%rax\n" // NR_clone + 0xF000 | |
| 256 "jnz 15f\n" | |
| 257 "lea 8(%%rsp), %%rax\n" // retain stack frame upon returning | |
| 258 "mov %%rax, 0xA8(%%rsp)\n" // %rsp at time of segmentation fault | |
| 259 "jmp 7b\n" | |
| 260 | |
| 261 // Forward system call to syscallWrapper() | |
| 262 "15:lea 7b(%%rip), %%rcx\n" | |
| 263 "push %%rcx\n" | |
| 264 "push 0xB8(%%rsp)\n" // %rip at time of segmentation fault | |
| 265 "lea playground$syscallWrapper(%%rip), %%rcx\n" | |
| 266 "jmp *%%rcx\n" | |
| 267 | |
| 268 // In order to implement SA_NODEFER, we have to keep track of recursive | |
| 269 // calls to SIGSEGV handlers. This means we have to increment a counter | |
| 270 // before calling the user's signal handler, and decrement it on | |
| 271 // leaving the user's signal handler. | |
| 272 // Some signal handlers look at the return address of the signal | |
| 273 // stack, and more importantly "gdb" uses the call to rt_sigreturn() | |
| 274 // as a magic signature when doing stacktraces. So, we have to use | |
| 275 // a little more unusual code to regain control after the user's | |
| 276 // signal handler is done. We adjust the return address to point to | |
| 277 // non-executable memory. And when we trigger another SEGV we pop the | |
| 278 // extraneous signal frame and then call rt_sigreturn(). | |
| 279 // N.B. We currently do not correctly adjust the SEGV counter, if the | |
| 280 // user's signal handler exits in way other than by returning (e.g. by | |
| 281 // directly calling rt_sigreturn(), or by calling siglongjmp()). | |
| 282 "16:lea 22f(%%rip), %%r14\n" | |
| 283 "cmp %%r14, %%r15\n" | |
| 284 "jnz 17f\n" // check if returning from user's handler | |
| 285 "decl %%gs:0x105C-0xE0\n" // decrement SEGV recursion counter | |
| 286 "mov 0xA8(%%rsp), %%rsp\n" // %rsp at time of segmentation fault | |
| 287 "mov $0xF, %%eax\n" // NR_rt_sigreturn | |
| 288 "syscall\n" | |
| 289 | |
| 290 // This was a genuine segmentation fault. Check Sandbox::sa_segv_ for | |
| 291 // what we are supposed to do. | |
| 292 "17:mov playground$sa_segv@GOTPCREL(%%rip), %%rax\n" | |
| 293 "cmp $0, 0(%%rax)\n" // SIG_DFL | |
| 294 "jz 18f\n" | |
| 295 "cmp $1, 0(%%rax)\n" // SIG_IGN | |
| 296 "jnz 19f\n" // can't really ignore synchronous signals | |
| 297 | |
| 298 // Trigger the kernel's default signal disposition. The only way we can | |
| 299 // do this from seccomp mode is by blocking the signal and retriggering | |
| 300 // it. | |
| 301 "18:orb $4, 0x131(%%rsp)\n" // signal mask at time of segmentation fault | |
| 302 "ret\n" | |
| 303 | |
| 304 // Check sa_flags: | |
| 305 // - We can ignore SA_NOCLDSTOP, SA_NOCLDWAIT, and SA_RESTART as they | |
| 306 // do not have any effect for SIGSEGV. | |
| 307 // - On x86-64, we can also ignore SA_SIGINFO, as the calling | |
| 308 // conventions for sa_handler() are a subset of the conventions for | |
| 309 // sa_sigaction(). | |
| 310 // - We have to always register our signal handler with SA_NODEFER so | |
| 311 // that the user's signal handler can make system calls which might | |
| 312 // require additional help from our SEGV handler. | |
| 313 // - If the user's signal handler wasn't supposed to be SA_NODEFER, then | |
| 314 // we emulate this behavior by keeping track of a recursion counter. | |
| 315 // | |
| 316 // TODO(markus): If/when we add support for sigaltstack(), we have to | |
| 317 // handle SA_ONSTACK. | |
| 318 "19:cmpl $0, %%gs:0x105C-0xE0\n"// check if we failed inside of SEGV handler | |
| 319 "jnz 18b\n" // if so, then terminate program | |
| 320 "mov 0(%%rax), %%rbx\n" // sa_segv_.sa_sigaction | |
| 321 "mov 8(%%rax), %%rcx\n" // sa_segv_.sa_flags | |
| 322 "btl $31, %%ecx\n" // SA_RESETHAND | |
| 323 "jnc 20f\n" | |
| 324 "movq $0, 0(%%rax)\n" // set handler to SIG_DFL | |
| 325 "20:btl $30, %%ecx\n" // SA_NODEFER | |
| 326 "jc 21f\n" | |
| 327 "mov %%r14, 0(%%rsp)\n" // trigger a SEGV on return, so that we can | |
| 328 "incl %%gs:0x105C-0xE0\n" // clean up state; incr. recursion counter | |
| 329 "21:jmp *%%rbx\n" // call user's signal handler | |
| 330 | |
| 331 | |
| 332 // Non-executable version of the restorer function. We use this to | |
| 333 // trigger a SEGV upon returning from the user's signal handler, giving | |
| 334 // us an ability to clean up prior to returning from the SEGV handler. | |
| 335 ".pushsection .data\n" // move code into non-executable section | |
| 336 "22:mov $0xF, %%rax\n" // gdb looks for this signature when doing | |
| 337 "syscall\n" // backtraces | |
| 338 ".popsection\n" | |
| 339 #elif defined(__i386__) | |
| 340 // Inspect instruction at the point where the segmentation fault | |
| 341 // happened. If it is RDTSC, forward the request to the trusted | |
| 342 // thread. | |
| 343 "mov $-3, %%ebx\n" // request for RDTSC | |
| 344 "mov 0xDC(%%esp), %%ebp\n" // %eip at time of segmentation fault | |
| 345 "cmpw $0x310F, (%%ebp)\n" // RDTSC | |
| 346 "jz 0f\n" | |
| 347 "cmpw $0x010F, (%%ebp)\n" // RDTSCP | |
| 348 "jnz 9f\n" | |
| 349 "cmpb $0xF9, 2(%%ebp)\n" | |
| 350 "jnz 9f\n" | |
| 351 "mov $-4, %%ebx\n" // request for RDTSCP | |
| 352 "0:" | |
| 353 #ifndef NDEBUG | |
| 354 "lea 100f, %%eax\n" | |
| 355 "push %%eax\n" | |
| 356 "call playground$debugMessage\n" | |
| 357 "sub $4, %%esp\n" | |
| 358 #else | |
| 359 "sub $8, %%esp\n" // allocate buffer for receiving timestamp | |
| 360 #endif | |
| 361 "push %%ebx\n" | |
| 362 "mov %%fs:16, %%ebx\n" // fd = threadFdPub | |
| 363 "mov %%esp, %%ecx\n" // buf = %esp | |
| 364 "mov $4, %%edx\n" // len = sizeof(int) | |
| 365 "1:mov %%edx, %%eax\n" // NR_write | |
| 366 "int $0x80\n" | |
| 367 "cmp %%eax, %%edx\n" | |
| 368 "jz 7f\n" | |
| 369 "cmp $-4, %%eax\n" // EINTR | |
| 370 "jz 1b\n" | |
| 371 "2:add $12, %%esp\n" // remove temporary buffer from stack | |
| 372 "xor %%eax, %%eax\n" | |
| 373 "movl $0, 0xC8(%%esp)\n" // %edx at time of segmentation fault | |
| 374 "cmpw $0x310F, (%%ebp)\n" // RDTSC | |
| 375 "jz 3f\n" | |
| 376 "movl $0, 0xCC(%%esp)\n" // %ecx at time of segmentation fault | |
| 377 "3:mov %%eax, 0xD0(%%esp)\n" // %eax at time of segmentation fault | |
| 378 "4:mov 0xDC(%%esp), %%ebp\n" // %eip at time of segmentation fault | |
| 379 "addl $2, 0xDC(%%esp)\n" // %eip at time of segmentation fault | |
| 380 "cmpw $0x010F, (%%ebp)\n" // RDTSCP | |
| 381 "jnz 5f\n" | |
| 382 "addl $1, 0xDC(%%esp)\n" // %eip at time of segmentation fault | |
| 383 "5:sub $0x1C8, %%esp\n" // a legacy signal stack is much larger | |
| 384 "mov 0x1CC(%%esp), %%eax\n" // push signal number | |
| 385 "push %%eax\n" | |
| 386 "lea 0x270(%%esp), %%esi\n" // copy siginfo register values | |
| 387 "lea 0x4(%%esp), %%edi\n" // into new location | |
| 388 "mov $22, %%ecx\n" | |
| 389 "cld\n" | |
| 390 "rep movsl\n" | |
| 391 "mov 0x2C8(%%esp), %%ebx\n" // copy first half of signal mask | |
| 392 "mov %%ebx, 0x54(%%esp)\n" | |
| 393 "lea 6f, %%esi\n" // copy "magic" restorer function | |
| 394 "push %%esi\n" // push restorer function | |
| 395 "lea 0x2D4(%%esp), %%edi\n" // patch up retcode magic numbers | |
| 396 "movb $2, %%cl\n" | |
| 397 "rep movsl\n" | |
| 398 "ret\n" // return to restorer function | |
| 399 | |
| 400 // The restorer function is sometimes used by gdb as a magic marker to | |
| 401 // recognize signal stack frames. Don't change any of the next three | |
| 402 // instructions. | |
| 403 "6:pop %%eax\n" // remove dummy argument (signo) | |
| 404 "mov $119, %%eax\n" // NR_sigreturn | |
| 405 "int $0x80\n" | |
| 406 "7:mov $12, %%edx\n" // len = 3*sizeof(int) | |
| 407 "8:mov $3, %%eax\n" // NR_read | |
| 408 "int $0x80\n" | |
| 409 "cmp $-4, %%eax\n" // EINTR | |
| 410 "jz 8b\n" | |
| 411 "cmp %%eax, %%edx\n" | |
| 412 "jnz 2b\n" | |
| 413 "pop %%eax\n" | |
| 414 "pop %%edx\n" | |
| 415 "pop %%ecx\n" | |
| 416 "mov %%edx, 0xC8(%%esp)\n" // %edx at time of segmentation fault | |
| 417 "cmpw $0x310F, (%%ebp)\n" // RDTSC | |
| 418 "jz 3b\n" | |
| 419 "mov %%ecx, 0xCC(%%esp)\n" // %ecx at time of segmentation fault | |
| 420 "jmp 3b\n" | |
| 421 | |
| 422 // If the instruction is INT 0, then this was probably the result | |
| 423 // of playground::Library being unable to find a way to safely | |
| 424 // rewrite the system call instruction. Retrieve the CPU register | |
| 425 // at the time of the segmentation fault and invoke syscallWrapper(). | |
| 426 "9:cmpw $0x00CD, (%%ebp)\n" // INT $0x0 | |
| 427 "jnz 20f\n" | |
| 428 #ifndef NDEBUG | |
| 429 "lea 200f, %%eax\n" | |
| 430 "push %%eax\n" | |
| 431 "call playground$debugMessage\n" | |
| 432 "add $0x4, %%esp\n" | |
| 433 #endif | |
| 434 "mov 0xD0(%%esp), %%eax\n" // %eax at time of segmentation fault | |
| 435 "mov 0xC4(%%esp), %%ebx\n" // %ebx at time of segmentation fault | |
| 436 "mov 0xCC(%%esp), %%ecx\n" // %ecx at time of segmentation fault | |
| 437 "mov 0xC8(%%esp), %%edx\n" // %edx at time of segmentation fault | |
| 438 "mov 0xB8(%%esp), %%esi\n" // %esi at time of segmentation fault | |
| 439 "mov 0xB4(%%esp), %%edi\n" // %edi at time of segmentation fault | |
| 440 "mov 0xB2(%%esp), %%ebp\n" // %ebp at time of segmentation fault | |
| 441 | |
| 442 // Handle sigprocmask() and rt_sigprocmask() | |
| 443 "cmp $175, %%eax\n" // NR_rt_sigprocmask | |
| 444 "jnz 10f\n" | |
| 445 "mov $-22, %%eax\n" // -EINVAL | |
| 446 "cmp $8, %%esi\n" // %esi = sigsetsize (8 bytes = 64 signals) | |
| 447 "jl 3b\n" | |
| 448 "jmp 11f\n" | |
| 449 "10:cmp $126, %%eax\n" // NR_sigprocmask | |
| 450 "jnz 15f\n" | |
| 451 "mov $-22, %%eax\n" | |
| 452 "11:mov 0xFC(%%esp), %%edi\n" // signal mask at time of segmentation fault | |
| 453 "mov 0x100(%%esp), %%ebp\n" | |
| 454 "test %%ecx, %%ecx\n" // only set mask, if set is non-NULL | |
| 455 "jz 14f\n" | |
| 456 "mov 0(%%ecx), %%esi\n" | |
| 457 "mov 4(%%ecx), %%ecx\n" | |
| 458 "cmp $0, %%ebx\n" // %ebx = how (SIG_BLOCK) | |
| 459 "jnz 12f\n" | |
| 460 "or %%esi, 0xFC(%%esp)\n" // signal mask at time of segmentation fault | |
| 461 "or %%ecx, 0x100(%%esp)\n" | |
| 462 "jmp 14f\n" | |
| 463 "12:cmp $1, %%ebx\n" // %ebx = how (SIG_UNBLOCK) | |
| 464 "jnz 13f\n" | |
| 465 "xor $-1, %%esi\n" | |
| 466 "xor $-1, %%ecx\n" | |
| 467 "and %%esi, 0xFC(%%esp)\n" // signal mask at time of segmentation fault | |
| 468 "and %%ecx, 0x100(%%esp)\n" | |
| 469 "jmp 14f\n" | |
| 470 "13:cmp $2, %%ebx\n" // %ebx = how (SIG_SETMASK) | |
| 471 "jnz 3b\n" | |
| 472 "mov %%esi, 0xFC(%%esp)\n" // signal mask at time of segmentation fault | |
| 473 "mov %%ecx, 0x100(%%esp)\n" | |
| 474 "14:xor %%eax, %%eax\n" | |
| 475 "test %%edx, %%edx\n" // only return old mask, if set is non-NULL | |
| 476 "jz 3b\n" | |
| 477 "mov %%edi, 0(%%edx)\n" // old_set | |
| 478 "mov %%ebp, 4(%%edx)\n" | |
| 479 "jmp 3b\n" | |
| 480 | |
| 481 // Handle sigreturn() and rt_sigreturn() | |
| 482 // See syscall.cc for a discussion on how we can emulate rt_sigreturn() | |
| 483 // by calling sigreturn() with a suitably adjusted stack. | |
| 484 "15:cmp $119, %%eax\n" // NR_sigreturn | |
| 485 "jnz 17f\n" | |
| 486 "mov 0xC0(%%esp), %%esp\n" // %esp at time of segmentation fault | |
| 487 "16:int $0x80\n" // sigreturn() is unrestricted | |
| 488 "17:cmp $173, %%eax\n" // NR_rt_sigreturn | |
| 489 "jnz 18f\n" | |
| 490 "mov 0xC0(%%esp), %%esp\n" // %esp at time of segmentation fault | |
| 491 "sub $4, %%esp\n" // add fake return address | |
| 492 "jmp 4b\n" | |
| 493 | |
| 494 // Copy signal frame onto new stack. In the process, we have to convert | |
| 495 // it from an RT signal frame to a legacy signal frame. | |
| 496 // See clone.cc for details | |
| 497 "18:cmp $120+0xF000, %%eax\n" // NR_clone + 0xF000 | |
| 498 "jnz 19f\n" | |
| 499 "lea -0x1C8(%%esp), %%eax\n"// retain stack frame upon returning | |
| 500 "mov %%eax, 0xC0(%%esp)\n" // %esp at time of segmentation fault | |
| 501 "jmp 3b\n" | |
| 502 | |
| 503 // Forward system call to syscallWrapper() | |
| 504 "19:call playground$syscallWrapper\n" | |
| 505 "jmp 3b\n" | |
| 506 | |
| 507 // In order to implement SA_NODEFER, we have to keep track of recursive | |
| 508 // calls to SIGSEGV handlers. This means we have to increment a counter | |
| 509 // before calling the user's signal handler, and decrement it on | |
| 510 // leaving the user's signal handler. | |
| 511 // Some signal handlers look at the return address of the signal | |
| 512 // stack, and more importantly "gdb" uses the call to {,rt_}sigreturn() | |
| 513 // as a magic signature when doing stacktraces. So, we have to use | |
| 514 // a little more unusual code to regain control after the user's | |
| 515 // signal handler is done. We adjust the return address to point to | |
| 516 // non-executable memory. And when we trigger another SEGV we pop the | |
| 517 // extraneous signal frame and then call sigreturn(). | |
| 518 // N.B. We currently do not correctly adjust the SEGV counter, if the | |
| 519 // user's signal handler exits in way other than by returning (e.g. by | |
| 520 // directly calling {,rt_}sigreturn(), or by calling siglongjmp()). | |
| 521 "20:lea 30f, %%edi\n" // rt-style restorer function | |
| 522 "lea 31f, %%esi\n" // legacy restorer function | |
| 523 "cmp %%ebp, %%edi\n" // check if returning from user's handler | |
| 524 "jnz 21f\n" | |
| 525 "decl %%fs:0x1040-0x58\n" // decrement SEGV recursion counter | |
| 526 "mov 0xC0(%%esp), %%esp\n" // %esp at time of segmentation fault | |
| 527 "jmp 29f\n" | |
| 528 "21:cmp %%ebp, %%esi\n" // check if returning from user's handler | |
| 529 "jnz 22f\n" | |
| 530 "decl %%fs:0x1040-0x58\n" // decrement SEGV recursion counter | |
| 531 "mov 0xC0(%%esp), %%esp\n" // %esp at time of segmentation fault | |
| 532 "jmp 6b\n" | |
| 533 | |
| 534 // This was a genuine segmentation fault. Check Sandbox::sa_segv_ for | |
| 535 // what we are supposed to do. | |
| 536 "22:lea playground$sa_segv, %%eax\n" | |
| 537 "cmp $0, 0(%%eax)\n" // SIG_DFL | |
| 538 "jz 23f\n" | |
| 539 "cmp $1, 0(%%eax)\n" // SIG_IGN | |
| 540 "jnz 24f\n" // can't really ignore synchronous signals | |
| 541 | |
| 542 // Trigger the kernel's default signal disposition. The only way we can | |
| 543 // do this from seccomp mode is by blocking the signal and retriggering | |
| 544 // it. | |
| 545 "23:orb $4, 0xFD(%%esp)\n" // signal mask at time of segmentation fault | |
| 546 "jmp 5b\n" | |
| 547 | |
| 548 // Check sa_flags: | |
| 549 // - We can ignore SA_NOCLDSTOP, SA_NOCLDWAIT, and SA_RESTART as they | |
| 550 // do not have any effect for SIGSEGV. | |
| 551 // - We have to always register our signal handler with SA_NODEFER so | |
| 552 // that the user's signal handler can make system calls which might | |
| 553 // require additional help from our SEGV handler. | |
| 554 // - If the user's signal handler wasn't supposed to be SA_NODEFER, then | |
| 555 // we emulate this behavior by keeping track of a recursion counter. | |
| 556 // | |
| 557 // TODO(markus): If/when we add support for sigaltstack(), we have to | |
| 558 // handle SA_ONSTACK. | |
| 559 "24:cmpl $0, %%fs:0x1040-0x58\n"// check if we failed inside of SEGV handler | |
| 560 "jnz 23b\n" // if so, then terminate program | |
| 561 "mov 0(%%eax), %%ebx\n" // sa_segv_.sa_sigaction | |
| 562 "mov 4(%%eax), %%ecx\n" // sa_segv_.sa_flags | |
| 563 "btl $31, %%ecx\n" // SA_RESETHAND | |
| 564 "jnc 25f\n" | |
| 565 "movl $0, 0(%%eax)\n" // set handler to SIG_DFL | |
| 566 "25:btl $30, %%ecx\n" // SA_NODEFER | |
| 567 "jc 28f\n" | |
| 568 "btl $2, %%ecx\n" // SA_SIGINFO | |
| 569 "jnc 26f\n" | |
| 570 "mov %%edi, 0(%%esp)\n" // trigger a SEGV on return | |
| 571 "incl %%fs:0x1040-0x58\n" // increment recursion counter | |
| 572 "jmp *%%ebx\n" // call user's signal handler | |
| 573 "26:mov %%esi, 0(%%esp)\n" | |
| 574 "incl %%fs:0x1040-0x58\n" // increment recursion counter | |
| 575 | |
| 576 // We always register the signal handler to give us rt-style signal | |
| 577 // frames. But if the user asked for legacy signal frames, we must | |
| 578 // convert the signal frame prior to calling the user's signal handler. | |
| 579 "27:sub $0x1C8, %%esp\n" // a legacy signal stack is much larger | |
| 580 "mov 0x1CC(%%esp), %%eax\n" // push signal number | |
| 581 "push %%eax\n" | |
| 582 "mov 0x1CC(%%esp), %%eax\n" // push restorer function | |
| 583 "push %%eax\n" | |
| 584 "lea 0x274(%%esp), %%esi\n" // copy siginfo register values | |
| 585 "lea 0x8(%%esp), %%edi\n" // into new location | |
| 586 "mov $22, %%ecx\n" | |
| 587 "cld\n" | |
| 588 "rep movsl\n" | |
| 589 "mov 0x2CC(%%esp), %%eax\n" // copy first half of signal mask | |
| 590 "mov %%eax, 0x58(%%esp)\n" | |
| 591 "lea 31f, %%esi\n" | |
| 592 "lea 0x2D4(%%esp), %%edi\n" // patch up retcode magic numbers | |
| 593 "movb $2, %%cl\n" | |
| 594 "rep movsl\n" | |
| 595 "jmp *%%ebx\n" // call user's signal handler | |
| 596 "28:lea 6b, %%eax\n" // set appropriate restorer function | |
| 597 "mov %%eax, 0(%%esp)\n" | |
| 598 "btl $2, %%ecx\n" // SA_SIGINFO | |
| 599 "jnc 27b\n" | |
| 600 "lea 29f, %%eax\n" | |
| 601 "mov %%eax, 0(%%esp)\n" // set appropriate restorer function | |
| 602 "jmp *%%ebx\n" // call user's signal handler | |
| 603 "29:pushl $30f\n" // emulate rt_sigreturn() | |
| 604 "jmp 5b\n" | |
| 605 | |
| 606 // Non-executable versions of the restorer function. We use these to | |
| 607 // trigger a SEGV upon returning from the user's signal handler, giving | |
| 608 // us an ability to clean up prior to returning from the SEGV handler. | |
| 609 ".pushsection .data\n" // move code into non-executable section | |
| 610 "30:mov $173, %%eax\n" // NR_rt_sigreturn | |
| 611 "int $0x80\n" // gdb looks for this signature when doing | |
| 612 ".byte 0\n" // backtraces | |
| 613 "31:pop %%eax\n" | |
| 614 "mov $119, %%eax\n" // NR_sigreturn | |
| 615 "int $0x80\n" | |
| 616 ".popsection\n" | |
| 617 #else | |
| 618 #error Unsupported target platform | |
| 619 #endif | |
| 620 ".pushsection \".rodata\"\n" | |
| 621 #ifndef NDEBUG | |
| 622 "100:.asciz \"RDTSC(P): Executing handler\\n\"\n" | |
| 623 "200:.asciz \"INT $0x0: Executing handler\\n\"\n" | |
| 624 #endif | |
| 625 ".popsection\n" | |
| 626 "999:pop %0\n" | |
| 627 : "=g"(fnc) | |
| 628 : | |
| 629 : "memory" | |
| 630 #if defined(__x86_64__) | |
| 631 , "rsp" | |
| 632 #elif defined(__i386__) | |
| 633 , "esp" | |
| 634 #endif | |
| 635 ); | |
| 636 return fnc; | |
| 637 } | |
| 638 | |
| 639 SecureMem::Args* Sandbox::getSecureMem() { | |
| 640 // Check trusted_thread.cc for the magic offset that gets us from the TLS | |
| 641 // to the beginning of the secure memory area. | |
| 642 SecureMem::Args* ret; | |
| 643 #if defined(__x86_64__) | |
| 644 asm volatile( | |
| 645 "movq %%gs:-0xE0, %0\n" | |
| 646 : "=q"(ret)); | |
| 647 #elif defined(__i386__) | |
| 648 asm volatile( | |
| 649 "movl %%fs:-0x58, %0\n" | |
| 650 : "=r"(ret)); | |
| 651 #else | |
| 652 #error Unsupported target platform | |
| 653 #endif | |
| 654 return ret; | |
| 655 } | |
| 656 | |
| 657 void Sandbox::snapshotMemoryMappings(int processFd, int proc_self_maps) { | |
| 658 SysCalls sys; | |
| 659 if (sys.lseek(proc_self_maps, 0, SEEK_SET) || | |
| 660 !sendFd(processFd, proc_self_maps, -1, NULL, 0)) { | |
| 661 failure: | |
| 662 die("Cannot access /proc/self/maps"); | |
| 663 } | |
| 664 int dummy; | |
| 665 if (read(sys, processFd, &dummy, sizeof(dummy)) != sizeof(dummy)) { | |
| 666 goto failure; | |
| 667 } | |
| 668 } | |
| 669 | |
| 670 int Sandbox::supportsSeccompSandbox(int proc_fd) { | |
| 671 if (status_ != STATUS_UNKNOWN) { | |
| 672 return status_ != STATUS_UNSUPPORTED; | |
| 673 } | |
| 674 int fds[2]; | |
| 675 SysCalls sys; | |
| 676 if (sys.pipe(fds)) { | |
| 677 status_ = STATUS_UNSUPPORTED; | |
| 678 return 0; | |
| 679 } | |
| 680 pid_t pid; | |
| 681 switch ((pid = sys.fork())) { | |
| 682 case -1: | |
| 683 status_ = STATUS_UNSUPPORTED; | |
| 684 return 0; | |
| 685 case 0: { | |
| 686 int devnull = sys.open("/dev/null", O_RDWR, 0); | |
| 687 if (devnull >= 0) { | |
| 688 sys.dup2(devnull, 0); | |
| 689 sys.dup2(devnull, 1); | |
| 690 sys.dup2(devnull, 2); | |
| 691 sys.close(devnull); | |
| 692 } | |
| 693 if (proc_fd >= 0) { | |
| 694 setProcSelfMaps(sys.openat(proc_fd, "self/maps", O_RDONLY, 0)); | |
| 695 } | |
| 696 startSandbox(); | |
| 697 write(sys, fds[1], "", 1); | |
| 698 | |
| 699 // Try to tell the trusted thread to shut down the entire process in an | |
| 700 // orderly fashion | |
| 701 defaultSystemCallHandler(__NR_exit_group, 0, 0, 0, 0, 0, 0); | |
| 702 | |
| 703 // If that did not work (e.g. because the kernel does not know about the | |
| 704 // exit_group() system call), make a direct _exit() system call instead. | |
| 705 // This system call is unrestricted in seccomp mode, so it will always | |
| 706 // succeed. Normally, we don't like it, because unlike exit_group() it | |
| 707 // does not terminate any other thread. But since we know that | |
| 708 // exit_group() exists in all kernels which support kernel-level threads, | |
| 709 // this is OK we only get here for old kernels where _exit() is OK. | |
| 710 sys._exit(0); | |
| 711 } | |
| 712 default: | |
| 713 NOINTR_SYS(sys.close(fds[1])); | |
| 714 char ch; | |
| 715 if (read(sys, fds[0], &ch, 1) != 1) { | |
| 716 status_ = STATUS_UNSUPPORTED; | |
| 717 } else { | |
| 718 status_ = STATUS_AVAILABLE; | |
| 719 } | |
| 720 int rc; | |
| 721 NOINTR_SYS(sys.waitpid(pid, &rc, 0)); | |
| 722 NOINTR_SYS(sys.close(fds[0])); | |
| 723 return status_ != STATUS_UNSUPPORTED; | |
| 724 } | |
| 725 } | |
| 726 | |
| 727 void Sandbox::setProcSelfMaps(int proc_self_maps) { | |
| 728 proc_self_maps_ = proc_self_maps; | |
| 729 } | |
| 730 | |
| 731 void Sandbox::startSandbox() { | |
| 732 if (status_ == STATUS_UNSUPPORTED) { | |
| 733 die("The seccomp sandbox is not supported on this computer"); | |
| 734 } else if (status_ == STATUS_ENABLED) { | |
| 735 return; | |
| 736 } | |
| 737 | |
| 738 SysCalls sys; | |
| 739 if (proc_self_maps_ < 0) { | |
| 740 proc_self_maps_ = sys.open("/proc/self/maps", O_RDONLY, 0); | |
| 741 if (proc_self_maps_ < 0) { | |
| 742 die("Cannot access \"/proc/self/maps\""); | |
| 743 } | |
| 744 } | |
| 745 | |
| 746 // The pid is unchanged for the entire program, so we can retrieve it once | |
| 747 // and store it in a global variable. | |
| 748 pid_ = sys.getpid(); | |
| 749 | |
| 750 // Block all signals, except for the RDTSC handler | |
| 751 setupSignalHandlers(); | |
| 752 | |
| 753 // Get socketpairs for talking to the trusted process | |
| 754 int pair[4]; | |
| 755 if (sys.socketpair(AF_UNIX, SOCK_STREAM, 0, pair) || | |
| 756 sys.socketpair(AF_UNIX, SOCK_STREAM, 0, pair+2)) { | |
| 757 die("Failed to create trusted thread"); | |
| 758 } | |
| 759 processFdPub_ = pair[0]; | |
| 760 cloneFdPub_ = pair[2]; | |
| 761 SecureMemArgs* secureMem = createTrustedProcess(pair[0], pair[1], | |
| 762 pair[2], pair[3]); | |
| 763 | |
| 764 // We find all libraries that have system calls and redirect the system | |
| 765 // calls to the sandbox. If we miss any system calls, the application will be | |
| 766 // terminated by the kernel's seccomp code. So, from a security point of | |
| 767 // view, if this code fails to identify system calls, we are still behaving | |
| 768 // correctly. | |
| 769 { | |
| 770 Maps maps(proc_self_maps_); | |
| 771 const char *libs[] = { "ld", "libc", "librt", "libpthread", NULL }; | |
| 772 | |
| 773 // Intercept system calls in the VDSO segment (if any). This has to happen | |
| 774 // before intercepting system calls in any of the other libraries, as | |
| 775 // the main kernel entry point might be inside of the VDSO and we need to | |
| 776 // determine its address before we can compare it to jumps from inside | |
| 777 // other libraries. | |
| 778 for (Maps::const_iterator iter = maps.begin(); iter != maps.end(); ++iter){ | |
| 779 Library* library = *iter; | |
| 780 if (library->isVDSO() && library->parseElf()) { | |
| 781 library->makeWritable(true); | |
| 782 library->patchSystemCalls(); | |
| 783 library->makeWritable(false); | |
| 784 break; | |
| 785 } | |
| 786 } | |
| 787 | |
| 788 // Intercept system calls in libraries that are known to have them. | |
| 789 for (Maps::const_iterator iter = maps.begin(); iter != maps.end(); ++iter){ | |
| 790 Library* library = *iter; | |
| 791 const char* mapping = iter.name().c_str(); | |
| 792 | |
| 793 // Find the actual base name of the mapped library by skipping past any | |
| 794 // SPC and forward-slashes. We don't want to accidentally find matches, | |
| 795 // because the directory name included part of our well-known lib names. | |
| 796 // | |
| 797 // Typically, prior to pruning, entries would look something like this: | |
| 798 // 08:01 2289011 /lib/libc-2.7.so | |
| 799 for (const char *delim = " /"; *delim; ++delim) { | |
| 800 const char* skip = strrchr(mapping, *delim); | |
| 801 if (skip) { | |
| 802 mapping = skip + 1; | |
| 803 } | |
| 804 } | |
| 805 | |
| 806 for (const char **ptr = libs; *ptr; ptr++) { | |
| 807 const char *name = strstr(mapping, *ptr); | |
| 808 if (name == mapping) { | |
| 809 char ch = name[strlen(*ptr)]; | |
| 810 if (ch < 'A' || (ch > 'Z' && ch < 'a') || ch > 'z') { | |
| 811 if (library->parseElf()) { | |
| 812 library->makeWritable(true); | |
| 813 library->patchSystemCalls(); | |
| 814 library->makeWritable(false); | |
| 815 break; | |
| 816 } | |
| 817 } | |
| 818 } | |
| 819 } | |
| 820 } | |
| 821 } | |
| 822 | |
| 823 // Take a snapshot of the current memory mappings. These mappings will be | |
| 824 // off-limits to all future mmap(), munmap(), mremap(), and mprotect() calls. | |
| 825 snapshotMemoryMappings(processFdPub_, proc_self_maps_); | |
| 826 NOINTR_SYS(sys.close(proc_self_maps_)); | |
| 827 proc_self_maps_ = -1; | |
| 828 | |
| 829 // Creating the trusted thread enables sandboxing | |
| 830 createTrustedThread(processFdPub_, cloneFdPub_, secureMem); | |
| 831 | |
| 832 // We can no longer check for sandboxing support at this point, but we also | |
| 833 // know for a fact that it is available (as we just turned it on). So update | |
| 834 // the status to reflect this information. | |
| 835 status_ = STATUS_ENABLED; | |
| 836 } | |
| 837 | |
| 838 } // namespace | |
| OLD | NEW |