OLD | NEW |
| (Empty) |
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "library.h" | |
6 #include "sandbox_impl.h" | |
7 #include "syscall_table.h" | |
8 | |
9 namespace playground { | |
10 | |
11 // Global variables | |
12 int Sandbox::proc_self_maps_ = -1; | |
13 enum Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN; | |
14 int Sandbox::pid_; | |
15 int Sandbox::processFdPub_; | |
16 int Sandbox::cloneFdPub_; | |
17 Sandbox::SysCalls::kernel_sigaction Sandbox::sa_segv_; | |
18 Sandbox::ProtectedMap Sandbox::protectedMap_; | |
19 std::vector<SecureMem::Args*> Sandbox::secureMemPool_; | |
20 | |
21 bool Sandbox::sendFd(int transport, int fd0, int fd1, const void* buf, | |
22 size_t len) { | |
23 int fds[2], count = 0; | |
24 if (fd0 >= 0) { fds[count++] = fd0; } | |
25 if (fd1 >= 0) { fds[count++] = fd1; } | |
26 if (!count) { | |
27 return false; | |
28 } | |
29 char cmsg_buf[CMSG_SPACE(count*sizeof(int))]; | |
30 memset(cmsg_buf, 0, sizeof(cmsg_buf)); | |
31 struct SysCalls::kernel_iovec iov[2] = { { 0 } }; | |
32 struct SysCalls::kernel_msghdr msg = { 0 }; | |
33 int dummy = 0; | |
34 iov[0].iov_base = &dummy; | |
35 iov[0].iov_len = sizeof(dummy); | |
36 if (buf && len > 0) { | |
37 iov[1].iov_base = const_cast<void *>(buf); | |
38 iov[1].iov_len = len; | |
39 } | |
40 msg.msg_iov = iov; | |
41 msg.msg_iovlen = (buf && len > 0) ? 2 : 1; | |
42 msg.msg_control = cmsg_buf; | |
43 msg.msg_controllen = CMSG_LEN(count*sizeof(int)); | |
44 struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); | |
45 cmsg->cmsg_level = SOL_SOCKET; | |
46 cmsg->cmsg_type = SCM_RIGHTS; | |
47 cmsg->cmsg_len = CMSG_LEN(count*sizeof(int)); | |
48 memcpy(CMSG_DATA(cmsg), fds, count*sizeof(int)); | |
49 SysCalls sys; | |
50 return NOINTR_SYS(sys.sendmsg(transport, &msg, 0)) == | |
51 (ssize_t)(sizeof(dummy) + ((buf && len > 0) ? len : 0)); | |
52 } | |
53 | |
54 bool Sandbox::getFd(int transport, int* fd0, int* fd1, void* buf, size_t*len) { | |
55 int count = 0; | |
56 int *err = NULL; | |
57 if (fd0) { | |
58 count++; | |
59 err = fd0; | |
60 *fd0 = -1; | |
61 } | |
62 if (fd1) { | |
63 if (!count++) { | |
64 err = fd1; | |
65 } | |
66 *fd1 = -1; | |
67 } | |
68 if (!count) { | |
69 return false; | |
70 } | |
71 char cmsg_buf[CMSG_SPACE(count*sizeof(int))]; | |
72 memset(cmsg_buf, 0, sizeof(cmsg_buf)); | |
73 struct SysCalls::kernel_iovec iov[2] = { { 0 } }; | |
74 struct SysCalls::kernel_msghdr msg = { 0 }; | |
75 iov[0].iov_base = err; | |
76 iov[0].iov_len = sizeof(int); | |
77 if (buf && len && *len > 0) { | |
78 iov[1].iov_base = buf; | |
79 iov[1].iov_len = *len; | |
80 } | |
81 msg.msg_iov = iov; | |
82 msg.msg_iovlen = (buf && len && *len > 0) ? 2 : 1; | |
83 msg.msg_control = cmsg_buf; | |
84 msg.msg_controllen = CMSG_LEN(count*sizeof(int)); | |
85 SysCalls sys; | |
86 ssize_t bytes = NOINTR_SYS(sys.recvmsg(transport, &msg, 0)); | |
87 if (len) { | |
88 *len = bytes > (int)sizeof(int) ? | |
89 bytes - sizeof(int) : 0; | |
90 } | |
91 if (bytes != (ssize_t)(sizeof(int) + ((buf && len && *len > 0) ? *len : 0))){ | |
92 *err = bytes >= 0 ? 0 : -EBADF; | |
93 return false; | |
94 } | |
95 if (*err) { | |
96 // "err" is the first four bytes of the payload. If these are non-zero, | |
97 // the sender on the other side of the socketpair sent us an errno value. | |
98 // We don't expect to get any file handles in this case. | |
99 return false; | |
100 } | |
101 struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); | |
102 if ((msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) || | |
103 !cmsg || | |
104 cmsg->cmsg_level != SOL_SOCKET || | |
105 cmsg->cmsg_type != SCM_RIGHTS || | |
106 cmsg->cmsg_len != CMSG_LEN(count*sizeof(int))) { | |
107 *err = -EBADF; | |
108 return false; | |
109 } | |
110 if (fd1) { *fd1 = ((int *)CMSG_DATA(cmsg))[--count]; } | |
111 if (fd0) { *fd0 = ((int *)CMSG_DATA(cmsg))[--count]; } | |
112 return true; | |
113 } | |
114 | |
115 void Sandbox::setupSignalHandlers() { | |
116 // Set SIGCHLD to SIG_DFL so that waitpid() can work | |
117 SysCalls sys; | |
118 struct SysCalls::kernel_sigaction sa; | |
119 memset(&sa, 0, sizeof(sa)); | |
120 sa.sa_handler_ = SIG_DFL; | |
121 sys.sigaction(SIGCHLD, &sa, NULL); | |
122 | |
123 // Set up SEGV handler for dealing with RDTSC instructions, system calls | |
124 // that have been rewritten to use INT0, for sigprocmask() emulation, for | |
125 // the creation of threads, and for user-provided SEGV handlers. | |
126 sa.sa_sigaction_ = segv(); | |
127 sa.sa_flags = SA_SIGINFO | SA_NODEFER; | |
128 sys.sigaction(SIGSEGV, &sa, &sa_segv_); | |
129 | |
130 // Unblock SIGSEGV and SIGCHLD | |
131 SysCalls::kernel_sigset_t mask; | |
132 memset(&mask, 0x00, sizeof(mask)); | |
133 mask.sig[0] |= (1 << (SIGSEGV - 1)) | (1 << (SIGCHLD - 1)); | |
134 sys.sigprocmask(SIG_UNBLOCK, &mask, 0); | |
135 } | |
136 | |
137 void (*Sandbox::segv())(int signo, SysCalls::siginfo *context, void *unused) { | |
138 void (*fnc)(int signo, SysCalls::siginfo *context, void *unused); | |
139 asm volatile( | |
140 "call 999f\n" | |
141 #if defined(__x86_64__) | |
142 // Inspect instruction at the point where the segmentation fault | |
143 // happened. If it is RDTSC, forward the request to the trusted | |
144 // thread. | |
145 "mov $-3, %%r14\n" // request for RDTSC | |
146 "mov 0xB0(%%rsp), %%r15\n" // %rip at time of segmentation fault | |
147 "cmpw $0x310F, (%%r15)\n" // RDTSC | |
148 "jz 0f\n" | |
149 "cmpw $0x010F, (%%r15)\n" // RDTSCP | |
150 "jnz 8f\n" | |
151 "cmpb $0xF9, 2(%%r15)\n" | |
152 "jnz 8f\n" | |
153 "mov $-4, %%r14\n" // request for RDTSCP | |
154 "0:" | |
155 #ifndef NDEBUG | |
156 "lea 100f(%%rip), %%rdi\n" | |
157 "call playground$debugMessage\n" | |
158 #endif | |
159 "sub $4, %%rsp\n" | |
160 "push %%r14\n" | |
161 "mov %%gs:16, %%edi\n" // fd = threadFdPub | |
162 "mov %%rsp, %%rsi\n" // buf = %rsp | |
163 "mov $4, %%edx\n" // len = sizeof(int) | |
164 "1:mov $1, %%eax\n" // NR_write | |
165 "syscall\n" | |
166 "cmp %%rax, %%rdx\n" | |
167 "jz 5f\n" | |
168 "cmp $-4, %%eax\n" // EINTR | |
169 "jz 1b\n" | |
170 "2:add $12, %%rsp\n" | |
171 "movq $0, 0x98(%%rsp)\n" // %rax at time of segmentation fault | |
172 "movq $0, 0x90(%%rsp)\n" // %rdx at time of segmentation fault | |
173 "cmpw $0x310F, (%%r15)\n" // RDTSC | |
174 "jz 3f\n" | |
175 "movq $0, 0xA0(%%rsp)\n" // %rcx at time of segmentation fault | |
176 "3:addq $2, 0xB0(%%rsp)\n" // %rip at time of segmentation fault | |
177 "cmpw $0x010F, (%%r15)\n" // RDTSC | |
178 "jnz 4f\n" | |
179 "addq $1, 0xB0(%%rsp)\n" // %rip at time of segmentation fault | |
180 "4:ret\n" | |
181 "5:mov $12, %%edx\n" // len = 3*sizeof(int) | |
182 "6:mov $0, %%eax\n" // NR_read | |
183 "syscall\n" | |
184 "cmp $-4, %%eax\n" // EINTR | |
185 "jz 6b\n" | |
186 "cmp %%rax, %%rdx\n" | |
187 "jnz 2b\n" | |
188 "mov 0(%%rsp), %%eax\n" | |
189 "mov 4(%%rsp), %%edx\n" | |
190 "mov 8(%%rsp), %%ecx\n" | |
191 "add $12, %%rsp\n" | |
192 "mov %%rdx, 0x90(%%rsp)\n" // %rdx at time of segmentation fault | |
193 "cmpw $0x310F, (%%r15)\n" // RDTSC | |
194 "jz 7f\n" | |
195 "mov %%rcx, 0xA0(%%rsp)\n" // %rcx at time of segmentation fault | |
196 "7:mov %%rax, 0x98(%%rsp)\n" // %rax at time of segmentation fault | |
197 "jmp 3b\n" | |
198 | |
199 // If the instruction is INT 0, then this was probably the result | |
200 // of playground::Library being unable to find a way to safely | |
201 // rewrite the system call instruction. Retrieve the CPU register | |
202 // at the time of the segmentation fault and invoke syscallWrapper(). | |
203 "8:cmpw $0x00CD, (%%r15)\n" // INT $0x0 | |
204 "jnz 16f\n" | |
205 #ifndef NDEBUG | |
206 "lea 200f(%%rip), %%rdi\n" | |
207 "call playground$debugMessage\n" | |
208 #endif | |
209 "mov 0x98(%%rsp), %%rax\n" // %rax at time of segmentation fault | |
210 "mov 0x70(%%rsp), %%rdi\n" // %rdi at time of segmentation fault | |
211 "mov 0x78(%%rsp), %%rsi\n" // %rsi at time of segmentation fault | |
212 "mov 0x90(%%rsp), %%rdx\n" // %rdx at time of segmentation fault | |
213 "mov 0x40(%%rsp), %%r10\n" // %r10 at time of segmentation fault | |
214 "mov 0x30(%%rsp), %%r8\n" // %r8 at time of segmentation fault | |
215 "mov 0x38(%%rsp), %%r9\n" // %r9 at time of segmentation fault | |
216 | |
217 // Handle rt_sigprocmask() | |
218 "cmp $14, %%rax\n" // NR_rt_sigprocmask | |
219 "jnz 12f\n" | |
220 "mov $-22, %%rax\n" // -EINVAL | |
221 "cmp $8, %%r10\n" // %r10 = sigsetsize (8 bytes = 64 signals) | |
222 "jl 7b\n" | |
223 "mov 0x130(%%rsp), %%r10\n" // signal mask at time of segmentation fault | |
224 "test %%rsi, %%rsi\n" // only set mask, if set is non-NULL | |
225 "jz 11f\n" | |
226 "mov 0(%%rsi), %%rsi\n" | |
227 "cmp $0, %%rdi\n" // %rdi = how (SIG_BLOCK) | |
228 "jnz 9f\n" | |
229 "or %%rsi, 0x130(%%rsp)\n" // signal mask at time of segmentation fault | |
230 "jmp 11f\n" | |
231 "9:cmp $1, %%rdi\n" // %rdi = how (SIG_UNBLOCK) | |
232 "jnz 10f\n" | |
233 "xor $-1, %%rsi\n" | |
234 "and %%rsi, 0x130(%%rsp)\n" // signal mask at time of segmentation fault | |
235 "jmp 11f\n" | |
236 "10:cmp $2, %%rdi\n" // %rdi = how (SIG_SETMASK) | |
237 "jnz 7b\n" | |
238 "mov %%rsi, 0x130(%%rsp)\n" // signal mask at time of segmentation fault | |
239 "11:xor %%rax, %%rax\n" | |
240 "test %%rdx, %%rdx\n" // only return old mask, if set is non-NULL | |
241 "jz 7b\n" | |
242 "mov %%r10, 0(%%rdx)\n" // old_set | |
243 "jmp 7b\n" | |
244 | |
245 // Handle rt_sigreturn() | |
246 "12:cmp $15, %%rax\n" // NR_rt_sigreturn | |
247 "jnz 14f\n" | |
248 "mov 0xA8(%%rsp), %%rsp\n" // %rsp at time of segmentation fault | |
249 "13:syscall\n" // rt_sigreturn() is unrestricted | |
250 "mov $66, %%edi\n" // rt_sigreturn() should never return | |
251 "mov $231, %%eax\n" // NR_exit_group | |
252 "jmp 13b\n" | |
253 | |
254 // Copy signal frame onto new stack. See clone.cc for details | |
255 "14:cmp $56+0xF000, %%rax\n" // NR_clone + 0xF000 | |
256 "jnz 15f\n" | |
257 "lea 8(%%rsp), %%rax\n" // retain stack frame upon returning | |
258 "mov %%rax, 0xA8(%%rsp)\n" // %rsp at time of segmentation fault | |
259 "jmp 7b\n" | |
260 | |
261 // Forward system call to syscallWrapper() | |
262 "15:lea 7b(%%rip), %%rcx\n" | |
263 "push %%rcx\n" | |
264 "push 0xB8(%%rsp)\n" // %rip at time of segmentation fault | |
265 "lea playground$syscallWrapper(%%rip), %%rcx\n" | |
266 "jmp *%%rcx\n" | |
267 | |
268 // In order to implement SA_NODEFER, we have to keep track of recursive | |
269 // calls to SIGSEGV handlers. This means we have to increment a counter | |
270 // before calling the user's signal handler, and decrement it on | |
271 // leaving the user's signal handler. | |
272 // Some signal handlers look at the return address of the signal | |
273 // stack, and more importantly "gdb" uses the call to rt_sigreturn() | |
274 // as a magic signature when doing stacktraces. So, we have to use | |
275 // a little more unusual code to regain control after the user's | |
276 // signal handler is done. We adjust the return address to point to | |
277 // non-executable memory. And when we trigger another SEGV we pop the | |
278 // extraneous signal frame and then call rt_sigreturn(). | |
279 // N.B. We currently do not correctly adjust the SEGV counter, if the | |
280 // user's signal handler exits in way other than by returning (e.g. by | |
281 // directly calling rt_sigreturn(), or by calling siglongjmp()). | |
282 "16:lea 22f(%%rip), %%r14\n" | |
283 "cmp %%r14, %%r15\n" | |
284 "jnz 17f\n" // check if returning from user's handler | |
285 "decl %%gs:0x105C-0xE0\n" // decrement SEGV recursion counter | |
286 "mov 0xA8(%%rsp), %%rsp\n" // %rsp at time of segmentation fault | |
287 "mov $0xF, %%eax\n" // NR_rt_sigreturn | |
288 "syscall\n" | |
289 | |
290 // This was a genuine segmentation fault. Check Sandbox::sa_segv_ for | |
291 // what we are supposed to do. | |
292 "17:mov playground$sa_segv@GOTPCREL(%%rip), %%rax\n" | |
293 "cmp $0, 0(%%rax)\n" // SIG_DFL | |
294 "jz 18f\n" | |
295 "cmp $1, 0(%%rax)\n" // SIG_IGN | |
296 "jnz 19f\n" // can't really ignore synchronous signals | |
297 | |
298 // Trigger the kernel's default signal disposition. The only way we can | |
299 // do this from seccomp mode is by blocking the signal and retriggering | |
300 // it. | |
301 "18:orb $4, 0x131(%%rsp)\n" // signal mask at time of segmentation fault | |
302 "ret\n" | |
303 | |
304 // Check sa_flags: | |
305 // - We can ignore SA_NOCLDSTOP, SA_NOCLDWAIT, and SA_RESTART as they | |
306 // do not have any effect for SIGSEGV. | |
307 // - On x86-64, we can also ignore SA_SIGINFO, as the calling | |
308 // conventions for sa_handler() are a subset of the conventions for | |
309 // sa_sigaction(). | |
310 // - We have to always register our signal handler with SA_NODEFER so | |
311 // that the user's signal handler can make system calls which might | |
312 // require additional help from our SEGV handler. | |
313 // - If the user's signal handler wasn't supposed to be SA_NODEFER, then | |
314 // we emulate this behavior by keeping track of a recursion counter. | |
315 // | |
316 // TODO(markus): If/when we add support for sigaltstack(), we have to | |
317 // handle SA_ONSTACK. | |
318 "19:cmpl $0, %%gs:0x105C-0xE0\n"// check if we failed inside of SEGV handler | |
319 "jnz 18b\n" // if so, then terminate program | |
320 "mov 0(%%rax), %%rbx\n" // sa_segv_.sa_sigaction | |
321 "mov 8(%%rax), %%rcx\n" // sa_segv_.sa_flags | |
322 "btl $31, %%ecx\n" // SA_RESETHAND | |
323 "jnc 20f\n" | |
324 "movq $0, 0(%%rax)\n" // set handler to SIG_DFL | |
325 "20:btl $30, %%ecx\n" // SA_NODEFER | |
326 "jc 21f\n" | |
327 "mov %%r14, 0(%%rsp)\n" // trigger a SEGV on return, so that we can | |
328 "incl %%gs:0x105C-0xE0\n" // clean up state; incr. recursion counter | |
329 "21:jmp *%%rbx\n" // call user's signal handler | |
330 | |
331 | |
332 // Non-executable version of the restorer function. We use this to | |
333 // trigger a SEGV upon returning from the user's signal handler, giving | |
334 // us an ability to clean up prior to returning from the SEGV handler. | |
335 ".pushsection .data\n" // move code into non-executable section | |
336 "22:mov $0xF, %%rax\n" // gdb looks for this signature when doing | |
337 "syscall\n" // backtraces | |
338 ".popsection\n" | |
339 #elif defined(__i386__) | |
340 // Inspect instruction at the point where the segmentation fault | |
341 // happened. If it is RDTSC, forward the request to the trusted | |
342 // thread. | |
343 "mov $-3, %%ebx\n" // request for RDTSC | |
344 "mov 0xDC(%%esp), %%ebp\n" // %eip at time of segmentation fault | |
345 "cmpw $0x310F, (%%ebp)\n" // RDTSC | |
346 "jz 0f\n" | |
347 "cmpw $0x010F, (%%ebp)\n" // RDTSCP | |
348 "jnz 9f\n" | |
349 "cmpb $0xF9, 2(%%ebp)\n" | |
350 "jnz 9f\n" | |
351 "mov $-4, %%ebx\n" // request for RDTSCP | |
352 "0:" | |
353 #ifndef NDEBUG | |
354 "lea 100f, %%eax\n" | |
355 "push %%eax\n" | |
356 "call playground$debugMessage\n" | |
357 "sub $4, %%esp\n" | |
358 #else | |
359 "sub $8, %%esp\n" // allocate buffer for receiving timestamp | |
360 #endif | |
361 "push %%ebx\n" | |
362 "mov %%fs:16, %%ebx\n" // fd = threadFdPub | |
363 "mov %%esp, %%ecx\n" // buf = %esp | |
364 "mov $4, %%edx\n" // len = sizeof(int) | |
365 "1:mov %%edx, %%eax\n" // NR_write | |
366 "int $0x80\n" | |
367 "cmp %%eax, %%edx\n" | |
368 "jz 7f\n" | |
369 "cmp $-4, %%eax\n" // EINTR | |
370 "jz 1b\n" | |
371 "2:add $12, %%esp\n" // remove temporary buffer from stack | |
372 "xor %%eax, %%eax\n" | |
373 "movl $0, 0xC8(%%esp)\n" // %edx at time of segmentation fault | |
374 "cmpw $0x310F, (%%ebp)\n" // RDTSC | |
375 "jz 3f\n" | |
376 "movl $0, 0xCC(%%esp)\n" // %ecx at time of segmentation fault | |
377 "3:mov %%eax, 0xD0(%%esp)\n" // %eax at time of segmentation fault | |
378 "4:mov 0xDC(%%esp), %%ebp\n" // %eip at time of segmentation fault | |
379 "addl $2, 0xDC(%%esp)\n" // %eip at time of segmentation fault | |
380 "cmpw $0x010F, (%%ebp)\n" // RDTSCP | |
381 "jnz 5f\n" | |
382 "addl $1, 0xDC(%%esp)\n" // %eip at time of segmentation fault | |
383 "5:sub $0x1C8, %%esp\n" // a legacy signal stack is much larger | |
384 "mov 0x1CC(%%esp), %%eax\n" // push signal number | |
385 "push %%eax\n" | |
386 "lea 0x270(%%esp), %%esi\n" // copy siginfo register values | |
387 "lea 0x4(%%esp), %%edi\n" // into new location | |
388 "mov $22, %%ecx\n" | |
389 "cld\n" | |
390 "rep movsl\n" | |
391 "mov 0x2C8(%%esp), %%ebx\n" // copy first half of signal mask | |
392 "mov %%ebx, 0x54(%%esp)\n" | |
393 "lea 6f, %%esi\n" // copy "magic" restorer function | |
394 "push %%esi\n" // push restorer function | |
395 "lea 0x2D4(%%esp), %%edi\n" // patch up retcode magic numbers | |
396 "movb $2, %%cl\n" | |
397 "rep movsl\n" | |
398 "ret\n" // return to restorer function | |
399 | |
400 // The restorer function is sometimes used by gdb as a magic marker to | |
401 // recognize signal stack frames. Don't change any of the next three | |
402 // instructions. | |
403 "6:pop %%eax\n" // remove dummy argument (signo) | |
404 "mov $119, %%eax\n" // NR_sigreturn | |
405 "int $0x80\n" | |
406 "7:mov $12, %%edx\n" // len = 3*sizeof(int) | |
407 "8:mov $3, %%eax\n" // NR_read | |
408 "int $0x80\n" | |
409 "cmp $-4, %%eax\n" // EINTR | |
410 "jz 8b\n" | |
411 "cmp %%eax, %%edx\n" | |
412 "jnz 2b\n" | |
413 "pop %%eax\n" | |
414 "pop %%edx\n" | |
415 "pop %%ecx\n" | |
416 "mov %%edx, 0xC8(%%esp)\n" // %edx at time of segmentation fault | |
417 "cmpw $0x310F, (%%ebp)\n" // RDTSC | |
418 "jz 3b\n" | |
419 "mov %%ecx, 0xCC(%%esp)\n" // %ecx at time of segmentation fault | |
420 "jmp 3b\n" | |
421 | |
422 // If the instruction is INT 0, then this was probably the result | |
423 // of playground::Library being unable to find a way to safely | |
424 // rewrite the system call instruction. Retrieve the CPU register | |
425 // at the time of the segmentation fault and invoke syscallWrapper(). | |
426 "9:cmpw $0x00CD, (%%ebp)\n" // INT $0x0 | |
427 "jnz 20f\n" | |
428 #ifndef NDEBUG | |
429 "lea 200f, %%eax\n" | |
430 "push %%eax\n" | |
431 "call playground$debugMessage\n" | |
432 "add $0x4, %%esp\n" | |
433 #endif | |
434 "mov 0xD0(%%esp), %%eax\n" // %eax at time of segmentation fault | |
435 "mov 0xC4(%%esp), %%ebx\n" // %ebx at time of segmentation fault | |
436 "mov 0xCC(%%esp), %%ecx\n" // %ecx at time of segmentation fault | |
437 "mov 0xC8(%%esp), %%edx\n" // %edx at time of segmentation fault | |
438 "mov 0xB8(%%esp), %%esi\n" // %esi at time of segmentation fault | |
439 "mov 0xB4(%%esp), %%edi\n" // %edi at time of segmentation fault | |
440 "mov 0xB2(%%esp), %%ebp\n" // %ebp at time of segmentation fault | |
441 | |
442 // Handle sigprocmask() and rt_sigprocmask() | |
443 "cmp $175, %%eax\n" // NR_rt_sigprocmask | |
444 "jnz 10f\n" | |
445 "mov $-22, %%eax\n" // -EINVAL | |
446 "cmp $8, %%esi\n" // %esi = sigsetsize (8 bytes = 64 signals) | |
447 "jl 3b\n" | |
448 "jmp 11f\n" | |
449 "10:cmp $126, %%eax\n" // NR_sigprocmask | |
450 "jnz 15f\n" | |
451 "mov $-22, %%eax\n" | |
452 "11:mov 0xFC(%%esp), %%edi\n" // signal mask at time of segmentation fault | |
453 "mov 0x100(%%esp), %%ebp\n" | |
454 "test %%ecx, %%ecx\n" // only set mask, if set is non-NULL | |
455 "jz 14f\n" | |
456 "mov 0(%%ecx), %%esi\n" | |
457 "mov 4(%%ecx), %%ecx\n" | |
458 "cmp $0, %%ebx\n" // %ebx = how (SIG_BLOCK) | |
459 "jnz 12f\n" | |
460 "or %%esi, 0xFC(%%esp)\n" // signal mask at time of segmentation fault | |
461 "or %%ecx, 0x100(%%esp)\n" | |
462 "jmp 14f\n" | |
463 "12:cmp $1, %%ebx\n" // %ebx = how (SIG_UNBLOCK) | |
464 "jnz 13f\n" | |
465 "xor $-1, %%esi\n" | |
466 "xor $-1, %%ecx\n" | |
467 "and %%esi, 0xFC(%%esp)\n" // signal mask at time of segmentation fault | |
468 "and %%ecx, 0x100(%%esp)\n" | |
469 "jmp 14f\n" | |
470 "13:cmp $2, %%ebx\n" // %ebx = how (SIG_SETMASK) | |
471 "jnz 3b\n" | |
472 "mov %%esi, 0xFC(%%esp)\n" // signal mask at time of segmentation fault | |
473 "mov %%ecx, 0x100(%%esp)\n" | |
474 "14:xor %%eax, %%eax\n" | |
475 "test %%edx, %%edx\n" // only return old mask, if set is non-NULL | |
476 "jz 3b\n" | |
477 "mov %%edi, 0(%%edx)\n" // old_set | |
478 "mov %%ebp, 4(%%edx)\n" | |
479 "jmp 3b\n" | |
480 | |
481 // Handle sigreturn() and rt_sigreturn() | |
482 // See syscall.cc for a discussion on how we can emulate rt_sigreturn() | |
483 // by calling sigreturn() with a suitably adjusted stack. | |
484 "15:cmp $119, %%eax\n" // NR_sigreturn | |
485 "jnz 17f\n" | |
486 "mov 0xC0(%%esp), %%esp\n" // %esp at time of segmentation fault | |
487 "16:int $0x80\n" // sigreturn() is unrestricted | |
488 "17:cmp $173, %%eax\n" // NR_rt_sigreturn | |
489 "jnz 18f\n" | |
490 "mov 0xC0(%%esp), %%esp\n" // %esp at time of segmentation fault | |
491 "sub $4, %%esp\n" // add fake return address | |
492 "jmp 4b\n" | |
493 | |
494 // Copy signal frame onto new stack. In the process, we have to convert | |
495 // it from an RT signal frame to a legacy signal frame. | |
496 // See clone.cc for details | |
497 "18:cmp $120+0xF000, %%eax\n" // NR_clone + 0xF000 | |
498 "jnz 19f\n" | |
499 "lea -0x1C8(%%esp), %%eax\n"// retain stack frame upon returning | |
500 "mov %%eax, 0xC0(%%esp)\n" // %esp at time of segmentation fault | |
501 "jmp 3b\n" | |
502 | |
503 // Forward system call to syscallWrapper() | |
504 "19:call playground$syscallWrapper\n" | |
505 "jmp 3b\n" | |
506 | |
507 // In order to implement SA_NODEFER, we have to keep track of recursive | |
508 // calls to SIGSEGV handlers. This means we have to increment a counter | |
509 // before calling the user's signal handler, and decrement it on | |
510 // leaving the user's signal handler. | |
511 // Some signal handlers look at the return address of the signal | |
512 // stack, and more importantly "gdb" uses the call to {,rt_}sigreturn() | |
513 // as a magic signature when doing stacktraces. So, we have to use | |
514 // a little more unusual code to regain control after the user's | |
515 // signal handler is done. We adjust the return address to point to | |
516 // non-executable memory. And when we trigger another SEGV we pop the | |
517 // extraneous signal frame and then call sigreturn(). | |
518 // N.B. We currently do not correctly adjust the SEGV counter, if the | |
519 // user's signal handler exits in way other than by returning (e.g. by | |
520 // directly calling {,rt_}sigreturn(), or by calling siglongjmp()). | |
521 "20:lea 30f, %%edi\n" // rt-style restorer function | |
522 "lea 31f, %%esi\n" // legacy restorer function | |
523 "cmp %%ebp, %%edi\n" // check if returning from user's handler | |
524 "jnz 21f\n" | |
525 "decl %%fs:0x1040-0x58\n" // decrement SEGV recursion counter | |
526 "mov 0xC0(%%esp), %%esp\n" // %esp at time of segmentation fault | |
527 "jmp 29f\n" | |
528 "21:cmp %%ebp, %%esi\n" // check if returning from user's handler | |
529 "jnz 22f\n" | |
530 "decl %%fs:0x1040-0x58\n" // decrement SEGV recursion counter | |
531 "mov 0xC0(%%esp), %%esp\n" // %esp at time of segmentation fault | |
532 "jmp 6b\n" | |
533 | |
534 // This was a genuine segmentation fault. Check Sandbox::sa_segv_ for | |
535 // what we are supposed to do. | |
536 "22:lea playground$sa_segv, %%eax\n" | |
537 "cmp $0, 0(%%eax)\n" // SIG_DFL | |
538 "jz 23f\n" | |
539 "cmp $1, 0(%%eax)\n" // SIG_IGN | |
540 "jnz 24f\n" // can't really ignore synchronous signals | |
541 | |
542 // Trigger the kernel's default signal disposition. The only way we can | |
543 // do this from seccomp mode is by blocking the signal and retriggering | |
544 // it. | |
545 "23:orb $4, 0xFD(%%esp)\n" // signal mask at time of segmentation fault | |
546 "jmp 5b\n" | |
547 | |
548 // Check sa_flags: | |
549 // - We can ignore SA_NOCLDSTOP, SA_NOCLDWAIT, and SA_RESTART as they | |
550 // do not have any effect for SIGSEGV. | |
551 // - We have to always register our signal handler with SA_NODEFER so | |
552 // that the user's signal handler can make system calls which might | |
553 // require additional help from our SEGV handler. | |
554 // - If the user's signal handler wasn't supposed to be SA_NODEFER, then | |
555 // we emulate this behavior by keeping track of a recursion counter. | |
556 // | |
557 // TODO(markus): If/when we add support for sigaltstack(), we have to | |
558 // handle SA_ONSTACK. | |
559 "24:cmpl $0, %%fs:0x1040-0x58\n"// check if we failed inside of SEGV handler | |
560 "jnz 23b\n" // if so, then terminate program | |
561 "mov 0(%%eax), %%ebx\n" // sa_segv_.sa_sigaction | |
562 "mov 4(%%eax), %%ecx\n" // sa_segv_.sa_flags | |
563 "btl $31, %%ecx\n" // SA_RESETHAND | |
564 "jnc 25f\n" | |
565 "movl $0, 0(%%eax)\n" // set handler to SIG_DFL | |
566 "25:btl $30, %%ecx\n" // SA_NODEFER | |
567 "jc 28f\n" | |
568 "btl $2, %%ecx\n" // SA_SIGINFO | |
569 "jnc 26f\n" | |
570 "mov %%edi, 0(%%esp)\n" // trigger a SEGV on return | |
571 "incl %%fs:0x1040-0x58\n" // increment recursion counter | |
572 "jmp *%%ebx\n" // call user's signal handler | |
573 "26:mov %%esi, 0(%%esp)\n" | |
574 "incl %%fs:0x1040-0x58\n" // increment recursion counter | |
575 | |
576 // We always register the signal handler to give us rt-style signal | |
577 // frames. But if the user asked for legacy signal frames, we must | |
578 // convert the signal frame prior to calling the user's signal handler. | |
579 "27:sub $0x1C8, %%esp\n" // a legacy signal stack is much larger | |
580 "mov 0x1CC(%%esp), %%eax\n" // push signal number | |
581 "push %%eax\n" | |
582 "mov 0x1CC(%%esp), %%eax\n" // push restorer function | |
583 "push %%eax\n" | |
584 "lea 0x274(%%esp), %%esi\n" // copy siginfo register values | |
585 "lea 0x8(%%esp), %%edi\n" // into new location | |
586 "mov $22, %%ecx\n" | |
587 "cld\n" | |
588 "rep movsl\n" | |
589 "mov 0x2CC(%%esp), %%eax\n" // copy first half of signal mask | |
590 "mov %%eax, 0x58(%%esp)\n" | |
591 "lea 31f, %%esi\n" | |
592 "lea 0x2D4(%%esp), %%edi\n" // patch up retcode magic numbers | |
593 "movb $2, %%cl\n" | |
594 "rep movsl\n" | |
595 "jmp *%%ebx\n" // call user's signal handler | |
596 "28:lea 6b, %%eax\n" // set appropriate restorer function | |
597 "mov %%eax, 0(%%esp)\n" | |
598 "btl $2, %%ecx\n" // SA_SIGINFO | |
599 "jnc 27b\n" | |
600 "lea 29f, %%eax\n" | |
601 "mov %%eax, 0(%%esp)\n" // set appropriate restorer function | |
602 "jmp *%%ebx\n" // call user's signal handler | |
603 "29:pushl $30f\n" // emulate rt_sigreturn() | |
604 "jmp 5b\n" | |
605 | |
606 // Non-executable versions of the restorer function. We use these to | |
607 // trigger a SEGV upon returning from the user's signal handler, giving | |
608 // us an ability to clean up prior to returning from the SEGV handler. | |
609 ".pushsection .data\n" // move code into non-executable section | |
610 "30:mov $173, %%eax\n" // NR_rt_sigreturn | |
611 "int $0x80\n" // gdb looks for this signature when doing | |
612 ".byte 0\n" // backtraces | |
613 "31:pop %%eax\n" | |
614 "mov $119, %%eax\n" // NR_sigreturn | |
615 "int $0x80\n" | |
616 ".popsection\n" | |
617 #else | |
618 #error Unsupported target platform | |
619 #endif | |
620 ".pushsection \".rodata\"\n" | |
621 #ifndef NDEBUG | |
622 "100:.asciz \"RDTSC(P): Executing handler\\n\"\n" | |
623 "200:.asciz \"INT $0x0: Executing handler\\n\"\n" | |
624 #endif | |
625 ".popsection\n" | |
626 "999:pop %0\n" | |
627 : "=g"(fnc) | |
628 : | |
629 : "memory" | |
630 #if defined(__x86_64__) | |
631 , "rsp" | |
632 #elif defined(__i386__) | |
633 , "esp" | |
634 #endif | |
635 ); | |
636 return fnc; | |
637 } | |
638 | |
639 SecureMem::Args* Sandbox::getSecureMem() { | |
640 // Check trusted_thread.cc for the magic offset that gets us from the TLS | |
641 // to the beginning of the secure memory area. | |
642 SecureMem::Args* ret; | |
643 #if defined(__x86_64__) | |
644 asm volatile( | |
645 "movq %%gs:-0xE0, %0\n" | |
646 : "=q"(ret)); | |
647 #elif defined(__i386__) | |
648 asm volatile( | |
649 "movl %%fs:-0x58, %0\n" | |
650 : "=r"(ret)); | |
651 #else | |
652 #error Unsupported target platform | |
653 #endif | |
654 return ret; | |
655 } | |
656 | |
657 void Sandbox::snapshotMemoryMappings(int processFd, int proc_self_maps) { | |
658 SysCalls sys; | |
659 if (sys.lseek(proc_self_maps, 0, SEEK_SET) || | |
660 !sendFd(processFd, proc_self_maps, -1, NULL, 0)) { | |
661 failure: | |
662 die("Cannot access /proc/self/maps"); | |
663 } | |
664 int dummy; | |
665 if (read(sys, processFd, &dummy, sizeof(dummy)) != sizeof(dummy)) { | |
666 goto failure; | |
667 } | |
668 } | |
669 | |
670 int Sandbox::supportsSeccompSandbox(int proc_fd) { | |
671 if (status_ != STATUS_UNKNOWN) { | |
672 return status_ != STATUS_UNSUPPORTED; | |
673 } | |
674 int fds[2]; | |
675 SysCalls sys; | |
676 if (sys.pipe(fds)) { | |
677 status_ = STATUS_UNSUPPORTED; | |
678 return 0; | |
679 } | |
680 pid_t pid; | |
681 switch ((pid = sys.fork())) { | |
682 case -1: | |
683 status_ = STATUS_UNSUPPORTED; | |
684 return 0; | |
685 case 0: { | |
686 int devnull = sys.open("/dev/null", O_RDWR, 0); | |
687 if (devnull >= 0) { | |
688 sys.dup2(devnull, 0); | |
689 sys.dup2(devnull, 1); | |
690 sys.dup2(devnull, 2); | |
691 sys.close(devnull); | |
692 } | |
693 if (proc_fd >= 0) { | |
694 setProcSelfMaps(sys.openat(proc_fd, "self/maps", O_RDONLY, 0)); | |
695 } | |
696 startSandbox(); | |
697 write(sys, fds[1], "", 1); | |
698 | |
699 // Try to tell the trusted thread to shut down the entire process in an | |
700 // orderly fashion | |
701 defaultSystemCallHandler(__NR_exit_group, 0, 0, 0, 0, 0, 0); | |
702 | |
703 // If that did not work (e.g. because the kernel does not know about the | |
704 // exit_group() system call), make a direct _exit() system call instead. | |
705 // This system call is unrestricted in seccomp mode, so it will always | |
706 // succeed. Normally, we don't like it, because unlike exit_group() it | |
707 // does not terminate any other thread. But since we know that | |
708 // exit_group() exists in all kernels which support kernel-level threads, | |
709 // this is OK we only get here for old kernels where _exit() is OK. | |
710 sys._exit(0); | |
711 } | |
712 default: | |
713 NOINTR_SYS(sys.close(fds[1])); | |
714 char ch; | |
715 if (read(sys, fds[0], &ch, 1) != 1) { | |
716 status_ = STATUS_UNSUPPORTED; | |
717 } else { | |
718 status_ = STATUS_AVAILABLE; | |
719 } | |
720 int rc; | |
721 NOINTR_SYS(sys.waitpid(pid, &rc, 0)); | |
722 NOINTR_SYS(sys.close(fds[0])); | |
723 return status_ != STATUS_UNSUPPORTED; | |
724 } | |
725 } | |
726 | |
727 void Sandbox::setProcSelfMaps(int proc_self_maps) { | |
728 proc_self_maps_ = proc_self_maps; | |
729 } | |
730 | |
731 void Sandbox::startSandbox() { | |
732 if (status_ == STATUS_UNSUPPORTED) { | |
733 die("The seccomp sandbox is not supported on this computer"); | |
734 } else if (status_ == STATUS_ENABLED) { | |
735 return; | |
736 } | |
737 | |
738 SysCalls sys; | |
739 if (proc_self_maps_ < 0) { | |
740 proc_self_maps_ = sys.open("/proc/self/maps", O_RDONLY, 0); | |
741 if (proc_self_maps_ < 0) { | |
742 die("Cannot access \"/proc/self/maps\""); | |
743 } | |
744 } | |
745 | |
746 // The pid is unchanged for the entire program, so we can retrieve it once | |
747 // and store it in a global variable. | |
748 pid_ = sys.getpid(); | |
749 | |
750 // Block all signals, except for the RDTSC handler | |
751 setupSignalHandlers(); | |
752 | |
753 // Get socketpairs for talking to the trusted process | |
754 int pair[4]; | |
755 if (sys.socketpair(AF_UNIX, SOCK_STREAM, 0, pair) || | |
756 sys.socketpair(AF_UNIX, SOCK_STREAM, 0, pair+2)) { | |
757 die("Failed to create trusted thread"); | |
758 } | |
759 processFdPub_ = pair[0]; | |
760 cloneFdPub_ = pair[2]; | |
761 SecureMemArgs* secureMem = createTrustedProcess(pair[0], pair[1], | |
762 pair[2], pair[3]); | |
763 | |
764 // We find all libraries that have system calls and redirect the system | |
765 // calls to the sandbox. If we miss any system calls, the application will be | |
766 // terminated by the kernel's seccomp code. So, from a security point of | |
767 // view, if this code fails to identify system calls, we are still behaving | |
768 // correctly. | |
769 { | |
770 Maps maps(proc_self_maps_); | |
771 const char *libs[] = { "ld", "libc", "librt", "libpthread", NULL }; | |
772 | |
773 // Intercept system calls in the VDSO segment (if any). This has to happen | |
774 // before intercepting system calls in any of the other libraries, as | |
775 // the main kernel entry point might be inside of the VDSO and we need to | |
776 // determine its address before we can compare it to jumps from inside | |
777 // other libraries. | |
778 for (Maps::const_iterator iter = maps.begin(); iter != maps.end(); ++iter){ | |
779 Library* library = *iter; | |
780 if (library->isVDSO() && library->parseElf()) { | |
781 library->makeWritable(true); | |
782 library->patchSystemCalls(); | |
783 library->makeWritable(false); | |
784 break; | |
785 } | |
786 } | |
787 | |
788 // Intercept system calls in libraries that are known to have them. | |
789 for (Maps::const_iterator iter = maps.begin(); iter != maps.end(); ++iter){ | |
790 Library* library = *iter; | |
791 const char* mapping = iter.name().c_str(); | |
792 | |
793 // Find the actual base name of the mapped library by skipping past any | |
794 // SPC and forward-slashes. We don't want to accidentally find matches, | |
795 // because the directory name included part of our well-known lib names. | |
796 // | |
797 // Typically, prior to pruning, entries would look something like this: | |
798 // 08:01 2289011 /lib/libc-2.7.so | |
799 for (const char *delim = " /"; *delim; ++delim) { | |
800 const char* skip = strrchr(mapping, *delim); | |
801 if (skip) { | |
802 mapping = skip + 1; | |
803 } | |
804 } | |
805 | |
806 for (const char **ptr = libs; *ptr; ptr++) { | |
807 const char *name = strstr(mapping, *ptr); | |
808 if (name == mapping) { | |
809 char ch = name[strlen(*ptr)]; | |
810 if (ch < 'A' || (ch > 'Z' && ch < 'a') || ch > 'z') { | |
811 if (library->parseElf()) { | |
812 library->makeWritable(true); | |
813 library->patchSystemCalls(); | |
814 library->makeWritable(false); | |
815 break; | |
816 } | |
817 } | |
818 } | |
819 } | |
820 } | |
821 } | |
822 | |
823 // Take a snapshot of the current memory mappings. These mappings will be | |
824 // off-limits to all future mmap(), munmap(), mremap(), and mprotect() calls. | |
825 snapshotMemoryMappings(processFdPub_, proc_self_maps_); | |
826 NOINTR_SYS(sys.close(proc_self_maps_)); | |
827 proc_self_maps_ = -1; | |
828 | |
829 // Creating the trusted thread enables sandboxing | |
830 createTrustedThread(processFdPub_, cloneFdPub_, secureMem); | |
831 | |
832 // We can no longer check for sandboxing support at this point, but we also | |
833 // know for a fact that it is available (as we just turned it on). So update | |
834 // the status to reflect this information. | |
835 status_ = STATUS_ENABLED; | |
836 } | |
837 | |
838 } // namespace | |
OLD | NEW |