sandbox/linux/seccomp/syscall.cc - Issue 3225010: Pull seccomp-sandbox in via DEPS rather than using an in-tree copy...

Side by Side Diff: sandbox/linux/seccomp/syscall.cc

Issue 3225010: Pull seccomp-sandbox in via DEPS rather than using an in-tree copy... (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/

Patch Set: '' Created 10 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include "debug.h"

6 #include "sandbox_impl.h"

7 #include "syscall_table.h"

8

9 namespace playground {

10

11 // TODO(markus): change this into a function that returns the address of the ass embly code. If that isn't possible for sandbox_clone, then move that function in to a *.S file

12 asm(

13 ".pushsection .text, \"ax\", @progbits\n"

14

15 // This is the special wrapper for the clone() system call. The code

16 // relies on the stack layout of the system call wrapper (c.f. below). It

17 // passes the stack pointer as an additional argument to sandbox__clone(),

18 // so that upon starting the child, register values can be restored and

19 // the child can start executing at the correct IP, instead of trying to

20 // run in the trusted thread.

21 "playground$sandbox_clone:"

22 ".globl playground$sandbox_clone\n"

23 ".type playground$sandbox_clone, @function\n"

24 #if defined(__x86_64__)

25 // Skip the 8 byte return address into the system call wrapper. The

26 // following bytes are the saved register values that we need to restore

27 // upon return from clone() in the new thread.

28 "lea 8(%rsp), %r9\n"

29 "jmp playground$sandbox__clone\n"

30 #elif defined(__i386__)

31 // As i386 passes function arguments on the stack, we need to skip a few

32 // more values before we can get to the saved registers.

33 "lea 28(%esp), %eax\n"

34 "mov %eax, 24(%esp)\n"

35 "jmp playground$sandbox__clone\n"

36 #else

37 #error Unsupported target platform

38 #endif

39 ".size playground$sandbox_clone, .-playground$sandbox_clone\n"

40

41

42 // This is the wrapper which is called by the untrusted code, trying to

43 // make a system call.

44 "playground$syscallWrapper:"

45 ".internal playground$syscallWrapper\n"

46 ".globl playground$syscallWrapper\n"

47 ".type playground$syscallWrapper, @function\n"

48 #if defined(__x86_64__)

49 // Check for rt_sigreturn(). It needs to be handled specially.

50 "cmp $15, %rax\n" // NR_rt_sigreturn

51 "jnz 1f\n"

52 "add $0x90, %rsp\n" // pop return addresses and red zone

53 "0:syscall\n" // rt_sigreturn() is unrestricted

54 "mov $66, %edi\n" // rt_sigreturn() should never return

55 "mov $231, %eax\n" // NR_exit_group

56 "jmp 0b\n"

57

58 // Save all registers

59 "1:push %rbp\n"

60 "mov %rsp, %rbp\n"

61 "push %rbx\n"

62 "push %rcx\n"

63 "push %rdx\n"

64 "push %rsi\n"

65 "push %rdi\n"

66 "push %r8\n"

67 "push %r9\n"

68 "push %r10\n"

69 "push %r11\n"

70 "push %r12\n"

71 "push %r13\n"

72 "push %r14\n"

73 "push %r15\n"

74

75 // Convert from syscall calling conventions to C calling conventions.

76 // System calls have a subtly different register ordering than the user-

77 // space x86-64 ABI.

78 "mov %r10, %rcx\n"

79

80 // Check range of system call

81 "cmp playground$maxSyscall(%rip), %eax\n"

82 "ja 3f\n"

83

84 // Retrieve function call from system call table (c.f. syscall_table.c).

85 // We have three different types of entries; zero for denied system calls,

86 // that should be handled by the defaultSystemCallHandler(); minus one

87 // for unrestricted system calls that need to be forwarded to the trusted

88 // thread; and function pointers to specific handler functions.

89 "mov %rax, %r10\n"

90 "shl $4, %r10\n"

91 "lea playground$syscallTable(%rip), %r11\n"

92 "add %r11, %r10\n"

93 "mov 0(%r10), %r10\n"

94

95 // Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise

96 // jump to fallback handler.

97 "cmp $1, %r10\n"

98 "jbe 3f\n"

99 "call *%r10\n"

100 "2:"

101

102 // Restore CPU registers, except for %rax which was set by the system call.

103 "pop %r15\n"

104 "pop %r14\n"

105 "pop %r13\n"

106 "pop %r12\n"

107 "pop %r11\n"

108 "pop %r10\n"

109 "pop %r9\n"

110 "pop %r8\n"

111 "pop %rdi\n"

112 "pop %rsi\n"

113 "pop %rdx\n"

114 "pop %rcx\n"

115 "pop %rbx\n"

116 "pop %rbp\n"

117

118 // Remove fake return address. This is added in the patching code in

119 // library.cc and it makes stack traces a little cleaner.

120 "add $8, %rsp\n"

121

122 // Return to caller

123 "ret\n"

124

125 "3:"

126 // If we end up calling a specific handler, we don't need to know the

127 // system call number. However, in the generic case, we do. Shift

128 // registers so that the system call number becomes visible as the

129 // first function argument.

130 "push %r9\n"

131 "mov %r8, %r9\n"

132 "mov %rcx, %r8\n"

133 "mov %rdx, %rcx\n"

134 "mov %rsi, %rdx\n"

135 "mov %rdi, %rsi\n"

136 "mov %rax, %rdi\n"

137

138 // Call default handler.

139 "call playground$defaultSystemCallHandler\n"

140 "pop %r9\n"

141 "jmp 2b\n"

142 #elif defined(__i386__)

143 "cmp $119, %eax\n" // NR_sigreturn

144 "jnz 1f\n"

145 "add $0x4, %esp\n" // pop return address

146 "0:int $0x80\n" // sigreturn() is unrestricted

147 "mov $66, %ebx\n" // sigreturn() should never return

148 "mov %ebx, %eax\n" // NR_exit

149 "jmp 0b\n"

150 "1:cmp $173, %eax\n" // NR_rt_sigreturn

151 "jnz 3f\n"

152

153 // Convert rt_sigframe into sigframe, allowing us to call sigreturn().

154 // This is possible since the first part of signal stack frames have

155 // stayed very stable since the earliest kernel versions. While never

156 // officially documented, lots of user space applications rely on this

157 // part of the ABI, and kernel developers have been careful to maintain

158 // backwards compatibility.

159 // In general, the rt_sigframe includes a lot of extra information that

160 // the signal handler can look at. Most notably, this means a complete

161 // siginfo record.

162 // Fortunately though, the kernel doesn't look at any of this extra data

163 // when returning from a signal handler. So, we can safely convert an

164 // rt_sigframe to a legacy sigframe, discarding the extra data in the

165 // process. Interestingly, the legacy signal frame is actually larger than

166 // the rt signal frame, as it includes a lot more padding.

167 "sub $0x1C8, %esp\n" // a legacy signal stack is much larger

168 "mov 0x1CC(%esp), %eax\n" // push signal number

169 "push %eax\n"

170 "lea 0x270(%esp), %esi\n" // copy siginfo register values

171 "lea 0x4(%esp), %edi\n" // into new location

172 "mov $0x16, %ecx\n"

173 "cld\n"

174 "rep movsl\n"

175 "mov 0x2C8(%esp), %ebx\n" // copy first half of signal mask

176 "mov %ebx, 0x54(%esp)\n"

177 "lea 2f, %esi\n"

178 "push %esi\n" // push restorer function

179 "lea 0x2D4(%esp), %edi\n" // patch up retcode magic numbers

180 "movb $2, %cl\n"

181 "rep movsl\n"

182 "ret\n" // return to restorer function

183 "2:pop %eax\n" // remove dummy argument (signo)

184 "mov $119, %eax\n" // NR_sigaction

185 "int $0x80\n"

186

187

188 // Preserve all registers

189 "3:push %ebx\n"

190 "push %ecx\n"

191 "push %edx\n"

192 "push %esi\n"

193 "push %edi\n"

194 "push %ebp\n"

195

196 // Convert from syscall calling conventions to C calling conventions

197 "push %ebp\n"

198 "push %edi\n"

199 "push %esi\n"

200 "push %edx\n"

201 "push %ecx\n"

202 "push %ebx\n"

203 "push %eax\n"

204

205 // Check range of system call

206 "cmp playground$maxSyscall, %eax\n"

207 "ja 9f\n"

208

209 // We often have long sequences of calls to gettimeofday(). This is

210 // needlessly expensive. Coalesce them into a single call.

211 //

212 // We keep track of state in TLS storage that we can access through

213 // the %fs segment register. See trusted_thread.cc for the exact

214 // memory layout.

215 //

216 // TODO(markus): maybe, we should proactively call gettimeofday() and

217 // clock_gettime(), whenever we talk to the trusted thread?

218 // or maybe, if we have recently seen requests to compute

219 // the time. There might be a repeated pattern of those.

220 "cmp $78, %eax\n" // __NR_gettimeofday

221 "jnz 6f\n"

222 "cmp %eax, %fs:0x102C-0x58\n" // last system call

223 "jnz 4f\n"

224

225 // This system call and the last system call prior to this one both are

226 // calls to gettimeofday(). Try to avoid making the new call and just

227 // return the same result as in the previous call.

228 // Just in case the caller is spinning on the result from gettimeofday(),

229 // every so often, call the actual system call.

230 "decl %fs:0x1030-0x58\n" // countdown calls to gettimofday()

231 "jz 4f\n"

232

233 // Atomically read the 64bit word representing last-known timestamp and

234 // return it to the caller. On x86-32 this is a little more complicated and

235 // requires the use of the cmpxchg8b instruction.

236 "mov %ebx, %eax\n"

237 "mov %ecx, %edx\n"

238 "lock; cmpxchg8b 100f\n"

239 "mov %eax, 0(%ebx)\n"

240 "mov %edx, 4(%ebx)\n"

241 "xor %eax, %eax\n"

242 "add $28, %esp\n"

243 "jmp 8f\n"

244

245 // This is a call to gettimeofday(), but we don't have a valid cached

246 // result, yet.

247 "4:mov %eax, %fs:0x102C-0x58\n" // remember syscall number

248 "movl $500, %fs:0x1030-0x58\n" // make system call, each 500 invocations

249 "call playground$defaultSystemCallHandler\n"

250

251 // Returned from gettimeofday(). Remember return value, in case the

252 // application calls us again right away.

253 // Again, this has to happen atomically and requires cmpxchg8b.

254 "mov 4(%ebx), %ecx\n"

255 "mov 0(%ebx), %ebx\n"

256 "mov 100f, %eax\n"

257 "mov 101f, %edx\n"

258 "5:lock; cmpxchg8b 100f\n"

259 "jnz 5b\n"

260 "xor %eax, %eax\n"

261 "jmp 10f\n"

262

263 // Remember the number of the last system call made. We deliberately do

264 // not remember calls to gettid(), as we have often seen long sequences

265 // of calls to just gettimeofday() and gettid(). In that situation, we

266 // would still like to coalesce the gettimeofday() calls.

267 "6:cmp $224, %eax\n" // __NR_gettid

268 "jz 7f\n"

269 "mov %eax, %fs:0x102C-0x58\n" // remember syscall number

270

271 // Retrieve function call from system call table (c.f. syscall_table.c).

272 // We have three different types of entries; zero for denied system calls,

273 // that should be handled by the defaultSystemCallHandler(); minus one

274 // for unrestricted system calls that need to be forwarded to the trusted

275 // thread; and function pointers to specific handler functions.

276 "7:shl $3, %eax\n"

277 "lea playground$syscallTable, %ebx\n"

278 "add %ebx, %eax\n"

279 "mov 0(%eax), %eax\n"

280

281 // Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise

282 // jump to fallback handler.

283 "cmp $1, %eax\n"

284 "jbe 9f\n"

285 "add $4, %esp\n"

286 "call *%eax\n"

287 "add $24, %esp\n"

288

289 // Restore CPU registers, except for %eax which was set by the system call.

290 "8:pop %ebp\n"

291 "pop %edi\n"

292 "pop %esi\n"

293 "pop %edx\n"

294 "pop %ecx\n"

295 "pop %ebx\n"

296

297 // Return to caller

298 "ret\n"

299

300 // Call default handler.

301 "9:call playground$defaultSystemCallHandler\n"

302 "10:add $28, %esp\n"

303 "jmp 8b\n"

304

305 ".pushsection \".bss\"\n"

306 ".balign 8\n"

307 "100:.byte 0, 0, 0, 0\n"

308 "101:.byte 0, 0, 0, 0\n"

309 ".popsection\n"

310

311 #else

312 #error Unsupported target platform

313 #endif

314 ".size playground$syscallWrapper, .-playground$syscallWrapper\n"

315 ".popsection\n"

316 );

317

318

319 void* Sandbox::defaultSystemCallHandler(int syscallNum, void* arg0, void* arg1,

320 void* arg2, void* arg3, void* arg4,

321 void* arg5) {

322 // TODO(markus): The following comment is currently not true, we do intercept these system calls. Try to fix that.

323

324 // We try to avoid intercepting read(), and write(), as these system calls

325 // are not restricted in Seccomp mode. But depending on the exact

326 // instruction sequence in libc, we might not be able to reliably

327 // filter out these system calls at the time when we instrument the code.

328 SysCalls sys;

329 long rc;

330 long long tm;

331 switch (syscallNum) {

332 case __NR_read:

333 Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call");

334 rc = sys.read((long)arg0, arg1, (size_t)arg2);

335 break;

336 case __NR_write:

337 Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call");

338 rc = sys.write((long)arg0, arg1, (size_t)arg2);

339 break;

340 default:

341 if (Debug::isEnabled()) {

342 // In debug mode, prevent stderr from being closed

343 if (syscallNum == __NR_close && arg0 == (void *)2)

344 return 0;

345 }

346

347 if ((unsigned)syscallNum <= maxSyscall &&

348 syscallTable[syscallNum].handler == UNRESTRICTED_SYSCALL) {

349 Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call");

350 perform_unrestricted:

351 struct {

352 int sysnum;

353 void* unrestricted_req[6];

354 } __attribute__((packed)) request = {

355 syscallNum, { arg0, arg1, arg2, arg3, arg4, arg5 } };

356

357 int thread = threadFdPub();

358 void* rc;

359 if (write(sys, thread, &request, sizeof(request)) != sizeof(request) \|\|

360 read(sys, thread, &rc, sizeof(rc)) != sizeof(rc)) {

361 die("Failed to forward unrestricted system call");

362 }

363 Debug::elapsed(tm, syscallNum);

364 return rc;

365 } else if (Debug::isEnabled()) {

366 Debug::syscall(&tm, syscallNum,

367 "In production mode, this call would be disallowed");

368 goto perform_unrestricted;

369 } else {

370 return (void *)-ENOSYS;

371 }

372 }

373 if (rc < 0) {

374 rc = -sys.my_errno;

375 }

376 Debug::elapsed(tm, syscallNum);

377 return (void *)rc;

378 }

379

380 } // namespace

OLD	NEW

« no previous file with comments | « sandbox/linux/seccomp/syscall.h ('k') | sandbox/linux/seccomp/syscall_table.h » ('j') | no next file with comments »