Linux kernel expose a lot of system calls to the userland process, however not all them will be used in one process. In most of the cases, one process only uses a very limited system calls and leave a lot of the other system calls unused. It is harmful to let one process to call any system calls. For example if one system call(the process don’t use in normal execution) is implemented with a security issue, the process can easily trigger this. Also if one process was compromised, the attacker usually will run a piece of shellcode and this may trigger the system calls that the process will not trigger in normal execution(such as execve). So reduce the system calls that one process can make is useful.
Seccomp filtering is such a mechism to specify a filter for process’s incoming system calls. Seccomp was originated from Secure Computing. At first, there are only a strict seccomp mode. This means once the process is set to the strict mode, it can only call the ‘read’, ‘write’, ‘_exit’ and ‘sigreturn’ system calls. Of course this is not flexible and not very useful. Later the seccomp adds a filter mode. The process can set the seccomp policy to filter mode and add a new filter program to the kernel. This filter program is Berkeley Packet Filter(BPF) program, as with socket filters, except that the data operated on is related to system call being made: system call number and the system call arguments. With BPF program added to the kernel, the process can easily make his policy to filter the system calls, let the kernel reject a system call or send a SIGSYS signal to the process. Note that BFP program can’t deference pointers so the seccomp can just evaluates the system call arguments directly.
The idea behind seccomp is very simple. Following picture show it. First, the process should set the seccomp policy to strict or filter mode. This will cause the kernel set the seccomp flag in task_struct and if the process sets the filter mode, the kernel will add the program to a seccomp filter list in task_struct. Later for every system call the process made, the kernel will check that based the seccomp filter.
Following code shows the strict seccomp mode usage. In default, the process doesn’t enable seccomp means it can call any system call. After set it to strict mode, it can only can the four system call so even the prctl itself can’t be called.
#include <stdio.h>
#include <sys/prctl.h>
#include <linux/seccomp.h>
#include <unistd.h>
int main() {
int ret;
ret = prctl(PR_GET_SECCOMP);
if (ret == 0) {
printf("no seccomp enabled!\n");
}
else {
printf("seccomp enabled!\n");
}
prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT);
ret = prctl(PR_GET_SECCOMP);
printf("you should not see me!\n");
}
Following is the result.
test@ubuntu:~$ ./prctl
no seccomp enabled!
Killed
Following code shows the filter mode usage. The BPF program instruction spec can be found here。The BPF program here just allow the prctl and write system call. Also note that we need call prctl(PR_SET_NO_NEW_PRIVS) to make sure the current process and its child process can’t be granted new privilegs(for eaxmple, running a setuid binary)。
#include <stdio.h>
#include <sys/prctl.h>
#include <linux/seccomp.h>
#include <unistd.h>
#include <linux/filter.h>
#include <stddef.h>
#include <sys/syscall.h>
int main() {
int ret;
struct sock_filter filter[] = {
BPF_STMT(BPF_LD+BPF_W+BPF_ABS, (offsetof(struct seccomp_data, nr))),
BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_prctl, 0, 1),
BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_write, 0, 1),
BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
};
struct sock_fprog prog = {
.len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
.filter = filter,
};
ret = prctl(PR_GET_SECCOMP);
if (ret == 0) {
printf("no seccomp enabled!\n");
}
else {
printf("seccomp enabled!\n");
}
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
perror("prctl(NO_NEW_PRIVS)");
exit(1);
}
if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
perror("prctl(SECCOMP)");
exit(1);
}
ret = prctl(PR_GET_SECCOMP);
if (ret == 0) {
printf("no seccomp enabled!\n");
}
else {
printf("seccomp enabled = %d!\n", ret);
}
dup2(1,2);
printf("you should not see me!\n");
return 0;
}
Following is the result.
test@ubuntu:~$ ./filter
no seccomp enabled!
seccomp enabled = 2!
Bad system call (core dumped)
We can use prctl to get/set seccomp policy. Also the new seccomp system call be used to set seccomp policy. The implementation of prctl system call is in kernel/sys.c file. Function ‘prctl_set_seccomp’ is called when the first argument of prctl is PR_SET_SECCOMP, following shows the code.
long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
{
unsigned int op;
char __user *uargs;
switch (seccomp_mode) {
case SECCOMP_MODE_STRICT:
op = SECCOMP_SET_MODE_STRICT;
/*
* Setting strict mode through prctl always ignored filter,
* so make sure it is always NULL here to pass the internal
* check in do_seccomp().
*/
uargs = NULL;
break;
case SECCOMP_MODE_FILTER:
op = SECCOMP_SET_MODE_FILTER;
uargs = filter;
break;
default:
return -EINVAL;
}
/* prctl interface doesn't have flags, so they are always zero. */
return do_seccomp(op, 0, uargs);
}
Set the ‘op’ according the seccomp mode and call do_seccomp.
static long do_seccomp(unsigned int op, unsigned int flags,
const char __user *uargs)
{
switch (op) {
case SECCOMP_SET_MODE_STRICT:
if (flags != 0 || uargs != NULL)
return -EINVAL;
return seccomp_set_mode_strict();
case SECCOMP_SET_MODE_FILTER:
return seccomp_set_mode_filter(flags, uargs);
default:
return -EINVAL;
}
}
Call ‘seccomp_set_mode_strict’ directly.
static long seccomp_set_mode_strict(void)
{
const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
long ret = -EINVAL;
spin_lock_irq(¤t->sighand->siglock);
if (!seccomp_may_assign_mode(seccomp_mode))
goto out;
#ifdef TIF_NOTSC
disable_TSC();
#endif
seccomp_assign_mode(current, seccomp_mode, 0);
ret = 0;
out:
spin_unlock_irq(¤t->sighand->siglock);
return ret;
}
“seccomp_may_assign_mode” is used to make sure the seccomp has not been set and if it is set and it’s previous value is not the same as SECCOMP_MODE_STRICT, it return false. So here we can see, once we set the seccomp we can’t change it. The state of a seccomp’ed process is stored in ‘seccomp’ filed in task_struct. The definition of ‘seccomp’ is following:
struct seccomp {
int mode;
struct seccomp_filter *filter;
};
struct seccomp_filter {
atomic_t usage;
struct seccomp_filter *prev;
struct bpf_prog *prog;
};
The ‘mode’ field indicates the seccomp mode and the ‘filter’ is used to link all of the filter in seccomp filter mode. Once the check of ‘seccomp_may_assign_mode’ passed, ‘seccomp_set_mode_strict’ calls ‘seccomp_assign_mode’ to set seccomp mode. It just set ‘task->seccomp.mode’ and set ‘task_struct->thread_info->flags’ with ‘TIF_SECCOMP’. If the ‘op’ of ‘do_seccomp’s argument is ‘SECCOMP_SET_MODE_FILTER’, this means the userland want to set seccomp mode to filter. The ‘do_seccomp’ calls ‘seccomp_set_mode_filter’.
static long seccomp_set_mode_filter(unsigned int flags,
const char __user *filter)
{
const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
struct seccomp_filter *prepared = NULL;
long ret = -EINVAL;
/* Validate flags. */
if (flags & ~SECCOMP_FILTER_FLAG_MASK)
return -EINVAL;
/* Prepare the new filter before holding any locks. */
prepared = seccomp_prepare_user_filter(filter);
if (IS_ERR(prepared))
return PTR_ERR(prepared);
/*
* Make sure we cannot change seccomp or nnp state via TSYNC
* while another thread is in the middle of calling exec.
*/
if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
mutex_lock_killable(¤t->signal->cred_guard_mutex))
goto out_free;
spin_lock_irq(¤t->sighand->siglock);
if (!seccomp_may_assign_mode(seccomp_mode))
goto out;
ret = seccomp_attach_filter(flags, prepared);
if (ret)
goto out;
/* Do not free the successfully attached filter. */
prepared = NULL;
seccomp_assign_mode(current, seccomp_mode, flags);
out:
spin_unlock_irq(¤t->sighand->siglock);
if (flags & SECCOMP_FILTER_FLAG_TSYNC)
mutex_unlock(¤t->signal->cred_guard_mutex);
out_free:
seccomp_filter_free(prepared);
return ret;
}
‘seccomp_prepare_user_filter’ is used to prepare the new filter. It mainly calls ‘seccomp_prepare_filter’. ‘seccomp_prepare_filter’ check the argument userland provided. Then do a permission check, as the comments says it only allow process ‘CAP_SYS_ADMIN in its namespace or be running with no_new_privs’ to set seccomp to filter mode.
static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
struct seccomp_filter *sfilter;
int ret;
const bool save_orig = config_enabled(CONFIG_CHECKPOINT_RESTORE);
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
return ERR_PTR(-EINVAL);
BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
/*
* Installing a seccomp filter requires that the task has
* CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
* This avoids scenarios where unprivileged tasks can affect the
* behavior of privileged children.
*/
if (!task_no_new_privs(current) &&
security_capable_noaudit(current_cred(), current_user_ns(),
CAP_SYS_ADMIN) != 0)
return ERR_PTR(-EACCES);
/* Allocate a new seccomp_filter */
sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
if (!sfilter)
return ERR_PTR(-ENOMEM);
ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
seccomp_check_filter, save_orig);
if (ret < 0) {
kfree(sfilter);
return ERR_PTR(ret);
}
atomic_set(&sfilter->usage, 1);
return sfilter;
}
Then ‘seccomp_prepare_filter’ calls ‘bpf_prog_create_from_user’ to copy the filter program from userland to kernel and also make a sanity check. After prepare a filter, the ‘seccomp_set_mode_filter’ calls ‘seccomp_attach_filter’ to attach this filter to process. It is simply link this new filter to ‘task_struct->seccomp.filter’ lists. Finally set the ‘task_struct->seccomp.mode’ to SECCOMP_MODE_FILTER and set ‘task_struct->thread_info->flags’ with ‘TIF_SECCOMP’.
When the process set seccomp mode, what system call it can make is stricted or filtered. In the system call enter, it checks ‘_TIF_WORK_SYSCALL_ENTRY’(arch\x86\entry\entry_64.S) and if it is not 0, it means it should do something before dispatch the system call.
#define _TIF_WORK_SYSCALL_ENTRY \
(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \
_TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \
_TIF_NOHZ)
Here we can see ‘_TIF_SECCOMP ‘ if part of ‘_TIF_WORK_SYSCALL_ENTRY’. First it calls ‘syscall_trace_enter_phase1’. In that function it calls ‘seccomp_phase1’.
u32 seccomp_phase1(struct seccomp_data *sd)
{
int mode = current->seccomp.mode;
int this_syscall = sd ? sd->nr :
syscall_get_nr(current, task_pt_regs(current));
if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
return SECCOMP_PHASE1_OK;
switch (mode) {
case SECCOMP_MODE_STRICT:
__secure_computing_strict(this_syscall); /* may call do_exit */
return SECCOMP_PHASE1_OK;
#ifdef CONFIG_SECCOMP_FILTER
case SECCOMP_MODE_FILTER:
return __seccomp_phase1_filter(this_syscall, sd);
#endif
default:
BUG();
}
}
For strict mode it calls ‘__secure_computing_strict’. It compare ‘this_syscall’ with the four allowed system call and do_exit with SIGKILL if current syscall is not one of them.
static void __secure_computing_strict(int this_syscall)
{
int *syscall_whitelist = mode1_syscalls;
#ifdef CONFIG_COMPAT
if (is_compat_task())
syscall_whitelist = mode1_syscalls_32;
#endif
do {
if (*syscall_whitelist == this_syscall)
return;
} while (*++syscall_whitelist);
#ifdef SECCOMP_DEBUG
dump_stack();
#endif
audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL);
do_exit(SIGKILL);
}
static int mode1_syscalls_32[] = {
__NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32,
0, /* null terminated */
};
For filter mode, it calls ‘__seccomp_phase1_filter’ to run the filter in ‘task_struct->seccomp.filter’.
static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
{
u32 filter_ret, action;
int data;
/*
* Make sure that any changes to mode from another thread have
* been seen after TIF_SECCOMP was seen.
*/
rmb();
filter_ret = seccomp_run_filters(sd);
data = filter_ret & SECCOMP_RET_DATA;
action = filter_ret & SECCOMP_RET_ACTION;
switch (action) {
case SECCOMP_RET_ERRNO:
/* Set low-order bits as an errno, capped at MAX_ERRNO. */
if (data > MAX_ERRNO)
data = MAX_ERRNO;
syscall_set_return_value(current, task_pt_regs(current),
-data, 0);
goto skip;
case SECCOMP_RET_TRAP:
/* Show the handler the original registers. */
syscall_rollback(current, task_pt_regs(current));
/* Let the filter pass back 16 bits of data. */
seccomp_send_sigsys(this_syscall, data);
goto skip;
case SECCOMP_RET_TRACE:
return filter_ret; /* Save the rest for phase 2. */
case SECCOMP_RET_ALLOW:
return SECCOMP_PHASE1_OK;
case SECCOMP_RET_KILL:
default:
audit_seccomp(this_syscall, SIGSYS, action);
do_exit(SIGSYS);
}
unreachable();
skip:
audit_seccomp(this_syscall, 0, action);
return SECCOMP_PHASE1_SKIP;
}
Notice here only ‘SECCOMP_RET_TRACE’ will cause the ‘syscall_trace_enter_phase2’ be called in entry_64.S. If the ‘return’ is ‘SECCOMP_RET_KILL’ it will cause the process exit with SIGSYS signal.