|
Date: Mon, 8 Aug 2011 21:39:13 +0400 From: Vasiliy Kulikov <segoon@...nwall.com> To: kernel-hardening@...ts.openwall.com Cc: Will Drewry <wad@...omium.org> Subject: Re: 32/64 bitness restriction for pid namespace On Sun, Aug 07, 2011 at 15:00 +0400, Vasiliy Kulikov wrote: > Solar, Will, all - > > The new sysctl is introduced, abi.bitness_locked. If set to 1, it locks > all tasks inside of current pid namespace to the bitness of init task > (pid_ns->child_reaper). After that (1) all syscalls of other bitness > return -ENOSYS and (2) loading ELF binaries of another bitness is > prohibited (as if the corresponding CONFIG_BINFMT_*=N). If there is any > task which differs in bitness, the lockup fails. > > TODO: > > * Fix a race of sysctl against fork(). Done. > * Denied syscall should behave as if it doesn't exist. I suppose the best way of handling denied 32 bit syscalls is pretending IA32_EMULATION=n, 64 bit syscalls - as if it is 32-bit kernel on 64-bit CPU. Simplified handling copied from interrupts handling with proper signal delivery will be implemented. 64 bit SYSCALL - #UD => SIGILL. 32-bit SYSCALL - 64-bit kernel without IA32_EMULATION simply returns -ENOSYS here. 32-bit SYSENTER - #GP(0) => SIGSEGV. 32-bit int 80h - #NP => SIGBUS. Other changes: - fixed an unitialized variable usage. - moved the check before "orl $TS_COMPAT,TI_status(%r10)". - sysctl is persistent with IA32_EMULATION=n. The new version: diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index a0e866d..39a6544 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -151,6 +151,8 @@ ENTRY(ia32_sysenter_target) .quad 1b,ia32_badarg .previous GET_THREAD_INFO(%r10) + testl $_TIF_SYSCALL32_DENIED,TI_flags(%r10) + jnz ia32_deniedsys orl $TS_COMPAT,TI_status(%r10) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) CFI_REMEMBER_STATE @@ -310,6 +312,8 @@ ENTRY(ia32_cstar_target) .quad 1b,ia32_badarg .previous GET_THREAD_INFO(%r10) + testl $_TIF_SYSCALL32_DENIED,TI_flags(%r10) + jnz ia32_deniedsys orl $TS_COMPAT,TI_status(%r10) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) CFI_REMEMBER_STATE @@ -421,6 +425,8 @@ ENTRY(ia32_syscall) this could be a problem. */ SAVE_ARGS 0,1,0 GET_THREAD_INFO(%r10) + testl $_TIF_SYSCALL32_DENIED,TI_flags(%r10) + jnz ia32_deniedsys orl $TS_COMPAT,TI_status(%r10) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) jnz ia32_tracesys @@ -453,6 +459,12 @@ ia32_badsys: movq $-ENOSYS,%rax jmp ia32_sysret +ia32_deniedsys: + /* FIXME: need SIGSEGV delivery or similar */ + movq $0,ORIG_RAX-ARGOFFSET(%rsp) + movq $-ENOSYS,%rax + jmp ia32_sysret + quiet_ni_syscall: movq $-ENOSYS,%rax ret diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index f2ad216..fb054c7 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -153,9 +153,10 @@ do { \ * This is used to ensure we don't load something for the wrong architecture. */ #define elf_check_arch(x) \ - ((x)->e_machine == EM_X86_64) + ((x)->e_machine == EM_X86_64 && !test_thread_flag(TIF_SYSCALL64_DENIED)) -#define compat_elf_check_arch(x) elf_check_arch_ia32(x) +#define compat_elf_check_arch(x) \ + (elf_check_arch_ia32(x) && !test_thread_flag(TIF_SYSCALL32_DENIED)) static inline void elf_common_init(struct thread_struct *t, struct pt_regs *regs, const u16 ds) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index a1fe5c1..7faebde 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -95,6 +95,8 @@ struct thread_info { #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ +#define TIF_SYSCALL32_DENIED 29 /* 32 bit syscalls are allowed */ +#define TIF_SYSCALL64_DENIED 30 /* 64 bit syscalls are allowed */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -117,6 +119,8 @@ struct thread_info { #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) +#define _TIF_SYSCALL32_DENIED (1 << TIF_SYSCALL32_DENIED) +#define _TIF_SYSCALL64_DENIED (1 << TIF_SYSCALL64_DENIED) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ @@ -259,9 +263,14 @@ static inline void set_restore_sigmask(void) ti->status |= TS_RESTORE_SIGMASK; set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags); } -#endif /* !__ASSEMBLY__ */ -#ifndef __ASSEMBLY__ +#ifdef CONFIG_IA32_EMULATION +#define __HAVE_ARCH_POST_FORK + +extern void arch_post_fork(struct task_struct *task); + +#endif /* CONFIG_IA32_EMULATION */ + extern void arch_task_cache_init(void); extern void free_thread_info(struct thread_info *ti); extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0410557..a200ff3 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -86,6 +86,7 @@ obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +obj-$(CONFIG_SYSCTL) += syscall_restrict.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_APB_TIMER) += apb_timer.o diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e13329d..1774685 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -474,6 +474,8 @@ ENTRY(system_call_after_swapgs) movq %rcx,RIP-ARGOFFSET(%rsp) CFI_REL_OFFSET rip,RIP-ARGOFFSET GET_THREAD_INFO(%rcx) + testl $_TIF_SYSCALL64_DENIED,TI_flags(%rcx) + jnz deniedsys testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx) jnz tracesys system_call_fastpath: @@ -541,6 +543,10 @@ sysret_signal: badsys: movq $-ENOSYS,RAX-ARGOFFSET(%rsp) jmp ret_from_sys_call +deniedsys: + /* FIXME: need SIGSEGV delivery or similar */ + movq $-ENOSYS,RAX-ARGOFFSET(%rsp) + jmp ret_from_sys_call #ifdef CONFIG_AUDITSYSCALL /* diff --git a/arch/x86/kernel/syscall_restrict.c b/arch/x86/kernel/syscall_restrict.c new file mode 100644 index 0000000..a5c8ffa --- /dev/null +++ b/arch/x86/kernel/syscall_restrict.c @@ -0,0 +1,187 @@ +#include <linux/thread_info.h> +#include <linux/pid_namespace.h> +#include <linux/sysctl.h> + +#ifdef CONFIG_IA32_EMULATION + +static bool pid_ns_contains_task(struct pid_namespace *pid_ns, + struct task_struct *task) +{ + struct pid_namespace *ns = NULL; + + if (task->nsproxy) + ns = task->nsproxy->pid_ns; + + for (; ns; ns = ns->parent) { + if (ns == pid_ns) + return true; + } + + return false; +} + +static int task_get_bitness(struct task_struct *task) +{ + if (test_ti_thread_flag(task_thread_info(task), TIF_IA32)) + return 32; + else + return 64; +} + +static bool pidns_locked(struct pid_namespace *pid_ns) +{ + struct thread_info *ti = task_thread_info(pid_ns->child_reaper); + + return test_ti_thread_flag(ti, TIF_SYSCALL32_DENIED) || + test_ti_thread_flag(ti, TIF_SYSCALL64_DENIED); +} + +static int bits_to_flags(int bits) +{ + switch (bits) { + case 32: + return TIF_SYSCALL64_DENIED; + case 64: + return TIF_SYSCALL32_DENIED; + default: + return -EINVAL; + } +} + +static int __pidns_may_lock_bitness(struct pid_namespace *pid_ns, int bits) +{ + struct task_struct *p, *thread; + int old_bits; + + do_each_thread(p, thread) { + if (!pid_ns_contains_task(pid_ns, thread)) + continue; + + old_bits = task_get_bitness(thread); + if (old_bits != bits) { + pr_err("Inconsistent syscall restriction detected! " + "Parent ns tries to restrict syscalls to %d " + "bits while some task is %d bit.", + bits, old_bits); + return -EINVAL; + } + } while_each_thread(p, thread); + + return 0; +} + +void arch_post_fork(struct task_struct *task) +{ + int clear_bit_nr; + + if (!pidns_locked(current->nsproxy->pid_ns)) + return; + + clear_bit_nr = bits_to_flags(task_get_bitness(current)); + set_tsk_thread_flag(task, clear_bit_nr); +} + +/* Called with hold tasklist_lock and rcu */ +static int __bitness_lock(struct pid_namespace *pid_ns, int bits) +{ + u32 clear_bit_nr; + struct task_struct *p, *thread; + + clear_bit_nr = bits_to_flags(bits); + + /* Yes, it is awfully slow, but it is called once per ns (if any) */ + do_each_thread(p, thread) { + if (!pid_ns_contains_task(pid_ns, thread)) + continue; + + set_tsk_thread_flag(thread, clear_bit_nr); + } while_each_thread(p, thread); + + return 0; +} + +static int bitness_lock(struct pid_namespace *pid_ns) +{ + int rc, new_bits; + + rcu_read_lock(); + write_lock_irq(&tasklist_lock); + + new_bits = task_get_bitness(pid_ns->child_reaper); + rc = __pidns_may_lock_bitness(pid_ns, new_bits); + if (!rc) + rc = __bitness_lock(pid_ns, new_bits); + + write_unlock_irq(&tasklist_lock); + rcu_read_unlock(); + return rc; +} + +static int bitness_locked_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int rc, new_bits, old_bits; + struct ctl_table tbl = { + .procname = table->procname, + .data = &new_bits, + .maxlen = sizeof(unsigned int), + .mode = 0644, + }; + + old_bits = new_bits = pidns_locked(current->nsproxy->pid_ns); + rc = proc_dointvec(&tbl, write, buffer, lenp, ppos); + if (rc || !write) + return rc; + + if (!capable(CAP_SYS_ADMIN) || (new_bits == 0 && old_bits)) + return -EACCES; + if (new_bits && old_bits) + return 0; + return bitness_lock(current->nsproxy->pid_ns); +} + +static struct ctl_table abi_syscall_restrict[] = { + { + .procname = "bitness_locked", + .mode = 0644, + .proc_handler = bitness_locked_handler + }, + {} +}; + +#else /* CONFIG_IA32_EMULATION */ + +static int one = 1; + +static struct ctl_table abi_syscall_restrict[] = { + { + .procname = "bitness_locked", + .data = &one, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &one, + }, + {} +}; + +#endif /* CONFIG_IA32_EMULATION */ + + +static struct ctl_table abi_root[] = { + { + .procname = "abi", + .mode = 0555, + .child = abi_syscall_restrict + }, + {} +}; + +__init int syscall_restrict_init(void) +{ + register_sysctl_table(abi_root); + return 0; +} +device_initcall(syscall_restrict_init); diff --git a/kernel/fork.c b/kernel/fork.c index e7ceaca..55e4455 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1039,6 +1039,10 @@ static void posix_cpu_timers_init(struct task_struct *tsk) INIT_LIST_HEAD(&tsk->cpu_timers[2]); } +#ifndef __HAVE_ARCH_POST_FORK +#define arch_post_fork(p) +#endif + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -1374,6 +1378,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, total_forks++; spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); + arch_post_fork(p); proc_fork_connector(p); cgroup_post_fork(p); if (clone_flags & CLONE_THREAD) --
Powered by blists - more mailing lists
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.