kernel-hardening - Re: [PATCH 1/3] fork: dynamically allocate cache array for vmapped stacks using cpuhp

Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20170203153914.GI19325@dhcp22.suse.cz>
Date: Fri, 3 Feb 2017 16:39:14 +0100
From: Michal Hocko <mhocko@...nel.org>
To: Hoeun Ryu <hoeun.ryu@...il.com>
Cc: Andrew Morton <akpm@...ux-foundation.org>,
	Ingo Molnar <mingo@...nel.org>, Andy Lutomirski <luto@...nel.org>,
	Kees Cook <keescook@...omium.org>,
	"Eric W. Biederman" <ebiederm@...ssion.com>,
	Mateusz Guzik <mguzik@...hat.com>, linux-kernel@...r.kernel.org,
	kernel-hardening@...ts.openwall.com
Subject: Re: [PATCH 1/3] fork: dynamically allocate cache array for vmapped
 stacks using cpuhp

On Sat 04-02-17 00:30:05, Hoeun Ryu wrote:
>  Using virtually mapped stack, kernel stacks are allocated via vmalloc.
> In the current implementation, two stacks per cpu can be cached when
> tasks are freed and the cached stacks are used again in task duplications.
> but the array for the cached stacks is statically allocated by per-cpu api.
>  In this new implementation, the array for the cached stacks are dynamically
> allocted and freed by cpu hotplug callbacks and the cached stacks are freed
> when cpu is down. setup for cpu hotplug is established in fork_init().

Why do we want this? I can see that the follow up patch makes the number
configurable but the changelog doesn't describe the motivation for that.
Which workload would benefit from a higher value?

> Signed-off-by: Hoeun Ryu <hoeun.ryu@...il.com>
> ---
>  kernel/fork.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 64 insertions(+), 17 deletions(-)
> 
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 61284d8..54421a9 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -167,26 +167,71 @@ void __weak arch_release_thread_stack(unsigned long *stack)
>   * flush.  Try to minimize the number of calls by caching stacks.
>   */
>  #define NR_CACHED_STACKS 2
> -static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
> +
> +struct vm_stack_cache {
> +	struct vm_struct **vm_stacks;
> +	int nr;
> +	int cur;
> +};
> +
> +static DEFINE_PER_CPU(struct vm_stack_cache, vm_stacks);
> +
> +static int alloc_vm_stack_cache(unsigned int cpu)
> +{
> +	struct vm_stack_cache *vm_stack_cache = &per_cpu(vm_stacks, cpu);
> +	struct vm_struct **vm_stacks = vm_stack_cache->vm_stacks;
> +	int i;
> +
> +	/* if free_vm_stack_cache() didn't free it */
> +	if (!vm_stacks) {
> +		vm_stacks =
> +			vzalloc(sizeof(struct vm_struct *) * NR_CACHED_STACKS);
> +		if (!vm_stacks)
> +			return -ENOMEM;
> +	}
> +
> +	vm_stack_cache->vm_stacks = vm_stacks;
> +	vm_stack_cache->cur = 0;
> +	vm_stack_cache->nr = 0;
> +
> +	return 0;
> +}
> +
> +static int free_vm_stack_cache(unsigned int cpu)
> +{
> +	struct vm_stack_cache *vm_stack_cache = &per_cpu(vm_stacks, cpu);
> +	struct vm_struct **vm_stacks = vm_stack_cache->vm_stacks;
> +	int i;
> +
> +	for (i = 0; i < vm_stack_cache->nr; i++) {
> +		vfree(vm_stacks[i]->addr);
> +		vm_stacks[i] = NULL;
> +	}
> +
> +	vm_stack_cache->nr = 0;
> +	vm_stack_cache->cur = 0;
> +	/* do not free vm_stack[cpu]->vm_stacks itself, reused in allocation */
> +
> +	return 0;
> +}
> +
>  #endif
>  
>  static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
>  {
>  #ifdef CONFIG_VMAP_STACK
> +	struct vm_stack_cache *vm_stack_cache =
> +		&per_cpu(vm_stacks, smp_processor_id());
> +	struct vm_struct **vm_stacks = vm_stack_cache->vm_stacks;
>  	void *stack;
> -	int i;
>  
>  	local_irq_disable();
> -	for (i = 0; i < NR_CACHED_STACKS; i++) {
> -		struct vm_struct *s = this_cpu_read(cached_stacks[i]);
> -
> -		if (!s)
> -			continue;
> -		this_cpu_write(cached_stacks[i], NULL);
> -
> -		tsk->stack_vm_area = s;
> +	if (vm_stack_cache->cur > 0) {
> +		struct vm_struct *vm_stack = vm_stacks[--vm_stack_cache->cur];
> +		tsk->stack_vm_area = vm_stack;
>  		local_irq_enable();
> -		return s->addr;
> +
> +		return vm_stack->addr;
>  	}
>  	local_irq_enable();
>  
> @@ -216,15 +261,14 @@ static inline void free_thread_stack(struct task_struct *tsk)
>  {
>  #ifdef CONFIG_VMAP_STACK
>  	if (task_stack_vm_area(tsk)) {
> +		struct vm_stack_cache *vm_stack_cache =
> +			&per_cpu(vm_stacks, smp_processor_id());
> +		struct vm_struct **vm_stacks = vm_stack_cache->vm_stacks;
>  		unsigned long flags;
> -		int i;
>  
>  		local_irq_save(flags);
> -		for (i = 0; i < NR_CACHED_STACKS; i++) {
> -			if (this_cpu_read(cached_stacks[i]))
> -				continue;
> -
> -			this_cpu_write(cached_stacks[i], tsk->stack_vm_area);
> +		if (vm_stack_cache->cur < vm_stack_cache->nr) {
> +			vm_stacks[vm_stack_cache->cur++] = tsk->stack_vm_area;
>  			local_irq_restore(flags);
>  			return;
>  		}
> @@ -456,6 +500,9 @@ void __init fork_init(void)
>  	for (i = 0; i < UCOUNT_COUNTS; i++) {
>  		init_user_ns.ucount_max[i] = max_threads/2;
>  	}
> +
> +	cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "vm_stack_cache",
> +			  alloc_vm_stack_cache, free_vm_stack_cache);
>  }
>  
>  int __weak arch_dup_task_struct(struct task_struct *dst,
> -- 
> 2.7.4
> 

-- 
Michal Hocko
SUSE Labs
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.