Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Date: Wed, 4 Nov 2020 17:21:03 +0100
From: Alexey Gladkov <gladkov.alexey@...il.com>
To: Sargun Dhillon <sargun@...gun.me>
Cc: LKML <linux-kernel@...r.kernel.org>,
	Linux Containers <containers@...ts.linux-foundation.org>,
	Kernel Hardening <kernel-hardening@...ts.openwall.com>,
	"Eric W . Biederman" <ebiederm@...ssion.com>,
	Kees Cook <keescook@...omium.org>,
	Christian Brauner <christian@...uner.io>
Subject: Re: [RFC PATCH v1 4/4] Allow to change the user namespace in which
 user rlimits are counted

On Wed, Nov 04, 2020 at 10:03:05AM +0000, Sargun Dhillon wrote:
> On Mon, Nov 02, 2020 at 05:50:33PM +0100, Alexey Gladkov wrote:
> > Add a new prctl to change the user namespace in which the process
> > counter is located. A pointer to the user namespace is in cred struct
> > to be inherited by all child processes.
> > 
> > Signed-off-by: Alexey Gladkov <gladkov.alexey@...il.com>
> > ---
> >  fs/exec.c                  |  2 +-
> >  fs/io-wq.c                 | 13 ++++++++-----
> >  fs/io-wq.h                 |  1 +
> >  fs/io_uring.c              |  1 +
> >  include/linux/cred.h       |  8 ++++++++
> >  include/uapi/linux/prctl.h |  5 +++++
> >  kernel/cred.c              | 35 +++++++++++++++++++++++++++++------
> >  kernel/exit.c              |  2 +-
> >  kernel/fork.c              |  4 ++--
> >  kernel/sys.c               | 22 +++++++++++++++++++++-
> >  kernel/user_namespace.c    |  3 +++
> >  11 files changed, 80 insertions(+), 16 deletions(-)
> > 
> > diff --git a/fs/exec.c b/fs/exec.c
> > index c45dfc716394..574b1381276c 100644
> > --- a/fs/exec.c
> > +++ b/fs/exec.c
> > @@ -1837,7 +1837,7 @@ static int __do_execve_file(int fd, struct filename *filename,
> >  		goto out_ret;
> >  	}
> >  
> > -	processes = get_rlimit_counter(&init_user_ns, current_euid(), UCOUNT_RLIMIT_NPROC);
> > +	processes = get_rlimit_counter(current_rlimit_ns(), current_euid(), UCOUNT_RLIMIT_NPROC);
> >  
> >  	/*
> >  	 * We move the actual failure in case of RLIMIT_NPROC excess from
> > diff --git a/fs/io-wq.c b/fs/io-wq.c
> > index c3b0843abc9b..19e43ec115cb 100644
> > --- a/fs/io-wq.c
> > +++ b/fs/io-wq.c
> > @@ -116,6 +116,7 @@ struct io_wq {
> >  
> >  	struct task_struct *manager;
> >  	struct user_struct *user;
> It seems like user would be unused here, and you could use creds->user?

Yep. I think so.

> > +	const struct cred *creds;
> >  	refcount_t refs;
> >  	struct completion done;
> >  
> > @@ -217,7 +218,7 @@ static void io_worker_exit(struct io_worker *worker)
> >  	if (worker->flags & IO_WORKER_F_RUNNING)
> >  		atomic_dec(&acct->nr_running);
> >  	if (!(worker->flags & IO_WORKER_F_BOUND))
> > -		dec_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
> > +		dec_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
> >  	worker->flags = 0;
> >  	preempt_enable();
> >  
> > @@ -350,9 +351,9 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
> >  			worker->flags |= IO_WORKER_F_BOUND;
> >  			wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--;
> >  			wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++;
> > -			dec_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
> > +			dec_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
> >  		} else {
> > -			if (!inc_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC))
> > +			if (!inc_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC))
> >  				return;
> >  			worker->flags &= ~IO_WORKER_F_BOUND;
> >  			wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++;
> > @@ -662,7 +663,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
> >  	}
> >  
> >  	if (index == IO_WQ_ACCT_UNBOUND &&
> > -	    !inc_rlimit_counter(&init_user_ns, wq->user->uid, UCOUNT_RLIMIT_NPROC)) {
> > +	    !inc_rlimit_counter(wq->creds->rlimit_ns, wq->user->uid, UCOUNT_RLIMIT_NPROC)) {
> >  		kfree(worker);
> >  		return false;
> >  	}
> > @@ -772,7 +773,8 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
> >  	if (free_worker)
> >  		return true;
> >  
> > -	processes = get_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
> > +	processes = get_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid,
> > +			UCOUNT_RLIMIT_NPROC);
> >  
> >  	if (processes >= acct->max_workers &&
> >  	    !(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN)))
> > @@ -1049,6 +1051,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
> >  
> >  	/* caller must already hold a reference to this */
> >  	wq->user = data->user;
> > +	wq->creds = data->creds;
> >  
> >  	for_each_node(node) {
> >  		struct io_wqe *wqe;
> > diff --git a/fs/io-wq.h b/fs/io-wq.h
> > index 071f1a997800..6acc3a04c38f 100644
> > --- a/fs/io-wq.h
> > +++ b/fs/io-wq.h
> > @@ -105,6 +105,7 @@ typedef void (io_wq_work_fn)(struct io_wq_work **);
> >  
> >  struct io_wq_data {
> >  	struct user_struct *user;
> > +	const struct cred *creds;
> >  
> >  	io_wq_work_fn *do_work;
> >  	free_work_fn *free_work;
> > diff --git a/fs/io_uring.c b/fs/io_uring.c
> > index 493e5047e67c..e419923968b3 100644
> > --- a/fs/io_uring.c
> > +++ b/fs/io_uring.c
> > @@ -6933,6 +6933,7 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx,
> >  	int ret = 0;
> >  
> >  	data.user = ctx->user;
> > +	data.creds = ctx->creds;
> >  	data.free_work = io_free_work;
> >  	data.do_work = io_wq_submit_work;
> >  
> > diff --git a/include/linux/cred.h b/include/linux/cred.h
> > index 18639c069263..43aee68d117f 100644
> > --- a/include/linux/cred.h
> > +++ b/include/linux/cred.h
> > @@ -144,6 +144,7 @@ struct cred {
> >  #endif
> >  	struct user_struct *user;	/* real user ID subscription */
> >  	struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
> > +	struct user_namespace *rlimit_ns; /* user_ns in which rlimits is tracked */
> >  	struct group_info *group_info;	/* supplementary groups for euid/fsgid */
> >  	/* RCU deletion */
> >  	union {
> > @@ -170,6 +171,7 @@ extern int set_security_override_from_ctx(struct cred *, const char *);
> >  extern int set_create_files_as(struct cred *, struct inode *);
> >  extern int cred_fscmp(const struct cred *, const struct cred *);
> >  extern void __init cred_init(void);
> > +extern int set_rlimit_ns(struct user_namespace *ns);
> >  
> >  /*
> >   * check for validity of credentials
> > @@ -370,6 +372,7 @@ static inline void put_cred(const struct cred *_cred)
> >  
> >  #define task_uid(task)		(task_cred_xxx((task), uid))
> >  #define task_euid(task)		(task_cred_xxx((task), euid))
> > +#define task_rlimit_ns(task)	(task_cred_xxx((task), rlimit_ns))
> >  
> >  #define current_cred_xxx(xxx)			\
> >  ({						\
> > @@ -390,11 +393,16 @@ static inline void put_cred(const struct cred *_cred)
> >  extern struct user_namespace init_user_ns;
> >  #ifdef CONFIG_USER_NS
> >  #define current_user_ns()	(current_cred_xxx(user_ns))
> > +#define current_rlimit_ns()	(current_cred_xxx(rlimit_ns))
> >  #else
> >  static inline struct user_namespace *current_user_ns(void)
> >  {
> >  	return &init_user_ns;
> >  }
> > +static inline struct user_namespace *current_rlimit_ns(void)
> > +{
> > +	return &init_user_ns;
> > +}
> >  #endif
> >  
> >  
> > diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> > index 07b4f8131e36..4f853f903415 100644
> > --- a/include/uapi/linux/prctl.h
> > +++ b/include/uapi/linux/prctl.h
> > @@ -238,4 +238,9 @@ struct prctl_mm_map {
> >  #define PR_SET_IO_FLUSHER		57
> >  #define PR_GET_IO_FLUSHER		58
> >  
> > +#define PR_SET_RLIMIT_USER_NAMESPACE	59
> > +#define PR_GET_RLIMIT_USER_NAMESPACE	60
> > +# define PR_RLIMIT_BIND_GLOBAL_USERNS	(1UL << 0)
> > +# define PR_RLIMIT_BIND_CURRENT_USERNS	(1UL << 1)
> > +
> >  #endif /* _LINUX_PRCTL_H */
> > diff --git a/kernel/cred.c b/kernel/cred.c
> > index 748704db1f6b..7b90e1ef9c9a 100644
> > --- a/kernel/cred.c
> > +++ b/kernel/cred.c
> > @@ -59,6 +59,7 @@ struct cred init_cred = {
> >  	.cap_bset		= CAP_FULL_SET,
> >  	.user			= INIT_USER,
> >  	.user_ns		= &init_user_ns,
> > +	.rlimit_ns		= &init_user_ns,
> >  	.group_info		= &init_groups,
> >  };
> >  
> > @@ -120,6 +121,7 @@ static void put_cred_rcu(struct rcu_head *rcu)
> >  		put_group_info(cred->group_info);
> >  	free_uid(cred->user);
> >  	put_user_ns(cred->user_ns);
> > +	put_user_ns(cred->rlimit_ns);
> >  	kmem_cache_free(cred_jar, cred);
> >  }
> >  
> > @@ -270,6 +272,7 @@ struct cred *prepare_creds(void)
> >  	get_group_info(new->group_info);
> >  	get_uid(new->user);
> >  	get_user_ns(new->user_ns);
> > +	get_user_ns(new->rlimit_ns);
> >  
> >  #ifdef CONFIG_KEYS
> >  	key_get(new->session_keyring);
> > @@ -345,7 +348,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
> >  #endif
> >  		clone_flags & CLONE_THREAD
> >  	    ) {
> > -		if (!inc_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC))
> > +		if (!inc_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC))
> >  			return -EACCES;
> >  		p->real_cred = get_cred(p->cred);
> >  		get_cred(p->cred);
> > @@ -385,7 +388,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
> >  	}
> >  #endif
> >  
> > -	if (!inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC))
> > +	if (!inc_rlimit_counter(new->rlimit_ns, new->euid, UCOUNT_RLIMIT_NPROC))
> >  		return -EACCES;
> >  	p->cred = p->real_cred = get_cred(new);
> >  	alter_cred_subscribers(new, 2);
> > @@ -487,13 +490,13 @@ int commit_creds(struct cred *new)
> >  	 * perhaps this limit is exceeded in the parent user namespace.
> >  	 */
> >  	alter_cred_subscribers(new, 2);
> > -	if (new->user != old->user &&
> > -	    !inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC))
> > +	if ((new->user != old->user || new->rlimit_ns != old->rlimit_ns) &&
> > +	    !inc_rlimit_counter(new->rlimit_ns, new->euid, UCOUNT_RLIMIT_NPROC))
> >  		task->flags |= PF_NPROC_UNS_EXCEEDED;
> >  	rcu_assign_pointer(task->real_cred, new);
> >  	rcu_assign_pointer(task->cred, new);
> > -	if (new->user != old->user)
> > -		dec_rlimit_counter(&init_user_ns, old->euid, UCOUNT_RLIMIT_NPROC);
> > +	if (new->user != old->user || new->rlimit_ns != old->rlimit_ns)
> > +		dec_rlimit_counter(old->rlimit_ns, old->euid, UCOUNT_RLIMIT_NPROC);
> >  	alter_cred_subscribers(old, -2);
> >  
> >  	/* send notifications */
> > @@ -789,6 +792,26 @@ int set_create_files_as(struct cred *new, struct inode *inode)
> >  }
> >  EXPORT_SYMBOL(set_create_files_as);
> >  
> > +/*
> > + * Change the rlimit user namespace of the current task, replacing the existing
> > + * one. If the given namespace is NULL, then initial user namespace will be
> > + * used.
> > + *
> > + * Return: 0 on success; -errno on failure.
> > + */
> > +int set_rlimit_ns(struct user_namespace *ns)
> > +{
> > +	struct cred *new;
> > +
> > +	new = prepare_creds();
> > +	if (!new)
> > +		return -ENOMEM;
> > +
> > +	new->rlimit_ns = ns ? ns : &init_user_ns;
> > +
> > +	return commit_creds(new);
> > +}
> > +
> >  #ifdef CONFIG_DEBUG_CREDENTIALS
> >  
> >  bool creds_are_invalid(const struct cred *cred)
> > diff --git a/kernel/exit.c b/kernel/exit.c
> > index 5a0d7dd1ad64..998436d32373 100644
> > --- a/kernel/exit.c
> > +++ b/kernel/exit.c
> > @@ -187,7 +187,7 @@ void release_task(struct task_struct *p)
> >  	/* don't need to get the RCU readlock here - the process is dead and
> >  	 * can't be modifying its own credentials. But shut RCU-lockdep up */
> >  	rcu_read_lock();
> > -	dec_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC);
> > +	dec_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC);
> >  	rcu_read_unlock();
> >  
> >  	cgroup_release(p);
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index d2b28634dc8f..43f3c54fe4c6 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -1963,7 +1963,7 @@ static __latent_entropy struct task_struct *copy_process(
> >  		current->flags &= ~PF_NPROC_UNS_EXCEEDED;
> >  		goto bad_fork_free;
> >  	}
> > -	processes = get_rlimit_counter(&init_user_ns, p->real_cred->euid,
> > +	processes = get_rlimit_counter(task_rlimit_ns(p), task_euid(p),
> >  			UCOUNT_RLIMIT_NPROC);
> >  	if (processes >= task_rlimit(p, RLIMIT_NPROC)) {
> >  		if (p->real_cred->user != INIT_USER &&
> > @@ -2366,7 +2366,7 @@ static __latent_entropy struct task_struct *copy_process(
> >  #endif
> >  	delayacct_tsk_free(p);
> >  bad_fork_cleanup_count:
> > -	dec_rlimit_counter(&init_user_ns, p->cred->euid, UCOUNT_RLIMIT_NPROC);
> > +	dec_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC);
> >  	exit_creds(p);
> >  bad_fork_free:
> >  	p->state = TASK_DEAD;
> > diff --git a/kernel/sys.c b/kernel/sys.c
> > index db780ec32d86..917cbd7fc674 100644
> > --- a/kernel/sys.c
> > +++ b/kernel/sys.c
> > @@ -467,7 +467,7 @@ static int set_user(struct cred *new)
> >  	if (!new_user)
> >  		return -EAGAIN;
> >  
> > -	processes = get_rlimit_counter(&init_user_ns, new_user->uid, UCOUNT_RLIMIT_NPROC);
> > +	processes = get_rlimit_counter(new->rlimit_ns, new_user->uid, UCOUNT_RLIMIT_NPROC);
> >  
> >  	/*
> >  	 * We don't fail in case of NPROC limit excess here because too many
> > @@ -2529,6 +2529,26 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
> >  
> >  		error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
> >  		break;
> > +	case PR_SET_RLIMIT_USER_NAMESPACE:
> > +		if (!capable(CAP_SYS_RESOURCE))
> > +			return -EPERM;
> Can you have CAP_SYS_RESOURCE in the init user ns while you're in a non-init 
> user ns? Shouldn't that only matter if you want to set yourself to 
> PR_RLIMIT_BIND_GLOBAL_USERNS anyways and you just want to make sure they have 
> CAP_SYS_RESOURCE in the current namespace for PR_RLIMIT_BIND_CURRENT_USERNS?

I think I was wrong to add the CAP_SYS_RESOURCE check because the user
still won't be able to exceed his limit. So since setrlimit does not
require CAP_SYS_RESOURCE, then it is not needed here either.

> > +
> > +		switch (arg2) {
> > +		case PR_RLIMIT_BIND_GLOBAL_USERNS:
> > +			error = set_rlimit_ns(&init_user_ns);
> > +			break;
> > +		case PR_RLIMIT_BIND_CURRENT_USERNS:
> > +			error = set_rlimit_ns(current_user_ns());
> > +			break;
> To some degree, this isn't so much "per user namespace rlimits", as much as it 
> is hierarchical rlimits. I'm not going to nitpick about names, but it might be 
> worth exploring. Especially because the way the patch is written, it would be 
> easy to introduce a third user namespace for rlimits with different mappings.

I agree that the naming is a bit confusing but I couldn't think of a
better one. This is probably because the discussion started with the topic
"per userns rlimits".

This change is more about the rlimit counter but not the rlimit itself.

> The downside I see with a sysctl is that changing it midway through the user 
> namespaces lifetime could be confusing, and difficult to get right.

sysctl requires procfs. I did prctl to avoid it.

> > +		default:
> > +			error = -EINVAL;
> > +		}
> > +		break;
> > +	case PR_GET_RLIMIT_USER_NAMESPACE:
> > +		error = current_rlimit_ns() == &init_user_ns
> > +			? PR_RLIMIT_BIND_GLOBAL_USERNS
> > +			: PR_RLIMIT_BIND_CURRENT_USERNS;
> > +		break;
> It would be nice to have this be a sysctl, so everyone who does
> setns() would get the behaviour, and if a process wants to 

What's so tricky about using prctl?

> >  	default:
> >  		error = -EINVAL;
> >  		break;
> > diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
> > index 87804e0371fe..346df35ceba9 100644
> > --- a/kernel/user_namespace.c
> > +++ b/kernel/user_namespace.c
> > @@ -56,6 +56,8 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
> >  #endif
> >  	/* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
> >  	cred->user_ns = user_ns;
> > +
> > +	cred->rlimit_ns = &init_user_ns;
> >  }
> >  
> >  /*
> > @@ -121,6 +123,7 @@ int create_user_ns(struct cred *new)
> >  	for (i = 0; i < UCOUNT_COUNTS; i++) {
> >  		ns->ucount_max[i] = INT_MAX;
> >  	}
> > +	ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC);
> >  	ns->ucounts = ucounts;
> >  
> >  	/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
> > -- 
> > 2.25.4
> > 
> 

-- 
Rgrds, legion

Powered by blists - more mailing lists

Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.