Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Wed, 6 Jul 2011 11:31:40 -0500
From: Serge Hallyn <serge.hallyn@...onical.com>
To: Vasiliy Kulikov <segoon@...nwall.com>
Cc: akpm@...ux-foundation.org, daniel.lezcano@...e.fr,
	ebiederm@...ssion.com, mingo@...e.hu, oleg@...hat.com,
	rdunlap@...otime.net, tj@...nel.org,
	kernel-hardening@...ts.openwall.com
Subject: Re: [PATCH] shm: handle separate PID namespaces case

Quoting Vasiliy Kulikov (segoon@...nwall.com):
> On Tue, Jul 05, 2011 at 10:57 -0500, Serge Hallyn wrote:
> > Do you have all of these patches applied in one git tree?  Could
> > you do a quick 'git diff master..' (assuming master is Linus' tree)
> > and send the result (or send a git url), and I'll take a last closer
> > look at the patch tomorrow?
> 
> These patches are against 3.0-rc2, it should be simple to rebase to -rc5
> since ipc/ is not heavily changed nowadays.  The patches are stored in
> -mm tree.  The blob against -rc2 is as follows:
> 

Thanks, Vasiliy.  I don't see anything wrong with it, though I do
wory (see below) about contention on task exit?  The rest of my
comments are just safe-to-ignore-of-you-like nits.

> ---
> 
>  Documentation/sysctl/kernel.txt |   22 ++++++++
>  include/linux/ipc_namespace.h   |    7 +++
>  include/linux/shm.h             |    7 +++
>  ipc/ipc_sysctl.c                |   36 +++++++++++++
>  ipc/shm.c                       |  105 +++++++++++++++++++++++++++++++++++++--
>  kernel/exit.c                   |    1 +
>  6 files changed, 174 insertions(+), 4 deletions(-)
> ---
> diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
> index 5e7cb39..4d7b866 100644
> --- a/Documentation/sysctl/kernel.txt
> +++ b/Documentation/sysctl/kernel.txt
> @@ -62,6 +62,7 @@ show up in /proc/sys/kernel:
>  - shmall
>  - shmmax                      [ sysv ipc ]
>  - shmmni
> +- shm_rmid_forced
>  - stop-a                      [ SPARC only ]
>  - sysrq                       ==> Documentation/sysrq.txt
>  - tainted
> @@ -475,6 +476,27 @@ kernel.  This value defaults to SHMMAX.
>  
>  ==============================================================
>  
> +shm_rmid_forced:
> +
> +Linux lets you set resource limits, including how much memory one
> +process can consume, via setrlimit(2).  Unfortunately, shared memory
> +segments are allowed to exist without association with any process, and
> +thus might not be counted against any resource limits.  If enabled,
> +shared memory segments are automatically destroyed when their attach
> +count becomes zero after a detach or a process termination.  It will
> +also destroy segments that were created, but never attached to, on exit
> +from the process.  The only use left for IPC_RMID is to immediately
> +destroy an unattached segment.  Of course, this breaks the way things are
> +defined, so some applications might stop working.  Note that this
> +feature will do you no good unless you also configure your resource
> +limits (in particular, RLIMIT_AS and RLIMIT_NPROC).  Most systems don't
> +need this.
> +
> +Note that if you change this from 0 to 1, already created segments
> +without users and with a dead originative process will be destroyed.
> +
> +==============================================================
> +
>  softlockup_thresh:
>  
>  This value can be used to lower the softlockup tolerance threshold.  The
> diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
> index a6d1655..8a297a5 100644
> --- a/include/linux/ipc_namespace.h
> +++ b/include/linux/ipc_namespace.h
> @@ -44,6 +44,11 @@ struct ipc_namespace {
>  	size_t		shm_ctlall;
>  	int		shm_ctlmni;
>  	int		shm_tot;
> +	/*
> +	 * Defines whether IPC_RMID is forced for _all_ shm segments regardless
> +	 * of shmctl()
> +	 */
> +	int		shm_rmid_forced;
>  
>  	struct notifier_block ipcns_nb;
>  
> @@ -72,6 +77,7 @@ extern int register_ipcns_notifier(struct ipc_namespace *);
>  extern int cond_register_ipcns_notifier(struct ipc_namespace *);
>  extern void unregister_ipcns_notifier(struct ipc_namespace *);
>  extern int ipcns_notify(unsigned long);
> +extern void shm_destroy_orphaned(struct ipc_namespace *ns);
>  #else /* CONFIG_SYSVIPC */
>  static inline int register_ipcns_notifier(struct ipc_namespace *ns)
>  { return 0; }
> @@ -79,6 +85,7 @@ static inline int cond_register_ipcns_notifier(struct ipc_namespace *ns)
>  { return 0; }
>  static inline void unregister_ipcns_notifier(struct ipc_namespace *ns) { }
>  static inline int ipcns_notify(unsigned long l) { return 0; }
> +static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {}
>  #endif /* CONFIG_SYSVIPC */
>  
>  #ifdef CONFIG_POSIX_MQUEUE
> diff --git a/include/linux/shm.h b/include/linux/shm.h
> index eca6235..92808b8 100644
> --- a/include/linux/shm.h
> +++ b/include/linux/shm.h
> @@ -95,6 +95,9 @@ struct shmid_kernel /* private to the kernel */
>  	pid_t			shm_cprid;
>  	pid_t			shm_lprid;
>  	struct user_struct	*mlock_user;
> +
> +	/* The task created the shm object.  NULL if the task is dead. */
> +	struct task_struct	*shm_creator;
>  };
>  
>  /* shm_mode upper byte flags */
> @@ -106,6 +109,7 @@ struct shmid_kernel /* private to the kernel */
>  #ifdef CONFIG_SYSVIPC
>  long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr);
>  extern int is_file_shm_hugepages(struct file *file);
> +extern void exit_shm(struct task_struct *task);
>  #else
>  static inline long do_shmat(int shmid, char __user *shmaddr,
>  				int shmflg, unsigned long *addr)
> @@ -116,6 +120,9 @@ static inline int is_file_shm_hugepages(struct file *file)
>  {
>  	return 0;
>  }
> +static inline void exit_shm(struct task_struct *task)
> +{
> +}
>  #endif
>  
>  #endif /* __KERNEL__ */
> diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
> index 56410fa..00fba2b 100644
> --- a/ipc/ipc_sysctl.c
> +++ b/ipc/ipc_sysctl.c
> @@ -31,12 +31,37 @@ static int proc_ipc_dointvec(ctl_table *table, int write,
>  	void __user *buffer, size_t *lenp, loff_t *ppos)
>  {
>  	struct ctl_table ipc_table;
> +
>  	memcpy(&ipc_table, table, sizeof(ipc_table));
>  	ipc_table.data = get_ipc(table);
>  
>  	return proc_dointvec(&ipc_table, write, buffer, lenp, ppos);
>  }
>  
> +static int proc_ipc_dointvec_minmax(ctl_table *table, int write,
> +	void __user *buffer, size_t *lenp, loff_t *ppos)
> +{
> +	struct ctl_table ipc_table;
> +
> +	memcpy(&ipc_table, table, sizeof(ipc_table));
> +	ipc_table.data = get_ipc(table);
> +
> +	return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
> +}
> +
> +static int proc_ipc_dointvec_minmax_orphans(ctl_table *table, int write,
> +	void __user *buffer, size_t *lenp, loff_t *ppos)
> +{
> +	struct ipc_namespace *ns = current->nsproxy->ipc_ns;
> +	int err = proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos);
> +
> +	if (err < 0)
> +		return err;
> +	if (ns->shm_rmid_forced)
> +		shm_destroy_orphaned(ns);
> +	return err;
> +}
> +
>  static int proc_ipc_callback_dointvec(ctl_table *table, int write,
>  	void __user *buffer, size_t *lenp, loff_t *ppos)
>  {
> @@ -125,6 +150,8 @@ static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write,
>  #else
>  #define proc_ipc_doulongvec_minmax NULL
>  #define proc_ipc_dointvec	   NULL
> +#define proc_ipc_dointvec_minmax   NULL
> +#define proc_ipc_dointvec_minmax_orphans   NULL
>  #define proc_ipc_callback_dointvec NULL
>  #define proc_ipcauto_dointvec_minmax NULL
>  #endif
> @@ -155,6 +182,15 @@ static struct ctl_table ipc_kern_table[] = {
>  		.proc_handler	= proc_ipc_dointvec,
>  	},
>  	{
> +		.procname	= "shm_rmid_forced",
> +		.data		= &init_ipc_ns.shm_rmid_forced,
> +		.maxlen		= sizeof(init_ipc_ns.shm_rmid_forced),
> +		.mode		= 0644,
> +		.proc_handler	= proc_ipc_dointvec_minmax_orphans,
> +		.extra1		= &zero,
> +		.extra2		= &one,
> +	},
> +	{
>  		.procname	= "msgmax",
>  		.data		= &init_ipc_ns.msg_ctlmax,
>  		.maxlen		= sizeof (init_ipc_ns.msg_ctlmax),
> diff --git a/ipc/shm.c b/ipc/shm.c
> index ab3385a..bf46636 100644
> --- a/ipc/shm.c
> +++ b/ipc/shm.c
> @@ -74,6 +74,7 @@ void shm_init_ns(struct ipc_namespace *ns)
>  	ns->shm_ctlmax = SHMMAX;
>  	ns->shm_ctlall = SHMALL;
>  	ns->shm_ctlmni = SHMMNI;
> +	ns->shm_rmid_forced = 1;

Given the description in Documentation/sysctl/kernel.txt, shouldn't
this default to 0?

>  	ns->shm_tot = 0;
>  	ipc_init_ids(&shm_ids(ns));
>  }
> @@ -130,6 +131,12 @@ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
>  	return container_of(ipcp, struct shmid_kernel, shm_perm);
>  }
>  
> +static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp)
> +{
> +	rcu_read_lock();
> +	spin_lock(&ipcp->shm_perm.lock);
> +}
> +
>  static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns,
>  						int id)
>  {
> @@ -187,6 +194,23 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
>  }
>  
>  /*
> + * shm_may_destroy - identifies whether shm segment should be destroyed now
> + *
> + * Returns true if and only if there are no active users of the segment and
> + * one of the following is true:
> + *
> + * 1) shmctl(id, IPC_RMID, NULL) was called for this shp
> + *
> + * 2) sysctl kernel.shm_rmid_forced is set to 1.
> + */
> +static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)

'may' usually implies a permission check.  Would this be better named
'shm_should_destroy()'?

> +{
> +	return (shp->shm_nattch == 0) &&
> +	       (ns->shm_rmid_forced ||
> +		(shp->shm_perm.mode & SHM_DEST));
> +}
> +
> +/*
>   * remove the attach descriptor vma.
>   * free memory for segment if it is marked destroyed.
>   * The descriptor has already been removed from the current->mm->mmap list
> @@ -206,14 +230,87 @@ static void shm_close(struct vm_area_struct *vma)
>  	shp->shm_lprid = task_tgid_vnr(current);
>  	shp->shm_dtim = get_seconds();
>  	shp->shm_nattch--;
> -	if(shp->shm_nattch == 0 &&
> -	   shp->shm_perm.mode & SHM_DEST)
> +	if (shm_may_destroy(ns, shp))
>  		shm_destroy(ns, shp);
>  	else
>  		shm_unlock(shp);
>  	up_write(&shm_ids(ns).rw_mutex);
>  }
>  
> +/* Called with ns->shm_ids(ns).rw_mutex locked */
> +static int shm_try_destroy_current(int id, void *p, void *data)
> +{
> +	struct ipc_namespace *ns = data;
> +	struct kern_ipc_perm *ipcp = p;
> +	struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);
> +
> +	if (shp->shm_creator != current)
> +		return 0;
> +
> +	/*
> +	 * Mark it as orphaned to destroy the segment when
> +	 * kernel.shm_rmid_forced is changed.
> +	 * It is noop if the following shm_may_destroy() returns true.
> +	 */
> +	shp->shm_creator = NULL;
> +
> +	/*
> +	 * Don't even try to destroy it.  If shm_rmid_forced=0 and IPC_RMID
> +	 * is not set, it shouldn't be deleted here.
> +	 */
> +	if (!ns->shm_rmid_forced)
> +		return 0;
> +
> +	if (shm_may_destroy(ns, shp)) {

This seems redundant.  Would it be better to just make this

	if (shp->shm_nattch == 0) {

here as we already know ns->shm_rmid_forced == 1?

> +		shm_lock_by_ptr(shp);
> +		shm_destroy(ns, shp);

Wish there were a clean way to document that the locks will be
released by shm_destroy().

> +	}
> +	return 0;
> +}
> +
> +/* Called with ns->shm_ids(ns).rw_mutex locked */
> +static int shm_try_destroy_orphaned(int id, void *p, void *data)
> +{
> +	struct ipc_namespace *ns = data;
> +	struct kern_ipc_perm *ipcp = p;
> +	struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);
> +
> +	/*
> +	 * We want to destroy segments without users and with already
> +	 * exit'ed originating process.
> +	 *
> +	 * As shp->* are changed under rw_mutex, it's safe to skip shp locking.
> +	 */
> +	if (shp->shm_creator != NULL)
> +		return 0;
> +
> +	if (shm_may_destroy(ns, shp)) {
> +		shm_lock_by_ptr(shp);
> +		shm_destroy(ns, shp);
> +	}
> +	return 0;
> +}
> +
> +void shm_destroy_orphaned(struct ipc_namespace *ns)
> +{
> +	down_write(&shm_ids(ns).rw_mutex);
> +	if (&shm_ids(ns).in_use)
> +		idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
> +	up_write(&shm_ids(ns).rw_mutex);

Hm, is this going to cause contention when killing a lot of tasks?

> +}
> +
> +
> +void exit_shm(struct task_struct *task)
> +{
> +	struct ipc_namespace *ns = task->nsproxy->ipc_ns;
> +
> +	/* Destroy all already created segments, but not mapped yet */
> +	down_write(&shm_ids(ns).rw_mutex);
> +	if (&shm_ids(ns).in_use)
> +		idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns);
> +	up_write(&shm_ids(ns).rw_mutex);

Having exit_shm() call shm_destroy_orphaned(task->nsproxy->ipc_ns) seems
more future-proof?

> +}
> +
>  static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>  {
>  	struct file *file = vma->vm_file;
> @@ -404,6 +501,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
>  	shp->shm_segsz = size;
>  	shp->shm_nattch = 0;
>  	shp->shm_file = file;
> +	shp->shm_creator = current;
>  	/*
>  	 * shmid gets reported as "inode#" in /proc/pid/maps.
>  	 * proc-ps tools use this. Changing this will break them.
> @@ -950,8 +1048,7 @@ out_nattch:
>  	shp = shm_lock(ns, shmid);
>  	BUG_ON(IS_ERR(shp));
>  	shp->shm_nattch--;
> -	if(shp->shm_nattch == 0 &&
> -	   shp->shm_perm.mode & SHM_DEST)
> +	if (shm_may_destroy(ns, shp))
>  		shm_destroy(ns, shp);
>  	else
>  		shm_unlock(shp);
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 20a4064..816c610 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -991,6 +991,7 @@ NORET_TYPE void do_exit(long code)
>  	trace_sched_process_exit(tsk);
>  
>  	exit_sem(tsk);
> +	exit_shm(tsk);
>  	exit_files(tsk);
>  	exit_fs(tsk);
>  	check_stack_usage();
> --

Powered by blists - more mailing lists

Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.