[PATCH] wait4() WIFSTOPPED starvation fix #1/2

David Howells (dhowells@redhat.com)
Fri, 15 Mar 2002 21:09:20 +0000


Hi Linus,

There appears to be a starvation bug in sys_wait4().

The case runs like this:

(1) If a process has a lot of threads, and a debugger like strace is running
over all the threads, then the threads will be sat in the stopped more
often than they're running.

(2) the children being traced are "reparented" and added to strace's children
list.

(4) strace uses wait4() to find an _unspecified_ stopped (T) or zombie (Z)
state process for it to deal with.

(5) sys_wait4() always walks the list of children in the same order
(effectively fork/clone order).

(6) children at the end of the list starve for attention because sys_wait4()
always takes the first process on the list that needs attention, and the
list isn't ever rearranged.

This patch (#1) just converts the task_struct to use struct list_head rather
than direct pointers for maintaining the children list.

David

diff -uNr linux-2.5.6-pre1/arch/i386/kernel/signal.c linux-wait-256p1/arch/i386/kernel/signal.c
--- linux-2.5.6-pre1/arch/i386/kernel/signal.c Wed Feb 27 08:27:56 2002
+++ linux-wait-256p1/arch/i386/kernel/signal.c Fri Mar 15 19:10:03 2002
@@ -630,8 +630,8 @@
info.si_signo = signr;
info.si_errno = 0;
info.si_code = SI_USER;
- info.si_pid = current->p_pptr->pid;
- info.si_uid = current->p_pptr->uid;
+ info.si_pid = current->parent->pid;
+ info.si_uid = current->parent->uid;
}

/* If the (new) signal is now blocked, requeue it. */
@@ -670,7 +670,7 @@
case SIGSTOP: {
struct signal_struct *sig;
current->exit_code = signr;
- sig = current->p_pptr->sig;
+ sig = current->parent->sig;
preempt_disable();
current->state = TASK_STOPPED;
if (sig && !(sig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP))
diff -uNr linux-2.5.6-pre1/fs/binfmt_elf.c linux-wait-256p1/fs/binfmt_elf.c
--- linux-2.5.6-pre1/fs/binfmt_elf.c Wed Feb 27 08:26:56 2002
+++ linux-wait-256p1/fs/binfmt_elf.c Fri Mar 15 19:11:37 2002
@@ -1094,7 +1094,7 @@
prstatus.pr_sigpend = current->pending.signal.sig[0];
prstatus.pr_sighold = current->blocked.sig[0];
psinfo.pr_pid = prstatus.pr_pid = current->pid;
- psinfo.pr_ppid = prstatus.pr_ppid = current->p_pptr->pid;
+ psinfo.pr_ppid = prstatus.pr_ppid = current->parent->pid;
psinfo.pr_pgrp = prstatus.pr_pgrp = current->pgrp;
psinfo.pr_sid = prstatus.pr_sid = current->session;
prstatus.pr_utime.tv_sec = CT_TO_SECS(current->times.tms_utime);
diff -uNr linux-2.5.6-pre1/fs/proc/array.c linux-wait-256p1/fs/proc/array.c
--- linux-2.5.6-pre1/fs/proc/array.c Wed Feb 27 08:26:55 2002
+++ linux-wait-256p1/fs/proc/array.c Fri Mar 15 19:09:08 2002
@@ -159,7 +159,7 @@
"Uid:\t%d\t%d\t%d\t%d\n"
"Gid:\t%d\t%d\t%d\t%d\n",
get_task_state(p), p->tgid,
- p->pid, p->pid ? p->p_opptr->pid : 0, 0,
+ p->pid, p->pid ? p->real_parent->pid : 0, 0,
p->uid, p->euid, p->suid, p->fsuid,
p->gid, p->egid, p->sgid, p->fsgid);
read_unlock(&tasklist_lock);
@@ -340,7 +340,7 @@
nice = task_nice(task);

read_lock(&tasklist_lock);
- ppid = task->pid ? task->p_opptr->pid : 0;
+ ppid = task->pid ? task->real_parent->pid : 0;
read_unlock(&tasklist_lock);
res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
%lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld %lu %lu %ld %lu %lu %lu %lu %lu \
diff -uNr linux-2.5.6-pre1/fs/proc/base.c linux-wait-256p1/fs/proc/base.c
--- linux-2.5.6-pre1/fs/proc/base.c Wed Feb 27 08:26:55 2002
+++ linux-wait-256p1/fs/proc/base.c Fri Mar 15 19:09:19 2002
@@ -336,7 +336,7 @@
};

#define MAY_PTRACE(p) \
-(p==current||(p->p_pptr==current&&(p->ptrace & PT_PTRACED)&&p->state==TASK_STOPPED))
+(p==current||(p->parent==current&&(p->ptrace & PT_PTRACED)&&p->state==TASK_STOPPED))


static int mem_open(struct inode* inode, struct file* file)
diff -uNr linux-2.5.6-pre1/include/linux/init_task.h linux-wait-256p1/include/linux/init_task.h
--- linux-2.5.6-pre1/include/linux/init_task.h Wed Feb 27 08:27:00 2002
+++ linux-wait-256p1/include/linux/init_task.h Fri Mar 15 18:58:27 2002
@@ -55,8 +55,10 @@
time_slice: HZ, \
next_task: &tsk, \
prev_task: &tsk, \
- p_opptr: &tsk, \
- p_pptr: &tsk, \
+ real_parent: &tsk, \
+ parent: &tsk, \
+ children: LIST_HEAD_INIT(tsk.children), \
+ sibling: LIST_HEAD_INIT(tsk.sibling), \
thread_group: LIST_HEAD_INIT(tsk.thread_group), \
wait_chldexit: __WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\
real_timer: { \
diff -uNr linux-2.5.6-pre1/include/linux/sched.h linux-wait-256p1/include/linux/sched.h
--- linux-2.5.6-pre1/include/linux/sched.h Wed Feb 27 08:29:20 2002
+++ linux-wait-256p1/include/linux/sched.h Fri Mar 15 18:52:29 2002
@@ -274,9 +274,12 @@
/*
* pointers to (original) parent process, youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
- * p->p_pptr->pid)
+ * p->parent->pid)
*/
- struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
+ struct task_struct *real_parent; /* real parent process (when being debugged) */
+ struct task_struct *parent; /* parent process */
+ struct list_head children; /* list of my children */
+ struct list_head sibling; /* linkage in my parent's children list */
struct list_head thread_group;

/* PID hash table linkage. */
@@ -715,28 +718,44 @@
__ret; \
})

-#define REMOVE_LINKS(p) do { \
- (p)->next_task->prev_task = (p)->prev_task; \
- (p)->prev_task->next_task = (p)->next_task; \
- if ((p)->p_osptr) \
- (p)->p_osptr->p_ysptr = (p)->p_ysptr; \
- if ((p)->p_ysptr) \
- (p)->p_ysptr->p_osptr = (p)->p_osptr; \
- else \
- (p)->p_pptr->p_cptr = (p)->p_osptr; \
+#define REMOVE_LINKS(p) do { \
+ (p)->next_task->prev_task = (p)->prev_task; \
+ (p)->prev_task->next_task = (p)->next_task; \
+ list_del_init(&(p)->sibling); \
} while (0)

-#define SET_LINKS(p) do { \
- (p)->next_task = &init_task; \
- (p)->prev_task = init_task.prev_task; \
- init_task.prev_task->next_task = (p); \
- init_task.prev_task = (p); \
- (p)->p_ysptr = NULL; \
- if (((p)->p_osptr = (p)->p_pptr->p_cptr) != NULL) \
- (p)->p_osptr->p_ysptr = p; \
- (p)->p_pptr->p_cptr = p; \
+#define SET_LINKS(p) do { \
+ (p)->next_task = &init_task; \
+ (p)->prev_task = init_task.prev_task; \
+ init_task.prev_task->next_task = (p); \
+ init_task.prev_task = (p); \
+ list_add_tail(&(p)->sibling,&(p)->parent->children); \
} while (0)

+static inline struct task_struct *eldest_child(struct task_struct *p)
+{
+ if (list_empty(&p->children)) return NULL;
+ return list_entry(p->children.next,struct task_struct,sibling);
+}
+
+static inline struct task_struct *youngest_child(struct task_struct *p)
+{
+ if (list_empty(&p->children)) return NULL;
+ return list_entry(p->children.prev,struct task_struct,sibling);
+}
+
+static inline struct task_struct *older_sibling(struct task_struct *p)
+{
+ if (p->sibling.prev==&p->parent->children) return NULL;
+ return list_entry(p->sibling.prev,struct task_struct,sibling);
+}
+
+static inline struct task_struct *younger_sibling(struct task_struct *p)
+{
+ if (p->sibling.next==&p->parent->children) return NULL;
+ return list_entry(p->sibling.next,struct task_struct,sibling);
+}
+
#define for_each_task(p) \
for (p = &init_task ; (p = p->next_task) != &init_task ; )

diff -uNr linux-2.5.6-pre1/kernel/exit.c linux-wait-256p1/kernel/exit.c
--- linux-2.5.6-pre1/kernel/exit.c Wed Feb 27 08:26:59 2002
+++ linux-wait-256p1/kernel/exit.c Fri Mar 15 19:05:47 2002
@@ -91,10 +91,10 @@
for_each_task(p) {
if ((p == ignored_task) || (p->pgrp != pgrp) ||
(p->state == TASK_ZOMBIE) ||
- (p->p_pptr->pid == 1))
+ (p->parent->pid == 1))
continue;
- if ((p->p_pptr->pgrp != pgrp) &&
- (p->p_pptr->session == p->session)) {
+ if ((p->parent->pgrp != pgrp) &&
+ (p->parent->session == p->session)) {
read_unlock(&tasklist_lock);
return 0;
}
@@ -144,8 +144,8 @@

/* Reparent to init */
REMOVE_LINKS(current);
- current->p_pptr = child_reaper;
- current->p_opptr = child_reaper;
+ current->parent = child_reaper;
+ current->real_parent = child_reaper;
SET_LINKS(current);

/* Set the exit signal to SIGCHLD so we signal init on exit */
@@ -217,16 +217,16 @@
reaper = child_reaper;

for_each_task(p) {
- if (p->p_opptr == father) {
+ if (p->real_parent == father) {
/* We dont want people slaying init */
p->exit_signal = SIGCHLD;
p->self_exec_id++;

/* Make sure we're not reparenting to ourselves */
if (p == reaper)
- p->p_opptr = child_reaper;
+ p->real_parent = child_reaper;
else
- p->p_opptr = reaper;
+ p->real_parent = reaper;

if (p->pdeath_signal) send_sig(p->pdeath_signal, p, 0);
}
@@ -400,7 +400,7 @@
* is about to become orphaned.
*/

- t = current->p_pptr;
+ t = current->parent;

if ((t->pgrp != current->pgrp) &&
(t->session == current->session) &&
@@ -445,17 +445,12 @@
write_lock_irq(&tasklist_lock);
current->state = TASK_ZOMBIE;
do_notify_parent(current, current->exit_signal);
- while (current->p_cptr != NULL) {
- p = current->p_cptr;
- current->p_cptr = p->p_osptr;
- p->p_ysptr = NULL;
+ while ((p = eldest_child(current))) {
+ list_del_init(&p->sibling);
p->ptrace = 0;

- p->p_pptr = p->p_opptr;
- p->p_osptr = p->p_pptr->p_cptr;
- if (p->p_osptr)
- p->p_osptr->p_ysptr = p;
- p->p_pptr->p_cptr = p;
+ p->parent = p->real_parent;
+ list_add_tail(&p->sibling,&p->parent->children);
if (p->state == TASK_ZOMBIE)
do_notify_parent(p, p->exit_signal);
/*
@@ -568,7 +563,9 @@
tsk = current;
do {
struct task_struct *p;
- for (p = tsk->p_cptr ; p ; p = p->p_osptr) {
+ struct list_head *_p;
+ list_for_each(_p,&tsk->children) {
+ p = list_entry(_p,struct task_struct,sibling);
if (pid>0) {
if (p->pid != pid)
continue;
@@ -613,10 +610,10 @@
if (retval)
goto end_wait4;
retval = p->pid;
- if (p->p_opptr != p->p_pptr) {
+ if (p->real_parent != p->parent) {
write_lock_irq(&tasklist_lock);
REMOVE_LINKS(p);
- p->p_pptr = p->p_opptr;
+ p->parent = p->real_parent;
SET_LINKS(p);
do_notify_parent(p, SIGCHLD);
write_unlock_irq(&tasklist_lock);
diff -uNr linux-2.5.6-pre1/kernel/fork.c linux-wait-256p1/kernel/fork.c
--- linux-2.5.6-pre1/kernel/fork.c Wed Feb 27 08:29:20 2002
+++ linux-wait-256p1/kernel/fork.c Fri Mar 15 18:53:10 2002
@@ -666,7 +666,8 @@

INIT_LIST_HEAD(&p->run_list);

- p->p_cptr = NULL;
+ INIT_LIST_HEAD(&p->children);
+ INIT_LIST_HEAD(&p->sibling);
init_waitqueue_head(&p->wait_chldexit);
p->vfork_done = NULL;
if (clone_flags & CLONE_VFORK) {
@@ -766,12 +767,12 @@
write_lock_irq(&tasklist_lock);

/* CLONE_PARENT re-uses the old parent */
- p->p_opptr = current->p_opptr;
- p->p_pptr = current->p_pptr;
+ p->real_parent = current->real_parent;
+ p->parent = current->parent;
if (!(clone_flags & CLONE_PARENT)) {
- p->p_opptr = current;
+ p->real_parent = current;
if (!(p->ptrace & PT_PTRACED))
- p->p_pptr = current;
+ p->parent = current;
}

if (clone_flags & CLONE_THREAD) {
diff -uNr linux-2.5.6-pre1/kernel/ptrace.c linux-wait-256p1/kernel/ptrace.c
--- linux-2.5.6-pre1/kernel/ptrace.c Wed Feb 27 08:26:59 2002
+++ linux-wait-256p1/kernel/ptrace.c Fri Mar 15 19:06:07 2002
@@ -24,7 +24,7 @@
if (!(child->ptrace & PT_PTRACED))
return -ESRCH;

- if (child->p_pptr != current)
+ if (child->parent != current)
return -ESRCH;

if (!kill) {
@@ -70,9 +70,9 @@
task_unlock(task);

write_lock_irq(&tasklist_lock);
- if (task->p_pptr != current) {
+ if (task->parent != current) {
REMOVE_LINKS(task);
- task->p_pptr = current;
+ task->parent = current;
SET_LINKS(task);
}
write_unlock_irq(&tasklist_lock);
@@ -98,7 +98,7 @@
child->exit_code = data;
write_lock_irq(&tasklist_lock);
REMOVE_LINKS(child);
- child->p_pptr = child->p_opptr;
+ child->parent = child->real_parent;
SET_LINKS(child);
write_unlock_irq(&tasklist_lock);

diff -uNr linux-2.5.6-pre1/kernel/sched.c linux-wait-256p1/kernel/sched.c
--- linux-2.5.6-pre1/kernel/sched.c Wed Feb 27 08:29:20 2002
+++ linux-wait-256p1/kernel/sched.c Fri Mar 15 18:40:27 2002
@@ -1311,6 +1311,7 @@
static void show_task(task_t * p)
{
unsigned long free = 0;
+ task_t *relative;
int state;
static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };

@@ -1337,17 +1338,17 @@
n++;
free = (unsigned long) n - (unsigned long)(p+1);
}
- printk("%5lu %5d %6d ", free, p->pid, p->p_pptr->pid);
- if (p->p_cptr)
- printk("%5d ", p->p_cptr->pid);
+ printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
+ if ((relative = eldest_child(p)))
+ printk("%5d ", relative->pid);
else
printk(" ");
- if (p->p_ysptr)
- printk("%7d", p->p_ysptr->pid);
+ if ((relative = younger_sibling(p)))
+ printk("%7d", relative->pid);
else
printk(" ");
- if (p->p_osptr)
- printk(" %5d", p->p_osptr->pid);
+ if ((relative = older_sibling(p)))
+ printk(" %5d", relative->pid);
else
printk(" ");
if (!p->mm)
diff -uNr linux-2.5.6-pre1/kernel/signal.c linux-wait-256p1/kernel/signal.c
--- linux-2.5.6-pre1/kernel/signal.c Wed Feb 27 08:26:59 2002
+++ linux-wait-256p1/kernel/signal.c Fri Mar 15 19:06:50 2002
@@ -804,8 +804,8 @@
info.si_code = why;
info.si_status = status;

- send_sig_info(sig, &info, tsk->p_pptr);
- wake_up_parent(tsk->p_pptr);
+ send_sig_info(sig, &info, tsk->parent);
+ wake_up_parent(tsk->parent);
}


diff -uNr linux-2.5.6-pre1/kernel/sys.c linux-wait-256p1/kernel/sys.c
--- linux-2.5.6-pre1/kernel/sys.c Wed Feb 27 08:26:59 2002
+++ linux-wait-256p1/kernel/sys.c Fri Mar 15 19:07:03 2002
@@ -839,7 +839,7 @@
if (!p)
goto out;

- if (p->p_pptr == current || p->p_opptr == current) {
+ if (p->parent == current || p->real_parent == current) {
err = -EPERM;
if (p->session != current->session)
goto out;
diff -uNr linux-2.5.6-pre1/kernel/timer.c linux-wait-256p1/kernel/timer.c
--- linux-2.5.6-pre1/kernel/timer.c Wed Feb 27 08:26:59 2002
+++ linux-wait-256p1/kernel/timer.c Fri Mar 15 19:06:19 2002
@@ -742,14 +742,14 @@
struct task_struct * me = current;
struct task_struct * parent;

- parent = me->p_opptr;
+ parent = me->real_parent;
for (;;) {
pid = parent->pid;
#if CONFIG_SMP
{
struct task_struct *old = parent;
mb();
- parent = me->p_opptr;
+ parent = me->real_parent;
if (old != parent)
continue;
}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/