In response (albeit a week plus late) to the recent hubbub about the cpu
affinity patches, I'd like to throw a third contender in the ring.
Attatched is a patch (against 2.4.16) which implements a /proc and a prctl()
interface to the cpus_allowed flag. The truly exciting (at least for me) part
of this patch is the launch_policy flag that it also introduces. The
launch_policy flag is used similarly to the cpus_allowed flag, but it controls
the cpus_allowed flags of any subsequent children of the process, instead of
the cpus_allowed of the process itself. Via this flag, there are no worries
about processes being able to fork children before a 'chaff' or 'echo' or
anything else for that matter can be executed. The child process is assigned
the desired cpus_allowed at fork/exec time. All this without having to bounce
the current process to different cpus to (hopefully) acheive the same results.
The launch_policy flag can acually be quite powerful. It allows for children
to be instantiated on the correct cpu/node with a minimum of memory footprint
on the wrong cpu/node. This can be taken advantage of via the /proc interface
(for smp/numa unaware programs) or through prctl() for more clueful programs.
You must have CAP_SYS_NICE or be the owner of the process to change *either*
cpus_allowed or launch_policy.
I will momentarily be posting this patch in its own thread for greater
exposure.
Feedback of any kind will be greatly appreciated!
Enjoy!
-matt
Linux maillist account wrote:
>
> At 11:49 PM 11/26/01 -0500, Robert Love wrote:
> >I can see the use for this, but you can also just do `echo whatever >
> >/proc/123/affinity' once it is running ... not a big deal.
>
> It's isn't quite the same..the biggest difference is races. The cpuselect(1)
> tool would change the affinity mask before the fork & exec of the first
> child. To
> do this by hand via an `echo whatever >/proc/123/affinity' would miss all the
> children spun off by 123 before the echo could be executed. One could write
> cpuselect as a shell script I suppose, using within it an echo on
> /proc/self/affinity,
> though even as a shell script it would be better to have this tool be part
> of the standard
> Linux repetoire that everyone could depend upon as being there in all Linux
> distributions
> and having a well known and unchanging syntax and semantics, rather than
> have it
> remain something that each user creates ad-hoc as the need for the tool arises.
>
> Joe
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
--------------4BDFD612D7DB15AA0D5BF73A
Content-Type: text/plain; charset=us-ascii;
name="launch_policy-2.4.16.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
filename="launch_policy-2.4.16.patch"
diff -Nur linux-2.4.10/fs/proc/array.c linux-2.4.10-launch_policy/fs/proc/array.c
--- linux-2.4.10/fs/proc/array.c Fri Oct 26 15:07:16 2001
+++ linux-2.4.10-launch_policy/fs/proc/array.c Wed Nov 28 13:59:58 2001
@@ -50,6 +50,10 @@
* Al Viro & Jeff Garzik : moved most of the thing into base.c and
* : proc_misc.c. The rest may eventually go into
* : base.c too.
+ *
+ * Andrew Morton : cpus_allowed
+ *
+ * Matthew Dobson : launch_policy (Thanks to Andrew Morton for inspiraton)
*/
#include <linux/config.h>
@@ -344,7 +348,7 @@
read_unlock(&tasklist_lock);
res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
%lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld %lu %lu %ld %lu %lu %lu %lu %lu \
-%lu %lu %lu %lu %lu %lu %lu %lu %d %d\n",
+%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu\n",
task->pid,
task->comm,
state,
@@ -387,7 +391,9 @@
task->nswap,
task->cnswap,
task->exit_signal,
- task->processor);
+ task->processor,
+ task->cpus_allowed,
+ task->launch_policy);
if(mm)
mmput(mm);
return res;
@@ -692,5 +698,59 @@
task->per_cpu_stime[cpu_logical_map(i)]);
return len;
+}
+
+static inline int proc_pid_cpu_bitmask_read(char * buffer, unsigned long *bitmask)
+{
+ int len;
+
+ len = sprintf(buffer, "%08lx\n", *bitmask);
+ return len;
+}
+
+static inline int proc_pid_cpu_bitmask_write(char * buffer, unsigned long *bitmask,
+ size_t nbytes, struct task_struct *task)
+{
+ unsigned long new_mask;
+ char *endp;
+ int ret;
+ unsigned long flags;
+
+ ret = -EPERM;
+ if ((current->euid != task->euid) && (current->euid != task->uid) &&
+ (!capable(CAP_SYS_NICE)))
+ goto out;
+
+ new_mask = simple_strtoul(buffer, &endp, 16);
+ ret = endp - buffer;
+
+ spin_lock_irqsave(&runqueue_lock, flags); /* token effort to not be racy */
+ if (!(cpu_online_map & new_mask))
+ ret = -EINVAL;
+ else
+ *bitmask = new_mask;
+ spin_unlock_irqrestore(&runqueue_lock, flags);
+out:
+ return ret;
+}
+
+int proc_pid_cpus_allowed_read(struct task_struct *task, char * buffer)
+{
+ return proc_pid_cpu_bitmask_read(buffer, &task->cpus_allowed);
+}
+
+int proc_pid_cpus_allowed_write(struct task_struct *task, char * buffer, size_t nbytes)
+{
+ return proc_pid_cpu_bitmask_write(buffer, &task->cpus_allowed, nbytes, task);
+}
+
+int proc_pid_launch_policy_read(struct task_struct *task, char * buffer)
+{
+ return proc_pid_cpu_bitmask_read(buffer, &task->launch_policy);
+}
+
+int proc_pid_launch_policy_write(struct task_struct *task, char * buffer, size_t nbytes)
+{
+ return proc_pid_cpu_bitmask_write(buffer, &task->launch_policy, nbytes, task);
}
#endif
diff -Nur linux-2.4.10/fs/proc/base.c linux-2.4.10-launch_policy/fs/proc/base.c
--- linux-2.4.10/fs/proc/base.c Fri Oct 26 15:07:16 2001
+++ linux-2.4.10-launch_policy/fs/proc/base.c Wed Nov 28 14:00:20 2001
@@ -39,6 +39,10 @@
int proc_pid_status(struct task_struct*,char*);
int proc_pid_statm(struct task_struct*,char*);
int proc_pid_cpu(struct task_struct*,char*);
+int proc_pid_cpus_allowed_read(struct task_struct*, char*);
+int proc_pid_cpus_allowed_write(struct task_struct*, char*, size_t);
+int proc_pid_launch_policy_read(struct task_struct*, char*);
+int proc_pid_launch_policy_write(struct task_struct*, char*, size_t);
static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
{
@@ -282,8 +286,44 @@
return count;
}
+static ssize_t proc_info_write(struct file * file, const char * buf,
+ size_t count, loff_t *ppos)
+{
+ struct inode * inode = file->f_dentry->d_inode;
+ unsigned long page;
+ ssize_t ret;
+ struct task_struct *task = inode->u.proc_i.task;
+
+ ret = -EINVAL;
+ if (inode->u.proc_i.op.proc_write == NULL)
+ goto out;
+ if (count > PAGE_SIZE - 1)
+ goto out;
+
+ ret = -ENOMEM;
+ if (!(page = __get_free_page(GFP_KERNEL)))
+ goto out;
+
+ ret = -EFAULT;
+ if (copy_from_user((char *)page, buf, count))
+ goto out_free_page;
+
+ ((char *)page)[count] = '\0';
+ ret = inode->u.proc_i.op.proc_write(task, (char*)page, count);
+ if (ret < 0)
+ goto out_free_page;
+
+ *ppos += ret;
+
+out_free_page:
+ free_page(page);
+out:
+ return ret;
+}
+
static struct file_operations proc_info_file_operations = {
read: proc_info_read,
+ write: proc_info_write,
};
#define MAY_PTRACE(p) \
@@ -497,25 +537,29 @@
PROC_PID_STATM,
PROC_PID_MAPS,
PROC_PID_CPU,
+ PROC_PID_CPUS_ALLOWED,
+ PROC_PID_LAUNCH_POLICY,
PROC_PID_FD_DIR = 0x8000, /* 0x8000-0xffff */
};
#define E(type,name,mode) {(type),sizeof(name)-1,(name),(mode)}
static struct pid_entry base_stuff[] = {
- E(PROC_PID_FD, "fd", S_IFDIR|S_IRUSR|S_IXUSR),
- E(PROC_PID_ENVIRON, "environ", S_IFREG|S_IRUSR),
- E(PROC_PID_STATUS, "status", S_IFREG|S_IRUGO),
- E(PROC_PID_CMDLINE, "cmdline", S_IFREG|S_IRUGO),
- E(PROC_PID_STAT, "stat", S_IFREG|S_IRUGO),
- E(PROC_PID_STATM, "statm", S_IFREG|S_IRUGO),
+ E(PROC_PID_FD, "fd", S_IFDIR|S_IRUSR|S_IXUSR),
+ E(PROC_PID_ENVIRON, "environ", S_IFREG|S_IRUSR),
+ E(PROC_PID_STATUS, "status", S_IFREG|S_IRUGO),
+ E(PROC_PID_CMDLINE, "cmdline", S_IFREG|S_IRUGO),
+ E(PROC_PID_STAT, "stat", S_IFREG|S_IRUGO),
+ E(PROC_PID_STATM, "statm", S_IFREG|S_IRUGO),
#ifdef CONFIG_SMP
- E(PROC_PID_CPU, "cpu", S_IFREG|S_IRUGO),
+ E(PROC_PID_CPU, "cpu", S_IFREG|S_IRUGO),
+ E(PROC_PID_CPUS_ALLOWED, "cpus_allowed", S_IFREG|S_IRUGO|S_IWUSR),
+ E(PROC_PID_LAUNCH_POLICY, "launch_policy",S_IFREG|S_IRUGO|S_IWUSR),
#endif
- E(PROC_PID_MAPS, "maps", S_IFREG|S_IRUGO),
- E(PROC_PID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR),
- E(PROC_PID_CWD, "cwd", S_IFLNK|S_IRWXUGO),
- E(PROC_PID_ROOT, "root", S_IFLNK|S_IRWXUGO),
- E(PROC_PID_EXE, "exe", S_IFLNK|S_IRWXUGO),
+ E(PROC_PID_MAPS, "maps", S_IFREG|S_IRUGO),
+ E(PROC_PID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR),
+ E(PROC_PID_CWD, "cwd", S_IFLNK|S_IRWXUGO),
+ E(PROC_PID_ROOT, "root", S_IFLNK|S_IRWXUGO),
+ E(PROC_PID_EXE, "exe", S_IFLNK|S_IRWXUGO),
{0,0,NULL,0}
};
#undef E
@@ -869,6 +913,16 @@
case PROC_PID_CPU:
inode->i_fop = &proc_info_file_operations;
inode->u.proc_i.op.proc_read = proc_pid_cpu;
+ break;
+ case PROC_PID_CPUS_ALLOWED:
+ inode->i_fop = &proc_info_file_operations;
+ inode->u.proc_i.op.proc_read = proc_pid_cpus_allowed_read;
+ inode->u.proc_i.op.proc_write = proc_pid_cpus_allowed_write;
+ break;
+ case PROC_PID_LAUNCH_POLICY:
+ inode->i_fop = &proc_info_file_operations;
+ inode->u.proc_i.op.proc_read = proc_pid_launch_policy_read;
+ inode->u.proc_i.op.proc_write = proc_pid_launch_policy_write;
break;
#endif
case PROC_PID_MEM:
diff -Nur linux-2.4.10/include/linux/capability.h linux-2.4.10-launch_policy/include/linux/capability.h
--- linux-2.4.10/include/linux/capability.h Mon Nov 19 22:57:29 2001
+++ linux-2.4.10-launch_policy/include/linux/capability.h Wed Nov 28 13:49:59 2001
@@ -243,6 +243,8 @@
/* Allow use of FIFO and round-robin (realtime) scheduling on own
processes and setting the scheduling algorithm used by another
process. */
+/* Allow binding of tasks to CPUs */
+/* Allow setting of launch policies */
#define CAP_SYS_NICE 23
diff -Nur linux-2.4.10/include/linux/prctl.h linux-2.4.10-launch_policy/include/linux/prctl.h
--- linux-2.4.10/include/linux/prctl.h Thu Jul 19 20:39:57 2001
+++ linux-2.4.10-launch_policy/include/linux/prctl.h Mon Nov 19 15:24:10 2001
@@ -20,4 +20,12 @@
#define PR_GET_KEEPCAPS 7
#define PR_SET_KEEPCAPS 8
+/* Get/set cpus allowed */
+#define PR_GET_CPUS_ALLOWED 13
+#define PR_SET_CPUS_ALLOWED 14
+
+/* Get/set launch policies */
+#define PR_GET_LAUNCH_POLICY 15
+#define PR_SET_LAUNCH_POLICY 16
+
#endif /* _LINUX_PRCTL_H */
diff -Nur linux-2.4.10/include/linux/proc_fs_i.h linux-2.4.10-launch_policy/include/linux/proc_fs_i.h
--- linux-2.4.10/include/linux/proc_fs_i.h Fri Oct 26 15:07:16 2001
+++ linux-2.4.10-launch_policy/include/linux/proc_fs_i.h Wed Oct 17 14:18:53 2001
@@ -1,9 +1,10 @@
struct proc_inode_info {
struct task_struct *task;
int type;
- union {
+ struct {
int (*proc_get_link)(struct inode *, struct dentry **, struct vfsmount **);
int (*proc_read)(struct task_struct *task, char *page);
+ int (*proc_write)(struct task_struct *task, char *page, size_t nbytes);
} op;
struct file *file;
};
diff -Nur linux-2.4.10/include/linux/sched.h linux-2.4.10-launch_policy/include/linux/sched.h
--- linux-2.4.10/include/linux/sched.h Mon Nov 19 22:57:29 2001
+++ linux-2.4.10-launch_policy/include/linux/sched.h Mon Nov 19 15:27:40 2001
@@ -352,6 +352,7 @@
struct task_struct *pidhash_next;
struct task_struct **pidhash_pprev;
+ unsigned long launch_policy; /* for *fork*() & exec() */
wait_queue_head_t wait_chldexit; /* for wait4() */
struct completion *vfork_done; /* for vfork() */
unsigned long rt_priority;
@@ -480,6 +481,7 @@
p_opptr: &tsk, \
p_pptr: &tsk, \
thread_group: LIST_HEAD_INIT(tsk.thread_group), \
+ launch_policy: -1, \
wait_chldexit: __WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\
real_timer: { \
function: it_real_fn \
diff -Nur linux-2.4.10/kernel/fork.c linux-2.4.10-launch_policy/kernel/fork.c
--- linux-2.4.10/kernel/fork.c Mon Sep 17 21:46:04 2001
+++ linux-2.4.10-launch_policy/kernel/fork.c Wed Oct 24 15:55:55 2001
@@ -646,6 +646,7 @@
spin_lock_init(&p->sigmask_lock);
}
#endif
+ p->cpus_allowed = p->launch_policy; /* launch_policy is inherited from parent */
p->lock_depth = -1; /* -1 = no lock */
p->start_time = jiffies;
diff -Nur linux-2.4.10/kernel/sys.c linux-2.4.10-launch_policy/kernel/sys.c
--- linux-2.4.10/kernel/sys.c Tue Sep 18 14:10:43 2001
+++ linux-2.4.10-launch_policy/kernel/sys.c Wed Nov 28 14:13:20 2001
@@ -1256,6 +1256,27 @@
}
current->keep_capabilities = arg2;
break;
+ case PR_GET_CPUS_ALLOWED:
+ error = put_user(current->cpus_allowed, (long *)arg2);
+ break;
+ case PR_SET_CPUS_ALLOWED:
+ if (!(cpu_online_map & arg2))
+ error = -EINVAL;
+ else {
+ current->cpus_allowed = arg2;
+ if (!((1 << smp_processor_id()) & arg2))
+ current->need_resched = 1;
+ }
+ break;
+ case PR_GET_LAUNCH_POLICY:
+ error = put_user(current->launch_policy, (long *)arg2);
+ break;
+ case PR_SET_LAUNCH_POLICY:
+ if (!(cpu_online_map & arg2))
+ error = -EINVAL;
+ else
+ current->launch_policy = arg2;
+ break;
default:
error = -EINVAL;
break;
--------------4BDFD612D7DB15AA0D5BF73A--
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/