Re: [Lse-tech] NUMA scheduling

Mike Kravetz (kravetz@us.ibm.com)
Fri, 22 Feb 2002 15:59:50 -0800
Messages sorted by: [ date ][ thread ][ subject ][ author ]
Next message: Ion Badulescu: "Re: Need some help with IP/TCP Checksum Offload"
Previous message: Jeff Garzik: "Re: [PATCH] 2.5.5-pre1 IDE cleanup 9"
On Fri, Feb 22, 2002 at 10:56:06AM -0800, Mike Kravetz wrote:
> Below is preliminary patch to implement some form of NUMA scheduling
> on top of Ingo's K3 scheduler patch for 2.4.17.
My apologies, the previously included patch was created on top of
of Ingo's J9 scheduler patch. The patch below is built on K3.
-- 
Mike

diff -Naur linux-2.4.17-K3/arch/i386/kernel/smpboot.c linux-2.4.17-K3-ns/arch/i386/kernel/smpboot.c
--- linux-2.4.17-K3/arch/i386/kernel/smpboot.c	Thu Feb 14 18:42:53 2002
+++ linux-2.4.17-K3-ns/arch/i386/kernel/smpboot.c	Thu Feb 14 17:38:51 2002
@@ -1198,6 +1198,11 @@
 			}
 		}
 	}
+
+	/*
+	 * Hack to get cpu sets initialized on NUMA architectures
+	 */
+	numa_sched_init();
 	     
 #ifndef CONFIG_VISWS
 	/*
diff -Naur linux-2.4.17-K3/fs/exec.c linux-2.4.17-K3-ns/fs/exec.c
--- linux-2.4.17-K3/fs/exec.c	Fri Dec 21 17:41:55 2001
+++ linux-2.4.17-K3-ns/fs/exec.c	Thu Feb 14 17:38:51 2002
@@ -860,6 +860,10 @@
 	int retval;
 	int i;
 
+#if CONFIG_SMP
+	sched_exec_migrate();
+#endif
+
 	file = open_exec(filename);
 
 	retval = PTR_ERR(file);
diff -Naur linux-2.4.17-K3/include/linux/sched.h linux-2.4.17-K3-ns/include/linux/sched.h
--- linux-2.4.17-K3/include/linux/sched.h	Thu Feb 14 18:46:54 2002
+++ linux-2.4.17-K3-ns/include/linux/sched.h	Thu Feb 14 17:52:18 2002
@@ -152,6 +152,7 @@
 extern void sched_task_migrated(task_t *p);
 extern void smp_migrate_task(int cpu, task_t *task);
 extern unsigned long cache_decay_ticks;
+extern void sched_exec_migrate(void);
 
 #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
 extern signed long FASTCALL(schedule_timeout(signed long timeout));
diff -Naur linux-2.4.17-K3/kernel/sched.c linux-2.4.17-K3-ns/kernel/sched.c
--- linux-2.4.17-K3/kernel/sched.c	Thu Feb 14 18:42:52 2002
+++ linux-2.4.17-K3-ns/kernel/sched.c	Thu Feb 14 17:40:47 2002
@@ -22,6 +22,53 @@
 #include <linux/kernel_stat.h>
 
 /*
+ * Definitions that depend on/should be part of NUMA topology discovery
+ */
+/*
+ * Periodically poll for new (off cpu set) work as part of the idle loop
+#define IDLE_LOOP_REBALANCE
+ */
+/*
+ * Look for off cpu set work before going idle
+ */
+#define GOING_IDLE_REBALANCE
+/*
+ * Attempt to balance load between cpu sets (even when busy)
+#define BUSY_REBALANCE
+ */
+/*
+ * Send tasks to least loaded CPU at exec time
+ */
+#define EXEC_BALANCE
+
+#define NR_CPU_SETS		NR_CPUS
+
+union cpu_set {
+	struct cpu_set_data {
+		spinlock_t lock;
+		unsigned long local_cpus;
+		unsigned long load_avg;
+	} csd;
+	char __pad [SMP_CACHE_BYTES];
+};
+typedef union cpu_set cpu_set_t;
+#define cs_lock		csd.lock
+#define	cs_local_cpus	csd.local_cpus
+#define cs_load_avg	csd.load_avg
+
+static int numa_num_cpu_sets = 0;
+static int numa_cpus_per_local_set = NR_CPUS;
+static int numa_cpu_set_distance = 1;
+static cpu_set_t cpu_sets[NR_CPU_SETS] __cacheline_aligned;
+
+#define cpu_to_set_id(c)	((c) / numa_cpus_per_local_set)
+#define next_cs_id(c)		((c) + 1 == numa_num_cpu_sets ? 0 : (c) + 1)
+#define cpu_to_set(c)		(cpu_sets + ((c) / numa_cpus_per_local_set))
+/*
+ * End of NUMA definitions
+ */
+
+/*
  * Priority of a process goes from 0 to 139. The 0-99
  * priority range is allocated to RT tasks, the 100-139
  * range is for SCHED_OTHER tasks. Priority values are
@@ -140,6 +187,7 @@
  */
 struct runqueue {
 	spinlock_t lock;
+	cpu_set_t *cpu_set;
 	unsigned long nr_running, nr_switches, expired_timestamp;
 	task_t *curr, *idle;
 	prio_array_t *active, *expired, arrays[2];
@@ -484,11 +532,13 @@
 static void load_balance(runqueue_t *this_rq, int idle)
 {
 	int imbalance, nr_running, load, max_load,
-		idx, i, this_cpu = smp_processor_id();
+		idx, i, this_cpu = smp_processor_id(),
+		total_set_load = 0, cpus_in_set = 0;
 	task_t *next = this_rq->idle, *tmp;
 	runqueue_t *busiest, *rq_src;
 	prio_array_t *array;
 	list_t *head, *curr;
+	unsigned long target_cpu_set;
 
 	/*
 	 * We search all runqueues to find the most busy one.
@@ -519,7 +569,11 @@
 
 	busiest = NULL;
 	max_load = 1;
+	target_cpu_set = this_rq->cpu_set->cs_local_cpus;
 	for (i = 0; i < smp_num_cpus; i++) {
+		/* Skip CPUs not in the target set */
+		if (!(target_cpu_set & (1UL << cpu_logical_map(i))))
+			continue;
 		rq_src = cpu_rq(cpu_logical_map(i));
 		if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i]))
 			load = rq_src->nr_running;
@@ -527,6 +581,9 @@
 			load = this_rq->prev_nr_running[i];
 		this_rq->prev_nr_running[i] = rq_src->nr_running;
 
+		total_set_load += load;
+		cpus_in_set++;
+
 		if ((load > max_load) && (rq_src != this_rq)) {
 			busiest = rq_src;
 			max_load = load;
@@ -547,10 +604,20 @@
 	 * Make sure nothing changed since we checked the
 	 * runqueue length.
 	 */
-	if (busiest->nr_running <= this_rq->nr_running + 1)
+	if (busiest->nr_running <= nr_running + 1)
 		goto out_unlock;
 
 	/*
+	 * Update cpu set load average
+	 */
+	total_set_load = total_set_load / cpus_in_set;
+	if (total_set_load != this_rq->cpu_set->cs_load_avg) {
+		spin_lock(&this_rq->cpu_set->cs_lock);
+		this_rq->cpu_set->cs_load_avg = total_set_load;
+		spin_unlock(&this_rq->cpu_set->cs_lock);
+	}
+
+	/*
 	 * We first consider expired tasks. Those will likely not be
 	 * executed in the near future, and they are most likely to
 	 * be cache-cold, thus switching CPUs has the least effect
@@ -623,6 +690,49 @@
 	spin_unlock(&busiest->lock);
 }
 
+#if defined(IDLE_LOOP_REBALANCE) || defined(GOING_IDLE_REBALANCE) || \
+    defined(BUSY_REBALANCE)
+/*
+ * Load balance CPU sets
+ */
+static void balance_cpu_sets(runqueue_t *this_rq, int idle)
+{
+	int i, this_load, max_load;
+	cpu_set_t *this_set, *target_set = NULL;
+
+	this_set = cpu_to_set(smp_processor_id());
+	this_load = this_set->cs_load_avg;
+
+	if (this_load > 1)
+		max_load = this_load;
+	else {
+		if (idle)
+			max_load = 0;	/* Looking for anything! */
+		else
+			max_load = 1;
+	}
+
+	for(i = 0; i < numa_num_cpu_sets; i++) {
+		if (cpu_sets[i].cs_load_avg > max_load) {
+			max_load = cpu_sets[i].cs_load_avg;
+			target_set = &cpu_sets[i];
+		}
+	}
+
+	if (!target_set || (max_load <= this_load))
+		return;
+
+	/*
+	 * We point current CPU at target cpu_set.  This is safe
+	 * because the caller ensures the current CPUs runqueue
+	 * is locked.
+	 */
+	this_rq()->cpu_set = target_set;
+	load_balance(this_rq, idle);
+	this_rq()->cpu_set = this_set;
+}
+#endif
+
 /*
  * One of the idle_cpu_tick() or the busy_cpu_tick() function will
  * gets called every timer tick, on every CPU. Our balancing action
@@ -634,16 +744,104 @@
  */
 #define BUSY_REBALANCE_TICK (HZ/4 ?: 1)
 #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
+#define CS_IDLE_REBALANCE_TICK (IDLE_REBALANCE_TICK * numa_cpu_set_distance)
+#define CS_BUSY_REBALANCE_TICK (BUSY_REBALANCE_TICK * numa_cpu_set_distance)
 
 static inline void idle_tick(void)
 {
-	if (jiffies % IDLE_REBALANCE_TICK)
+	unsigned long now = jiffies;
+
+	if (now % IDLE_REBALANCE_TICK)
 		return;
 	spin_lock(&this_rq()->lock);
 	load_balance(this_rq(), 1);
+
+#ifdef IDLE_LOOP_REBALANCE
+	/*
+	 * If there are no waiting tasks on the local cpu set, then
+	 * at least take a look at other sets.
+	 */
+	if (!this_rq()->nr_running && !(now % CS_IDLE_REBALANCE_TICK))
+		balance_cpu_sets(this_rq(), 1);
+#endif
+
 	spin_unlock(&this_rq()->lock);
 }
 
+/*
+ * Code to be executed as part of the exec path on NUMA architectures.
+ * Since exec throws away the old process image, there is little
+ * advantage to keeping the task on the same cpu set.  Therefore, we
+ * consider moving the task to the least laded cpu set.
+ */
+void sched_exec_migrate(void)
+{
+#ifdef EXEC_BALANCE
+	int this_cpu, this_cs_id, this_load_avg, min_load_avg,
+		i, target_cs_id, target_cpu;
+	unsigned long target_cs_mask,
+		cpus_allowed = current->cpus_allowed;
+	runqueue_t *rq;
+
+	/*
+	 * Only consider migration if other tasks are waiting for
+	 * this CPU, and the load average for this cpu set is
+	 * greater than 1.  Also, make sure the task is allowed
+	 * to run on another cpu set.
+	 */
+	this_cpu = smp_processor_id();
+	if (cpu_rq(this_cpu)->nr_running < 2)
+		return;
+
+	this_cs_id = cpu_to_set_id(this_cpu);
+	this_load_avg = cpu_sets[this_cs_id].cs_load_avg;
+	if (this_load_avg < 1)
+		return;
+
+	if (!(cpus_allowed & ~(cpu_sets[this_cs_id].cs_local_cpus)))
+		return;
+
+	/*
+	 * Look for another cpu set with a lower load average.
+	 */
+	min_load_avg = this_load_avg;
+	target_cs_id = this_cs_id;
+	for (i = next_cs_id(this_cs_id); i != this_cs_id; i = next_cs_id(i)) {
+		if (!(cpus_allowed & cpu_sets[i].cs_local_cpus))
+			continue;
+		if (cpu_sets[i].cs_load_avg < min_load_avg) {
+			min_load_avg = cpu_sets[i].cs_load_avg;
+			target_cs_id = i;
+		}
+	}
+	if (!(min_load_avg < this_load_avg))
+		return;
+
+	/*
+	 * Find shortest runqueue on target cpu set.
+	 */
+	min_load_avg = INT_MAX;
+	target_cs_mask = cpu_sets[target_cs_id].cs_local_cpus;
+	target_cpu = this_cpu;
+	for (i=0; i < smp_num_cpus; i++) {
+		if (!(cpus_allowed & target_cs_mask & (1UL << i)))
+			continue;
+		rq = cpu_rq(cpu_logical_map(i));
+		if (rq->nr_running < min_load_avg) {
+			min_load_avg = rq->nr_running;
+			target_cpu = cpu_logical_map(i);
+		}
+	}
+			
+	/*
+	 * Migrate task
+	 */
+	current->state = TASK_UNINTERRUPTIBLE;
+	smp_migrate_task(target_cpu, current);
+	schedule();
+#endif
+}
+
 #endif
 
 /*
@@ -732,6 +930,11 @@
 #if CONFIG_SMP
 	if (!(jiffies % BUSY_REBALANCE_TICK))
 		load_balance(rq, 0);
+
+#ifdef BUSY_REBALANCE
+	if (!(now % CS_BUSY_REBALANCE_TICK))
+		balance_cpu_sets(rq, 0);
+#endif
 #endif
 	spin_unlock(&rq->lock);
 }
@@ -772,6 +975,10 @@
 	if (unlikely(!rq->nr_running)) {
 #if CONFIG_SMP
 		load_balance(rq, 1);
+#ifdef GOING_IDLE_REBALANCE
+		if (!rq->nr_running)
+			balance_cpu_sets(rq, 1);
+#endif
 		if (rq->nr_running)
 			goto pick_next_task;
 #endif
@@ -1488,10 +1695,20 @@
 	runqueue_t *rq;
 	int i, j, k;
 
+	/*
+	 * Default is to have a single cpu set containing all CPUs
+	 */
+	numa_num_cpu_sets = 0;
+	cpu_sets[0].cs_lock = SPIN_LOCK_UNLOCKED;
+	cpu_sets[0].cs_local_cpus = -1;
+	cpu_sets[0].cs_load_avg = 0;
+
 	for (i = 0; i < NR_CPUS; i++) {
 		runqueue_t *rq = cpu_rq(i);
 		prio_array_t *array;
 
+		rq->cpu_set = &cpu_sets[0];
+
 		rq->active = rq->arrays + 0;
 		rq->expired = rq->arrays + 1;
 		spin_lock_init(&rq->lock);
@@ -1528,3 +1745,42 @@
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current, smp_processor_id());
 }
+
+void numa_sched_init(void)
+{
+	int i, cpu_set_num;
+	unsigned long numa_local_cpu_set_mask;
+
+#define NUMA_CPUS_PER_LOCAL_SET	4
+#define NUMA_CPU_SET_DISTANCE	2
+
+	numa_cpus_per_local_set = NUMA_CPUS_PER_LOCAL_SET;
+	numa_cpu_set_distance = NUMA_CPU_SET_DISTANCE;
+
+	numa_local_cpu_set_mask = 0UL;
+	for (i=0; i<numa_cpus_per_local_set; i++) {
+		numa_local_cpu_set_mask = numa_local_cpu_set_mask << 1;
+		numa_local_cpu_set_mask |= 0x1UL;
+	}
+
+	/*
+	 * Make sure all runqueues point to the correct cpu_set
+	 */
+	cpu_set_num = -1;
+	numa_num_cpu_sets = 0;
+	for (i=0; i<smp_num_cpus; i++) {
+		if (cpu_to_set_id(i) != cpu_set_num) {
+			cpu_set_num = cpu_to_set_id(i);
+
+			cpu_sets[cpu_set_num].cs_lock = SPIN_LOCK_UNLOCKED;
+			cpu_sets[cpu_set_num].cs_local_cpus =
+				numa_local_cpu_set_mask <<
+				(cpu_set_num * numa_cpus_per_local_set);
+			cpu_sets[cpu_set_num].cs_load_avg = 0;
+
+			numa_num_cpu_sets++;
+		}
+
+		runqueues[i].cpu_set = &cpu_sets[cpu_set_num];
+	}
+}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Next message: Ion Badulescu: "Re: Need some help with IP/TCP Checksum Offload"
Previous message: Jeff Garzik: "Re: [PATCH] 2.5.5-pre1 IDE cleanup 9"