Re: a quest for a better scheduler

Fabio Riccardi (fabio@chromium.com)
Tue, 03 Apr 2001 17:18:03 -0700


This is a multi-part message in MIME format.
--------------B46085350AE53D8B14855445
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

Dear all,

I've spent my afternoon running some benchmarks to see if MQ patches would
degrade performance in the "normal case".

To measure performance I've used the latest lmbench and I have mesured the
kernel compile times on a dual pentium III box runing at 1GHz with an 133MHz
bus.

Results (attached) show that there is no measurable difference in performace
between a vanilla scheduler and a multiqueue scheduler when running only few
processes (the compilation benchmark runs essentially two processes, one per
CPU).

I have measured the HP and not the "scalability" patch because the two do more
or less the same thing and give me the same performance advantages, but the
former is a lot simpler and I could port it with no effort on any recent
kernel. It is indeed interesting to see that this patch was originally designed
for a 2.4.0-test10 kernel, and still works fine on the latest kernels, only a
minor change (one line) was required. A version of the patch for the 2.4.2-ac26
kernel is attached to this message.

Given the zero impact on "normal case" performance and the huge positive impact
(> 20%) in the heavy load case (70-80 runnable processess on a load of about
1400 total) I don't see why such a thing shouldn't be accepted in the
mainstream scheduler.

- Fabio

--------------B46085350AE53D8B14855445
Content-Type: text/plain; charset=us-ascii;
name="mqpatch242ac26"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
filename="mqpatch242ac26"

--- sched.c.orig Tue Mar 27 17:30:58 2001
+++ sched.c Tue Apr 3 16:45:21 2001
@@ -34,6 +34,7 @@
extern void timer_bh(void);
extern void tqueue_bh(void);
extern void immediate_bh(void);
+static inline void hop_queues(struct task_struct *, int);

/*
* scheduler variables
@@ -90,7 +91,8 @@
spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */
rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */

-static LIST_HEAD(runqueue_head);
+static struct list_head runqueue_head[NR_CPUS] = { LIST_HEAD_INIT((runqueue_head[0]))};
+static LIST_HEAD(rt_queue_head);

/*
* We align per-CPU scheduling data on cacheline boundaries,
@@ -100,12 +102,15 @@
struct schedule_data {
struct task_struct * curr;
cycles_t last_schedule;
+ struct list_head runqueue_head;
} schedule_data;
char __pad [SMP_CACHE_BYTES];
-} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
+} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0,
+ LIST_HEAD_INIT((aligned_data[0].schedule_data.runqueue_head))}}};

#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
+#define cpu_rq(cpu) (aligned_data[(cpu)].schedule_data.runqueue_head)

struct kernel_stat kstat;

@@ -199,6 +204,33 @@
return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
}

+
+static inline int other_goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
+{
+ int weight;
+
+ /*
+ * select the current process after every other
+ * runnable process, but before the idle thread.
+ * Also, dont trigger a counter recalculation.
+ *
+ * Give the process a first-approximation goodness value
+ * according to the number of clock-ticks it has left.
+ *
+ * Don't do any other calculations if the time slice is
+ * over..
+ */
+ weight = p->counter;
+ if (!weight)
+ goto out2;
+
+ /* .. and a slight advantage to the current MM */
+ if (p->mm == this_mm || !p->mm)
+ weight += 1;
+ weight += 20 - p->nice;
+out2:
+ return weight;
+}
/*
* This is ugly, but reschedule_idle() is very timing-critical.
* We are called with the runqueue spinlock held and we must
@@ -266,6 +298,10 @@
} else {
if (oldest_idle == -1ULL) {
int prio = preemption_goodness(tsk, p, cpu);
+ /*
+ * this will never be true for < 400 HZ non
+ * realtime. optimize this? SAR
+ */

if (prio > max_prio) {
max_prio = prio;
@@ -277,6 +313,10 @@
tsk = target_tsk;
if (tsk) {
if (oldest_idle != -1ULL) {
+ /* push onto best queue */
+ if (p->policy == SCHED_OTHER){
+ hop_queues(p, tsk->processor);
+ }
best_cpu = tsk->processor;
goto send_now_idle;
}
@@ -306,20 +346,28 @@
*/
static inline void add_to_runqueue(struct task_struct * p)
{
- list_add(&p->run_list, &runqueue_head);
+ if (p->policy == SCHED_OTHER){
+ list_add(&p->run_list, &cpu_rq(p->processor));
+ } else list_add(&p->run_list, &rt_queue_head);
nr_running++;
}

-static inline void move_last_runqueue(struct task_struct * p)
+static inline void move_last_rt_queue(struct task_struct * p)
+{
+ list_del(&p->run_list);
+ list_add_tail(&p->run_list, &rt_queue_head);
+}
+
+static inline void move_first_rt_queue(struct task_struct * p)
{
list_del(&p->run_list);
- list_add_tail(&p->run_list, &runqueue_head);
+ list_add(&p->run_list, &rt_queue_head);
}

-static inline void move_first_runqueue(struct task_struct * p)
+static inline void hop_queues(struct task_struct * p, int this_cpu)
{
list_del(&p->run_list);
- list_add(&p->run_list, &runqueue_head);
+ list_add(&p->run_list, &cpu_rq(this_cpu));
}

/*
@@ -343,6 +391,7 @@
if (task_on_runqueue(p))
goto out;
add_to_runqueue(p);
+ /* LATER : make an effort to choose rq before add ? */
if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id())))
reschedule_idle(p);
success = 1;
@@ -531,9 +580,9 @@
asmlinkage void schedule(void)
{
struct schedule_data * sched_data;
- struct task_struct *prev, *next, *p;
+ struct task_struct *prev, *next, *p = NULL; /* LATER fix this */
struct list_head *tmp;
- int this_cpu, c;
+ int this_cpu, c, i;


spin_lock_prefetch(&runqueue_lock);
@@ -592,18 +641,63 @@
goto still_running;

still_running_back:
- list_for_each(tmp, &runqueue_head) {
+ /*
+ * we unrolled the original combined runqueue in to two separate
+ * checks, first the real time and then then policy OTHER for own
+ * cpu, and finally theft from another cpu.
+ */
+ list_for_each(tmp, &rt_queue_head) {
p = list_entry(tmp, struct task_struct, run_list);
- if (can_schedule(p, this_cpu)) {
- int weight = goodness(p, this_cpu, prev->active_mm);
+ if (can_schedule(p, this_cpu) && !(p->policy & SCHED_YIELD)) {
+ int weight = 1000 + p->rt_priority;
if (weight > c)
c = weight, next = p;
}
}

+ if (c >= 1000)
+ goto choice_made;
+
+ list_for_each(tmp, &cpu_rq(this_cpu)) {
+ p = list_entry(tmp, struct task_struct, run_list);
+ if (can_schedule(p, this_cpu) && !(p->policy & SCHED_YIELD)) {
+ int weight = other_goodness(p, this_cpu, prev->active_mm);
+ if (weight > c)
+ c = weight, next = p;
+ }
+ }
+
+#ifdef CONFIG_SMP
+ if (c > 0)
+ goto choice_made;
+
+ /*
+ * try to steal from another CPU's queue. since we don't have to
+ * worry about real time or CPU preference, pick anything available.
+ * this is an area for detailed policy.
+ */
+ for (i = 0; i < smp_num_cpus; i++) {
+ int cpu = cpu_logical_map(i);
+ if (cpu == this_cpu)
+ continue;
+
+ list_for_each(tmp, &cpu_rq(cpu)) {
+ p = list_entry(tmp, struct task_struct, run_list);
+ if (can_schedule(p, this_cpu) && !(p->policy & SCHED_YIELD)) {
+ int weight = other_goodness(p, cpu, prev->active_mm);
+ if (weight > c)
+ c = weight, next = p;
+ }
+ }
+ }
+
+ if (c > 0)
+ hop_queues(next, this_cpu); /* pull onto mine */
+#endif
/* Do we need to re-calculate counters? */
if (!c)
goto recalculate;
+choice_made:
/*
* from this point on nothing can prevent us from
* switching to the next task, save this fact in
@@ -705,7 +799,7 @@
move_rr_last:
if (!prev->counter) {
prev->counter = NICE_TO_TICKS(prev->nice);
- move_last_runqueue(prev);
+ move_last_rt_queue(prev);
}
goto move_rr_back;

@@ -938,8 +1032,14 @@
retval = 0;
p->policy = policy;
p->rt_priority = lp.sched_priority;
- if (task_on_runqueue(p))
- move_first_runqueue(p);
+ if (task_on_runqueue(p)){
+ if (policy != SCHED_OTHER)
+ move_first_rt_queue(p);
+ else {
+ /* push onto appropriate non-rt queue */
+ hop_queues(p, p->processor);
+ }
+ }

current->need_resched = 1;

@@ -1251,9 +1351,12 @@
* process right in SMP mode.
*/
int cpu = smp_processor_id();
- int nr;
+ int nr, i;

init_task.processor = cpu;
+ for (i=1; i<NR_CPUS; i++){
+ INIT_LIST_HEAD(&cpu_rq(i));
+ }

for(nr = 0; nr < PIDHASH_SZ; nr++)
pidhash[nr] = NULL;

--------------B46085350AE53D8B14855445
Content-Type: text/plain; charset=us-ascii;
name="compile_times"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
filename="compile_times"

Kernel compilation times
========================

three runs of "time make -j2"

--

Linux 2.4.2-ac26 + HP Multiqueue patch

216.34user 14.36system 2:00.03elapsed 192%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (482269major+691020minor)pagefaults 0swaps

216.53user 14.23system 1:57.91elapsed 195%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (482269major+691020minor)pagefaults 0swaps

217.65user 13.46system 1:58.05elapsed 195%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (482269major+691020minor)pagefaults 0swaps

--

Linux 2.4.2-ac26

220.07user 14.88system 2:02.67elapsed 191%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (482269major+691019minor)pagefaults 0swaps

220.31user 14.90system 2:00.64elapsed 194%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (482269major+691018minor)pagefaults 0swaps

220.58user 14.84system 2:00.57elapsed 195%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (482269major+691018minor)pagefaults 0swaps

LMBENCH-2BETA1 ==============

First two: Linux 2.4.2-ac26 + HP Multiqueue patch

Second Two: Linux 2.4.2-ac26

Processor, Processes - times in microseconds - smaller is better ---------------------------------------------------------------- Host OS Mhz null null open selct sig sig fork exec sh call I/O stat clos TCP inst hndl proc proc proc --------- ------------- ---- ---- ---- ---- ---- ----- ---- ---- ---- ---- ---- skinny Linux 2.4.2-a 997 0.34 0.56 3.96 5.04 27 0.89 2.72 245 1128 4044 skinny Linux 2.4.2-a 997 0.34 0.57 4.19 5.32 25 0.89 2.71 247 1150 4067 skinny Linux 2.4.2-a 997 0.34 0.58 3.90 5.00 25 0.89 2.69 249 1121 3968 skinny Linux 2.4.2-a 997 0.34 0.57 3.90 5.01 25 0.87 2.70 246 1126 4018

Context switching - times in microseconds - smaller is better ------------------------------------------------------------- Host OS 2p/0K 2p/16K 2p/64K 8p/16K 8p/64K 16p/16K 16p/64K ctxsw ctxsw ctxsw ctxsw ctxsw ctxsw ctxsw --------- ------------- ----- ------ ------ ------ ------ ------- ------- skinny Linux 2.4.2-a 1.820 4.7700 12 8.0800 109 27 110 skinny Linux 2.4.2-a 1.890 4.7300 20 6.6500 109 27 110 skinny Linux 2.4.2-a 1.620 4.5900 12 6.7000 109 24 109 skinny Linux 2.4.2-a 1.700 4.6400 12 7.0600 109 26 109

*Local* Communication latencies in microseconds - smaller is better ------------------------------------------------------------------- Host OS 2p/0K Pipe AF UDP RPC/ TCP RPC/ TCP ctxsw UNIX UDP TCP conn --------- ------------- ----- ----- ---- ----- ----- ----- ----- ---- skinny Linux 2.4.2-a 1.820 7.390 15 16 52 23 91 55 skinny Linux 2.4.2-a 1.890 7.185 14 16 41 23 56 54 skinny Linux 2.4.2-a 1.620 6.793 15 16 40 21 56 54 skinny Linux 2.4.2-a 1.700 6.801 15 16 40 21 55 54

File & VM system latencies in microseconds - smaller is better -------------------------------------------------------------- Host OS 0K File 10K File Mmap Prot Page Create Delete Create Delete Latency Fault Fault --------- ------------- ------ ------ ------ ------ ------- ----- ----- skinny Linux 2.4.2-a 6.2054 0.7192 12 1.6973 451 0.672 2.00000 skinny Linux 2.4.2-a 6.2469 0.7360 12 1.7142 465 0.668 2.00000 skinny Linux 2.4.2-a 3.1992 0.7182 12 1.5857 449 0.680 2.00000 skinny Linux 2.4.2-a 6.1576 0.7119 12 1.5817 448 0.669 2.00000

*Local* Communication bandwidths in MB/s - bigger is better ----------------------------------------------------------- Host OS Pipe AF TCP File Mmap Bcopy Bcopy Mem Mem UNIX reread reread (libc) (hand) read write --------- ------------- ---- ---- ---- ------ ------ ------ ------ ---- ----- skinny Linux 2.4.2-a 823 233 162 434 558 271 218 558 282 skinny Linux 2.4.2-a 828 289 243 408 558 269 216 558 281 skinny Linux 2.4.2-a 839 285 249 445 558 222 147 558 219 skinny Linux 2.4.2-a 811 284 242 445 558 222 147 558 219

Memory latencies in nanoseconds - smaller is better (WARNING - may not be correct, check graphs) --------------------------------------------------- Host OS Mhz L1 $ L2 $ Main mem Guesses --------- ------------- ---- ----- ------ -------- ------- skinny Linux 2.4.2-a 997 3.009 7.0230 101 skinny Linux 2.4.2-a 997 3.010 7.0220 101 skinny Linux 2.4.2-a 997 3.010 7.0280 101 skinny Linux 2.4.2-a 997 3.009 7.0290 101

--------------B46085350AE53D8B14855445--

- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/