Dear all,
I've spent my afternoon running some benchmarks to see if MQ patches would
degrade performance in the "normal case".
To measure performance I've used the latest lmbench and I have mesured the
kernel compile times on a dual pentium III box runing at 1GHz with an 133MHz
bus.
Results (attached) show that there is no measurable difference in performace
between a vanilla scheduler and a multiqueue scheduler when running only few
processes (the compilation benchmark runs essentially two processes, one per
CPU).
I have measured the HP and not the "scalability" patch because the two do more
or less the same thing and give me the same performance advantages, but the
former is a lot simpler and I could port it with no effort on any recent
kernel. It is indeed interesting to see that this patch was originally designed
for a 2.4.0-test10 kernel, and still works fine on the latest kernels, only a
minor change (one line) was required. A version of the patch for the 2.4.2-ac26
kernel is attached to this message.
Given the zero impact on "normal case" performance and the huge positive impact
(> 20%) in the heavy load case (70-80 runnable processess on a load of about
1400 total) I don't see why such a thing shouldn't be accepted in the
mainstream scheduler.
- Fabio
--------------B46085350AE53D8B14855445
Content-Type: text/plain; charset=us-ascii;
name="mqpatch242ac26"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
filename="mqpatch242ac26"
--- sched.c.orig Tue Mar 27 17:30:58 2001
+++ sched.c Tue Apr 3 16:45:21 2001
@@ -34,6 +34,7 @@
extern void timer_bh(void);
extern void tqueue_bh(void);
extern void immediate_bh(void);
+static inline void hop_queues(struct task_struct *, int);
/*
* scheduler variables
@@ -90,7 +91,8 @@
spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */
rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */
-static LIST_HEAD(runqueue_head);
+static struct list_head runqueue_head[NR_CPUS] = { LIST_HEAD_INIT((runqueue_head[0]))};
+static LIST_HEAD(rt_queue_head);
/*
* We align per-CPU scheduling data on cacheline boundaries,
@@ -100,12 +102,15 @@
struct schedule_data {
struct task_struct * curr;
cycles_t last_schedule;
+ struct list_head runqueue_head;
} schedule_data;
char __pad [SMP_CACHE_BYTES];
-} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
+} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0,
+ LIST_HEAD_INIT((aligned_data[0].schedule_data.runqueue_head))}}};
#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
+#define cpu_rq(cpu) (aligned_data[(cpu)].schedule_data.runqueue_head)
struct kernel_stat kstat;
@@ -199,6 +204,33 @@
return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
}
+
+static inline int other_goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
+{
+ int weight;
+
+ /*
+ * select the current process after every other
+ * runnable process, but before the idle thread.
+ * Also, dont trigger a counter recalculation.
+ *
+ * Give the process a first-approximation goodness value
+ * according to the number of clock-ticks it has left.
+ *
+ * Don't do any other calculations if the time slice is
+ * over..
+ */
+ weight = p->counter;
+ if (!weight)
+ goto out2;
+
+ /* .. and a slight advantage to the current MM */
+ if (p->mm == this_mm || !p->mm)
+ weight += 1;
+ weight += 20 - p->nice;
+out2:
+ return weight;
+}
/*
* This is ugly, but reschedule_idle() is very timing-critical.
* We are called with the runqueue spinlock held and we must
@@ -266,6 +298,10 @@
} else {
if (oldest_idle == -1ULL) {
int prio = preemption_goodness(tsk, p, cpu);
+ /*
+ * this will never be true for < 400 HZ non
+ * realtime. optimize this? SAR
+ */
if (prio > max_prio) {
max_prio = prio;
@@ -277,6 +313,10 @@
tsk = target_tsk;
if (tsk) {
if (oldest_idle != -1ULL) {
+ /* push onto best queue */
+ if (p->policy == SCHED_OTHER){
+ hop_queues(p, tsk->processor);
+ }
best_cpu = tsk->processor;
goto send_now_idle;
}
@@ -306,20 +346,28 @@
*/
static inline void add_to_runqueue(struct task_struct * p)
{
- list_add(&p->run_list, &runqueue_head);
+ if (p->policy == SCHED_OTHER){
+ list_add(&p->run_list, &cpu_rq(p->processor));
+ } else list_add(&p->run_list, &rt_queue_head);
nr_running++;
}
-static inline void move_last_runqueue(struct task_struct * p)
+static inline void move_last_rt_queue(struct task_struct * p)
+{
+ list_del(&p->run_list);
+ list_add_tail(&p->run_list, &rt_queue_head);
+}
+
+static inline void move_first_rt_queue(struct task_struct * p)
{
list_del(&p->run_list);
- list_add_tail(&p->run_list, &runqueue_head);
+ list_add(&p->run_list, &rt_queue_head);
}
-static inline void move_first_runqueue(struct task_struct * p)
+static inline void hop_queues(struct task_struct * p, int this_cpu)
{
list_del(&p->run_list);
- list_add(&p->run_list, &runqueue_head);
+ list_add(&p->run_list, &cpu_rq(this_cpu));
}
/*
@@ -343,6 +391,7 @@
if (task_on_runqueue(p))
goto out;
add_to_runqueue(p);
+ /* LATER : make an effort to choose rq before add ? */
if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id())))
reschedule_idle(p);
success = 1;
@@ -531,9 +580,9 @@
asmlinkage void schedule(void)
{
struct schedule_data * sched_data;
- struct task_struct *prev, *next, *p;
+ struct task_struct *prev, *next, *p = NULL; /* LATER fix this */
struct list_head *tmp;
- int this_cpu, c;
+ int this_cpu, c, i;
spin_lock_prefetch(&runqueue_lock);
@@ -592,18 +641,63 @@
goto still_running;
still_running_back:
- list_for_each(tmp, &runqueue_head) {
+ /*
+ * we unrolled the original combined runqueue in to two separate
+ * checks, first the real time and then then policy OTHER for own
+ * cpu, and finally theft from another cpu.
+ */
+ list_for_each(tmp, &rt_queue_head) {
p = list_entry(tmp, struct task_struct, run_list);
- if (can_schedule(p, this_cpu)) {
- int weight = goodness(p, this_cpu, prev->active_mm);
+ if (can_schedule(p, this_cpu) && !(p->policy & SCHED_YIELD)) {
+ int weight = 1000 + p->rt_priority;
if (weight > c)
c = weight, next = p;
}
}
+ if (c >= 1000)
+ goto choice_made;
+
+ list_for_each(tmp, &cpu_rq(this_cpu)) {
+ p = list_entry(tmp, struct task_struct, run_list);
+ if (can_schedule(p, this_cpu) && !(p->policy & SCHED_YIELD)) {
+ int weight = other_goodness(p, this_cpu, prev->active_mm);
+ if (weight > c)
+ c = weight, next = p;
+ }
+ }
+
+#ifdef CONFIG_SMP
+ if (c > 0)
+ goto choice_made;
+
+ /*
+ * try to steal from another CPU's queue. since we don't have to
+ * worry about real time or CPU preference, pick anything available.
+ * this is an area for detailed policy.
+ */
+ for (i = 0; i < smp_num_cpus; i++) {
+ int cpu = cpu_logical_map(i);
+ if (cpu == this_cpu)
+ continue;
+
+ list_for_each(tmp, &cpu_rq(cpu)) {
+ p = list_entry(tmp, struct task_struct, run_list);
+ if (can_schedule(p, this_cpu) && !(p->policy & SCHED_YIELD)) {
+ int weight = other_goodness(p, cpu, prev->active_mm);
+ if (weight > c)
+ c = weight, next = p;
+ }
+ }
+ }
+
+ if (c > 0)
+ hop_queues(next, this_cpu); /* pull onto mine */
+#endif
/* Do we need to re-calculate counters? */
if (!c)
goto recalculate;
+choice_made:
/*
* from this point on nothing can prevent us from
* switching to the next task, save this fact in
@@ -705,7 +799,7 @@
move_rr_last:
if (!prev->counter) {
prev->counter = NICE_TO_TICKS(prev->nice);
- move_last_runqueue(prev);
+ move_last_rt_queue(prev);
}
goto move_rr_back;
@@ -938,8 +1032,14 @@
retval = 0;
p->policy = policy;
p->rt_priority = lp.sched_priority;
- if (task_on_runqueue(p))
- move_first_runqueue(p);
+ if (task_on_runqueue(p)){
+ if (policy != SCHED_OTHER)
+ move_first_rt_queue(p);
+ else {
+ /* push onto appropriate non-rt queue */
+ hop_queues(p, p->processor);
+ }
+ }
current->need_resched = 1;
@@ -1251,9 +1351,12 @@
* process right in SMP mode.
*/
int cpu = smp_processor_id();
- int nr;
+ int nr, i;
init_task.processor = cpu;
+ for (i=1; i<NR_CPUS; i++){
+ INIT_LIST_HEAD(&cpu_rq(i));
+ }
for(nr = 0; nr < PIDHASH_SZ; nr++)
pidhash[nr] = NULL;
--------------B46085350AE53D8B14855445
Content-Type: text/plain; charset=us-ascii;
name="compile_times"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
filename="compile_times"
Kernel compilation times
========================
three runs of "time make -j2"
--Linux 2.4.2-ac26 + HP Multiqueue patch
216.34user 14.36system 2:00.03elapsed 192%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (482269major+691020minor)pagefaults 0swaps
216.53user 14.23system 1:57.91elapsed 195%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (482269major+691020minor)pagefaults 0swaps
217.65user 13.46system 1:58.05elapsed 195%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (482269major+691020minor)pagefaults 0swaps
--Linux 2.4.2-ac26
220.07user 14.88system 2:02.67elapsed 191%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (482269major+691019minor)pagefaults 0swaps
220.31user 14.90system 2:00.64elapsed 194%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (482269major+691018minor)pagefaults 0swaps
220.58user 14.84system 2:00.57elapsed 195%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (482269major+691018minor)pagefaults 0swaps
LMBENCH-2BETA1 ==============
First two: Linux 2.4.2-ac26 + HP Multiqueue patch
Second Two: Linux 2.4.2-ac26
Processor, Processes - times in microseconds - smaller is better ---------------------------------------------------------------- Host OS Mhz null null open selct sig sig fork exec sh call I/O stat clos TCP inst hndl proc proc proc --------- ------------- ---- ---- ---- ---- ---- ----- ---- ---- ---- ---- ---- skinny Linux 2.4.2-a 997 0.34 0.56 3.96 5.04 27 0.89 2.72 245 1128 4044 skinny Linux 2.4.2-a 997 0.34 0.57 4.19 5.32 25 0.89 2.71 247 1150 4067 skinny Linux 2.4.2-a 997 0.34 0.58 3.90 5.00 25 0.89 2.69 249 1121 3968 skinny Linux 2.4.2-a 997 0.34 0.57 3.90 5.01 25 0.87 2.70 246 1126 4018
Context switching - times in microseconds - smaller is better ------------------------------------------------------------- Host OS 2p/0K 2p/16K 2p/64K 8p/16K 8p/64K 16p/16K 16p/64K ctxsw ctxsw ctxsw ctxsw ctxsw ctxsw ctxsw --------- ------------- ----- ------ ------ ------ ------ ------- ------- skinny Linux 2.4.2-a 1.820 4.7700 12 8.0800 109 27 110 skinny Linux 2.4.2-a 1.890 4.7300 20 6.6500 109 27 110 skinny Linux 2.4.2-a 1.620 4.5900 12 6.7000 109 24 109 skinny Linux 2.4.2-a 1.700 4.6400 12 7.0600 109 26 109
*Local* Communication latencies in microseconds - smaller is better ------------------------------------------------------------------- Host OS 2p/0K Pipe AF UDP RPC/ TCP RPC/ TCP ctxsw UNIX UDP TCP conn --------- ------------- ----- ----- ---- ----- ----- ----- ----- ---- skinny Linux 2.4.2-a 1.820 7.390 15 16 52 23 91 55 skinny Linux 2.4.2-a 1.890 7.185 14 16 41 23 56 54 skinny Linux 2.4.2-a 1.620 6.793 15 16 40 21 56 54 skinny Linux 2.4.2-a 1.700 6.801 15 16 40 21 55 54
File & VM system latencies in microseconds - smaller is better -------------------------------------------------------------- Host OS 0K File 10K File Mmap Prot Page Create Delete Create Delete Latency Fault Fault --------- ------------- ------ ------ ------ ------ ------- ----- ----- skinny Linux 2.4.2-a 6.2054 0.7192 12 1.6973 451 0.672 2.00000 skinny Linux 2.4.2-a 6.2469 0.7360 12 1.7142 465 0.668 2.00000 skinny Linux 2.4.2-a 3.1992 0.7182 12 1.5857 449 0.680 2.00000 skinny Linux 2.4.2-a 6.1576 0.7119 12 1.5817 448 0.669 2.00000
*Local* Communication bandwidths in MB/s - bigger is better ----------------------------------------------------------- Host OS Pipe AF TCP File Mmap Bcopy Bcopy Mem Mem UNIX reread reread (libc) (hand) read write --------- ------------- ---- ---- ---- ------ ------ ------ ------ ---- ----- skinny Linux 2.4.2-a 823 233 162 434 558 271 218 558 282 skinny Linux 2.4.2-a 828 289 243 408 558 269 216 558 281 skinny Linux 2.4.2-a 839 285 249 445 558 222 147 558 219 skinny Linux 2.4.2-a 811 284 242 445 558 222 147 558 219
Memory latencies in nanoseconds - smaller is better (WARNING - may not be correct, check graphs) --------------------------------------------------- Host OS Mhz L1 $ L2 $ Main mem Guesses --------- ------------- ---- ----- ------ -------- ------- skinny Linux 2.4.2-a 997 3.009 7.0230 101 skinny Linux 2.4.2-a 997 3.010 7.0220 101 skinny Linux 2.4.2-a 997 3.010 7.0280 101 skinny Linux 2.4.2-a 997 3.009 7.0290 101
--------------B46085350AE53D8B14855445--
- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/