[patch] mini-lowlatency

Andrew Morton (akpm@zip.com.au)
Sun, 06 Jan 2002 21:32:21 -0800


This is a patch which improves the kernel's scheduling latency under
heavy filesystem activity. I am proposing this for the 2.4 series.
Parts of it are from the -aa patchset.

It adds ten rescheduling points and pops spinlocks in two places. It's
all pretty obvious and safe, apart from the ext3 chunk which required some
thought.

Testing was on an 850MHz PIII with 768 megabytes of RAM. The workload
executed was:

cd /usr/src/linux
make -j3 bzImage
make clean
cd /mnt/hda7
tar xfz /nfs-mountpoint/linux-2.4.10.tar.gz
rm -rf linux
dd if=/dev/zero of=foo bs=1M count=400
rm foo
cvs co dbench
cd dbench
make
./dbench 40
sync
cd ..
rm -rf dbench

In conjunction with this, the `realfeel2' app from
http://www.zip.com.au/~akpm/linux/amlat.tar.gz was used to accumulate a
histogram of the latency between occurrence of an interrupt and the
scheduling of the userspace process which was woken as a result of that
interrupt.

Testing is against 2.4.18-pre1. The table shows the number of times a
particular latency was encountered with this workload, for various
patchsets. The filesystem was ext3. ext2 does better.

Latency (milliseconds)
Kernel 0-1 1-2 2-4 4-8 8-16 16-32 32-64 64-128 128-256

Stock 1.2e6 365 32 33 417 11 11 4

rml-preempt 5.0e6 119 26 22 11 8 13 3 1

mini-ll 1.8e6 217 15 15 8 9 1

mini-ll+rml-preempt 1.4e6 96 14 7 4 2 5 1

low-latency 1.3e6 18 1

The 5.0e6 count on the second row is due to extra idle time (I hadn't
scripted the test).

This is a logarithmic representation, so the mini-ll improvements do
not look really fantastic, but in fact it is a very considerable
improvement. Worst-case latency is 36 milliseconds (versus 84 for the
stock kernel).

Bear in mind that the intent here is mouse-no-jerky and less
audio dropouts. If the requirement is for very low worst-case,
you'll still need a low-latency patch. Here's the full histogram
for the latest low-latency patch:

0.0 1271218
0.1 4554
0.2 472
0.3 78
0.4 10
0.5 5
0.6 24
0.7 27
0.8 20
0.9 10
1.0 15
1.1 1
1.2 1
1.7 1
3.2 1

Which is butt-kicking. It can be improved by minimising the amount
of online swapspace. scan_swap_map().

--- linux-2.4.18-pre1/fs/buffer.c Fri Dec 21 11:19:14 2001
+++ linux-akpm/fs/buffer.c Sun Jan 6 01:06:41 2002
@@ -249,12 +249,19 @@ static int wait_for_buffers(kdev_t dev,
struct buffer_head * next;
int nr;

- next = lru_list[index];
nr = nr_buffers_type[index];
+repeat:
+ next = lru_list[index];
while (next && --nr >= 0) {
struct buffer_head *bh = next;
next = bh->b_next_free;

+ if (dev == NODEV && current->need_resched) {
+ spin_unlock(&lru_list_lock);
+ conditional_schedule();
+ spin_lock(&lru_list_lock);
+ goto repeat;
+ }
if (!buffer_locked(bh)) {
if (refile)
__refile_buffer(bh);
@@ -1174,8 +1181,10 @@ struct buffer_head * bread(kdev_t dev, i

bh = getblk(dev, block, size);
touch_buffer(bh);
- if (buffer_uptodate(bh))
+ if (buffer_uptodate(bh)) {
+ conditional_schedule();
return bh;
+ }
ll_rw_block(READ, 1, &bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
--- linux-2.4.18-pre1/fs/dcache.c Fri Dec 21 11:19:14 2001
+++ linux-akpm/fs/dcache.c Sat Jan 5 23:49:00 2002
@@ -71,7 +71,7 @@ static inline void d_free(struct dentry
* d_iput() operation if defined.
* Called with dcache_lock held, drops it.
*/
-static inline void dentry_iput(struct dentry * dentry)
+static void dentry_iput(struct dentry * dentry)
{
struct inode *inode = dentry->d_inode;
if (inode) {
@@ -84,6 +84,7 @@ static inline void dentry_iput(struct de
iput(inode);
} else
spin_unlock(&dcache_lock);
+ conditional_schedule();
}

/*
--- linux-2.4.18-pre1/fs/jbd/commit.c Fri Dec 21 11:19:14 2001
+++ linux-akpm/fs/jbd/commit.c Sun Jan 6 02:43:43 2002
@@ -212,6 +212,16 @@ write_out_data_locked:
__journal_remove_journal_head(bh);
refile_buffer(bh);
__brelse(bh);
+ if (current->need_resched) {
+ if (commit_transaction->t_sync_datalist)
+ commit_transaction->t_sync_datalist =
+ next_jh;
+ if (bufs)
+ break;
+ spin_unlock(&journal_datalist_lock);
+ conditional_schedule();
+ goto write_out_data;
+ }
}
}
if (bufs == ARRAY_SIZE(wbuf)) {
--- linux-2.4.18-pre1/fs/proc/array.c Thu Oct 11 09:00:01 2001
+++ linux-akpm/fs/proc/array.c Sat Jan 5 23:49:00 2002
@@ -415,6 +415,8 @@ static inline void statm_pte_range(pmd_t
pte_t page = *pte;
struct page *ptpage;

+ conditional_schedule();
+
address += PAGE_SIZE;
pte++;
if (pte_none(page))
--- linux-2.4.18-pre1/fs/proc/generic.c Fri Sep 7 10:53:59 2001
+++ linux-akpm/fs/proc/generic.c Sat Jan 5 23:49:00 2002
@@ -98,7 +98,9 @@ proc_file_read(struct file * file, char
retval = n;
break;
}
-
+
+ conditional_schedule();
+
/* This is a hack to allow mangling of file pos independent
* of actual bytes read. Simply place the data at page,
* return the bytes, and set `start' to the desired offset
--- linux-2.4.18-pre1/include/linux/condsched.h Thu Jan 1 00:00:00 1970
+++ linux-akpm/include/linux/condsched.h Sat Jan 5 23:49:00 2002
@@ -0,0 +1,18 @@
+#ifndef _LINUX_CONDSCHED_H
+#define _LINUX_CONDSCHED_H
+
+#ifndef __LINUX_COMPILER_H
+#include <linux/compiler.h>
+#endif
+
+#ifndef __ASSEMBLY__
+#define conditional_schedule() \
+do { \
+ if (unlikely(current->need_resched)) { \
+ __set_current_state(TASK_RUNNING); \
+ schedule(); \
+ } \
+} while(0)
+#endif
+
+#endif
--- linux-2.4.18-pre1/include/linux/sched.h Fri Dec 21 11:19:23 2001
+++ linux-akpm/include/linux/sched.h Sun Jan 6 02:44:06 2002
@@ -13,6 +13,7 @@ extern unsigned long event;
#include <linux/times.h>
#include <linux/timex.h>
#include <linux/rbtree.h>
+#include <linux/condsched.h>

#include <asm/system.h>
#include <asm/semaphore.h>
--- linux-2.4.18-pre1/mm/filemap.c Wed Dec 26 11:47:41 2001
+++ linux-akpm/mm/filemap.c Sat Jan 5 23:49:00 2002
@@ -296,10 +296,7 @@ static int truncate_list_pages(struct li

page_cache_release(page);

- if (current->need_resched) {
- __set_current_state(TASK_RUNNING);
- schedule();
- }
+ conditional_schedule();

spin_lock(&pagecache_lock);
goto restart;
@@ -609,6 +606,7 @@ void filemap_fdatasync(struct address_sp
UnlockPage(page);

page_cache_release(page);
+ conditional_schedule();
spin_lock(&pagecache_lock);
}
spin_unlock(&pagecache_lock);
@@ -1392,6 +1390,9 @@ page_ok:
offset &= ~PAGE_CACHE_MASK;

page_cache_release(page);
+
+ conditional_schedule();
+
if (ret == nr && desc->count)
continue;
break;
@@ -3025,6 +3026,8 @@ unlock:
SetPageReferenced(page);
UnlockPage(page);
page_cache_release(page);
+
+ conditional_schedule();

if (status < 0)
break;
--- linux-2.4.18-pre1/drivers/block/ll_rw_blk.c Mon Nov 5 21:01:11 2001
+++ linux-akpm/drivers/block/ll_rw_blk.c Sat Jan 5 23:49:00 2002
@@ -913,6 +913,7 @@ void submit_bh(int rw, struct buffer_hea
kstat.pgpgin += count;
break;
}
+ conditional_schedule();
}

/**
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/