I've been using the patch below from Andrew for some weeks now, sometimes
under quite heavy load, and find it quite stable.
Just wanted to say ...
roy
> > I don't think Andrew is ready to submit this yet ... before anything
> > gets merged back, it'd be very worthwhile testing the relative
> > performance of both solutions ... the more testers we have the
> > better ;-)
>
> Cripes no. It's pretty experimental. Andrea spotted a bug, too. Fixed
> version is below.
>
> It's possible that keeping the number of buffers as low as possible
> will give improved performance over Andrea's approach because it
> leaves more ZONE_NORMAL for other things. It's also possible that
> it'll give worse performance because more get_block's need to be
> done for file overwriting.
>
>
> --- 2.4.19-pre8/include/linux/pagemap.h~nuke-buffers Fri May 24 12:24:56
> 2002 +++ 2.4.19-pre8-akpm/include/linux/pagemap.h Fri May 24 12:26:30 2002
> @@ -89,13 +89,7 @@ extern void add_to_page_cache(struct pag
> extern void add_to_page_cache_locked(struct page * page, struct
> address_space *mapping, unsigned long index); extern int
> add_to_page_cache_unique(struct page * page, struct address_space *mapping,
> unsigned long index, struct page **hash);
>
> -extern void ___wait_on_page(struct page *);
> -
> -static inline void wait_on_page(struct page * page)
> -{
> - if (PageLocked(page))
> - ___wait_on_page(page);
> -}
> +extern void wait_on_page(struct page *);
>
> extern struct page * grab_cache_page (struct address_space *, unsigned
> long); extern struct page * grab_cache_page_nowait (struct address_space *,
> unsigned long); --- 2.4.19-pre8/mm/filemap.c~nuke-buffers Fri May 24
> 12:24:56 2002 +++ 2.4.19-pre8-akpm/mm/filemap.c Fri May 24 12:24:56 2002
> @@ -608,7 +608,7 @@ int filemap_fdatawait(struct address_spa
> page_cache_get(page);
> spin_unlock(&pagecache_lock);
>
> - ___wait_on_page(page);
> + wait_on_page(page);
> if (PageError(page))
> ret = -EIO;
>
> @@ -805,33 +805,29 @@ static inline wait_queue_head_t *page_wa
> return &wait[hash];
> }
>
> -/*
> - * Wait for a page to get unlocked.
> +static void kill_buffers(struct page *page)
> +{
> + if (!PageLocked(page))
> + BUG();
> + if (page->buffers)
> + try_to_release_page(page, GFP_NOIO);
> +}
> +
> +/*
> + * Wait for a page to come unlocked. Then try to ditch its buffer_heads.
> *
> - * This must be called with the caller "holding" the page,
> - * ie with increased "page->count" so that the page won't
> - * go away during the wait..
> + * FIXME: Make the ditching dependent on CONFIG_MONSTER_BOX or something.
> */
> -void ___wait_on_page(struct page *page)
> +void wait_on_page(struct page *page)
> {
> - wait_queue_head_t *waitqueue = page_waitqueue(page);
> - struct task_struct *tsk = current;
> - DECLARE_WAITQUEUE(wait, tsk);
> -
> - add_wait_queue(waitqueue, &wait);
> - do {
> - set_task_state(tsk, TASK_UNINTERRUPTIBLE);
> - if (!PageLocked(page))
> - break;
> - sync_page(page);
> - schedule();
> - } while (PageLocked(page));
> - __set_task_state(tsk, TASK_RUNNING);
> - remove_wait_queue(waitqueue, &wait);
> + lock_page(page);
> + kill_buffers(page);
> + unlock_page(page);
> }
> +EXPORT_SYMBOL(wait_on_page);
>
> /*
> - * Unlock the page and wake up sleepers in ___wait_on_page.
> + * Unlock the page and wake up sleepers in lock_page.
> */
> void unlock_page(struct page *page)
> {
> @@ -1400,6 +1396,11 @@ found_page:
>
> if (!Page_Uptodate(page))
> goto page_not_up_to_date;
> + if (page->buffers) {
> + lock_page(page);
> + kill_buffers(page);
> + unlock_page(page);
> + }
> generic_file_readahead(reada_ok, filp, inode, page);
> page_ok:
> /* If users can be writing to this page using arbitrary
> @@ -1457,6 +1458,7 @@ page_not_up_to_date:
>
> /* Did somebody else fill it already? */
> if (Page_Uptodate(page)) {
> + kill_buffers(page);
> UnlockPage(page);
> goto page_ok;
> }
> @@ -1948,6 +1950,11 @@ retry_find:
> */
> if (!Page_Uptodate(page))
> goto page_not_uptodate;
> + if (page->buffers) {
> + lock_page(page);
> + kill_buffers(page);
> + unlock_page(page);
> + }
>
> success:
> /*
> @@ -2006,6 +2013,7 @@ page_not_uptodate:
>
> /* Did somebody else get it up-to-date? */
> if (Page_Uptodate(page)) {
> + kill_buffers(page);
> UnlockPage(page);
> goto success;
> }
> @@ -2033,6 +2041,7 @@ page_not_uptodate:
>
> /* Somebody else successfully read it in? */
> if (Page_Uptodate(page)) {
> + kill_buffers(page);
> UnlockPage(page);
> goto success;
> }
> @@ -2850,6 +2859,7 @@ retry:
> goto retry;
> }
> if (Page_Uptodate(page)) {
> + kill_buffers(page);
> UnlockPage(page);
> goto out;
> }
> --- 2.4.19-pre8/kernel/ksyms.c~nuke-buffers Fri May 24 12:24:56 2002
> +++ 2.4.19-pre8-akpm/kernel/ksyms.c Fri May 24 12:24:56 2002
> @@ -202,7 +202,6 @@ EXPORT_SYMBOL(ll_rw_block);
> EXPORT_SYMBOL(submit_bh);
> EXPORT_SYMBOL(unlock_buffer);
> EXPORT_SYMBOL(__wait_on_buffer);
> -EXPORT_SYMBOL(___wait_on_page);
> EXPORT_SYMBOL(generic_direct_IO);
> EXPORT_SYMBOL(discard_bh_page);
> EXPORT_SYMBOL(block_write_full_page);
> --- 2.4.19-pre8/mm/vmscan.c~nuke-buffers Fri May 24 12:24:56 2002
> +++ 2.4.19-pre8-akpm/mm/vmscan.c Fri May 24 12:24:56 2002
> @@ -365,8 +365,13 @@ static int shrink_cache(int nr_pages, zo
> if (unlikely(!page_count(page)))
> continue;
>
> - if (!memclass(page_zone(page), classzone))
> + if (!memclass(page_zone(page), classzone)) {
> + if (page->buffers && !TryLockPage(page)) {
> + try_to_release_page(page, GFP_NOIO);
> + unlock_page(page);
> + }
> continue;
> + }
>
> /* Racy check to avoid trylocking when not worthwhile */
> if (!page->buffers && (page_count(page) != 1 || !page->mapping))
> @@ -562,6 +567,11 @@ static int shrink_caches(zone_t * classz
> nr_pages -= kmem_cache_reap(gfp_mask);
> if (nr_pages <= 0)
> return 0;
> + if ((gfp_mask & __GFP_WAIT) && (shrink_buffer_cache() > 16)) {
> + nr_pages -= kmem_cache_reap(gfp_mask);
> + if (nr_pages <= 0)
> + return 0;
> + }
>
> nr_pages = chunk_size;
> /* try to keep the active list 2/3 of the size of the cache */
> --- 2.4.19-pre8/fs/buffer.c~nuke-buffers Fri May 24 12:24:56 2002
> +++ 2.4.19-pre8-akpm/fs/buffer.c Fri May 24 12:26:28 2002
> @@ -1500,6 +1500,10 @@ static int __block_write_full_page(struc
> /* Stage 3: submit the IO */
> do {
> struct buffer_head *next = bh->b_this_page;
> + /*
> + * Stick it on BUF_LOCKED so shrink_buffer_cache() can nail it.
> + */
> + refile_buffer(bh);
> submit_bh(WRITE, bh);
> bh = next;
> } while (bh != head);
> @@ -2615,6 +2619,25 @@ static int sync_page_buffers(struct buff
> int try_to_free_buffers(struct page * page, unsigned int gfp_mask)
> {
> struct buffer_head * tmp, * bh = page->buffers;
> + int was_uptodate = 1;
> +
> + if (!PageLocked(page))
> + BUG();
> +
> + if (!bh)
> + return 1;
> + /*
> + * Quick check for freeable buffers before we go take three
> + * global locks.
> + */
> + if (!(gfp_mask & __GFP_IO)) {
> + tmp = bh;
> + do {
> + if (buffer_busy(tmp))
> + return 0;
> + tmp = tmp->b_this_page;
> + } while (tmp != bh);
> + }
>
> cleaned_buffers_try_again:
> spin_lock(&lru_list_lock);
> @@ -2637,7 +2660,8 @@ cleaned_buffers_try_again:
> tmp = tmp->b_this_page;
>
> if (p->b_dev == B_FREE) BUG();
> -
> + if (!buffer_uptodate(p))
> + was_uptodate = 0;
> remove_inode_queue(p);
> __remove_from_queues(p);
> __put_unused_buffer_head(p);
> @@ -2645,7 +2669,15 @@ cleaned_buffers_try_again:
> spin_unlock(&unused_list_lock);
>
> /* Wake up anyone waiting for buffer heads */
> - wake_up(&buffer_wait);
> + smp_mb();
> + if (waitqueue_active(&buffer_wait))
> + wake_up(&buffer_wait);
> +
> + /*
> + * Make sure we don't read buffers again when they are reattached
> + */
> + if (was_uptodate)
> + SetPageUptodate(page);
>
> /* And free the page */
> page->buffers = NULL;
> @@ -2674,6 +2706,62 @@ busy_buffer_page:
> }
> EXPORT_SYMBOL(try_to_free_buffers);
>
> +/*
> + * Returns the number of pages which might have become freeable
> + */
> +int shrink_buffer_cache(void)
> +{
> + struct buffer_head *bh;
> + int nr_todo;
> + int nr_shrunk = 0;
> +
> + /*
> + * Move any clean unlocked buffers from BUF_LOCKED onto BUF_CLEAN
> + */
> + spin_lock(&lru_list_lock);
> + for ( ; ; ) {
> + bh = lru_list[BUF_LOCKED];
> + if (!bh || buffer_locked(bh))
> + break;
> + __refile_buffer(bh);
> + }
> +
> + /*
> + * Now start liberating buffers
> + */
> + nr_todo = nr_buffers_type[BUF_CLEAN];
> + while (nr_todo--) {
> + struct page *page;
> +
> + bh = lru_list[BUF_CLEAN];
> + if (!bh)
> + break;
> +
> + /*
> + * Park the buffer on BUF_LOCKED so we don't revisit it on
> + * this pass.
> + */
> + __remove_from_lru_list(bh);
> + bh->b_list = BUF_LOCKED;
> + __insert_into_lru_list(bh, BUF_LOCKED);
> + page = bh->b_page;
> + if (TryLockPage(page))
> + continue;
> +
> + page_cache_get(page);
> + spin_unlock(&lru_list_lock);
> + if (try_to_release_page(page, GFP_NOIO))
> + nr_shrunk++;
> + unlock_page(page);
> + page_cache_release(page);
> + spin_lock(&lru_list_lock);
> + }
> + spin_unlock(&lru_list_lock);
> +// printk("%s: liberated %d page's worth of buffer_heads\n",
> +// __FUNCTION__, nr_shrunk);
> + return (nr_shrunk * sizeof(struct buffer_head)) / PAGE_CACHE_SIZE;
> +}
> +
> /* ================== Debugging =================== */
>
> void show_buffers(void)
> @@ -2988,6 +3076,7 @@ int kupdate(void *startup)
> #ifdef DEBUG
> printk(KERN_DEBUG "kupdate() activated...\n");
> #endif
> + shrink_buffer_cache();
> sync_old_buffers();
> run_task_queue(&tq_disk);
> }
> --- 2.4.19-pre8/include/linux/fs.h~nuke-buffers Fri May 24 12:24:56 2002
> +++ 2.4.19-pre8-akpm/include/linux/fs.h Fri May 24 12:24:56 2002
> @@ -1116,6 +1116,7 @@ extern int FASTCALL(try_to_free_buffers(
> extern void refile_buffer(struct buffer_head * buf);
> extern void create_empty_buffers(struct page *, kdev_t, unsigned long);
> extern void end_buffer_io_sync(struct buffer_head *bh, int uptodate);
> +extern int shrink_buffer_cache(void);
>
> /* reiserfs_writepage needs this */
> extern void set_buffer_async_io(struct buffer_head *bh) ;
>
>
> -
-- Roy Sigurd Karlsbakk, DatavaktmesterComputers are like air conditioners. They stop working when you open Windows.
- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/