<!-- received="Sun Apr 23 05:16:36 2000 EET DST" -->
<!-- sent="Sat, 22 Apr 2000 23:08:35 -0300 (BRST)" -->
<!-- name="Rik van Riel" -->
<!-- email="riel@conectiva.com.br" -->
<!-- subject="[PATCH] 2.3.99-pre6-3+  VM rebalancing" -->
<!-- id="" -->
<!-- inreplyto="" -->
<title>Linux-kernel mailing list archive 2000-17,: [PATCH] 2.3.99-pre6-3+  VM rebalancing</title>
<body bgcolor="#FFFFFF"><font face="Arial,Helvetica">
<h1>[PATCH] 2.3.99-pre6-3+  VM rebalancing</h1>
<b>Rik van Riel</b> (<a href="mailto:riel@conectiva.com.br"><i>riel@conectiva.com.br</i></a>)<br>
<i>Sat, 22 Apr 2000 23:08:35 -0300 (BRST)</i>
<p>
<ul>
<li> <b>Messages sorted by:</b> <a href="date.html#22">[ date ]</a><a href="index.html#22">[ thread ]</a><a href="subject.html#22">[ subject ]</a><a href="author.html#22">[ author ]</a>
<!-- next="start" -->
<li> <b>Next message:</b> <a href="0023.html">Alexander Viro: "Re: [PATCH] f_op-&gt;poll() without lock_kernel()"</a>
<li> <b>Previous message:</b> <a href="0021.html">Daniel Stone: "RE: [-pre6-3] Oops on shutdown"</a>
<!-- nextthread="start" -->
<li> <b>Next in thread:</b> <a href="0394.html">Jeff Garzik: "Re: [PATCH] 2.3.99-pre6-3+  VM rebalancing"</a>
<li> <b>Reply:</b> <a href="0394.html">Jeff Garzik: "Re: [PATCH] 2.3.99-pre6-3+  VM rebalancing"</a>
<li> <b>Reply:</b> <a href="0473.html">David S. Miller: "Re: [PATCH] 2.3.99-pre6-3+  VM rebalancing"</a>
<li> <b>Reply:</b> <a href="0482.html">David S. Miller: "Re: [PATCH] 2.3.99-pre6-3+  VM rebalancing"</a>
<li> <b>Reply:</b> <a href="0486.html">David S. Miller: "Re: [PATCH] 2.3.99-pre6-3+  VM rebalancing"</a>
<li> <b>Reply:</b> <a href="0501.html">David S. Miller: "Re: [PATCH] 2.3.99-pre6-3+  VM rebalancing"</a>
<li> <b>Reply:</b> <a href="0505.html">David S. Miller: "Re: [PATCH] 2.3.99-pre6-3+  VM rebalancing"</a>
<!-- reply="end" -->
</ul>
<hr>
<!-- body="start" -->
Hi,<br>
<p>
the following patch makes VM in 2.3.99-pre6+ behave more nice<br>
than in previous versions. It does that by:<br>
<p>
- having a global lru queue for shrink_mmap()<br>
- slightly improving the lru scanning<br>
- being less agressive with lru scanning, so we'll have<br>
  more pages in the lru queue and will do better page<br>
  aging  (and also gives us a bigger buffer of clean pages,<br>
  this way big memory hogs have less impact on the rest of<br>
  the system)<br>
- freeing some pages from the "wrong" zone when freeing<br>
  from one particular zone ... this keeps memory balanced<br>
  because __alloc_pages() will allocate most pages from<br>
  the least busy zone<br>
<p>
It has done some amazing things in test situations on my<br>
machine, but I have no idea what it'll do to kswapd cpu<br>
usage on &gt;1GB machines. I think that the extra freedom in<br>
allocation will offset the slightly more expensive freeing<br>
code almost all of the time.<br>
<p>
regards,<br>
<p>
Rik<br>
<pre>
--
The Internet is not a network of computers. It is a network
of people. That is its real strength.
<p>
Wanna talk about the kernel?  irc.openprojects.net / #kernelnewbies
<a href="http://www.conectiva.com/">http://www.conectiva.com/</a>		http://www.surriel.com/
<p>
<p>
<p>
--- linux-2.3.99-pre6-3/mm/filemap.c.orig	Mon Apr 17 12:21:46 2000
+++ linux-2.3.99-pre6-3/mm/filemap.c	Sat Apr 22 22:14:10 2000
@@ -44,6 +44,7 @@
 atomic_t page_cache_size = ATOMIC_INIT(0);
 unsigned int page_hash_bits;
 struct page **page_hash_table;
+struct list_head lru_cache;
 
 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
 /*
@@ -149,11 +150,16 @@
 
 		/* page wholly truncated - free it */
 		if (offset &gt;= start) {
+			if (TryLockPage(page)) {
+				spin_unlock(&amp;pagecache_lock);
+				get_page(page);
+				wait_on_page(page);
+				put_page(page);
+				goto repeat;
+			}
 			get_page(page);
 			spin_unlock(&amp;pagecache_lock);
 
-			lock_page(page);
-
 			if (!page-&gt;buffers || block_flushpage(page, 0))
 				lru_cache_del(page);
 
@@ -191,11 +197,13 @@
 			continue;
 
 		/* partial truncate, clear end of page */
+		if (TryLockPage(page)) {
+			spin_unlock(&amp;pagecache_lock);
+			goto repeat;
+		}
 		get_page(page);
 		spin_unlock(&amp;pagecache_lock);
 
-		lock_page(page);
-
 		memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
 		if (page-&gt;buffers)
 			block_flushpage(page, partial);
@@ -208,6 +216,9 @@
 		 */
 		UnlockPage(page);
 		page_cache_release(page);
+		get_page(page);
+		wait_on_page(page);
+		put_page(page);
 		goto repeat;
 	}
 	spin_unlock(&amp;pagecache_lock);
@@ -215,46 +226,56 @@
 
 int shrink_mmap(int priority, int gfp_mask, zone_t *zone)
 {
-	int ret = 0, count;
+	int ret = 0, loop = 0, count;
 	LIST_HEAD(young);
 	LIST_HEAD(old);
 	LIST_HEAD(forget);
 	struct list_head * page_lru, * dispose;
-	struct page * page;
-
+	struct page * page = NULL;
+	struct zone_struct * p_zone;
+	
 	if (!zone)
 		BUG();
 
-	count = nr_lru_pages / (priority+1);
+	count = nr_lru_pages &gt;&gt; priority;
+	if (!count)
+		return ret;
 
 	spin_lock(&amp;pagemap_lru_lock);
-
-	while (count &gt; 0 &amp;&amp; (page_lru = zone-&gt;lru_cache.prev) != &amp;zone-&gt;lru_cache) {
+again:
+	/* we need pagemap_lru_lock for list_del() ... subtle code below */
+	while (count &gt; 0 &amp;&amp; (page_lru = lru_cache.prev) != &amp;lru_cache) {
 		page = list_entry(page_lru, struct page, lru);
 		list_del(page_lru);
+		p_zone = page-&gt;zone;
 
-		dispose = &amp;zone-&gt;lru_cache;
-		if (test_and_clear_bit(PG_referenced, &amp;page-&gt;flags))
-			/* Roll the page at the top of the lru list,
-			 * we could also be more aggressive putting
-			 * the page in the young-dispose-list, so
-			 * avoiding to free young pages in each pass.
-			 */
-			goto dispose_continue;
-
+		/*
+		 * These two tests are there to make sure we don't free too
+		 * many pages from the "wrong" zone. We free some anyway,
+		 * they are the least recently used pages in the system.
+		 * When we don't free them, leave them in &amp;old.
+		 */
 		dispose = &amp;old;
-		/* don't account passes over not DMA pages */
-		if (zone &amp;&amp; (!memclass(page-&gt;zone, zone)))
+		if (p_zone-&gt;free_pages &gt; p_zone-&gt;pages_high)
 			goto dispose_continue;
 
-		count--;
-
+		if (loop &gt; 5 &amp;&amp; page-&gt;zone != zone)
+			goto dispose_continue;
+		
+		/* The page is in use, or was used very recently, put it in
+		 * &amp;young to make sure that we won't try to free it the next
+		 * time */
 		dispose = &amp;young;
-
-		/* avoid unscalable SMP locking */
 		if (!page-&gt;buffers &amp;&amp; page_count(page) &gt; 1)
 			goto dispose_continue;
 
+		/* Only count pages that have a chance of being freeable */
+		count--;
+		if (test_and_clear_bit(PG_referenced, &amp;page-&gt;flags))
+			goto dispose_continue;
+
+		/* Page not used -&gt; free it; if that fails -&gt; &amp;old */
+		dispose = &amp;old;
 		if (TryLockPage(page))
 			goto dispose_continue;
 
@@ -327,6 +348,7 @@
 		list_add(page_lru, dispose);
 		continue;
 
+		/* we're holding pagemap_lru_lock, so we can just loop again */
 dispose_continue:
 		list_add(page_lru, dispose);
 	}
@@ -342,9 +364,14 @@
 	/* nr_lru_pages needs the spinlock */
 	nr_lru_pages--;
 
+	loop++;
+	/* wrong zone?  not looped too often?    roll again... */
+	if (page-&gt;zone != zone &amp;&amp; loop &lt; (128 &gt;&gt; priority))
+		goto again;
+
 out:
-	list_splice(&amp;young, &amp;zone-&gt;lru_cache);
-	list_splice(&amp;old, zone-&gt;lru_cache.prev);
+	list_splice(&amp;young, &amp;lru_cache);
+	list_splice(&amp;old, lru_cache.prev);
 
 	spin_unlock(&amp;pagemap_lru_lock);
 
--- linux-2.3.99-pre6-3/mm/page_alloc.c.orig	Mon Apr 17 12:21:46 2000
+++ linux-2.3.99-pre6-3/mm/page_alloc.c	Sat Apr 22 17:28:31 2000
@@ -25,7 +25,7 @@
 #endif
 
 int nr_swap_pages = 0;
-int nr_lru_pages;
+int nr_lru_pages = 0;
 pg_data_t *pgdat_list = (pg_data_t *)0;
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
@@ -530,6 +530,7 @@
 	freepages.min += i;
 	freepages.low += i * 2;
 	freepages.high += i * 3;
+	memlist_init(&amp;lru_cache);
 
 	/*
 	 * Some architectures (with lots of mem and discontinous memory
@@ -609,7 +610,6 @@
 			unsigned long bitmap_size;
 
 			memlist_init(&amp;zone-&gt;free_area[i].free_list);
-			memlist_init(&amp;zone-&gt;lru_cache);
 			mask += mask;
 			size = (size + ~mask) &amp; mask;
 			bitmap_size = size &gt;&gt; i;
--- linux-2.3.99-pre6-3/include/linux/mm.h.orig	Mon Apr 17 12:22:22 2000
+++ linux-2.3.99-pre6-3/include/linux/mm.h	Sat Apr 22 16:13:15 2000
@@ -15,6 +15,7 @@
 extern unsigned long num_physpages;
 extern void * high_memory;
 extern int page_cluster;
+extern struct list_head lru_cache;
 
 #include &lt;asm/page.h&gt;
 #include &lt;asm/pgtable.h&gt;
--- linux-2.3.99-pre6-3/include/linux/mmzone.h.orig	Mon Apr 17 12:22:22 2000
+++ linux-2.3.99-pre6-3/include/linux/mmzone.h	Sat Apr 22 16:13:02 2000
@@ -31,7 +31,6 @@
 	char			low_on_memory;
 	char			zone_wake_kswapd;
 	unsigned long		pages_min, pages_low, pages_high;
-	struct list_head	lru_cache;
 
 	/*
 	 * free areas of different sizes
--- linux-2.3.99-pre6-3/include/linux/swap.h.orig	Mon Apr 17 12:22:23 2000
+++ linux-2.3.99-pre6-3/include/linux/swap.h	Sat Apr 22 16:19:38 2000
@@ -166,7 +166,7 @@
 #define	lru_cache_add(page)			\
 do {						\
 	spin_lock(&amp;pagemap_lru_lock);		\
-	list_add(&amp;(page)-&gt;lru, &amp;page-&gt;zone-&gt;lru_cache);	\
+	list_add(&amp;(page)-&gt;lru, &amp;lru_cache);	\
 	nr_lru_pages++;				\
 	spin_unlock(&amp;pagemap_lru_lock);		\
 } while (0)
<p>
<p>
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at <a href="http://www.tux.org/lkml/">http://www.tux.org/lkml/</a>
</pre>
<!-- body="end" -->
<hr>
<p>
<ul>
<!-- next="start" -->
<li> <b>Next message:</b> <a href="0023.html">Alexander Viro: "Re: [PATCH] f_op-&gt;poll() without lock_kernel()"</a>
<li> <b>Previous message:</b> <a href="0021.html">Daniel Stone: "RE: [-pre6-3] Oops on shutdown"</a>
<!-- nextthread="start" -->
<li> <b>Next in thread:</b> <a href="0394.html">Jeff Garzik: "Re: [PATCH] 2.3.99-pre6-3+  VM rebalancing"</a>
<li> <b>Reply:</b> <a href="0394.html">Jeff Garzik: "Re: [PATCH] 2.3.99-pre6-3+  VM rebalancing"</a>
<li> <b>Reply:</b> <a href="0473.html">David S. Miller: "Re: [PATCH] 2.3.99-pre6-3+  VM rebalancing"</a>
<li> <b>Reply:</b> <a href="0482.html">David S. Miller: "Re: [PATCH] 2.3.99-pre6-3+  VM rebalancing"</a>
<li> <b>Reply:</b> <a href="0486.html">David S. Miller: "Re: [PATCH] 2.3.99-pre6-3+  VM rebalancing"</a>
<li> <b>Reply:</b> <a href="0501.html">David S. Miller: "Re: [PATCH] 2.3.99-pre6-3+  VM rebalancing"</a>
<li> <b>Reply:</b> <a href="0505.html">David S. Miller: "Re: [PATCH] 2.3.99-pre6-3+  VM rebalancing"</a>
<!-- reply="end" -->
</ul>
</font></body>
