<!-- received="Wed Apr 26 16:43:30 2000 EET DST" -->
<!-- sent="Wed, 26 Apr 2000 10:36:10 -0300 (BRST)" -->
<!-- name="Rik van Riel" -->
<!-- email="riel@conectiva.com.br" -->
<!-- subject="[patch] 2.3.99-pre6-3 VM fixed" -->
<!-- id="" -->
<!-- inreplyto="" -->
<title>Linux-kernel mailing list archive 2000-17,: [patch] 2.3.99-pre6-3 VM fixed</title>
<body bgcolor="#FFFFFF"><font face="Arial,Helvetica">
<h1>[patch] 2.3.99-pre6-3 VM fixed</h1>
<b>Rik van Riel</b> (<a href="mailto:riel@conectiva.com.br"><i>riel@conectiva.com.br</i></a>)<br>
<i>Wed, 26 Apr 2000 10:36:10 -0300 (BRST)</i>
<p>
<ul>
<li> <b>Messages sorted by:</b> <a href="date.html#489">[ date ]</a><a href="index.html#489">[ thread ]</a><a href="subject.html#489">[ subject ]</a><a href="author.html#489">[ author ]</a>
<!-- next="start" -->
<li> <b>Next message:</b> <a href="0490.html">Amit S. Kale: "Re: kernel debugger"</a>
<li> <b>Previous message:</b> <a href="0488.html">Catilina: "Re: PROBLEM: umountfs and shutting down linux 2.3.99-pre5"</a>
<!-- nextthread="start" -->
<!-- reply="end" -->
</ul>
<hr>
<!-- body="start" -->
Hi,<br>
<p>
The attached patch should fix most of the VM performance problems<br>
2.3 was having. It does the following things:<br>
<p>
- have a global lru queue for shrink_mmap(), so balancing<br>
  between zones is achieved<br>
- protection against memory hogs, by scanning memory hogs<br>
  more agressively than other processes in swap_out()<br>
	- agressiveness (A:B) = sqrt (size A: size B)<br>
              [very rough approximation used in the code]<br>
	- if there is memory pressure, the biggest processes<br>
	  will call swap_out() before doing a memory allocation,<br>
          this will keep enough freeable pages in the LRU queue<br>
	  to make life for kswapd easy and let small processes<br>
	  run fast<br>
- since the memory of memory hogs is scanned more agressively<br>
  and more of the hog's pages end up on the lru queue, page<br>
  aging for the memory hog is better ... this often results in<br>
  better performance for the memory hog too<br>
- the LRU queue aging in shrink_mmap() is improved a bit<br>
<p>
<p>
The patch runs great in a variety of workloads I've tested here,<br>
but of course I'm not sure if it works as good as it should in<br>
*your* workload, so testing is wanted/needed/appreciated...<br>
<p>
TODO:<br>
- make the "anti hog" code sysctl switchable if it turns out<br>
  that performance of some memory hogs gets less because of<br>
  the anti hog measures<br>
<p>
regards,<br>
<p>
Rik<br>
<pre>
--
The Internet is not a network of computers. It is a network
of people. That is its real strength.
<p>
Wanna talk about the kernel?  irc.openprojects.net / #kernelnewbies
<a href="http://www.conectiva.com/">http://www.conectiva.com/</a>		http://www.surriel.com/
<p>
<p>
<p>
--- linux-2.3.99-pre6-3/mm/filemap.c.orig	Mon Apr 17 12:21:46 2000
+++ linux-2.3.99-pre6-3/mm/filemap.c	Tue Apr 25 18:39:29 2000
@@ -44,6 +44,7 @@
 atomic_t page_cache_size = ATOMIC_INIT(0);
 unsigned int page_hash_bits;
 struct page **page_hash_table;
+struct list_head lru_cache;
 
 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
 /*
@@ -149,11 +150,16 @@
 
 		/* page wholly truncated - free it */
 		if (offset &gt;= start) {
+			if (TryLockPage(page)) {
+				spin_unlock(&amp;pagecache_lock);
+				get_page(page);
+				wait_on_page(page);
+				put_page(page);
+				goto repeat;
+			}
 			get_page(page);
 			spin_unlock(&amp;pagecache_lock);
 
-			lock_page(page);
-
 			if (!page-&gt;buffers || block_flushpage(page, 0))
 				lru_cache_del(page);
 
@@ -191,11 +197,13 @@
 			continue;
 
 		/* partial truncate, clear end of page */
+		if (TryLockPage(page)) {
+			spin_unlock(&amp;pagecache_lock);
+			goto repeat;
+		}
 		get_page(page);
 		spin_unlock(&amp;pagecache_lock);
 
-		lock_page(page);
-
 		memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
 		if (page-&gt;buffers)
 			block_flushpage(page, partial);
@@ -208,6 +216,9 @@
 		 */
 		UnlockPage(page);
 		page_cache_release(page);
+		get_page(page);
+		wait_on_page(page);
+		put_page(page);
 		goto repeat;
 	}
 	spin_unlock(&amp;pagecache_lock);
@@ -215,46 +226,61 @@
 
 int shrink_mmap(int priority, int gfp_mask, zone_t *zone)
 {
-	int ret = 0, count;
+	int ret = 0, loop = 0, count;
 	LIST_HEAD(young);
 	LIST_HEAD(old);
 	LIST_HEAD(forget);
 	struct list_head * page_lru, * dispose;
-	struct page * page;
-
+	struct page * page = NULL;
+	struct zone_struct * p_zone;
+	int maxloop = 256 &gt;&gt; priority;
+	
 	if (!zone)
 		BUG();
 
-	count = nr_lru_pages / (priority+1);
+	/* the first term should be very small when nr_lru_pages is small */
+	/*
+	count = (10 * nr_lru_pages * nr_lru_pages) / num_physpages;
+	count += nr_lru_pages;
+	count &gt;&gt;= priority;
+	*/
+	count = nr_lru_pages &gt;&gt; priority;
+	if (!count)
+		return ret;
 
 	spin_lock(&amp;pagemap_lru_lock);
-
-	while (count &gt; 0 &amp;&amp; (page_lru = zone-&gt;lru_cache.prev) != &amp;zone-&gt;lru_cache) {
+again:
+	/* we need pagemap_lru_lock for list_del() ... subtle code below */
+	while (count &gt; 0 &amp;&amp; (page_lru = lru_cache.prev) != &amp;lru_cache) {
 		page = list_entry(page_lru, struct page, lru);
 		list_del(page_lru);
+		p_zone = page-&gt;zone;
 
-		dispose = &amp;zone-&gt;lru_cache;
-		if (test_and_clear_bit(PG_referenced, &amp;page-&gt;flags))
-			/* Roll the page at the top of the lru list,
-			 * we could also be more aggressive putting
-			 * the page in the young-dispose-list, so
-			 * avoiding to free young pages in each pass.
-			 */
-			goto dispose_continue;
-
+		/*
+		 * These two tests are there to make sure we don't free too
+		 * many pages from the "wrong" zone. We free some anyway,
+		 * they are the least recently used pages in the system.
+		 * When we don't free them, leave them in &amp;old.
+		 */
 		dispose = &amp;old;
-		/* don't account passes over not DMA pages */
-		if (zone &amp;&amp; (!memclass(page-&gt;zone, zone)))
+		if (p_zone != zone &amp;&amp; (loop &gt; (maxloop / 4) ||
+				p_zone-&gt;free_pages &gt; p_zone-&gt;pages_high))
 			goto dispose_continue;
 
+		/* The page is in use, or was used very recently, put it in
+		 * &amp;young to make sure that we won't try to free it the next
+		 * time */
 		count--;
-
 		dispose = &amp;young;
-
-		/* avoid unscalable SMP locking */
 		if (!page-&gt;buffers &amp;&amp; page_count(page) &gt; 1)
 			goto dispose_continue;
 
+		/* Only count pages that have a chance of being freeable */
+		if (test_and_clear_bit(PG_referenced, &amp;page-&gt;flags))
+			goto dispose_continue;
+
+		/* Page not used -&gt; free it; if that fails -&gt; &amp;old */
+		dispose = &amp;old;
 		if (TryLockPage(page))
 			goto dispose_continue;
 
@@ -327,6 +353,7 @@
 		list_add(page_lru, dispose);
 		continue;
 
+		/* we're holding pagemap_lru_lock, so we can just loop again */
 dispose_continue:
 		list_add(page_lru, dispose);
 	}
@@ -342,9 +369,14 @@
 	/* nr_lru_pages needs the spinlock */
 	nr_lru_pages--;
 
+	loop++;
+	/* wrong zone?  not looped too often?    roll again... */
+	if (page-&gt;zone != zone &amp;&amp; loop &lt; maxloop)
+		goto again;
+
 out:
-	list_splice(&amp;young, &amp;zone-&gt;lru_cache);
-	list_splice(&amp;old, zone-&gt;lru_cache.prev);
+	list_splice(&amp;young, &amp;lru_cache);
+	list_splice(&amp;old, lru_cache.prev);
 
 	spin_unlock(&amp;pagemap_lru_lock);
 
--- linux-2.3.99-pre6-3/mm/page_alloc.c.orig	Mon Apr 17 12:21:46 2000
+++ linux-2.3.99-pre6-3/mm/page_alloc.c	Wed Apr 26 08:35:01 2000
@@ -25,7 +25,7 @@
 #endif
 
 int nr_swap_pages = 0;
-int nr_lru_pages;
+int nr_lru_pages = 0;
 pg_data_t *pgdat_list = (pg_data_t *)0;
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
@@ -33,6 +33,7 @@
 static int zone_balance_min[MAX_NR_ZONES] = { 10 , 10, 10, };
 static int zone_balance_max[MAX_NR_ZONES] = { 255 , 255, 255, };
 
+extern int swap_out(unsigned int, int);
 /*
  * Free_page() adds the page to the free lists. This is optimized for
  * fast normal cases (no error jumps taken normally).
@@ -273,6 +274,7 @@
 struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
 {
 	zone_t **zone = zonelist-&gt;zones;
+	int gfp_mask = zonelist-&gt;gfp_mask;
 
 	/*
 	 * If this is a recursive call, we'd better
@@ -282,6 +284,13 @@
 	if (current-&gt;flags &amp; PF_MEMALLOC)
 		goto allocate_ok;
 
+	/* If we're a memory hog, unmap some pages */
+	if (current-&gt;hog &amp;&amp; (gfp_mask &amp; __GFP_WAIT)) {
+		zone_t *z = *zone;
+	       	if (z-&gt;zone_wake_kswapd)
+			swap_out(6, gfp_mask);
+	}
+
 	/*
 	 * (If anyone calls gfp from interrupts nonatomically then it
 	 * will sooner or later tripped up by a schedule().)
@@ -530,6 +539,7 @@
 	freepages.min += i;
 	freepages.low += i * 2;
 	freepages.high += i * 3;
+	memlist_init(&amp;lru_cache);
 
 	/*
 	 * Some architectures (with lots of mem and discontinous memory
@@ -609,7 +619,6 @@
 			unsigned long bitmap_size;
 
 			memlist_init(&amp;zone-&gt;free_area[i].free_list);
-			memlist_init(&amp;zone-&gt;lru_cache);
 			mask += mask;
 			size = (size + ~mask) &amp; mask;
 			bitmap_size = size &gt;&gt; i;
--- linux-2.3.99-pre6-3/mm/vmscan.c.orig	Mon Apr 17 12:21:46 2000
+++ linux-2.3.99-pre6-3/mm/vmscan.c	Wed Apr 26 07:39:53 2000
@@ -34,7 +34,7 @@
  * using a process that no longer actually exists (it might
  * have died while we slept).
  */
-static int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
+static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
 {
 	pte_t pte;
 	swp_entry_t entry;
@@ -48,6 +48,7 @@
 	if ((page-mem_map &gt;= max_mapnr) || PageReserved(page))
 		goto out_failed;
 
+	mm-&gt;swap_cnt--;
 	/* Don't look at this pte if it's been accessed recently. */
 	if (pte_young(pte)) {
 		/*
@@ -194,7 +195,7 @@
  * (C) 1993 Kai Petzke, <a href="mailto:wpp@marie.physik.tu-berlin.de">wpp@marie.physik.tu-berlin.de</a>
  */
 
-static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 {
 	pte_t * pte;
 	unsigned long pmd_end;
@@ -216,16 +217,18 @@
 	do {
 		int result;
 		vma-&gt;vm_mm-&gt;swap_address = address + PAGE_SIZE;
-		result = try_to_swap_out(vma, address, pte, gfp_mask);
+		result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
 		if (result)
 			return result;
+		if (!mm-&gt;swap_cnt)
+			return 0;
 		address += PAGE_SIZE;
 		pte++;
 	} while (address &amp;&amp; (address &lt; end));
 	return 0;
 }
 
-static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 {
 	pmd_t * pmd;
 	unsigned long pgd_end;
@@ -245,16 +248,18 @@
 		end = pgd_end;
 	
 	do {
-		int result = swap_out_pmd(vma, pmd, address, end, gfp_mask);
+		int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask);
 		if (result)
 			return result;
+		if (!mm-&gt;swap_cnt)
+			return 0;
 		address = (address + PMD_SIZE) &amp; PMD_MASK;
 		pmd++;
 	} while (address &amp;&amp; (address &lt; end));
 	return 0;
 }
 
-static int swap_out_vma(struct vm_area_struct * vma, unsigned long address, int gfp_mask)
+static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask)
 {
 	pgd_t *pgdir;
 	unsigned long end;
@@ -269,9 +274,11 @@
 	if (address &gt;= end)
 		BUG();
 	do {
-		int result = swap_out_pgd(vma, pgdir, address, end, gfp_mask);
+		int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask);
 		if (result)
 			return result;
+		if (!mm-&gt;swap_cnt)
+			return 0;
 		address = (address + PGDIR_SIZE) &amp; PGDIR_MASK;
 		pgdir++;
 	} while (address &amp;&amp; (address &lt; end));
@@ -299,7 +306,7 @@
 			address = vma-&gt;vm_start;
 
 		for (;;) {
-			int result = swap_out_vma(vma, address, gfp_mask);
+			int result = swap_out_vma(mm, vma, address, gfp_mask);
 			if (result)
 				return result;
 			vma = vma-&gt;vm_next;
@@ -321,7 +328,7 @@
  * N.B. This function returns only 0 or 1.  Return values != 1 from
  * the lower level routines result in continued processing.
  */
-static int swap_out(unsigned int priority, int gfp_mask)
+int swap_out(unsigned int priority, int gfp_mask)
 {
 	struct task_struct * p;
 	int counter;
@@ -369,9 +376,28 @@
 				pid = p-&gt;pid;
 			}
 		}
-		read_unlock(&amp;tasklist_lock);
-		if (assign == 1)
+		if (assign == 1) {
+			/* we just assigned swap_cnt, normalise values */
 			assign = 2;
+			p = init_task.next_task;
+			for (; p != &amp;init_task; p = p-&gt;next_task) {
+				int i = 0;
+				struct mm_struct *mm = p-&gt;mm;
+				if (!p-&gt;swappable || !mm || mm-&gt;rss &lt;= 0)
+					continue;
+				/* small processes are swapped out less */
+				while ((mm-&gt;swap_cnt &lt;&lt; 2 * (i + 1) &lt; max_cnt))
+					i++;
+				mm-&gt;swap_cnt &gt;&gt;= i;
+				mm-&gt;swap_cnt += i; /* in case we reach 0 */
+				/* we're big -&gt; hog treatment */
+				if (!i)
+					p-&gt;hog = 1;
+				else
+					p-&gt;hog = 0;
+			}
+		}
+		read_unlock(&amp;tasklist_lock);
 		if (!best) {
 			if (!assign) {
 				assign = 1;
@@ -412,13 +438,16 @@
 {
 	int priority;
 	int count = SWAP_CLUSTER_MAX;
+	int swapcount = SWAP_CLUSTER_MAX;
+	int ret;
 
 	/* Always trim SLAB caches when memory gets low. */
 	kmem_cache_reap(gfp_mask);
 
 	priority = 6;
 	do {
-		while (shrink_mmap(priority, gfp_mask, zone)) {
+free_more:
+		while ((ret = shrink_mmap(priority, gfp_mask, zone))) {
 			if (!--count)
 				goto done;
 		}
@@ -441,9 +470,13 @@
 			}
 		}
 
-		/* Then, try to page stuff out.. */
+		/* Then, try to page stuff out..
+		 * We use swapcount here because this doesn't actually
+		 * free pages */
 		while (swap_out(priority, gfp_mask)) {
-			if (!--count)
+			if (!--swapcount)
+				if (count)
+					goto free_more;
 				goto done;
 		}
 	} while (--priority &gt;= 0);
--- linux-2.3.99-pre6-3/include/linux/mm.h.orig	Mon Apr 17 12:22:22 2000
+++ linux-2.3.99-pre6-3/include/linux/mm.h	Wed Apr 26 07:40:34 2000
@@ -15,6 +15,7 @@
 extern unsigned long num_physpages;
 extern void * high_memory;
 extern int page_cluster;
+extern struct list_head lru_cache;
 
 #include &lt;asm/page.h&gt;
 #include &lt;asm/pgtable.h&gt;
--- linux-2.3.99-pre6-3/include/linux/mmzone.h.orig	Mon Apr 17 12:22:22 2000
+++ linux-2.3.99-pre6-3/include/linux/mmzone.h	Sat Apr 22 16:13:02 2000
@@ -31,7 +31,6 @@
 	char			low_on_memory;
 	char			zone_wake_kswapd;
 	unsigned long		pages_min, pages_low, pages_high;
-	struct list_head	lru_cache;
 
 	/*
 	 * free areas of different sizes
--- linux-2.3.99-pre6-3/include/linux/sched.h.orig	Mon Apr 17 12:22:23 2000
+++ linux-2.3.99-pre6-3/include/linux/sched.h	Wed Apr 26 07:26:57 2000
@@ -321,6 +321,7 @@
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
 	int swappable:1;
+	int hog:1;
 /* process credentials */
 	uid_t uid,euid,suid,fsuid;
 	gid_t gid,egid,sgid,fsgid;
--- linux-2.3.99-pre6-3/include/linux/swap.h.orig	Mon Apr 17 12:22:23 2000
+++ linux-2.3.99-pre6-3/include/linux/swap.h	Sat Apr 22 16:19:38 2000
@@ -166,7 +166,7 @@
 #define	lru_cache_add(page)			\
 do {						\
 	spin_lock(&amp;pagemap_lru_lock);		\
-	list_add(&amp;(page)-&gt;lru, &amp;page-&gt;zone-&gt;lru_cache);	\
+	list_add(&amp;(page)-&gt;lru, &amp;lru_cache);	\
 	nr_lru_pages++;				\
 	spin_unlock(&amp;pagemap_lru_lock);		\
 } while (0)
<p>
<p>
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at <a href="http://www.tux.org/lkml/">http://www.tux.org/lkml/</a>
</pre>
<!-- body="end" -->
<hr>
<p>
<ul>
<!-- next="start" -->
<li> <b>Next message:</b> <a href="0490.html">Amit S. Kale: "Re: kernel debugger"</a>
<li> <b>Previous message:</b> <a href="0488.html">Catilina: "Re: PROBLEM: umountfs and shutting down linux 2.3.99-pre5"</a>
<!-- nextthread="start" -->
<!-- reply="end" -->
</ul>
</font></body>
