[patch] Re: 2.4.3-pre6: agpart.o causes arch/i386/mm/ioremap.c hang

Andrew Morton (andrewm@uow.edu.au)
Fri, 23 Mar 2001 23:44:31 +1100


"Adam J. Richter" wrote:
>
> In case anyone is interested, the conflicting lock of
> init_mm.page_table_lock was acquired in line 1318 of mm/memory.c,
> in pte_alloc.

You can sorta blame me for that. I reviewed the locking in
the mmap_sem patch and said it was correct :(

I only checked that the new locking was correct, rather than
checking that the new locking *rules* were being complied
with kernel-wide.

pmd_alloc() has the same problem. It requires ->page_table_lock,
and we have bugs there. Fixed in this patch.

Linus, this patch includes the mm->rss locking stuff which
I sent yesterday.

Also, is the comment over remap_page_range true? Should the caller
be doing a down_write(mmap_sem)? If so, then there are
about thirty callers who don't. I've looked at each one,
and it is safe to do the down_write() *within* remap_page_range().

--- linux-2.4.3-pre6/fs/exec.c Thu Mar 22 18:52:52 2001
+++ linux-akpm/fs/exec.c Fri Mar 23 22:08:48 2001
@@ -252,6 +252,8 @@
/*
* This routine is used to map in a page into an address space: needed by
* execve() for the initial stack and environment pages.
+ *
+ * tsk->mmap_sem is held for writing.
*/
void put_dirty_page(struct task_struct * tsk, struct page *page, unsigned long address)
{
@@ -291,6 +293,7 @@
unsigned long stack_base;
struct vm_area_struct *mpnt;
int i;
+ unsigned long rss_increment = 0;

stack_base = STACK_TOP - MAX_ARG_PAGES*PAGE_SIZE;

@@ -322,11 +325,14 @@
struct page *page = bprm->page[i];
if (page) {
bprm->page[i] = NULL;
- current->mm->rss++;
+ rss_increment++;
put_dirty_page(current,page,stack_base);
}
stack_base += PAGE_SIZE;
}
+ spin_lock(&current->mm->page_table_lock);
+ current->mm->rss += rss_increment;
+ spin_unlock(&current->mm->page_table_lock);
up_write(&current->mm->mmap_sem);

return 0;
--- linux-2.4.3-pre6/include/linux/mm.h Thu Mar 22 18:52:52 2001
+++ linux-akpm/include/linux/mm.h Fri Mar 23 23:32:00 2001
@@ -407,6 +407,8 @@
* On a two-level page table, this ends up being trivial. Thus the
* inlining and the symmetry break with pte_alloc() that does all
* of this out-of-line.
+ *
+ * __pmd_alloc() requires that mm->page_table_lock be held, so we do too.
*/
static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
--- linux-2.4.3-pre6/include/linux/sched.h Thu Mar 22 18:52:52 2001
+++ linux-akpm/include/linux/sched.h Fri Mar 23 22:08:48 2001
@@ -209,9 +209,12 @@
atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
int map_count; /* number of VMAs */
struct rw_semaphore mmap_sem;
- spinlock_t page_table_lock;
+ spinlock_t page_table_lock; /* Protects task page tables and mm->rss */

- struct list_head mmlist; /* List of all active mm's */
+ struct list_head mmlist; /* List of all active mm's. These are globally strung
+ * together off init_mm.mmlist, and are protected
+ * by mmlist_lock
+ */

unsigned long start_code, end_code, start_data, end_data;
unsigned long start_brk, brk, start_stack;
--- linux-2.4.3-pre6/mm/memory.c Fri Mar 23 22:16:55 2001
+++ linux-akpm/mm/memory.c Fri Mar 23 23:32:05 2001
@@ -374,7 +374,6 @@
address = (address + PGDIR_SIZE) & PGDIR_MASK;
dir++;
} while (address && (address < end));
- spin_unlock(&mm->page_table_lock);
/*
* Update rss for the mm_struct (not necessarily current->mm)
* Notice that rss is an unsigned long.
@@ -383,6 +382,7 @@
mm->rss -= freed;
else
mm->rss = 0;
+ spin_unlock(&mm->page_table_lock);
}


@@ -655,6 +655,7 @@
} while (address && (address < end));
}

+/* mm->page_table_lock must be held */
static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address,
unsigned long size, pgprot_t prot)
{
@@ -734,6 +735,7 @@
} while (address && (address < end));
}

+/* mm->page_table_lock must be held */
static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size,
unsigned long phys_addr, pgprot_t prot)
{
@@ -792,6 +794,8 @@
* - flush the old one
* - update the page tables
* - inform the TLB about the new one
+ *
+ * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
*/
static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry)
{
@@ -800,6 +804,9 @@
update_mmu_cache(vma, address, entry);
}

+/*
+ * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
+ */
static inline void break_cow(struct vm_area_struct * vma, struct page * old_page, struct page * new_page, unsigned long address,
pte_t *page_table)
{
@@ -1024,8 +1031,7 @@
}

/*
- * We hold the mm semaphore and the page_table_lock on entry
- * and exit.
+ * We hold the mm semaphore and the page_table_lock on entry and exit.
*/
static int do_swap_page(struct mm_struct * mm,
struct vm_area_struct * vma, unsigned long address,
--- linux-2.4.3-pre6/mm/mmap.c Thu Mar 22 18:52:52 2001
+++ linux-akpm/mm/mmap.c Fri Mar 23 22:08:48 2001
@@ -889,8 +889,8 @@
spin_lock(&mm->page_table_lock);
mpnt = mm->mmap;
mm->mmap = mm->mmap_avl = mm->mmap_cache = NULL;
- spin_unlock(&mm->page_table_lock);
mm->rss = 0;
+ spin_unlock(&mm->page_table_lock);
mm->total_vm = 0;
mm->locked_vm = 0;

--- linux-2.4.3-pre6/mm/mremap.c Thu Mar 22 18:52:52 2001
+++ linux-akpm/mm/mremap.c Fri Mar 23 22:22:04 2001
@@ -15,6 +15,11 @@

extern int vm_enough_memory(long pages);

+/*
+ * Throughout all the following functions, mm->mmap_sem must be held for
+ * writing, and mm->page_table_lock must be held
+ */
+
static inline pte_t *get_one_pte(struct mm_struct *mm, unsigned long addr)
{
pgd_t * pgd;
--- linux-2.4.3-pre6/mm/swapfile.c Sun Feb 25 17:37:14 2001
+++ linux-akpm/mm/swapfile.c Fri Mar 23 22:08:48 2001
@@ -209,6 +209,7 @@
* share this swap entry, so be cautious and let do_wp_page work out
* what to do if a write is requested later.
*/
+/* tasklist_lock and vma->vm_mm->page_table_lock are held */
static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
pte_t *dir, swp_entry_t entry, struct page* page)
{
@@ -234,6 +235,7 @@
++vma->vm_mm->rss;
}

+/* tasklist_lock and vma->vm_mm->page_table_lock are held */
static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
unsigned long address, unsigned long size, unsigned long offset,
swp_entry_t entry, struct page* page)
@@ -261,6 +263,7 @@
} while (address && (address < end));
}

+/* tasklist_lock and vma->vm_mm->page_table_lock are held */
static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
unsigned long address, unsigned long size,
swp_entry_t entry, struct page* page)
@@ -291,6 +294,7 @@
} while (address && (address < end));
}

+/* tasklist_lock and vma->vm_mm->page_table_lock are held */
static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
swp_entry_t entry, struct page* page)
{
--- linux-2.4.3-pre6/mm/vmalloc.c Thu Mar 22 18:52:52 2001
+++ linux-akpm/mm/vmalloc.c Fri Mar 23 22:38:07 2001
@@ -123,7 +123,11 @@
if (end > PGDIR_SIZE)
end = PGDIR_SIZE;
do {
- pte_t * pte = pte_alloc(&init_mm, pmd, address);
+ pte_t * pte;
+
+ spin_lock(&init_mm.page_table_lock); /* pte_alloc requires this */
+ pte = pte_alloc(&init_mm, pmd, address);
+ spin_unlock(&init_mm.page_table_lock);
if (!pte)
return -ENOMEM;
if (alloc_area_pte(pte, address, end - address, gfp_mask, prot))
@@ -147,7 +151,9 @@
do {
pmd_t *pmd;

+ spin_lock(&init_mm.page_table_lock); /* pmd_alloc requires this */
pmd = pmd_alloc(&init_mm, dir, address);
+ spin_unlock(&init_mm.page_table_lock);
ret = -ENOMEM;
if (!pmd)
break;
--- linux-2.4.3-pre6/mm/vmscan.c Tue Jan 16 07:36:49 2001
+++ linux-akpm/mm/vmscan.c Fri Mar 23 22:08:48 2001
@@ -25,16 +25,15 @@
#include <asm/pgalloc.h>

/*
- * The swap-out functions return 1 if they successfully
- * threw something out, and we got a free page. It returns
- * zero if it couldn't do anything, and any other value
- * indicates it decreased rss, but the page was shared.
+ * The swap-out function returns 1 if it successfully
+ * scanned all the pages it was asked to (`count').
+ * It returns zero if it couldn't do anything,
*
- * NOTE! If it sleeps, it *must* return 1 to make sure we
- * don't continue with the swap-out. Otherwise we may be
- * using a process that no longer actually exists (it might
- * have died while we slept).
+ * rss may decrease because pages are shared, but this
+ * doesn't count as having freed a page.
*/
+
+/* mm->page_table_lock is held. mmap_sem is not held */
static void try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page)
{
pte_t pte;
@@ -129,6 +128,7 @@
return;
}

+/* mm->page_table_lock is held. mmap_sem is not held */
static int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count)
{
pte_t * pte;
@@ -165,6 +165,7 @@
return count;
}

+/* mm->page_table_lock is held. mmap_sem is not held */
static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count)
{
pmd_t * pmd;
@@ -194,6 +195,7 @@
return count;
}

+/* mm->page_table_lock is held. mmap_sem is not held */
static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count)
{
pgd_t *pgdir;
@@ -218,6 +220,9 @@
return count;
}

+/*
+ * Returns non-zero if we scanned all `count' pages
+ */
static int swap_out_mm(struct mm_struct * mm, int count)
{
unsigned long address;
@@ -879,6 +884,7 @@
* If there are applications that are active memory-allocators
* (most normal use), this basically shouldn't matter.
*/
+
int kswapd(void *unused)
{
struct task_struct *tsk = current;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/