[RFC] Page table sharing

Daniel Phillips (phillips@bonn-fries.net)
Sat, 16 Feb 2002 19:07:46 +0100


Hi,

I think this patch is ready to look at now. It's been pretty stable, though
I haven't gone as far as booting with it - page table sharing is still
restricted to uid 9999. I'm running it on a 2 way under moderate load without
apparent problems. The speedup on forking from a parent with large vm is
*way* more than I expected.

I haven't fully analyzed the locking yet, but I'm beginning to suspect it
just works as is, i.e., I haven't exposed any new critical regions. I'd be
happy to be corrected on that though.

Changed from the previous version:

- Debug tracing is in macros now, and off by default
- TLB flushing in zap_pte_range is hopefully correct now

--
Daniel

--- ../2.4.17.clean/fs/exec.c Fri Dec 21 12:41:55 2001 +++ ./fs/exec.c Sat Feb 16 17:41:52 2002 @@ -860,6 +860,7 @@ int retval; int i; + ptab(printk(">>> execve %s\n", filename)); file = open_exec(filename); retval = PTR_ERR(file); --- ../2.4.17.clean/include/linux/mm.h Fri Dec 21 12:42:03 2001 +++ ./include/linux/mm.h Sat Feb 16 17:41:52 2002 @@ -411,7 +411,7 @@ extern int vmtruncate(struct inode * inode, loff_t offset); extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); -extern pte_t *FASTCALL(pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); +extern pte_t *FASTCALL(__pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address, int write)); extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); extern int make_pages_present(unsigned long addr, unsigned long end); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); @@ -424,6 +424,19 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); + +static inline pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +{ + return __pte_alloc(mm, pmd, address, 1); +} + +#define nil do { } while (0) + +#if 0 +# define ptab(cmd) cmd +#else +# define ptab(cmd) nil +#endif /* * On a two-level page table, this ends up being trivial. Thus the --- ../2.4.17.clean/include/linux/sched.h Fri Dec 21 12:42:03 2001 +++ ./include/linux/sched.h Sat Feb 16 17:41:52 2002 @@ -427,7 +427,7 @@ #define PF_MEMDIE 0x00001000 /* Killed for out-of-memory */ #define PF_FREE_PAGES 0x00002000 /* per process page freeing */ #define PF_NOIO 0x00004000 /* avoid generating further I/O */ - +#define PF_SHARE_TABLES 0x00008000 /* share page tables (testing) */ #define PF_USEDFPU 0x00100000 /* task used FPU this quantum (SMP) */ /* --- ../2.4.17.clean/kernel/fork.c Wed Nov 21 13:18:42 2001 +++ ./kernel/fork.c Sat Feb 16 17:41:52 2002 @@ -566,9 +566,10 @@ struct task_struct *p; struct completion vfork; + ptab(printk(">>> fork, stack=%li\n", stack_start)); retval = -EPERM; - /* + /* * CLONE_PID is only allowed for the initial SMP swapper * calls */ --- ../2.4.17.clean/kernel/sys.c Tue Sep 18 17:10:43 2001 +++ ./kernel/sys.c Sat Feb 16 17:41:52 2002 @@ -514,6 +514,11 @@ current->uid = new_ruid; current->user = new_user; free_uid(old_user); + + if (current->uid == 9999) + current->flags |= PF_SHARE_TABLES; + printk(">>> user: uid=%i pid=%i pf=%x\n", current->uid, current->pid, current->flags); + return 0; } --- ../2.4.17.clean/mm/memory.c Fri Dec 21 12:42:05 2001 +++ ./mm/memory.c Sat Feb 16 17:41:52 2002 @@ -34,6 +34,9 @@ * * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG * (Gerhard.Wichert@pdb.siemens.de) + * + * Feb 2002 - Shared page tables added by Daniel Phillips + * (phillips@nl.linux.org) */ #include <linux/mm.h> @@ -100,8 +103,12 @@ return; } pte = pte_offset(dir, 0); - pmd_clear(dir); - pte_free(pte); + if (current->uid == 9999 || page_count(virt_to_page(pte)) > 1) + ptab(printk(">>> free page table %p (%i)\n", pte, page_count(virt_to_page(pte)))); + if (put_page_testzero(virt_to_page(pte))) { + pmd_clear(dir); + pte_free(pte); + } } static inline void free_one_pgd(pgd_t * dir) @@ -143,8 +150,8 @@ */ void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr) { - pgd_t * page_dir = mm->pgd; - + pgd_t *page_dir = mm->pgd; + ptab(printk(">>> clear_page_tables\n")); spin_lock(&mm->page_table_lock); page_dir += first; do { @@ -171,13 +178,21 @@ * dst->page_table_lock is held on entry and exit, * but may be dropped within pmd_alloc() and pte_alloc(). */ -int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma) +int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) { - pgd_t * src_pgd, * dst_pgd; + pgd_t *src_pgd, *dst_pgd; unsigned long address = vma->vm_start; unsigned long end = vma->vm_end; unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE; + int share_page_tables = !!(current->flags & PF_SHARE_TABLES); + +#if 0 + static int teststart = 0, testcount = 999, tests = 0; + if (share_page_tables && (tests++ < teststart || tests > teststart + testcount)) + share_page_tables = 0; + if (share_page_tables) + printk(">>> copy_page_range test %i\n", tests - 1); +#endif src_pgd = pgd_offset(src, address)-1; dst_pgd = pgd_offset(dst, address)-1; @@ -186,15 +201,15 @@ pmd_t * src_pmd, * dst_pmd; src_pgd++; dst_pgd++; - + /* copy_pmd_range */ - + if (pgd_none(*src_pgd)) - goto skip_copy_pmd_range; + goto skip_pmd_range; if (pgd_bad(*src_pgd)) { pgd_ERROR(*src_pgd); pgd_clear(src_pgd); -skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK; +skip_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK; if (!address || (address >= end)) goto out; continue; @@ -204,34 +219,56 @@ dst_pmd = pmd_alloc(dst, dst_pgd, address); if (!dst_pmd) goto nomem; - do { - pte_t * src_pte, * dst_pte; - - /* copy_pte_range */ - + pte_t *src_ptb, *dst_ptb; + if (pmd_none(*src_pmd)) - goto skip_copy_pte_range; + goto skip_ptb_range; if (pmd_bad(*src_pmd)) { pmd_ERROR(*src_pmd); pmd_clear(src_pmd); -skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; +skip_ptb_range: address = (address + PMD_SIZE) & PMD_MASK; if (address >= end) goto out; - goto cont_copy_pmd_range; + goto cont_pmd_range; } - src_pte = pte_offset(src_pmd, address); - dst_pte = pte_alloc(dst, dst_pmd, address); - if (!dst_pte) + src_ptb = pte_offset(src_pmd, address); + + if (!share_page_tables) goto no_share; + + if (pmd_none(*dst_pmd)) { + get_page(virt_to_page(src_ptb)); + pmd_populate(dst, dst_pmd, ((unsigned long) src_ptb & PAGE_MASK)); + ptab(printk(">>> share %p @ %p (%i)\n", src_ptb, address, + page_count(virt_to_page(src_ptb)))); + } else if (page_count(virt_to_page(src_ptb)) == 1) // should test for ptbs != + goto no_share; + + spin_lock(&src->page_table_lock); + do { + pte_t pte = *src_ptb; + if (!pte_none(pte) && pte_present(pte)) { + struct page *page = pte_page(pte); + if (VALID_PAGE(page) && !PageReserved(page) && cow) + ptep_set_wrprotect(src_ptb); + } + if ((address += PAGE_SIZE) >= end) + goto out_unlock; + src_ptb++; + } while ((unsigned) src_ptb & PTE_TABLE_MASK); + spin_unlock(&src->page_table_lock); + + goto cont_pmd_range; +no_share: + dst_ptb = __pte_alloc(dst, dst_pmd, address, 0); + if (!dst_ptb) goto nomem; - spin_lock(&src->page_table_lock); + spin_lock(&src->page_table_lock); do { - pte_t pte = *src_pte; + pte_t pte = *src_ptb; struct page *ptepage; - - /* copy_one_pte */ if (pte_none(pte)) goto cont_copy_pte_range_noset; @@ -240,14 +277,14 @@ goto cont_copy_pte_range; } ptepage = pte_page(pte); - if ((!VALID_PAGE(ptepage)) || + if ((!VALID_PAGE(ptepage)) || PageReserved(ptepage)) goto cont_copy_pte_range; /* If it's a COW mapping, write protect it both in the parent and the child */ if (cow) { - ptep_set_wrprotect(src_pte); - pte = *src_pte; + ptep_set_wrprotect(src_ptb); + pte = *src_ptb; } /* If it's a shared mapping, mark it clean in the child */ @@ -257,16 +294,16 @@ get_page(ptepage); dst->rss++; -cont_copy_pte_range: set_pte(dst_pte, pte); +cont_copy_pte_range: set_pte(dst_ptb, pte); cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) goto out_unlock; - src_pte++; - dst_pte++; - } while ((unsigned long)src_pte & PTE_TABLE_MASK); + src_ptb++; + dst_ptb++; + } while ((unsigned) src_ptb & PTE_TABLE_MASK); spin_unlock(&src->page_table_lock); - -cont_copy_pmd_range: src_pmd++; + +cont_pmd_range: src_pmd++; dst_pmd++; } while ((unsigned long)src_pmd & PMD_TABLE_MASK); } @@ -302,7 +339,18 @@ pmd_clear(pmd); return 0; } + ptep = pte_offset(pmd, address); + + if (page_count(virt_to_page(ptep)) > 1) { + ptab(printk(">>> zap table!!! %p (%i)\n", + ptep, page_count(virt_to_page(ptep)))); + // pmd_clear(pmd); + // put_page(virt_to_page(ptep)); + tlb_remove_page(tlb, (pte_t *) pmd, pmd_val(*pmd)); + return 0; + } + offset = address & ~PMD_MASK; if (offset + size > PMD_SIZE) size = PMD_SIZE - offset; @@ -346,12 +394,30 @@ freed = 0; do { freed += zap_pte_range(tlb, pmd, address, end - address); - address = (address + PMD_SIZE) & PMD_MASK; + address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address < end); return freed; } +static int unshare_one(struct mm_struct *mm, pgd_t *dir, unsigned long address) +{ + if ((address & PMD_MASK)) { + address &= PMD_MASK; + if (!pgd_none(*dir)) { + pmd_t *pmd = pmd_offset(dir, address); + if (!pmd_none(*pmd)) { + pte_t *ptb = pte_offset(pmd, address); + if (page_count(virt_to_page(ptb)) > 1) { + __pte_alloc(mm, pmd, address, 1); + return 1; + } + } + } + } + return 0; +} + /* * remove user pages in a given range. */ @@ -361,6 +427,7 @@ pgd_t * dir; unsigned long start = address, end = address + size; int freed = 0; + ptab(printk(">>> zap_page_range %lx+%lx\n", address, size)); dir = pgd_offset(mm, address); @@ -374,6 +441,14 @@ if (address >= end) BUG(); spin_lock(&mm->page_table_lock); + + /* + * Ensure first and last partial page tables are unshared + */ + do { + unshare_one(mm, dir, address); + } while (unshare_one(mm, pgd_offset(mm, end & PMD_MASK), end)); + flush_cache_range(mm, address, end); tlb = tlb_gather_mmu(mm); @@ -1348,7 +1423,8 @@ pmd = pmd_alloc(mm, pgd, address); if (pmd) { - pte_t * pte = pte_alloc(mm, pmd, address); + pte_t *pte = __pte_alloc(mm, pmd, address, + write_access /*&& !(vma->vm_flags & VM_SHARED)*/); if (pte) return handle_pte_fault(mm, vma, address, write_access, pte); } @@ -1398,28 +1474,50 @@ * We've already handled the fast-path in-line, and we own the * page table lock. */ -pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +pte_t *__pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address, int write) { - if (pmd_none(*pmd)) { - pte_t *new; - - /* "fast" allocation can happen without dropping the lock.. */ - new = pte_alloc_one_fast(mm, address); + if (pmd_none(*pmd) || (write && page_count(virt_to_page(pmd_page(*pmd))) > 1)) { + pte_t *new = pte_alloc_one_fast(mm, address); if (!new) { spin_unlock(&mm->page_table_lock); new = pte_alloc_one(mm, address); spin_lock(&mm->page_table_lock); if (!new) return NULL; - - /* - * Because we dropped the lock, we should re-check the - * entry, as somebody else could have populated it.. - */ - if (!pmd_none(*pmd)) { + /* Recheck in case populated while unlocked */ + if (!pmd_none(*pmd) && !(write && page_count(virt_to_page(pmd_page(*pmd))) > 1)) { pte_free(new); goto out; } + } + ptab(printk(">>> make page table %p @ %p %s\n", new, address, + write == 2? "write fault": + write == 1? "unshared": + write == 0? "sharable": + "bogus")); + if (!page_count(virt_to_page(new)) == 1) BUG(); + if (!pmd_none(*pmd)) { + pte_t *src_ptb = pte_offset(pmd, 0); + pte_t *dst_ptb = new; + ptab(printk(">>> unshare %p (%i--)\n", *pmd, page_count(virt_to_page(pmd_page(*pmd))))); + do { + pte_t pte = *src_ptb; + if (!pte_none(pte)) { + if (pte_present(pte)) { + struct page *page = pte_page(pte); + if (VALID_PAGE(page) && !PageReserved(page)) { + get_page(page); + pte = pte_mkold(pte_mkclean(pte)); + mm->rss++; + } + } else + swap_duplicate(pte_to_swp_entry(pte)); + set_pte(dst_ptb, pte); + } + src_ptb++; + dst_ptb++; + } while ((unsigned) dst_ptb & PTE_TABLE_MASK); + put_page(virt_to_page(pmd_page(*pmd))); } pmd_populate(mm, pmd, new); } --- ../2.4.17.clean/mm/mremap.c Thu Sep 20 23:31:26 2001 +++ ./mm/mremap.c Sat Feb 16 17:41:52 2002 @@ -92,6 +92,7 @@ { unsigned long offset = len; + ptab(printk(">>> mremap\n")); flush_cache_range(mm, old_addr, old_addr + len); /* - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/