Attached is my numa aware slab allocator, rediffed against 2.5.46.
It makes the objects that are returned from kmem_cache_alloc strictly
node local. Unfortunately this means that kmem_cache_free must return
objects to the home node, which is expensive. (The return is batched,
but it's still expensive)
I'm not sure that the patch will improve the performance - benchmarks
are now needed.
TODO:
- implement ptr_to_nodeid() for all archs.The current implementation is
a dummy, to test the code on non-NUMA systems.
- switch from MAX_NUMNODES to numnodes - Anton proposed that.
- improve the handling of nodes without cpus or without memory.
- add a kmem_cache_alloc_fromnode() function
- replace the kmem_list3 array with an array of pointers, and allocate
the storage from the right node.
- allocate the head arrays from the node that is local to the cpu that
accesses the head array.
- check for regressions - I was careful not to undo any cleanups that
happened between 2.5.42 and 46, but it's possible that I missed some.
-- Manfred--------------090700010801050401070407 Content-Type: text/plain; name="patch-slab-numa" Content-Transfer-Encoding: 7bit Content-Disposition: inline; filename="patch-slab-numa"
--- 2.5/mm/slab.c 2002-11-09 00:45:37.000000000 +0100 +++ build-2.5/mm/slab.c 2002-11-09 15:25:05.000000000 +0100 @@ -10,6 +10,8 @@ * * Cleanup, make the head arrays unconditional, preparation for NUMA * (c) 2002 Manfred Spraul + * Initial NUMA implementation + * (c) 2002 Manfred Spraul * * An implementation of the Slab Allocator as described in outline in; * UNIX Internals: The New Frontiers by Uresh Vahalia @@ -85,6 +87,29 @@ #include <asm/uaccess.h> /* + * Enable the NUMA mode for slab + * This is a separate define from CONFIG_DISCONTIGMEM, because it only + * applies if ZONE_NORMAL allocations are possible on all zones. + * TODO: + * - move ptr_to_nodeid into include/asm- + * - make the cache structures themselves node local + * - is it possible to use the cpu alloc interface? + * - the behaviour is bad if get_free_pages returns returns + * memory from the another node: + * The page is used just for one refill, then left on the + * other node's partial list. + * Is that acceptable? + * - determine the optimal placement for the chache spinlock: + * node local or global? + * - which additional statistic counters would be interesting? + * - disable object return for the hopeless caches [journal head, + * buffer head, dentry - we'll trash cachelines anyway] + */ +#define CONFIG_SLAB_NUMA +#undef MAX_NUMNODES +#define MAX_NUMNODES 4 + +/* * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, * SLAB_RED_ZONE & SLAB_POISON. * 0 for faster, smaller code (especially in the critical paths). @@ -174,6 +199,10 @@ * * The limit is stored in the per-cpu structure to reduce the data cache * footprint. + * On NUMA systems, 2 per-cpu structures exist: one for the current + * node, one for wrong node free calls. + * Memory from the wrong node is never returned by alloc, it's returned + * to the home node as soon as the cpu cache is filled * */ struct array_cache { @@ -183,8 +212,17 @@ unsigned int touched; }; +struct cpucache_wrapper { + struct array_cache *native; +#ifdef CONFIG_SLAB_NUMA + struct array_cache *alien; +#endif +}; /* bootstrap: The caches do not work without cpuarrays anymore, * but the cpuarrays are allocated from the generic caches... + * + * sizeof(struct arraycache_init) must be <= the size of the first + * kmalloc general cache, otherwise the bootstrap will crash. */ #define BOOT_CPUCACHE_ENTRIES 1 struct arraycache_init { @@ -206,20 +244,31 @@ unsigned long free_objects; int free_touched; unsigned long next_reap; +#if STATS + unsigned long num_allocations; + + unsigned long grown; + unsigned long high_mark; + unsigned long num_active; +#endif }; -#define LIST3_INIT(parent) \ - { \ - .slabs_full = LIST_HEAD_INIT(parent.slabs_full), \ - .slabs_partial = LIST_HEAD_INIT(parent.slabs_partial), \ - .slabs_free = LIST_HEAD_INIT(parent.slabs_free) \ - } -#define list3_data(cachep) \ - (&(cachep)->lists) +#if STATS +#define STATS_INC_GROWN(x) ((x)->grown++) +#define STATS_INC_ALLOCED(x) ((x)->num_allocations++) +#define STATS_INC_ACTIVE(x) do { (x)->num_active++; \ + if ((x)->num_active > (x)->high_mark) \ + (x)->high_mark = (x)->num_active; \ + } while (0) +#define STATS_DEC_ACTIVE(x) ((x)->num_active--) +#else +#define STATS_INC_GROWN(x) do { } while (0) +#define STATS_INC_ALLOCED(x) do { } while (0) +#define STATS_INC_ACTIVE(x) do { } while (0) -/* NUMA: per-node */ -#define list3_data_ptr(cachep, ptr) \ - list3_data(cachep) +#define STATS_DEC_ACTIVE(x) do { } while (0) + +#endif /* * kmem_cache_t @@ -229,12 +278,11 @@ struct kmem_cache_s { /* 1) per-cpu data, touched during every alloc/free */ - struct array_cache *array[NR_CPUS]; + struct cpucache_wrapper cpudata[NR_CPUS]; unsigned int batchcount; unsigned int limit; /* 2) touched by every alloc & free from the backend */ - struct kmem_list3 lists; - /* NUMA: kmem_3list_t *nodelists[NR_NODES] */ + struct kmem_list3 lists[MAX_NUMNODES]; /* NUMA: pointers would be better */ unsigned int objsize; unsigned int flags; /* constant flags */ unsigned int num; /* # of objs per slab */ @@ -252,7 +300,6 @@ unsigned int colour_off; /* colour offset */ unsigned int colour_next; /* cache colouring */ kmem_cache_t *slabp_cache; - unsigned int dflags; /* dynamic flags */ /* constructor func */ void (*ctor)(void *, kmem_cache_t *, unsigned long); @@ -266,17 +313,15 @@ /* 5) statistics */ #if STATS - unsigned long num_active; - unsigned long num_allocations; - unsigned long high_mark; - unsigned long grown; - unsigned long reaped; - unsigned long errors; - unsigned long max_freeable; - atomic_t allochit; - atomic_t allocmiss; - atomic_t freehit; - atomic_t freemiss; + atomic_t errors; + + atomic_t allochit[NR_CPUS]; + atomic_t allocmiss[NR_CPUS]; + atomic_t freehit[NR_CPUS]; + atomic_t freemiss[NR_CPUS]; +#ifdef CONFIG_SLAB_NUMA + atomic_t foreign[NR_CPUS]; +#endif #endif }; @@ -296,39 +341,21 @@ #define REAPTIMEOUT_LIST3 (4*HZ) #if STATS -#define STATS_INC_ACTIVE(x) ((x)->num_active++) -#define STATS_DEC_ACTIVE(x) ((x)->num_active--) -#define STATS_INC_ALLOCED(x) ((x)->num_allocations++) -#define STATS_INC_GROWN(x) ((x)->grown++) -#define STATS_INC_REAPED(x) ((x)->reaped++) -#define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \ - (x)->high_mark = (x)->num_active; \ - } while (0) -#define STATS_INC_ERR(x) ((x)->errors++) -#define STATS_SET_FREEABLE(x, i) \ - do { if ((x)->max_freeable < i) \ - (x)->max_freeable = i; \ - } while (0) +#define STATS_INC_ERR(x) atomic_inc(&(x)->errors) -#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) -#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) -#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) -#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) +#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit[smp_processor_id()]) +#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss[smp_processor_id()]) +#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit[smp_processor_id()]) +#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss[smp_processor_id()]) +#define STATS_INC_FOREIGN(x) atomic_inc(&(x)->foreign[smp_processor_id()]) #else -#define STATS_INC_ACTIVE(x) do { } while (0) -#define STATS_DEC_ACTIVE(x) do { } while (0) -#define STATS_INC_ALLOCED(x) do { } while (0) -#define STATS_INC_GROWN(x) do { } while (0) -#define STATS_INC_REAPED(x) do { } while (0) -#define STATS_SET_HIGH(x) do { } while (0) -#define STATS_INC_ERR(x) do { } while (0) -#define STATS_SET_FREEABLE(x, i) \ - do { } while (0) +#define STATS_INC_ERR(x) do { } while (0) #define STATS_INC_ALLOCHIT(x) do { } while (0) #define STATS_INC_ALLOCMISS(x) do { } while (0) #define STATS_INC_FREEHIT(x) do { } while (0) #define STATS_INC_FREEMISS(x) do { } while (0) +#define STATS_INC_FOREIGN(x) do { } while (0) #endif #if DEBUG @@ -436,8 +463,6 @@ /* internal cache of cache description objs */ static kmem_cache_t cache_cache = { - .lists = LIST3_INIT(cache_cache.lists), - .array = { [0] = &initarray_cache.cache }, .batchcount = 1, .limit = BOOT_CPUCACHE_ENTRIES, .objsize = sizeof(kmem_cache_t), @@ -514,6 +539,23 @@ } } +static struct array_cache *alloc_acdata(int limit, int batchcount) +{ + int memsize; + struct array_cache *nc; + + memsize = sizeof(void*)*limit+sizeof(struct array_cache); + nc = kmalloc(memsize, GFP_KERNEL); + if (!nc) + return NULL; + nc->avail = 0; + nc->limit = limit; + nc->batchcount = batchcount; + nc->touched = 0; + + return nc; +} + /* * Note: if someone calls kmem_cache_alloc() on the new * cpu before the cpuup callback had a chance to allocate @@ -531,25 +573,27 @@ case CPU_UP_PREPARE: down(&cache_chain_sem); list_for_each(p, &cache_chain) { - int memsize; struct array_cache *nc; kmem_cache_t* cachep = list_entry(p, kmem_cache_t, next); - memsize = sizeof(void*)*cachep->limit+sizeof(struct array_cache); - nc = kmalloc(memsize, GFP_KERNEL); + nc = alloc_acdata(cachep->limit, cachep->batchcount); if (!nc) goto bad; - nc->avail = 0; - nc->limit = cachep->limit; - nc->batchcount = cachep->batchcount; - nc->touched = 0; spin_lock_irq(&cachep->spinlock); - cachep->array[cpu] = nc; + cachep->cpudata[cpu].native = nc; cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + cachep->num; spin_unlock_irq(&cachep->spinlock); +#ifdef CONFIG_SLAB_NUMA + nc = alloc_acdata(cachep->limit, cachep->limit); + if (!nc) + goto bad; + spin_lock_irq(&cachep->spinlock); + cachep->cpudata[cpu].alien = nc; + spin_unlock_irq(&cachep->spinlock); +#endif } up(&cache_chain_sem); break; @@ -564,9 +608,14 @@ struct array_cache *nc; kmem_cache_t* cachep = list_entry(p, kmem_cache_t, next); - nc = cachep->array[cpu]; - cachep->array[cpu] = NULL; + nc = cachep->cpudata[cpu].native; + cachep->cpudata[cpu].native = NULL; kfree(nc); +#ifdef CONFIG_SLAB_NUMA + nc = cachep->cpudata[cpu].alien; + cachep->cpudata[cpu].alien = NULL; + kfree(nc); +#endif } up(&cache_chain_sem); break; @@ -584,20 +633,74 @@ return (void**)(ac+1); } -static inline struct array_cache *ac_data(kmem_cache_t *cachep) +/* + * Helper functions/macros to access the per-cpu + * and per-node structures + */ + +#define ac_data(cachep) \ + ((cachep)->cpudata[smp_processor_id()].native) + +#define list3_data(cachep) \ + (&(cachep)->lists[__cpu_to_node(smp_processor_id())]) + +#ifdef CONFIG_SLAB_NUMA +/* + * NUMA: check where ptr points, and select the appropriate storage + * for the object. + */ +/* FIXME - this function must be somewhere in include/asm- */ +static inline int ptr_to_node(void *obj) { - return cachep->array[smp_processor_id()]; + return (((unsigned long)obj)/4/1024/1024)%MAX_NUMNODES; } +static inline struct array_cache * ac_data_ptr(kmem_cache_t *cachep, void *objp) +{ + if (ptr_to_node(objp) == __cpu_to_node(smp_processor_id())) + return cachep->cpudata[smp_processor_id()].native; + STATS_INC_FOREIGN(cachep); + return cachep->cpudata[smp_processor_id()].alien; +} +#define DEFINE_NUMALIST_PTR(x) \ + struct kmem_list3 *x + +#define set_numalist_ptr(x, cachep, objp) \ + do { x = &cachep->lists[ptr_to_node(objp)]; } while(0) +#define set_numalist_cur(x, cachep) \ + do { x = &cachep->lists[__cpu_to_node(smp_processor_id())]; } while(0) +#define access_numalist_ptr(cachep, x) \ + (x) + +#else + +#define ac_data_ptr(cachep, ptr) ac_data(cachep) + +#define DEFINE_NUMALIST_PTR(x) +#define set_numalist_ptr(x, cachep, objp) do { } while(0) +#define set_numalist_cur(x, cachep) do { } while(0) + +#define access_numalist_ptr(cachep, x) (&(cachep->lists[0])) + +#endif + /* Initialisation - setup the `cache' cache. */ void __init kmem_cache_init(void) { size_t left_over; + int i; init_MUTEX(&cache_chain_sem); INIT_LIST_HEAD(&cache_chain); list_add(&cache_cache.next, &cache_chain); + for (i=0;i<MAX_NUMNODES;i++) { + INIT_LIST_HEAD(&cache_cache.lists[i].slabs_full); + INIT_LIST_HEAD(&cache_cache.lists[i].slabs_partial); + INIT_LIST_HEAD(&cache_cache.lists[i].slabs_free); + } + ac_data(&cache_cache) = &initarray_cache.cache; + cache_estimate(0, cache_cache.objsize, 0, &left_over, &cache_cache.num); if (!cache_cache.num) @@ -657,20 +760,33 @@ */ { void * ptr; +#ifdef CONFIG_SLAB_NUMA + void * ptr2; +#endif - ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); + ptr = alloc_acdata(1, 1); +#ifdef CONFIG_SLAB_NUMA + ptr2 = alloc_acdata(1, 1); +#endif local_irq_disable(); - BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); - memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init)); - cache_cache.array[smp_processor_id()] = ptr; + BUG_ON(cache_cache.cpudata[smp_processor_id()].native != &initarray_cache.cache); + cache_cache.cpudata[smp_processor_id()].native = ptr; +#ifdef CONFIG_SLAB_NUMA + cache_cache.cpudata[smp_processor_id()].alien = ptr2; +#endif local_irq_enable(); - ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); + ptr = alloc_acdata(1, 1); +#ifdef CONFIG_SLAB_NUMA + ptr2 = alloc_acdata(1, 1); +#endif local_irq_disable(); - BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache); - memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep), - sizeof(struct arraycache_init)); - malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr; + BUG_ON(malloc_sizes[0].cs_cachep->cpudata[smp_processor_id()].native != + &initarray_generic.cache); + malloc_sizes[0].cs_cachep->cpudata[smp_processor_id()].native = ptr; +#ifdef CONFIG_SLAB_NUMA + malloc_sizes[0].cs_cachep->cpudata[smp_processor_id()].alien = ptr2; +#endif local_irq_enable(); } } @@ -850,6 +966,7 @@ const char *func_nm = KERN_ERR "kmem_create: "; size_t left_over, align, slab_size; kmem_cache_t *cachep = NULL; + int i; /* * Sanity checks... these are all serious usage bugs. @@ -1000,10 +1117,11 @@ cachep->gfpflags |= GFP_DMA; spin_lock_init(&cachep->spinlock); cachep->objsize = size; - /* NUMA */ - INIT_LIST_HEAD(&cachep->lists.slabs_full); - INIT_LIST_HEAD(&cachep->lists.slabs_partial); - INIT_LIST_HEAD(&cachep->lists.slabs_free); + for (i=0;i<MAX_NUMNODES;i++) { + INIT_LIST_HEAD(&cachep->lists[i].slabs_full); + INIT_LIST_HEAD(&cachep->lists[i].slabs_partial); + INIT_LIST_HEAD(&cachep->lists[i].slabs_free); + } if (flags & CFLGS_OFF_SLAB) cachep->slabp_cache = kmem_find_general_cachep(slab_size,0); @@ -1019,24 +1137,26 @@ * the cache that's used by kmalloc(24), otherwise * the creation of further caches will BUG(). */ - cachep->array[smp_processor_id()] = &initarray_generic.cache; + ac_data(cachep) = &initarray_generic.cache; g_cpucache_up = PARTIAL; } else { - cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL); + ac_data(cachep) = alloc_acdata(1,1); +#ifdef CONFIG_SLAB_NUMA + cachep->cpudata[smp_processor_id()].alien = + alloc_acdata(1,1); +#endif } - BUG_ON(!ac_data(cachep)); - ac_data(cachep)->avail = 0; - ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES; - ac_data(cachep)->batchcount = 1; - ac_data(cachep)->touched = 0; cachep->batchcount = 1; cachep->limit = BOOT_CPUCACHE_ENTRIES; cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + cachep->num; } - cachep->lists.next_reap = jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep)%REAPTIMEOUT_LIST3; + for (i=0;i< MAX_NUMNODES;i++) { + cachep->lists[i].next_reap = jiffies + REAPTIMEOUT_LIST3 + + ((unsigned long)cachep)%REAPTIMEOUT_LIST3 + + i*HZ/10; + } /* Need the semaphore to access the chain. */ down(&cache_chain_sem); @@ -1128,38 +1248,41 @@ } -/* NUMA shrink all list3s */ static int __cache_shrink(kmem_cache_t *cachep) { struct slab *slabp; int ret; + int i; drain_cpu_caches(cachep); check_irq_on(); spin_lock_irq(&cachep->spinlock); - for(;;) { - struct list_head *p; + ret = 0; + for (i=0;i<MAX_NUMNODES;i++) { + for(;;) { + struct list_head *p; - p = cachep->lists.slabs_free.prev; - if (p == &cachep->lists.slabs_free) - break; + p = cachep->lists[i].slabs_free.prev; + if (p == &cachep->lists[i].slabs_free) + break; - slabp = list_entry(cachep->lists.slabs_free.prev, struct slab, list); + slabp = list_entry(cachep->lists[i].slabs_free.prev, struct slab, list); #if DEBUG - if (slabp->inuse) - BUG(); + if (slabp->inuse) + BUG(); #endif - list_del(&slabp->list); + list_del(&slabp->list); - cachep->lists.free_objects -= cachep->num; - spin_unlock_irq(&cachep->spinlock); - slab_destroy(cachep, slabp); - spin_lock_irq(&cachep->spinlock); + cachep->lists[i].free_objects -= cachep->num; + spin_unlock_irq(&cachep->spinlock); + slab_destroy(cachep, slabp); + spin_lock_irq(&cachep->spinlock); + } + ret |= !list_empty(&cachep->lists[i].slabs_full); + ret |= !list_empty(&cachep->lists[i].slabs_partial); } - ret = !list_empty(&cachep->lists.slabs_full) || - !list_empty(&cachep->lists.slabs_partial); spin_unlock_irq(&cachep->spinlock); return ret; } @@ -1217,9 +1340,12 @@ } { int i; - for (i = 0; i < NR_CPUS; i++) - kfree(cachep->array[i]); - /* NUMA: free the list3 structures */ + for (i = 0; i < NR_CPUS; i++) { + kfree(cachep->cpudata[i].native); +#ifdef CONFIG_SLAB_NUMA + kfree(cachep->cpudata[i].alien); +#endif + } } kmem_cache_free(&cache_cache, cachep); @@ -1316,7 +1442,7 @@ * Grow (by 1) the number of slabs within a cache. This is called by * kmem_cache_alloc() when there are no active objs left in a cache. */ -static int cache_grow (kmem_cache_t * cachep, int flags) +static struct kmem_list3 *cache_grow (kmem_cache_t * cachep, int flags) { struct slab *slabp; struct page *page; @@ -1324,6 +1450,7 @@ size_t offset; unsigned int i, local_flags; unsigned long ctor_flags; + DEFINE_NUMALIST_PTR(l3); /* Be lazy and only check for valid flags here, * keeping it out of the critical path in kmem_cache_alloc(). @@ -1394,15 +1521,17 @@ spin_lock(&cachep->spinlock); /* Make slab active. */ - list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_free)); - STATS_INC_GROWN(cachep); - list3_data(cachep)->free_objects += cachep->num; + set_numalist_ptr(l3, cachep, slabp->s_mem); + list_add_tail(&slabp->list, &(access_numalist_ptr(cachep, l3)->slabs_free)); + STATS_INC_GROWN(access_numalist_ptr(cachep, l3)); + access_numalist_ptr(cachep, l3)->free_objects += cachep->num; spin_unlock(&cachep->spinlock); - return 1; + return access_numalist_ptr(cachep, l3); opps1: kmem_freepages(cachep, objp); failed: - return 0; + STATS_INC_ERR(cachep); + return NULL; } /* @@ -1502,25 +1631,6 @@ #endif } -static inline void * cache_alloc_one_tail (kmem_cache_t *cachep, - struct slab *slabp) -{ - void *objp; - - check_spinlock_acquired(cachep); - - STATS_INC_ALLOCED(cachep); - STATS_INC_ACTIVE(cachep); - STATS_SET_HIGH(cachep); - - /* get obj pointer */ - slabp->inuse++; - objp = slabp->s_mem + slabp->free*cachep->objsize; - slabp->free=slab_bufctl(slabp)[slabp->free]; - - return objp; -} - static inline void cache_alloc_listfixup(struct kmem_list3 *l3, struct slab *slabp) { list_del(&slabp->list); @@ -1539,6 +1649,7 @@ check_irq_off(); ac = ac_data(cachep); + l3 = list3_data(cachep); retry: batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { @@ -1548,7 +1659,6 @@ */ batchcount = BATCHREFILL_LIMIT; } - l3 = list3_data(cachep); BUG_ON(ac->avail > 0); spin_lock(&cachep->spinlock); @@ -1566,9 +1676,16 @@ slabp = list_entry(entry, struct slab, list); check_slabp(cachep, slabp); - while (slabp->inuse < cachep->num && batchcount--) + while (slabp->inuse < cachep->num && batchcount--) { + STATS_INC_ALLOCED(l3); + STATS_INC_ACTIVE(l3); + + slabp->inuse++; + /* get obj pointer */ ac_entry(ac)[ac->avail++] = - cache_alloc_one_tail(cachep, slabp); + slabp->s_mem + slabp->free*cachep->objsize; + slabp->free=slab_bufctl(slabp)[slabp->free]; + } check_slabp(cachep, slabp); cache_alloc_listfixup(l3, slabp); } @@ -1578,12 +1695,11 @@ spin_unlock(&cachep->spinlock); if (unlikely(!ac->avail)) { - int x; - x = cache_grow(cachep, flags); + l3 = cache_grow(cachep, flags); // cache_grow can reenable interrupts, then ac could change. ac = ac_data(cachep); - if (!x && ac->avail == 0) // no objects in sight? abort + if (!l3 && ac->avail == 0) // no objects in sight? abort return NULL; if (!ac->avail) // objects refilled by interrupt? @@ -1654,51 +1770,48 @@ return objp; } -/* - * NUMA: different approach needed if the spinlock is moved into - * the l3 structure - */ - -static inline void -__free_block(kmem_cache_t *cachep, void **objpp, int nr_objects) +static inline void __free_block (kmem_cache_t* cachep, void** objpp, int len) { - int i; - check_irq_off(); spin_lock(&cachep->spinlock); +#ifndef CONFIG_SLAB_NUMA + cachep->lists[0].free_objects += len; +#endif - /* NUMA: move add into loop */ - cachep->lists.free_objects += nr_objects; - - for (i = 0; i < nr_objects; i++) { - void *objp = objpp[i]; - struct slab *slabp; - unsigned int objnr; + for ( ; len > 0; len--, objpp++) { + struct slab* slabp; + void *objp = *objpp; + DEFINE_NUMALIST_PTR(l3); slabp = GET_PAGE_SLAB(virt_to_page(objp)); list_del(&slabp->list); - objnr = (objp - slabp->s_mem) / cachep->objsize; - slab_bufctl(slabp)[objnr] = slabp->free; - slabp->free = objnr; - STATS_DEC_ACTIVE(cachep); - slabp->inuse--; + { + unsigned int objnr = (objp-slabp->s_mem)/cachep->objsize; + slab_bufctl(slabp)[objnr] = slabp->free; + slabp->free = objnr; + } + + set_numalist_ptr(l3, cachep, objp); + STATS_DEC_ACTIVE(access_numalist_ptr(cachep, l3)); +#ifdef CONFIG_SLAB_NUMA + l3->free_objects++; +#endif /* fixup slab chains */ - if (slabp->inuse == 0) { - if (cachep->lists.free_objects > cachep->free_limit) { - cachep->lists.free_objects -= cachep->num; + if (unlikely(!--slabp->inuse)) { + if (access_numalist_ptr(cachep, l3)->free_objects > cachep->free_limit) { + access_numalist_ptr(cachep, l3)->free_objects -= cachep->num; slab_destroy(cachep, slabp); } else { list_add(&slabp->list, - &list3_data_ptr(cachep, objp)->slabs_free); + &(access_numalist_ptr(cachep, l3)->slabs_free)); } } else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the * other objects to be freed, too. */ - list_add_tail(&slabp->list, - &list3_data_ptr(cachep, objp)->slabs_partial); + list_add_tail(&slabp->list, &(access_numalist_ptr(cachep, l3)->slabs_partial)); } } spin_unlock(&cachep->spinlock); @@ -1720,26 +1833,6 @@ check_irq_off(); __free_block(cachep, &ac_entry(ac)[0], batchcount); -#if STATS - { - int i = 0; - struct list_head *p; - - spin_lock(&cachep->spinlock); - p = list3_data(cachep)->slabs_free.next; - while (p != &(list3_data(cachep)->slabs_free)) { - struct slab *slabp; - - slabp = list_entry(p, struct slab, list); - BUG_ON(slabp->inuse); - - i++; - p = p->next; - } - STATS_SET_FREEABLE(cachep, i); - spin_unlock(&cachep->spinlock); - } -#endif ac->avail -= batchcount; memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount], sizeof(void*)*ac->avail); @@ -1754,7 +1847,7 @@ */ static inline void __cache_free (kmem_cache_t *cachep, void* objp) { - struct array_cache *ac = ac_data(cachep); + struct array_cache *ac = ac_data_ptr(cachep, objp); check_irq_off(); objp = cache_free_debugcheck(cachep, objp); @@ -1890,6 +1983,9 @@ struct ccupdate_struct { kmem_cache_t *cachep; struct array_cache *new[NR_CPUS]; +#ifdef CONFIG_SLAB_NUMA + struct array_cache *new_alien[NR_CPUS]; +#endif }; static void do_ccupdate_local(void *info) @@ -1898,10 +1994,15 @@ struct array_cache *old; check_irq_off(); - old = ac_data(new->cachep); - - new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; + old = new->cachep->cpudata[smp_processor_id()].native; + new->cachep->cpudata[smp_processor_id()].native = new->new[smp_processor_id()]; new->new[smp_processor_id()] = old; + +#ifdef CONFIG_SLAB_NUMA + old = new->cachep->cpudata[smp_processor_id()].alien; + new->cachep->cpudata[smp_processor_id()].alien = new->new_alien[smp_processor_id()]; + new->new_alien[smp_processor_id()] = old; +#endif } @@ -1909,22 +2010,22 @@ { struct ccupdate_struct new; int i; + int ret; memset(&new.new,0,sizeof(new.new)); for (i = 0; i < NR_CPUS; i++) { - struct array_cache *ccnew; - - ccnew = kmalloc(sizeof(void*)*limit+ - sizeof(struct array_cache), GFP_KERNEL); - if (!ccnew) { - for (i--; i >= 0; i--) kfree(new.new[i]); - return -ENOMEM; - } - ccnew->avail = 0; - ccnew->limit = limit; - ccnew->batchcount = batchcount; - ccnew->touched = 0; - new.new[i] = ccnew; + new.new[i] = alloc_acdata(limit, batchcount); + if (!new.new[i]) { + ret = -ENOMEM; + goto out; + } +#ifdef CONFIG_SLAB_NUMA + new.new_alien[i] = alloc_acdata(limit, limit); + if (!new.new_alien[i]) { + ret = -ENOMEM; + goto out; + } +#endif } new.cachep = cachep; @@ -1936,17 +2037,30 @@ cachep->limit = limit; cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + cachep->num; spin_unlock_irq(&cachep->spinlock); - + + ret = 0; +out: for (i = 0; i < NR_CPUS; i++) { - struct array_cache *ccold = new.new[i]; - if (!ccold) - continue; - local_irq_disable(); - free_block(cachep, ac_entry(ccold), ccold->avail); - local_irq_enable(); - kfree(ccold); + struct array_cache* ccold; + + ccold = new.new[i]; + if (ccold) { + local_irq_disable(); + free_block(cachep, ac_entry(ccold), ccold->avail); + local_irq_enable(); + kfree(ccold); + } +#ifdef CONFIG_SLAB_NUMA + ccold = new.new_alien[i]; + if (ccold) { + local_irq_disable(); + free_block(cachep, ac_entry(ccold), ccold->avail); + local_irq_enable(); + kfree(ccold); + } +#endif } - return 0; + return ret; } @@ -1998,6 +2112,7 @@ int tofree; struct array_cache *ac; struct slab *slabp; + DEFINE_NUMALIST_PTR(l3); searchp = list_entry(walk, kmem_cache_t, next); @@ -2019,36 +2134,41 @@ memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree], sizeof(void*)*ac->avail); } - if(time_after(searchp->lists.next_reap, jiffies)) +#ifdef CONFIG_SLAB_NUMA + ac = searchp->cpudata[smp_processor_id()].alien; + free_block(searchp, ac_entry(ac), ac->avail); + ac->avail = 0; +#endif + set_numalist_cur(l3, searchp); + if(time_after(access_numalist_ptr(searchp, l3)->next_reap, jiffies)) goto next_irqon; spin_lock(&searchp->spinlock); - if(time_after(searchp->lists.next_reap, jiffies)) { + if(time_after(access_numalist_ptr(searchp, l3)->next_reap, jiffies)) { goto next_unlock; } - searchp->lists.next_reap = jiffies + REAPTIMEOUT_LIST3; - if (searchp->lists.free_touched) { - searchp->lists.free_touched = 0; + access_numalist_ptr(searchp, l3)->next_reap = jiffies + REAPTIMEOUT_LIST3; + if (access_numalist_ptr(searchp, l3)->free_touched) { + access_numalist_ptr(searchp, l3)->free_touched = 0; goto next_unlock; } tofree = (searchp->free_limit+5*searchp->num-1)/(5*searchp->num); do { - p = list3_data(searchp)->slabs_free.next; - if (p == &(list3_data(searchp)->slabs_free)) + p = access_numalist_ptr(searchp, l3)->slabs_free.next; + if (p == &(access_numalist_ptr(searchp, l3)->slabs_free)) break; slabp = list_entry(p, struct slab, list); BUG_ON(slabp->inuse); list_del(&slabp->list); - STATS_INC_REAPED(searchp); /* Safe to drop the lock. The slab is no longer * linked to the cache. * searchp cannot disappear, we hold * cache_chain_lock */ - searchp->lists.free_objects -= searchp->num; + access_numalist_ptr(searchp, l3)->free_objects -= searchp->num; spin_unlock_irq(&searchp->spinlock); slab_destroy(searchp, slabp); spin_lock_irq(&searchp->spinlock); @@ -2075,7 +2195,7 @@ struct timer_list *rt = &reap_timers[cpu]; cache_reap(); - mod_timer(rt, jiffies + REAPTIMEOUT_CPUC + cpu); + mod_timer(rt, jiffies + REAPTIMEOUT_CPUC); } #ifdef CONFIG_PROC_FS @@ -2116,19 +2236,16 @@ { kmem_cache_t *cachep = p; struct list_head *q; - struct slab *slabp; - unsigned long active_objs; - unsigned long num_objs; - unsigned long active_slabs = 0; - unsigned long num_slabs; + struct slab *slabp; const char *name; + int i; if (p == (void*)1) { /* * Output format version, so at least we can change it * without _too_ many complaints. */ - seq_puts(m, "slabinfo - version: 1.2" + seq_puts(m, "slabinfo - version: 2.0" #if STATS " (statistics)" #endif @@ -2136,33 +2253,7 @@ return 0; } - check_irq_on(); - spin_lock_irq(&cachep->spinlock); - active_objs = 0; - num_slabs = 0; - list_for_each(q,&cachep->lists.slabs_full) { - slabp = list_entry(q, struct slab, list); - if (slabp->inuse != cachep->num) - BUG(); - active_objs += cachep->num; - active_slabs++; - } - list_for_each(q,&cachep->lists.slabs_partial) { - slabp = list_entry(q, struct slab, list); - BUG_ON(slabp->inuse == cachep->num || !slabp->inuse); - active_objs += slabp->inuse; - active_slabs++; - } - list_for_each(q,&cachep->lists.slabs_free) { - slabp = list_entry(q, struct slab, list); - if (slabp->inuse) - BUG(); - num_slabs++; - } - num_slabs+=active_slabs; - num_objs = num_slabs*cachep->num; - BUG_ON(num_objs - active_objs != cachep->lists.free_objects); - + /* line 1: global stats */ name = cachep->name; { char tmp; @@ -2175,33 +2266,76 @@ set_fs(old_fs); } - seq_printf(m, "%-17s %6lu %6lu %6u %4lu %4lu %4u", - name, active_objs, num_objs, cachep->objsize, - active_slabs, num_slabs, (1<<cachep->gfporder)); + seq_printf(m, "%-17s : %6u %6u %4u 0x%04x %6u %4u %4u", + name, cachep->objsize, cachep->num, (1<<cachep->gfporder), + cachep->flags, cachep->free_limit, cachep->limit, cachep->batchcount); +#if STATS + seq_printf(m, " %4u", atomic_read(&cachep->errors)); +#endif + + seq_putc(m, '\n'); + + + check_irq_on(); + /* block 2: list3 data */ + spin_lock_irq(&cachep->spinlock); + for (i=0;i<MAX_NUMNODES;i++) { + struct kmem_list3 *l3 = &cachep->lists[i]; + unsigned long active_objs = 0; + unsigned long num_objs = 0; + unsigned long active_slabs = 0; + unsigned long num_slabs = 0; + + list_for_each(q,&l3->slabs_full) { + slabp = list_entry(q, struct slab, list); + if (slabp->inuse != cachep->num) + BUG(); + active_objs += cachep->num; + active_slabs++; + } + list_for_each(q,&l3->slabs_partial) { + slabp = list_entry(q, struct slab, list); + BUG_ON(slabp->inuse == cachep->num || !slabp->inuse); + active_objs += slabp->inuse; + active_slabs++; + } + list_for_each(q,&l3->slabs_free) { + slabp = list_entry(q, struct slab, list); + if (slabp->inuse) + BUG(); + num_slabs++; + } + num_slabs+=active_slabs; + num_objs = num_slabs*cachep->num; + + BUG_ON(num_objs - active_objs != l3->free_objects); + seq_printf(m, "# Node %2u : %6lu %6lu %8lu %8lu", + i, active_slabs, num_slabs, active_objs, num_objs); +#if STATS + BUG_ON(active_objs != l3->num_active); - seq_printf(m, " : %4u %4u", cachep->limit, cachep->batchcount); + seq_printf(m, " %8lu %8lu %6lu", l3->num_allocations, + l3->high_mark, l3->grown); +#endif + seq_putc(m, '\n'); + } + /* block 3: array data */ #if STATS - { // list3 stats - unsigned long high = cachep->high_mark; - unsigned long allocs = cachep->num_allocations; - unsigned long grown = cachep->grown; - unsigned long reaped = cachep->reaped; - unsigned long errors = cachep->errors; - unsigned long max_freeable = cachep->max_freeable; - unsigned long free_limit = cachep->free_limit; - - seq_printf(m, " : %6lu %7lu %5lu %4lu %4lu %4lu %4lu", - high, allocs, grown, reaped, errors, - max_freeable, free_limit); - } - { // cpucache stats - unsigned long allochit = atomic_read(&cachep->allochit); - unsigned long allocmiss = atomic_read(&cachep->allocmiss); - unsigned long freehit = atomic_read(&cachep->freehit); - unsigned long freemiss = atomic_read(&cachep->freemiss); + for (i=0;i<NR_CPUS;i++) { + if (!cpu_online(i)) + continue; - seq_printf(m, " : %6lu %6lu %6lu %6lu", - allochit, allocmiss, freehit, freemiss); + seq_printf(m, "# Cpu %2i : %6u %6u %6u %6u", + i, + atomic_read(&cachep->allochit[i]), + atomic_read(&cachep->allocmiss[i]), + atomic_read(&cachep->freehit[i]), + atomic_read(&cachep->freemiss[i])); +#ifdef CONFIG_SLAB_NUMA + seq_printf(m, " %6u", + atomic_read(&cachep->foreign[i])); +#endif + seq_putc(m, '\n'); } #endif spin_unlock_irq(&cachep->spinlock);
--------------090700010801050401070407--
- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/