Here is a Memory Binding API. This allows processes to bind themselves to
particular blocks of memory in a multi-memory block system. This patch is
based on top of the Simple Topology API I posted a few moments ago. The calls
take a simple bitmask of memblocks.
This is a scaled down version of a full NUMA API I posted several times in the
past, to a distinct lack of fanfare. I hope that this smaller patch holds more
interest to the community...
Enjoy!
-Matt
--------------000108060308080408010305
Content-Type: text/plain;
name="2.5.25-mem_api.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
filename="2.5.25-mem_api.patch"
diff -Nur linux-2.5.25-test/arch/i386/config.in linux-2.5.25-api/arch/i386/config.in
--- linux-2.5.25-test/arch/i386/config.in Fri Jul 5 16:42:20 2002
+++ linux-2.5.25-api/arch/i386/config.in Fri Jul 12 14:11:40 2002
@@ -165,6 +165,10 @@
fi
else
bool 'Multiquad NUMA system' CONFIG_MULTIQUAD
+ if [ "$CONFIG_MULTIQUAD" = y ]; then
+ bool 'Memory Binding API Support' CONFIG_MEMBIND
+ bool 'IBM/Sequent NUMA-Q Hardware Support' CONFIG_IBMNUMAQ
+ fi
fi
bool 'Machine Check Exception' CONFIG_X86_MCE
diff -Nur linux-2.5.25-test/include/asm-i386/smp.h linux-2.5.25-api/include/asm-i386/smp.h
--- linux-2.5.25-test/include/asm-i386/smp.h Fri Jul 5 16:42:02 2002
+++ linux-2.5.25-api/include/asm-i386/smp.h Fri Jul 12 16:09:16 2002
@@ -55,6 +55,7 @@
extern void smp_alloc_memory(void);
extern unsigned long phys_cpu_present_map;
extern unsigned long cpu_online_map;
+extern unsigned long memblk_online_map;
extern volatile unsigned long smp_invalidate_needed;
extern int pic_mode;
extern int smp_num_siblings;
@@ -99,6 +100,11 @@
return hweight32(cpu_online_map);
}
+extern inline unsigned int num_online_memblks(void)
+{
+ return hweight32(memblk_online_map);
+}
+
extern inline int any_online_cpu(unsigned int mask)
{
if (mask & cpu_online_map)
diff -Nur linux-2.5.25-test/include/linux/init_task.h linux-2.5.25-api/include/linux/init_task.h
--- linux-2.5.25-test/include/linux/init_task.h Fri Jul 5 16:42:04 2002
+++ linux-2.5.25-api/include/linux/init_task.h Fri Jul 12 16:32:19 2002
@@ -59,6 +59,7 @@
children: LIST_HEAD_INIT(tsk.children), \
sibling: LIST_HEAD_INIT(tsk.sibling), \
thread_group: LIST_HEAD_INIT(tsk.thread_group), \
+ memblk_binding: { MEMBLK_NO_BINDING, MPOL_STRICT }, \
wait_chldexit: __WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\
real_timer: { \
function: it_real_fn \
diff -Nur linux-2.5.25-test/include/linux/membind.h linux-2.5.25-api/include/linux/membind.h
--- linux-2.5.25-test/include/linux/membind.h Fri Jul 12 16:53:46 2002
+++ linux-2.5.25-api/include/linux/membind.h Fri Jul 12 16:31:30 2002
@@ -27,6 +27,35 @@
#ifndef _LINUX_MEMBIND_H_
#define _LINUX_MEMBIND_H_
+#include <linux/types.h>
+
+#ifdef CONFIG_MEMBIND
+#define NR_MEMBLKS 32 /* Max number of Memory Blocks */
+#else
+#define NR_MEMBLKS 1
+#endif
+
+typedef unsigned long memblk_bitmask_t;
+#define MEMBLK_NO_BINDING ((memblk_bitmask_t) 0) /* A '0' means use the memblk
+ and a '1' means *don't* */
+
+#define MPOL_STRICT 0 /* Memory MUST be allocated according to binding */
+#define MPOL_LOOSE 1 /* Memory will be allocated according to binding,
+ but can fall back to other memory blocks if necessary. */
+#define MPOL_FIRST 2 /* UNUSED FOR NOW */
+#define MPOL_STRIPE 4 /* UNUSED FOR NOW */
+#define MPOL_RR 8 /* UNUSED FOR NOW */
+
+
+typedef struct memblk_list {
+ memblk_bitmask_t bitmask;
+ int behavior;
+ rwlock_t lock;
+} memblk_list_t;
+
+
+int set_memblk_binding(memblk_bitmask_t, int);
+memblk_bitmask_t get_memblk_binding(void);
int cpu_to_node(int);
int memblk_to_node(int);
int node_to_node(int);
diff -Nur linux-2.5.25-test/include/linux/mmzone.h linux-2.5.25-api/include/linux/mmzone.h
--- linux-2.5.25-test/include/linux/mmzone.h Fri Jul 5 16:42:02 2002
+++ linux-2.5.25-api/include/linux/mmzone.h Thu Jul 11 14:00:12 2002
@@ -136,6 +136,7 @@
unsigned long node_start_mapnr;
unsigned long node_size;
int node_id;
+ int memblk_id; /* A unique ID for each memory block */
struct pglist_data *node_next;
} pg_data_t;
@@ -169,14 +170,15 @@
#define NODE_MEM_MAP(nid) mem_map
#define MAX_NR_NODES 1
-#else /* !CONFIG_DISCONTIGMEM */
+#endif /* !CONFIG_DISCONTIGMEM */
-#include <asm/mmzone.h>
+#if defined (CONFIG_DISCONTIGMEM) || defined (CONFIG_MEMBIND)
+#include <asm/mmzone.h>
/* page->zone is currently 8 bits ... */
#define MAX_NR_NODES (255 / MAX_NR_ZONES)
-#endif /* !CONFIG_DISCONTIGMEM */
+#endif /* CONFIG_DISCONTIGMEM || CONFIG_MEMBIND */
#define MAP_ALIGN(x) ((((x) % sizeof(struct page)) == 0) ? (x) : ((x) + \
sizeof(struct page) - ((x) % sizeof(struct page))))
diff -Nur linux-2.5.25-test/include/linux/prctl.h linux-2.5.25-api/include/linux/prctl.h
--- linux-2.5.25-test/include/linux/prctl.h Fri Jul 12 16:53:46 2002
+++ linux-2.5.25-api/include/linux/prctl.h Wed Jul 10 13:58:17 2002
@@ -26,6 +26,10 @@
# define PR_FPEMU_NOPRINT 1 /* silently emulate fp operations accesses */
# define PR_FPEMU_SIGFPE 2 /* don't emulate fp operations, send SIGFPE instead */
+/* Get/Set MemBlk Binding */
+#define PR_SET_MEMBLK_BINDING 11
+#define PR_GET_MEMBLK_BINDING 12
+
/* Get CPU/Node */
#define PR_GET_CURR_CPU 13
#define PR_GET_CURR_NODE 14
diff -Nur linux-2.5.25-test/include/linux/sched.h linux-2.5.25-api/include/linux/sched.h
--- linux-2.5.25-test/include/linux/sched.h Fri Jul 5 16:42:04 2002
+++ linux-2.5.25-api/include/linux/sched.h Wed Jul 10 13:43:48 2002
@@ -27,6 +27,7 @@
#include <linux/securebits.h>
#include <linux/fs_struct.h>
#include <linux/compiler.h>
+#include <linux/membind.h>
struct exec_domain;
@@ -302,6 +303,9 @@
struct task_struct *pidhash_next;
struct task_struct **pidhash_pprev;
+ /* additional Memory Binding stuff */
+ memblk_list_t memblk_binding;
+
wait_queue_head_t wait_chldexit; /* for wait4() */
struct completion *vfork_done; /* for vfork() */
diff -Nur linux-2.5.25-test/include/linux/smp.h linux-2.5.25-api/include/linux/smp.h
--- linux-2.5.25-test/include/linux/smp.h Fri Jul 5 16:42:28 2002
+++ linux-2.5.25-api/include/linux/smp.h Wed Jul 10 13:43:48 2002
@@ -86,6 +86,7 @@
#define smp_call_function(func,info,retry,wait) ({ 0; })
static inline void smp_send_reschedule(int cpu) { }
static inline void smp_send_reschedule_all(void) { }
+#define memblk_online_map 1
#define cpu_online_map 1
#define cpu_online(cpu) 1
#define num_online_cpus() 1
diff -Nur linux-2.5.25-test/kernel/Makefile linux-2.5.25-api/kernel/Makefile
--- linux-2.5.25-test/kernel/Makefile Fri Jul 5 16:42:18 2002
+++ linux-2.5.25-api/kernel/Makefile Wed Jul 10 13:43:48 2002
@@ -15,7 +15,7 @@
obj-y = sched.o dma.o fork.o exec_domain.o panic.o printk.o \
module.o exit.o itimer.o time.o softirq.o resource.o \
sysctl.o capability.o ptrace.o timer.o user.o \
- signal.o sys.o kmod.o context.o futex.o platform.o
+ signal.o sys.o kmod.o context.o futex.o platform.o membind.o
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += ksyms.o
diff -Nur linux-2.5.25-test/kernel/membind.c linux-2.5.25-api/kernel/membind.c
--- linux-2.5.25-test/kernel/membind.c Fri Jul 12 16:53:46 2002
+++ linux-2.5.25-api/kernel/membind.c Fri Jul 12 16:13:17 2002
@@ -33,9 +33,70 @@
#include <linux/errno.h>
#include <linux/smp.h>
+#define is_valid_memblk_behavior(x) (1)
+
+#define is_memblk_subset(x, y) (!(~(x) & (y))) /* test whether x is a subset of y */
+
extern unsigned long memblk_online_map;
/*
+ * set_memblk_binding(): Sets up a new MemBlk Binding
+ */
+int set_memblk_binding(memblk_bitmask_t memblks, int behavior)
+{
+ int ret;
+ unsigned long flags;
+
+ ret = -ENODEV;
+ /* Make sure that at least one of the memblks in the new binding set is online. */
+ if (!(memblks & memblk_online_map))
+ goto out;
+
+ read_lock_irqsave(¤t->memblk_binding.lock, flags);
+
+ ret = -EPERM;
+ /* If the new binding expands upon the old binding, the caller
+ must have CAP_SYS_NICE. */
+ if ((!is_memblk_subset(memblks, current->memblk_binding.bitmask)) &&
+ (!capable(CAP_SYS_NICE)))
+ goto out_unlock;
+ read_unlock_irqrestore(¤t->memblk_binding.lock, flags);
+
+ ret = -EINVAL;
+ // Test to make sure the behavior argument is valid.
+ if (!is_valid_memblk_behavior(behavior))
+ goto out;
+
+ write_lock_irqsave(¤t->memblk_binding.lock, flags);
+ current->memblk_binding.bitmask = memblks;
+ current->memblk_binding.behavior = behavior;
+ write_unlock_irqrestore(¤t->memblk_binding.lock, flags);
+
+ ret = 0;
+ goto out;
+
+ out_unlock:
+ read_unlock_irqrestore(¤t->memblk_binding.lock, flags);
+ out:
+ return ret;
+}
+
+/*
+ * get_memblk_binding(): Returns the current MemBlk Binding
+ */
+inline memblk_bitmask_t get_memblk_binding(void)
+{
+ unsigned long flags;
+ memblk_bitmask_t memblk_binding;
+
+ read_lock_irqsave(¤t->memblk_binding.lock, flags);
+ memblk_binding = current->memblk_binding.bitmask;
+ read_unlock_irqrestore(¤t->memblk_binding.lock, flags);
+
+ return memblk_binding;
+}
+
+/*
* cpu_to_node(cpu): Returns the number of the most specific Node
* containing CPU 'cpu'.
*/
diff -Nur linux-2.5.25-test/kernel/sys.c linux-2.5.25-api/kernel/sys.c
--- linux-2.5.25-test/kernel/sys.c Fri Jul 12 16:53:46 2002
+++ linux-2.5.25-api/kernel/sys.c Fri Jul 12 16:11:16 2002
@@ -1292,6 +1292,12 @@
}
current->keep_capabilities = arg2;
break;
+ case PR_SET_MEMBLK_BINDING:
+ error = (long) set_memblk_binding((memblk_bitmask_t)arg2, (int)arg3);
+ break;
+ case PR_GET_MEMBLK_BINDING:
+ error = (long) get_memblk_binding();
+ break;
case PR_GET_CURR_CPU:
error = (long) get_curr_cpu();
break;
diff -Nur linux-2.5.25-test/mm/numa.c linux-2.5.25-api/mm/numa.c
--- linux-2.5.25-test/mm/numa.c Fri Jul 5 16:42:20 2002
+++ linux-2.5.25-api/mm/numa.c Fri Jul 12 16:35:19 2002
@@ -8,8 +8,10 @@
#include <linux/bootmem.h>
#include <linux/mmzone.h>
#include <linux/spinlock.h>
+#include <linux/membind.h>
int numnodes = 1; /* Initialized for UMA platforms */
+unsigned long memblk_online_map = 0UL; /* Similar to cpu_online_map, but for memory blocks */
static bootmem_data_t contig_bootmem_data;
pg_data_t contig_page_data = { bdata: &contig_bootmem_data };
@@ -27,6 +29,9 @@
{
free_area_init_core(0, &contig_page_data, &mem_map, zones_size,
zone_start_paddr, zholes_size, pmap);
+ contig_page_data.node_id = 0;
+ contig_page_data.memblk_id = 0;
+ memblk_online_map = 1UL;
}
#endif /* !CONFIG_DISCONTIGMEM */
@@ -71,6 +76,11 @@
free_area_init_core(nid, pgdat, &discard, zones_size, zone_start_paddr,
zholes_size, pmap);
pgdat->node_id = nid;
+ pgdat->memblk_id = num_online_memblks();
+ if (test_and_set_bit(num_online_memblks() + 1, &memblk_online_map)){
+ printk("memblk alread counted?!?!\n");
+ BUG();
+ }
/*
* Get space for the valid bitmap.
@@ -88,6 +98,8 @@
return __alloc_pages(gfp_mask, order, pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK));
}
+#ifdef CONFIG_NUMA
+
/*
* This can be refined. Currently, tries to do round robin, instead
* should do concentratic circle search, starting from current node.
@@ -96,30 +110,66 @@
{
struct page *ret = 0;
pg_data_t *start, *temp;
-#ifndef CONFIG_NUMA
+ int search_twice = 0;
+ memblk_bitmask_t memblk_bitmask;
unsigned long flags;
- static pg_data_t *next = 0;
-#endif
if (order >= MAX_ORDER)
return NULL;
-#ifdef CONFIG_NUMA
+
+ read_lock_irqsave(¤t->memblk_binding.lock, flags);
+ memblk_bitmask = current->memblk_binding.bitmask;
+ /* if it is a loose binding, remember to search other memblks */
+ if ((current->memblk_binding.behavior == MPOL_LOOSE) &&
+ (current->memblk_binding.bitmask != MEMBLK_NO_BINDING))
+ search_twice = 1;
+ read_unlock_irqrestore(¤t->memblk_api_lock, flags);
+
+search_through_memblks:
temp = NODE_DATA(numa_node_id());
-#else
- spin_lock_irqsave(&node_lock, flags);
- if (!next) next = pgdat_list;
- temp = next;
- next = next->node_next;
- spin_unlock_irqrestore(&node_lock, flags);
-#endif
start = temp;
while (temp) {
- if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
- return(ret);
+ if (memblk_bitmask & (1 << temp->memblk_id))
+ if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
+ return(ret);
temp = temp->node_next;
}
temp = pgdat_list;
while (temp != start) {
+ if (!(memblk_bitmask & (1 << temp->memblk_id)))
+ if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
+ return(ret);
+ temp = temp->node_next;
+ }
+
+ if (search_twice) {
+ /*
+ * If we failed to find a "preferred" memblk, try again
+ * looking for anything we haven't checked yet.
+ */
+ search_twice = 0; /* no infinite loops, please */
+ memblk_bitmask = ~memblk_bitmask;
+ goto search_through_memblks;
+ }
+ return(0);
+}
+
+#else /* !CONFIG_NUMA */
+
+struct page * _alloc_pages(unsigned int gfp_mask, unsigned int order)
+{
+ struct page *ret = 0;
+ pg_data_t *temp;
+ unsigned long flags;
+
+ if (order >= MAX_ORDER)
+ return NULL;
+
+ spin_lock_irqsave(&node_lock, flags);
+ temp = pgdat_list;
+ spin_unlock_irqrestore(&node_lock, flags);
+
+ while (temp) {
if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
return(ret);
temp = temp->node_next;
@@ -127,4 +179,6 @@
return(0);
}
+#endif /* CONFIG_NUMA */
+
#endif /* CONFIG_DISCONTIGMEM */
diff -Nur linux-2.5.25-test/mm/page_alloc.c linux-2.5.25-api/mm/page_alloc.c
--- linux-2.5.25-test/mm/page_alloc.c Fri Jul 5 16:42:03 2002
+++ linux-2.5.25-api/mm/page_alloc.c Fri Jul 12 16:00:40 2002
@@ -41,6 +41,8 @@
static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
+extern unsigned long memblk_online_map;
+
/*
* Temporary debugging check for pages not lying within a given zone.
*/
@@ -921,6 +923,9 @@
void __init free_area_init(unsigned long *zones_size)
{
free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
+ contig_page_data.node_id = 0;
+ contig_page_data.memblk_id = 0;
+ memblk_online_map = 1UL;
}
static int __init setup_mem_frac(char *str)
--------------000108060308080408010305--
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/