Just a few things;
1. Are patches of this type welcome? i.e. don't change any code but try to
document how it works. If they are welcome, is there anything about the
style I should bear in mind before continuing on with what will cover
most of the VM eventually?
2. I have marked some parts with a QUERY: which are questions I had such
as asking if a particular part was dead code. I'd appreciate if
someone could take a look at them.
Thanks
Mel Gorman
--- linux-2.4.19pre7.orig/mm/page_alloc.c Tue Apr 16 20:36:49 2002
+++ linux-2.4.19pre7.mel/mm/page_alloc.c Thu Apr 18 03:01:34 2002
@@ -29,7 +29,8 @@
struct list_head active_list;
pg_data_t *pgdat_list;
-/* Used to look up the address of the struct zone encoded in page->zone */
+/* Used to look up the address of the struct zone previously encoded in
+ * page->zone */
zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
EXPORT_SYMBOL(zone_table);
@@ -86,8 +87,17 @@
* storing an extra bit, utilizing entry point information.
*
* -- wli
+ *
+ * There is a brief explanation of how a buddy algorithm works at
+ * http://www.memorymanagement.org/articles/alloc.html . A better idea
+ * is to read the explanation from a book like UNIX Internals by
+ * Uresh Vahalia
+ *
*/
+/* This function returns the pages allocated by __alloc_pages and tries to merge
+ * buddies if possible
+ */
static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order));
static void __free_pages_ok (struct page *page, unsigned int order)
{
@@ -98,6 +108,10 @@
/* Yes, think what happens when other parts of the kernel take
* a reference to a page in order to pin it for io. -ben
+ *
+ * QUERY: Out of curiousity, why would a page allocated by the buddy
+ * algorithm be on a LRU list if kernel pages are not meant to be swapped
+ * out? Pretty serious bug no? --mel
*/
if (PageLRU(page))
lru_cache_del(page);
@@ -118,30 +132,56 @@
BUG();
page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
+ /* If it is balance_classzone that is doing the freeing, the pages are to
+ * be kept for the process doing all the work freeing up pages
+ */
if (current->flags & PF_FREE_PAGES)
goto local_freelist;
+
back_local_freelist:
+ /* page_zone returns back what zone a page belongs to */
zone = page_zone(page);
+ /* Multiple uses of which one is
+ * -mask == number of pages that will be freed
+ */
mask = (~0UL) << order;
+
+ /* base is the first page in the current zone */
base = zone->zone_mem_map;
+
+ /* The number page insider the zone_mem_map relative to page size */
page_idx = page - base;
+
+ /* If the page is not aligned to the order size, it's a bug */
if (page_idx & ~mask)
BUG();
- index = page_idx >> (1 + order);
+ /* index is the number bit inside the free_area_t bitmap stored in
+ * area->map
+ */
+ index = page_idx >> (1 + order);
area = zone->free_area + order;
spin_lock_irqsave(&zone->lock, flags);
+ /* -mask == number of pages been freed */
zone->free_pages -= mask;
+ /* No matter what order the function started out as, this expression
+ * will result in 0 when the mask would reach the max order
+ */
while (mask + (1 << (MAX_ORDER-1))) {
struct page *buddy1, *buddy2;
+ /* QUERY: Bogus check, doesn't the condition loop check
+ * for this?
+ */
if (area >= zone->free_area + MAX_ORDER)
BUG();
+
+ /* QUERY: Can someone explain this to me? */
if (!__test_and_change_bit(index, area->map))
/*
* the buddy page is still allocated.
@@ -151,6 +191,9 @@
* Move the buddy up one level.
* This code is taking advantage of the identity:
* -mask = 1+~mask
+ *
+ * remember page_idx is the address index relative to the
+ * beginning of the zone
*/
buddy1 = base + (page_idx ^ -mask);
buddy2 = base + page_idx;
@@ -159,7 +202,10 @@
if (BAD_RANGE(zone,buddy2))
BUG();
+ /* buddy2 has already been freed */
memlist_del(&buddy1->list);
+
+ /* Prepare to try and merge the higher order buddies */
mask <<= 1;
area++;
index >>= 1;
@@ -171,11 +217,22 @@
return;
local_freelist:
+ /* If the process has already freed pages for itself, don't give it
+ * more */
if (current->nr_local_pages)
goto back_local_freelist;
+
+ /* An interrupt doesn't have a current process to store pages on.
+ *
+ * QUERY: is this not a dead check, an interrupt could only get here if
+ * alloc_pages took the slow path through balance_classzones. If an
+ * interrupt got there, aren't we already dead?
+ */
if (in_interrupt())
goto back_local_freelist;
+ /* Add the page onto the local list, update the page information
+ * and return */
list_add(&page->list, ¤t->local_pages);
page->index = order;
current->nr_local_pages++;
@@ -184,19 +241,46 @@
#define MARK_USED(index, order, area) \
__change_bit((index) >> (1+(order)), (area)->map)
+/* expand - Shuffle pages around the free lists
+ *
+ * This function will break up higher orders of pages necessary and update the
+ * bitmaps as it goes along
+ */
static inline struct page * expand (zone_t *zone, struct page *page,
unsigned long index, int low, int high, free_area_t * area)
{
unsigned long size = 1 << high;
+ /* low is the original order requested
+ * high is where we had to start to get a free block
+ *
+ * If it turned out there was a free block at the right order to begin
+ * with, no splitting will take place
+ */
while (high > low) {
if (BAD_RANGE(zone,page))
BUG();
+
+ /* Mark that we are moving to the next area after we are finished
+ * shuffling the free order lists
+ */
area--;
high--;
+
+ /* Size is now half as big because the order dropped by 1 */
size >>= 1;
+
+ /* Add the page to the free list for the "lower" area
+ * note that the lower buddy is put on the free list and the
+ * higher buddy is considered for allocation, or splitting
+ * more if necessary
+ */
memlist_add_head(&(page)->list, &(area)->free_list);
MARK_USED(index, high, area);
+
+ /* index is the page number inside this zone
+ * page is the actual address
+ */
index += size;
page += size;
}
@@ -205,9 +289,23 @@
return page;
}
+/* rmqueue - Allocate pages for the Buddy Allocator
+ *
+ * This function is responsible for finding out what order of pages we
+ * have to go to to satisify the request. For example if there is no
+ * page blocks free to satisy order=0 (1 page), then see if there is
+ * a free block of order=1 that can be spilt into two order=0 pages
+ */
static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
static struct page * rmqueue(zone_t *zone, unsigned int order)
{
+ /* A free_area_t exists for each order of pages that can be allocated.
+ * The struct stores a list of page blocks that can be allocated and
+ * the bitmap the describes if the buddy is allocated or not.
+ *
+ * Here area is set to the free_area_t that represents this order of pages.
+ * If necessary, the next higher order of free blocks will be examined.
+ */
free_area_t * area = zone->free_area + order;
unsigned int curr_order = order;
struct list_head *head, *curr;
@@ -216,24 +314,48 @@
spin_lock_irqsave(&zone->lock, flags);
do {
+ /* Get the first block of pages free in this area */
head = &area->free_list;
curr = memlist_next(head);
+ /* If there is a free block available, split it up until
+ * we get the order we want and allocate it
+ */
if (curr != head) {
unsigned int index;
+ /* Get the page for this block */
page = memlist_entry(curr, struct page, list);
if (BAD_RANGE(zone,page))
BUG();
+
+ /* It is no longer free for this block so remove it from
+ * the list
+ */
memlist_del(curr);
+
+ /* zone_mem_map is the first page in this zone block so
+ * subtracting them will give us which index this page in
+ * the zone is. MARK_USED will give what bit number in
+ * the map it is
+ */
index = page - zone->zone_mem_map;
if (curr_order != MAX_ORDER-1)
MARK_USED(index, curr_order, area);
+
+ /* Remove from the count the number of pages that is been
+ * split or assigned.
+ */
zone->free_pages -= 1UL << order;
+ /* expand is responsible for splitting blocks of higher
+ * orders (if necessary) until we get a block of the
+ * order we are interested in.
+ */
page = expand(zone, page, index, order, curr_order, area);
spin_unlock_irqrestore(&zone->lock, flags);
+ /* Mark the page as used and do some checks */
set_page_count(page, 1);
if (BAD_RANGE(zone,page))
BUG();
@@ -241,10 +363,16 @@
BUG();
if (PageActive(page))
BUG();
+
return page;
}
+
+ /* There isn't pages ready at this order so examine a block of
+ * higher orders
+ */
curr_order++;
area++;
+
} while (curr_order < MAX_ORDER);
spin_unlock_irqrestore(&zone->lock, flags);
@@ -252,6 +380,15 @@
}
#ifndef CONFIG_DISCONTIGMEM
+/* _alloc_pages - Allocate a contiguous block of pages
+ * @gfp_mask - Flags that determine the behaviour of the allocator
+ * @order - 2^order number of pages will be allocated in a block
+ *
+ * This is called directly by alloc_pages. It's only task is to identify the
+ * preferred set of zones to allocate from. The zones currently are
+ * ZONE_DMA, ZONE_NORMAL and ZONE_HIGHMEM
+ *
+ */
struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order)
{
return __alloc_pages(gfp_mask, order,
@@ -259,6 +396,9 @@
}
#endif
+/* In a nutshell, this function does some of the work of kswapd in a synchrous
+ * fashion when there simply is too little memory to be waiting around
+ */
static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
{
@@ -271,6 +411,10 @@
BUG();
current->allocation_order = order;
+
+ /* These flags are set so that __free_pages_ok knows to return the
+ * pages directly to the process
+ */
current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
__freed = try_to_free_pages(classzone, gfp_mask, order);
@@ -278,6 +422,7 @@
current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
if (current->nr_local_pages) {
+ /* If pages were freed */
struct list_head * entry, * local_pages;
struct page * tmp;
int nr_pages;
@@ -290,7 +435,14 @@
do {
tmp = list_entry(entry, struct page, list);
if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
+ /* This is a block of pages that is of
+ * the correct size so remember it
+ */
list_del(entry);
+
+ /* QUERY: if order > 0, wouldn't the
+ * nr_local_pages drop by more than 1?
+ */
current->nr_local_pages--;
set_page_count(tmp, 1);
page = tmp;
@@ -318,7 +470,10 @@
}
nr_pages = current->nr_local_pages;
- /* free in reverse order so that the global order will be lifo */
+ /* free in reverse order so that the global order will be lifo
+ * The pages freed here are ones not of the order we are
+ * interested in for the moment
+ */
while ((entry = local_pages->prev) != local_pages) {
list_del(entry);
tmp = list_entry(entry, struct page, list);
@@ -333,8 +488,19 @@
return page;
}
-/*
+/* __alloc_pages - Allocate pages in a block
+ * @gfp_mask - Flags for this allocation
+ * @order - 2^order number of pages to allocate
+ * @zonelist - The preferred zone to allocate from
+ *
* This is the 'heart' of the zoned buddy allocator:
+ *
+ * There is a few paths the this will take to try and allocate the pages. Which is
+ * takes depends on what pages are available and what flags on gfp_mask are set.
+ * For instance, if the allocation is for an interrupt handler, __alloc_pages
+ * won't do anything that would block. Each block or attempt made gets progressively
+ * slower as the function executes.
+ *
*/
struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
{
@@ -343,15 +509,40 @@
struct page * page;
int freed;
+ /* zonelist is the set of zones for either ZONE_DMA, ZONE_NORMAL
+ * or ZONE_HIGHMEM. zone is the subset of zones inside them three groups
+ */
zone = zonelist->zones;
+
+ /* classzone is the first zone of the list. It's a "special"
+ * zone which keeps track of whether the whole needs to be balanced or
+ * something
+ */
classzone = *zone;
+
+ /* min is the number of pages that have to be allocated */
min = 1UL << order;
+
+ /* Cycle through all the zones available */
for (;;) {
zone_t *z = *(zone++);
if (!z)
break;
+ /* Increase min by pages_low so that too many pages from a zone
+ * are not allocated. If pages_low is reached, kswapd needs to
+ * begin work
+ *
+ * QUERY: If there was more than one zone in ZONE_NORMAL and each
+ * zone had a pages_low value of 10, wouldn't the second
+ * zone have a min value of 20, the third of 30 and so on?
+ * Wouldn't this possibly wake kswapd before it was really
+ * needed? Is this the expected behaviour?
+ */
min += z->pages_low;
+
+ /* There is enough pages, allocate it (rmqueue) and return the
+ * page */
if (z->free_pages > min) {
page = rmqueue(z, order);
if (page)
@@ -359,11 +550,18 @@
}
}
+ /* pages_low has been reached. Mark the zone set as needing balancing and
+ * wake up kswapd which will start work freeing pages in this zone
+ */
classzone->need_balance = 1;
mb();
if (waitqueue_active(&kswapd_wait))
wake_up_interruptible(&kswapd_wait);
+ /* Start again moving through the zones. This time it will allow the zone
+ * to reach pages_min number of free pages. It is hoped that kswapd will
+ * bring the number of pages above the watermarks again later
+ */
zone = zonelist->zones;
min = 1UL << order;
for (;;) {
@@ -373,9 +571,19 @@
break;
local_min = z->pages_min;
+
+ /* If the process requesting this cannot discard other pages or
+ * wait, allow the zone to be pushed into a tigher memory position.
+ */
if (!(gfp_mask & __GFP_WAIT))
local_min >>= 2;
+
+ /* QUERY: same as above, does it not artifically inflate min
+ * depending on the number of zones there is?
+ */
min += local_min;
+
+ /* If we are safe to allocate this, allocate the page */
if (z->free_pages > min) {
page = rmqueue(z, order);
if (page)
@@ -387,7 +595,17 @@
rebalance:
if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) {
+ /* PF_MEMALLOC if set if the calling process wants to be
+ * treated as a memory allocator, kswapd for example. This
+ * process is high priority and should be served if at
+ * all possible. PF_MEMDIE is set by the OOM killer. The
+ * calling process is going to die no matter what but
+ * needs a bit of memory to die cleanly, hence give what
+ * it needs because we'll get it back soon. */
+
zone = zonelist->zones;
+
+ /* Cycle through the zones and try to allocate if at all possible */
for (;;) {
zone_t *z = *(zone++);
if (!z)
@@ -404,10 +622,17 @@
if (!(gfp_mask & __GFP_WAIT))
return NULL;
+ /* Basically do the work of kswapd in a synchronous fashion and return
+ * the pages if enough were freed
+ */
page = balance_classzone(classzone, gfp_mask, order, &freed);
if (page)
return page;
+ /* if balance_classzone returned no pages, it might be because the
+ * pages it freed up were of a higher or lower order than the one we
+ * were interested in, so search though all the zones one last time
+ */
zone = zonelist->zones;
min = 1UL << order;
for (;;) {
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/