From clameter@sgi.com Mon Jun 18 11:53:38 2007
Message-Id: <20070618185326.593525493@sgi.com>
User-Agent: quilt/0.46-1
Date: Mon, 18 Jun 2007 11:53:26 -0700
From: clameter@sgi.com
Subject: [patch 00/10] Memoryless Node support

-- 

From clameter@sgi.com Mon Jun 18 11:53:40 2007
Message-Id: <20070618185339.844731179@sgi.com>
References: <20070618185326.593525493@sgi.com>
User-Agent: quilt/0.46-1
Date: Mon, 18 Jun 2007 11:53:27 -0700
From: clameter@sgi.com
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Subject: [patch 01/10] Memoryless nodes: Fix GFP_THISNODE behavior
Content-Disposition: inline; filename=memless_thisnode_fix

GFP_THISNODE checks that the zone selected is within the pgdat (node) of the
first zone of a nodelist. That only works if the node has memory. A
memoryless node will have its first node on another pgdat (node).

GFP_THISNODE currently will return simply memory on the first pgdat.
Thus it is returning memory on other nodes. GFP_THISNODE should fail
if there is no local memory on a node.


Add a new set of zonelists for each node that only contain the nodes
that belong to the zones itself so that no fallback is possible.

Then modify gfp_type to pickup the right zone based on the presence
of __GFP_THISNODE.

Drop the existing GFP_THISNODE checks from the page_allocators hot path.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Nishanth Aravamudan <nacc@us.ibm.com>

Index: linux-2.6.22-rc4-mm2/include/linux/gfp.h
===================================================================
--- linux-2.6.22-rc4-mm2.orig/include/linux/gfp.h	2007-06-18 11:46:24.000000000 -0700
+++ linux-2.6.22-rc4-mm2/include/linux/gfp.h	2007-06-18 11:47:11.000000000 -0700
@@ -116,22 +116,28 @@ static inline int allocflags_to_migratet
 
 static inline enum zone_type gfp_zone(gfp_t flags)
 {
+	int base = 0;
+
+#ifdef CONFIG_NUMA
+	if (flags & __GFP_THISNODE)
+		base = MAX_NR_ZONES;
+#endif
 #ifdef CONFIG_ZONE_DMA
 	if (flags & __GFP_DMA)
-		return ZONE_DMA;
+		return base + ZONE_DMA;
 #endif
 #ifdef CONFIG_ZONE_DMA32
 	if (flags & __GFP_DMA32)
-		return ZONE_DMA32;
+		return base + ZONE_DMA32;
 #endif
 	if ((flags & (__GFP_HIGHMEM | __GFP_MOVABLE)) ==
 			(__GFP_HIGHMEM | __GFP_MOVABLE))
-		return ZONE_MOVABLE;
+		return base + ZONE_MOVABLE;
 #ifdef CONFIG_HIGHMEM
 	if (flags & __GFP_HIGHMEM)
-		return ZONE_HIGHMEM;
+		return base + ZONE_HIGHMEM;
 #endif
-	return ZONE_NORMAL;
+	return base + ZONE_NORMAL;
 }
 
 static inline gfp_t set_migrateflags(gfp_t gfp, gfp_t migrate_flags)
Index: linux-2.6.22-rc4-mm2/mm/page_alloc.c
===================================================================
--- linux-2.6.22-rc4-mm2.orig/mm/page_alloc.c	2007-06-18 11:46:26.000000000 -0700
+++ linux-2.6.22-rc4-mm2/mm/page_alloc.c	2007-06-18 11:48:32.000000000 -0700
@@ -1430,9 +1430,6 @@ zonelist_scan:
 			!zlc_zone_worth_trying(zonelist, z, allowednodes))
 				continue;
 		zone = *z;
-		if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
-			zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
-				break;
 		if ((alloc_flags & ALLOC_CPUSET) &&
 			!cpuset_zone_allowed_softwall(zone, gfp_mask))
 				goto try_next_zone;
@@ -1553,7 +1550,10 @@ restart:
 	z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
 
 	if (unlikely(*z == NULL)) {
-		/* Should this ever happen?? */
+		/*
+		 * Happens if we have an empty zonelist as a result of
+		 * GFP_THISNODE being used on a memoryless node
+		 */
 		return NULL;
 	}
 
@@ -2151,6 +2151,22 @@ static void build_zonelists_in_node_orde
 }
 
 /*
+ * Build gfp_thisnode zonelists
+ */
+static void build_thisnode_zonelists(pg_data_t *pgdat)
+{
+	enum zone_type i;
+	int j;
+	struct zonelist *zonelist;
+
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i;
+ 		j = build_zonelists_node(pgdat, zonelist, 0, i);
+		zonelist->zones[j] = NULL;
+	}
+}
+
+/*
  * Build zonelists ordered by zone and nodes within zones.
  * This results in conserving DMA zone[s] until all Normal memory is
  * exhausted, but results in overflowing to remote node while memory
@@ -2254,7 +2270,7 @@ static void build_zonelists(pg_data_t *p
 	int order = current_zonelist_order;
 
 	/* initialize zonelists */
-	for (i = 0; i < MAX_NR_ZONES; i++) {
+	for (i = 0; i < 2 * MAX_NR_ZONES; i++) {
 		zonelist = pgdat->node_zonelists + i;
 		zonelist->zones[0] = NULL;
 	}
@@ -2299,6 +2315,8 @@ static void build_zonelists(pg_data_t *p
 		/* calculate node order -- i.e., DMA last! */
 		build_zonelists_in_zone_order(pgdat, j);
 	}
+
+	build_thisnode_zonelists(pgdat);
 }
 
 /* Construct the zonelist performance cache - see further mmzone.h */
Index: linux-2.6.22-rc4-mm2/include/linux/mmzone.h
===================================================================
--- linux-2.6.22-rc4-mm2.orig/include/linux/mmzone.h	2007-06-18 11:46:24.000000000 -0700
+++ linux-2.6.22-rc4-mm2/include/linux/mmzone.h	2007-06-18 11:47:11.000000000 -0700
@@ -356,6 +356,7 @@ struct zone {
 #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)
 
 #ifdef CONFIG_NUMA
+#define MAX_ZONELISTS (2 * MAX_NR_ZONES)
 /*
  * We cache key information from each zonelist for smaller cache
  * footprint when scanning for free pages in get_page_from_freelist().
@@ -421,6 +422,7 @@ struct zonelist_cache {
 	unsigned long last_full_zap;		/* when last zap'd (jiffies) */
 };
 #else
+#define MAX_ZONELISTS MAX_NR_ZONES
 struct zonelist_cache;
 #endif
 
@@ -469,7 +471,7 @@ extern struct page *mem_map;
 struct bootmem_data;
 typedef struct pglist_data {
 	struct zone node_zones[MAX_NR_ZONES];
-	struct zonelist node_zonelists[MAX_NR_ZONES];
+	struct zonelist node_zonelists[MAX_ZONELISTS];
 	int nr_zones;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	struct page *node_mem_map;

-- 

From clameter@sgi.com Mon Jun 18 11:53:41 2007
Message-Id: <20070618185340.882920672@sgi.com>
References: <20070618185326.593525493@sgi.com>
User-Agent: quilt/0.46-1
Date: Mon, 18 Jun 2007 11:53:28 -0700
From: clameter@sgi.com
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>,
 Nishanth Aravamudan <nacc@us.ibm.com>
Subject: [patch 02/10] NUMA: Introduce node_memory_map
Content-Disposition: inline; filename=memless_memory_map

It is necessary to know if nodes have memory since we have recently
begun to add support for memoryless nodes. For that purpose we introduce
a new bitmap called

node_memory_map

A node has its bit in node_memory_map set if it has memory. If a node
has memory then it has at least one zone defined in its pgdat structure
that is located in the pgdat itself.

The node_memory_map can then be used in various places to insure that we
do the right thing when we encounter a memoryless node.

Signed-off-by: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.22-rc4-mm2/include/linux/nodemask.h
===================================================================
--- linux-2.6.22-rc4-mm2.orig/include/linux/nodemask.h	2007-06-18 11:46:26.000000000 -0700
+++ linux-2.6.22-rc4-mm2/include/linux/nodemask.h	2007-06-18 11:48:42.000000000 -0700
@@ -64,12 +64,16 @@
  *
  * int node_online(node)		Is some node online?
  * int node_possible(node)		Is some node possible?
+ * int node_memory(node)		Does a node have memory?
  *
  * int any_online_node(mask)		First online node in mask
  *
  * node_set_online(node)		set bit 'node' in node_online_map
  * node_set_offline(node)		clear bit 'node' in node_online_map
  *
+ * node_set_has_memory(node)		set bit 'node' in node_memory_map
+ * node_set_no_memory(node)		clear bit 'node' in node_memory_map
+ *
  * for_each_node(node)			for-loop node over node_possible_map
  * for_each_online_node(node)		for-loop node over node_online_map
  *
@@ -344,12 +348,14 @@ static inline void __nodes_remap(nodemas
 
 extern nodemask_t node_online_map;
 extern nodemask_t node_possible_map;
+extern nodemask_t node_memory_map;
 
 #if MAX_NUMNODES > 1
 #define num_online_nodes()	nodes_weight(node_online_map)
 #define num_possible_nodes()	nodes_weight(node_possible_map)
 #define node_online(node)	node_isset((node), node_online_map)
 #define node_possible(node)	node_isset((node), node_possible_map)
+#define node_memory(node)	node_isset((node), node_memory_map)
 #define first_online_node	first_node(node_online_map)
 #define next_online_node(nid)	next_node((nid), node_online_map)
 extern int nr_node_ids;
@@ -358,6 +364,8 @@ extern int nr_node_ids;
 #define num_possible_nodes()	1
 #define node_online(node)	((node) == 0)
 #define node_possible(node)	((node) == 0)
+#define node_memory(node)	((node) == 0)
+#define node_populated(node)	((node) == 0)
 #define first_online_node	0
 #define next_online_node(nid)	(MAX_NUMNODES)
 #define nr_node_ids		1
@@ -375,7 +383,11 @@ extern int nr_node_ids;
 #define node_set_online(node)	   set_bit((node), node_online_map.bits)
 #define node_set_offline(node)	   clear_bit((node), node_online_map.bits)
 
+#define node_set_has_memory(node)  set_bit((node), node_memory_map.bits)
+#define node_set_no_memory(node)   clear_bit((node), node_memory_map.bits)
+
 #define for_each_node(node)	   for_each_node_mask((node), node_possible_map)
 #define for_each_online_node(node) for_each_node_mask((node), node_online_map)
+#define for_each_memory_node(node) for_each_node_mask((node), node_memory_map)
 
 #endif /* __LINUX_NODEMASK_H */
Index: linux-2.6.22-rc4-mm2/mm/page_alloc.c
===================================================================
--- linux-2.6.22-rc4-mm2.orig/mm/page_alloc.c	2007-06-18 11:48:32.000000000 -0700
+++ linux-2.6.22-rc4-mm2/mm/page_alloc.c	2007-06-18 11:49:34.000000000 -0700
@@ -54,6 +54,9 @@ nodemask_t node_online_map __read_mostly
 EXPORT_SYMBOL(node_online_map);
 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
+nodemask_t node_memory_map __read_mostly = NODE_MASK_NONE;
+EXPORT_SYMBOL(node_memory_map);
+
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 long nr_swap_pages;
@@ -2317,6 +2320,9 @@ static void build_zonelists(pg_data_t *p
 	}
 
 	build_thisnode_zonelists(pgdat);
+
+	if (pgdat->node_present_pages)
+		node_set_has_memory(local_node);
 }
 
 /* Construct the zonelist performance cache - see further mmzone.h */

-- 

From clameter@sgi.com Mon Jun 18 11:53:42 2007
Message-Id: <20070618185341.507010532@sgi.com>
References: <20070618185326.593525493@sgi.com>
User-Agent: quilt/0.46-1
Date: Mon, 18 Jun 2007 11:53:29 -0700
From: clameter@sgi.com
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Subject: [patch 03/10] Fix MPOL_INTERLEAVE behavior for memoryless nodes
Content-Disposition: inline; filename=memless_fix_interleave

MPOL_INTERLEAVE currently simply loops over all nodes. Allocations on
memoryless nodes will be redirected to nodes with memory. This results in
an imbalance because the neighboring nodes to memoryless nodes will get significantly
more interleave hits that the rest of the nodes on the system.

We can avoid this imbalance by clearing the nodes in the interleave node
set that have no memory.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

Index: linux-2.6.22-rc4-mm2/mm/mempolicy.c
===================================================================
--- linux-2.6.22-rc4-mm2.orig/mm/mempolicy.c	2007-06-13 23:06:14.000000000 -0700
+++ linux-2.6.22-rc4-mm2/mm/mempolicy.c	2007-06-14 00:49:43.000000000 -0700
@@ -185,7 +185,8 @@ static struct mempolicy *mpol_new(int mo
 	switch (mode) {
 	case MPOL_INTERLEAVE:
 		policy->v.nodes = *nodes;
-		if (nodes_weight(*nodes) == 0) {
+		nodes_and(policy->v.nodes, policy->v.nodes, node_memory_map);
+		if (nodes_weight(policy->v.nodes) == 0) {
 			kmem_cache_free(policy_cache, policy);
 			return ERR_PTR(-EINVAL);
 		}

-- 

From clameter@sgi.com Mon Jun 18 11:53:42 2007
Message-Id: <20070618185342.355249020@sgi.com>
References: <20070618185326.593525493@sgi.com>
User-Agent: quilt/0.46-1
Date: Mon, 18 Jun 2007 11:53:30 -0700
From: clameter@sgi.com
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Subject: [patch 04/10] OOM: use the node_memory_map instead of constructing one on the fly
Content-Disposition: inline; filename=memless_oom_kill

constrained_alloc() builds its own memory map for nodes with memory.
We have that available in node_memory_map now. So simplify the code.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Nishanth Aravamudan <nacc@us.ibm.com>

Index: linux-2.6.22-rc4-mm2/mm/oom_kill.c
===================================================================
--- linux-2.6.22-rc4-mm2.orig/mm/oom_kill.c	2007-06-13 23:11:32.000000000 -0700
+++ linux-2.6.22-rc4-mm2/mm/oom_kill.c	2007-06-13 23:12:39.000000000 -0700
@@ -176,14 +176,7 @@ static inline int constrained_alloc(stru
 {
 #ifdef CONFIG_NUMA
 	struct zone **z;
-	nodemask_t nodes;
-	int node;
-
-	nodes_clear(nodes);
-	/* node has memory ? */
-	for_each_online_node(node)
-		if (NODE_DATA(node)->node_present_pages)
-			node_set(node, nodes);
+	nodemask_t nodes = node_memory_map;
 
 	for (z = zonelist->zones; *z; z++)
 		if (cpuset_zone_allowed_softwall(*z, gfp_mask))

-- 

From clameter@sgi.com Mon Jun 18 11:53:43 2007
Message-Id: <20070618185343.030964065@sgi.com>
References: <20070618185326.593525493@sgi.com>
User-Agent: quilt/0.46-1
Date: Mon, 18 Jun 2007 11:53:31 -0700
From: clameter@sgi.com
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Subject: [patch 05/10] Memoryless Nodes: No need for kswapd
Content-Disposition: inline; filename=memless_no_kswapd

A node without memory does not need a kswapd. So use the memory map instead
of the online map when starting kswapd.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Nishanth Aravamudan <nacc@us.ibm.com>

Index: linux-2.6.22-rc4-mm2/mm/vmscan.c
===================================================================
--- linux-2.6.22-rc4-mm2.orig/mm/vmscan.c	2007-06-18 11:46:25.000000000 -0700
+++ linux-2.6.22-rc4-mm2/mm/vmscan.c	2007-06-18 11:49:47.000000000 -0700
@@ -1735,7 +1735,7 @@ static int __init kswapd_init(void)
 	int nid;
 
 	swap_setup();
-	for_each_online_node(nid)
+	for_each_memory_node(nid)
  		kswapd_run(nid);
 	hotcpu_notifier(cpu_callback, 0);
 	return 0;

-- 

From clameter@sgi.com Mon Jun 18 11:53:44 2007
Message-Id: <20070618185343.559364733@sgi.com>
References: <20070618185326.593525493@sgi.com>
User-Agent: quilt/0.46-1
Date: Mon, 18 Jun 2007 11:53:32 -0700
From: clameter@sgi.com
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Subject: [patch 06/10] Memoryless Node: Slab support
Content-Disposition: inline; filename=memless_slab

Slab should not allocate control structures for nodes without memory. This may seem
to work right now but its unreliable since not all allocations can fall back due
to the use of GFP_THISNODE.

Switching a few for_each_online_node's to for_each_memory_node will allow us to
only allocate for nodes that actually have memory.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Nishanth Aravamudan <nacc@us.ibm.com>

Index: linux-2.6.22-rc4-mm2/mm/slab.c
===================================================================
--- linux-2.6.22-rc4-mm2.orig/mm/slab.c	2007-06-18 11:46:25.000000000 -0700
+++ linux-2.6.22-rc4-mm2/mm/slab.c	2007-06-18 11:49:53.000000000 -0700
@@ -1564,7 +1564,7 @@ void __init kmem_cache_init(void)
 		/* Replace the static kmem_list3 structures for the boot cpu */
 		init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node);
 
-		for_each_online_node(nid) {
+		for_each_memory_node(nid) {
 			init_list(malloc_sizes[INDEX_AC].cs_cachep,
 				  &initkmem_list3[SIZE_AC + nid], nid);
 
@@ -1942,7 +1942,7 @@ static void __init set_up_list3s(struct 
 {
 	int node;
 
-	for_each_online_node(node) {
+	for_each_memory_node(node) {
 		cachep->nodelists[node] = &initkmem_list3[index + node];
 		cachep->nodelists[node]->next_reap = jiffies +
 		    REAPTIMEOUT_LIST3 +
@@ -2073,7 +2073,7 @@ static int __init_refok setup_cpu_cache(
 			g_cpucache_up = PARTIAL_L3;
 		} else {
 			int node;
-			for_each_online_node(node) {
+			for_each_memory_node(node) {
 				cachep->nodelists[node] =
 				    kmalloc_node(sizeof(struct kmem_list3),
 						GFP_KERNEL, node);
@@ -3787,7 +3787,7 @@ static int alloc_kmemlist(struct kmem_ca
 	struct array_cache *new_shared;
 	struct array_cache **new_alien = NULL;
 
-	for_each_online_node(node) {
+	for_each_memory_node(node) {
 
                 if (use_alien_caches) {
                         new_alien = alloc_alien_cache(node, cachep->limit);

-- 

From clameter@sgi.com Mon Jun 18 11:53:45 2007
Message-Id: <20070618185344.626942398@sgi.com>
References: <20070618185326.593525493@sgi.com>
User-Agent: quilt/0.46-1
Date: Mon, 18 Jun 2007 11:53:33 -0700
From: clameter@sgi.com
Subject: [patch 07/10] Memoryless nodes: SLUB support
Content-Disposition: inline; filename=memless_slub

Simply switch all for_each_online_node to for_each_memory_node. That way
SLUB only operates on nodes with memory. Any allocation attempt on a
memoryless node will fall whereupon SLUB will fetch memory from a nearby
node (depending on how memory policies and cpuset describe fallback).

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.22-rc4-mm2/mm/slub.c
===================================================================
--- linux-2.6.22-rc4-mm2.orig/mm/slub.c	2007-06-18 11:16:15.000000000 -0700
+++ linux-2.6.22-rc4-mm2/mm/slub.c	2007-06-18 11:28:50.000000000 -0700
@@ -2086,7 +2086,7 @@ static void free_kmem_cache_nodes(struct
 {
 	int node;
 
-	for_each_online_node(node) {
+	for_each_memory_node(node) {
 		struct kmem_cache_node *n = s->node[node];
 		if (n && n != &s->local_node)
 			kmem_cache_free(kmalloc_caches, n);
@@ -2104,7 +2104,7 @@ static int init_kmem_cache_nodes(struct 
 	else
 		local_node = 0;
 
-	for_each_online_node(node) {
+	for_each_memory_node(node) {
 		struct kmem_cache_node *n;
 
 		if (local_node == node)
@@ -2366,7 +2366,7 @@ static inline int kmem_cache_close(struc
 	/* Attempt to free all objects */
 	free_kmem_cache_cpus(s);
 
-	for_each_online_node(node) {
+	for_each_memory_node(node) {
 		struct kmem_cache_node *n = get_node(s, node);
 
 		n->nr_partial -= free_list(s, n, &n->partial);
@@ -2937,7 +2937,7 @@ int kmem_cache_shrink(struct kmem_cache 
 	if (!scratch)
 		return -ENOMEM;
 
-	for_each_online_node(node)
+	for_each_memory_node(node)
 		__kmem_cache_shrink(s, get_node(s, node), scratch);
 
 	kfree(scratch);
@@ -3008,7 +3008,7 @@ int kmem_cache_defrag(int percent, int n
 		scratch = kmalloc(sizeof(struct list_head) * s->objects,
 								GFP_KERNEL);
 		if (node == -1) {
-			for_each_online_node(node)
+			for_each_memory_node(node)
 				pages += __kmem_cache_defrag(s, percent,
 							node, scratch);
 		} else
@@ -3392,7 +3392,7 @@ static unsigned long validate_slab_cache
 	unsigned long count = 0;
 
 	flush_all(s);
-	for_each_online_node(node) {
+	for_each_memory_node(node) {
 		struct kmem_cache_node *n = get_node(s, node);
 
 		count += validate_slab_node(s, n);
@@ -3611,7 +3611,7 @@ static int list_locations(struct kmem_ca
 	/* Push back cpu slabs */
 	flush_all(s);
 
-	for_each_online_node(node) {
+	for_each_memory_node(node) {
 		struct kmem_cache_node *n = get_node(s, node);
 		unsigned long flags;
 		struct page *page;
@@ -3723,7 +3723,7 @@ static unsigned long slab_objects(struct
 		}
 	}
 
-	for_each_online_node(node) {
+	for_each_memory_node(node) {
 		struct kmem_cache_node *n = get_node(s, node);
 
 		if (flags & SO_PARTIAL) {
@@ -3751,7 +3751,7 @@ static unsigned long slab_objects(struct
 
 	x = sprintf(buf, "%lu", total);
 #ifdef CONFIG_NUMA
-	for_each_online_node(node)
+	for_each_memory_node(node)
 		if (nodes[node])
 			x += sprintf(buf + x, " N%d=%lu",
 					node, nodes[node]);
@@ -3772,7 +3772,7 @@ static int any_slab_objects(struct kmem_
 			return 1;
 	}
 
-	for_each_online_node(node) {
+	for_each_memory_node(node) {
 		struct kmem_cache_node *n = get_node(s, node);
 
 		if (n && (n->nr_partial || atomic_read(&n->nr_slabs)))

-- 

From clameter@sgi.com Mon Jun 18 11:53:46 2007
Message-Id: <20070618185345.388646330@sgi.com>
References: <20070618185326.593525493@sgi.com>
User-Agent: quilt/0.46-1
Date: Mon, 18 Jun 2007 11:53:34 -0700
From: clameter@sgi.com
Cc: jes@sgi.com
Subject: [patch 08/10] Uncached allocator: Handle memoryless nodes
Content-Disposition: inline; filename=memless_mspec

The checks for node_online in the uncached allocator are made to make sure
that memory is available on these nodes. Thus switch all the checks to use
the node_memory and for_each_memory_node functions.

Cc: jes@sgi.com
Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.22-rc4-mm2/arch/ia64/kernel/uncached.c
===================================================================
--- linux-2.6.22-rc4-mm2.orig/arch/ia64/kernel/uncached.c	2007-06-13 23:29:58.000000000 -0700
+++ linux-2.6.22-rc4-mm2/arch/ia64/kernel/uncached.c	2007-06-13 23:32:35.000000000 -0700
@@ -196,7 +196,7 @@ unsigned long uncached_alloc_page(int st
 	nid = starting_nid;
 
 	do {
-		if (!node_online(nid))
+		if (!node_memory(nid))
 			continue;
 		uc_pool = &uncached_pools[nid];
 		if (uc_pool->pool == NULL)
@@ -268,7 +268,7 @@ static int __init uncached_init(void)
 {
 	int nid;
 
-	for_each_online_node(nid) {
+	for_each_memory_node(nid) {
 		uncached_pools[nid].pool = gen_pool_create(PAGE_SHIFT, nid);
 		mutex_init(&uncached_pools[nid].add_chunk_mutex);
 	}
Index: linux-2.6.22-rc4-mm2/drivers/char/mspec.c
===================================================================
--- linux-2.6.22-rc4-mm2.orig/drivers/char/mspec.c	2007-06-13 23:28:15.000000000 -0700
+++ linux-2.6.22-rc4-mm2/drivers/char/mspec.c	2007-06-13 23:29:35.000000000 -0700
@@ -353,7 +353,7 @@ mspec_init(void)
 		is_sn2 = 1;
 		if (is_shub2()) {
 			ret = -ENOMEM;
-			for_each_online_node(nid) {
+			for_each_memory_node(nid) {
 				int actual_nid;
 				int nasid;
 				unsigned long phys;

-- 

From clameter@sgi.com Mon Jun 18 11:53:47 2007
Message-Id: <20070618185346.513959832@sgi.com>
References: <20070618185326.593525493@sgi.com>
User-Agent: quilt/0.46-1
Date: Mon, 18 Jun 2007 11:53:35 -0700
From: clameter@sgi.com
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Subject: [patch 09/10] Memoryless node: Allow profiling data to fall back to other nodes
Content-Disposition: inline; filename=memless_profile

Processors on memoryless nodes must be able to fall back to remote nodes
in order to get a profiling buffer. This may lead to excessive NUMA traffic
but I think we should allow this rather than failing.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Nishanth Aravamudan <nacc@us.ibm.com>

Index: linux-2.6.22-rc4-mm2/kernel/profile.c
===================================================================
--- linux-2.6.22-rc4-mm2.orig/kernel/profile.c	2007-06-13 23:36:42.000000000 -0700
+++ linux-2.6.22-rc4-mm2/kernel/profile.c	2007-06-13 23:36:55.000000000 -0700
@@ -346,7 +346,7 @@ static int __devinit profile_cpu_callbac
 		per_cpu(cpu_profile_flip, cpu) = 0;
 		if (!per_cpu(cpu_profile_hits, cpu)[1]) {
 			page = alloc_pages_node(node,
-					GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+					GFP_KERNEL | __GFP_ZERO,
 					0);
 			if (!page)
 				return NOTIFY_BAD;
@@ -354,7 +354,7 @@ static int __devinit profile_cpu_callbac
 		}
 		if (!per_cpu(cpu_profile_hits, cpu)[0]) {
 			page = alloc_pages_node(node,
-					GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+					GFP_KERNEL | __GFP_ZERO,
 					0);
 			if (!page)
 				goto out_free;

-- 

From clameter@sgi.com Mon Jun 18 11:53:47 2007
Message-Id: <20070618185347.299689839@sgi.com>
References: <20070618185326.593525493@sgi.com>
User-Agent: quilt/0.46-1
Date: Mon, 18 Jun 2007 11:53:36 -0700
From: clameter@sgi.com
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Subject: [patch 10/10] Memoryless nodes: Update memory policy and page migration
Content-Disposition: inline; filename=memless_migrate

Online nodes now may have no memory. The checks and initialization must therefore
be changed to no longer use the online functions.

This will correctly initialize the interleave on bootup to only target
nodes with memory and will make sys_move_pages return an error when a page
is to be moved to a memoryless node. Similarly we will get an error if
MPOL_BIND and MPOL_INTERLEAVE is used on a memoryless node.

These are somewhat new semantics. So far one could specify memoryless nodes
and we would maybe do the right thing and just ignore the node (or we'd do
something strange like with MPOL_INTERLEAVE). If we want to allow the
specification of memoryless nodes via memory policies then we need to keep
checking for online nodes.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Nishanth Aravamudan <nacc@us.ibm.com>

Index: linux-2.6.22-rc4-mm2/mm/migrate.c
===================================================================
--- linux-2.6.22-rc4-mm2.orig/mm/migrate.c	2007-06-14 00:49:43.000000000 -0700
+++ linux-2.6.22-rc4-mm2/mm/migrate.c	2007-06-18 11:30:53.000000000 -0700
@@ -963,7 +963,7 @@ asmlinkage long sys_move_pages(pid_t pid
 				goto out;
 
 			err = -ENODEV;
-			if (!node_online(node))
+			if (!node_memory(node))
 				goto out;
 
 			err = -EACCES;
Index: linux-2.6.22-rc4-mm2/mm/mempolicy.c
===================================================================
--- linux-2.6.22-rc4-mm2.orig/mm/mempolicy.c	2007-06-18 11:24:51.000000000 -0700
+++ linux-2.6.22-rc4-mm2/mm/mempolicy.c	2007-06-18 11:30:53.000000000 -0700
@@ -130,7 +130,7 @@ static int mpol_check_policy(int mode, n
 			return -EINVAL;
 		break;
 	}
-	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
+	return nodes_subset(*nodes, node_memory_map) ? 0 : -EINVAL;
 }
 
 /* Generate a custom zonelist for the BIND policy. */
@@ -495,9 +495,9 @@ static void get_zonemask(struct mempolic
 		*nodes = p->v.nodes;
 		break;
 	case MPOL_PREFERRED:
-		/* or use current node instead of online map? */
+		/* or use current node instead of memory_map? */
 		if (p->v.preferred_node < 0)
-			*nodes = node_online_map;
+			*nodes = node_memory_map;
 		else
 			node_set(p->v.preferred_node, *nodes);
 		break;
@@ -1606,7 +1606,7 @@ int mpol_parse_options(char *value, int 
 		*nodelist++ = '\0';
 		if (nodelist_parse(nodelist, *policy_nodes))
 			goto out;
-		if (!nodes_subset(*policy_nodes, node_online_map))
+		if (!nodes_subset(*policy_nodes, node_memory_map))
 			goto out;
 	}
 	if (!strcmp(value, "default")) {
@@ -1631,9 +1631,9 @@ int mpol_parse_options(char *value, int 
 			err = 0;
 	} else if (!strcmp(value, "interleave")) {
 		*policy = MPOL_INTERLEAVE;
-		/* Default to nodes online if no nodelist */
+		/* Default to nodes memory map if no nodelist */
 		if (!nodelist)
-			*policy_nodes = node_online_map;
+			*policy_nodes = node_memory_map;
 		err = 0;
 	}
 out:
@@ -1674,14 +1674,14 @@ void __init numa_policy_init(void)
 
 	/*
 	 * Use the specified nodemask for init, or fall back to
-	 * node_online_map.
+	 * node_memory_map.
 	 */
 	if (policy_sysinit == MPOL_DEFAULT)
 		nmask = NULL;
 	else if (!nodes_empty(nmask_sysinit))
 		nmask = &nmask_sysinit;
 	else
-		nmask = &node_online_map;
+		nmask = &node_memory_map;
 
 	if (do_set_mempolicy(policy_sysinit, nmask))
 		printk("numa_policy_init: setting init policy failed\n");
@@ -1945,7 +1945,7 @@ int show_numa_map(struct seq_file *m, vo
 		seq_printf(m, " huge");
 	} else {
 		check_pgd_range(vma, vma->vm_start, vma->vm_end,
-				&node_online_map, MPOL_MF_STATS, md);
+				&node_memory_map, MPOL_MF_STATS, md);
 	}
 
 	if (!md->pages)
@@ -1972,7 +1972,7 @@ int show_numa_map(struct seq_file *m, vo
 	if (md->writeback)
 		seq_printf(m," writeback=%lu", md->writeback);
 
-	for_each_online_node(n)
+	for_each_memory_node(n)
 		if (md->node[n])
 			seq_printf(m, " N%d=%lu", n, md->node[n]);
 out:

-- 

