ZVC: Make the counters scale

The cachelines holding vm_stat and the per zone vm_stat are potential
hotspots in a large scale system. These are only accessed for
every 32th update but that is per counter. The system may increment
5 or more VM counters for every page fault. This cuts down the
effectivenessi somewhat.

Lets say we use 5 counters so on every 6th page fault we need to
write to the global counter cachelines. If we run that on a system
with 100 or more processors we get some cache line contention
(have not had a chance yet to try this more than 160 processors but
I guess it will be more).

One way to solve that issue is to update all counters whenever
a single counter overflows. At the point of overflow we have to acquire
the single vm_stat cacheline and the per zone cacheline for exclusive
access. So do as much as we can. This reduces the number of
necessary acquisitions of those cachelines for exclusive access.

We now only touch the cacheline if a single counter exceeds
the threshhold and then we also consider all the updates from
the other counters. With 5 counters we will likely end up
updating 32 + 4 * 16 (assume 50% chance of increment) = 96
counts in one go. Around one touch of the global cacheline for 100
increments.

On larger scale systems we may want to reduce the number of counter
updates more by increasing the threshold for larger systems.
Smaller system have less memory and so we may want to track memory
usage closely. So reduce for smaller systems.


Test shows unstable results at 8p!

Tested and verified that it scales up to 160 processors on IA64 NUMA.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.17-mm3/mm/vmstat.c
===================================================================
--- linux-2.6.17-mm3.orig/mm/vmstat.c	2006-06-27 17:12:49.625167407 -0700
+++ linux-2.6.17-mm3/mm/vmstat.c	2006-06-27 18:13:24.082680422 -0700
@@ -112,42 +112,82 @@ atomic_long_t vm_stat[NR_VM_ZONE_STAT_IT
 
 #ifdef CONFIG_SMP
 
+/*
+ * A higher cpu count means a higher possibility for contention.
+ * We increase the threshhold as the number of processors increase.
+ */
+#if NR_CPUS <= 2
+#define STAT_THRESHOLD 8
+#elif NR_CPUS <= 4
+#define STAT_THRESHOLD 16
+#elif NR_CPUS <= 16
 #define STAT_THRESHOLD 32
+#elif NR_CPUS <= 64
+#define STAT_THRESHOLD 64
+#else
+#define STAT_THRESHOLD 126
+#endif
 
 /*
- * Determine pointer to currently valid differential byte given a zone and
- * the item number.
+ * Bring the global and zone counters up to date from counters in a pcp.
+ * This function is called when we know that some counters have to be
+ * updated.
  *
- * Preemption must be off
+ * Preemption must be disabled for the sake of the pcp pointer.
+ * Interrupts must be disabled since we rely on the
+ * differential count not changing while we update
+ * the zone and global counters.
  */
-static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item)
+static void zone_page_state_consolidate(struct zone *z, struct per_cpu_pageset *pcp)
 {
-	return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item];
+	/*
+	 * The global cachelines are may be heavily contended.
+	 * If we get exclusive access to the cacheline then we need to
+	 * make best use of it by updating all counters.
+	 */
+	int i;
+	s8 *p;
+
+	/* Get exclusive access to the global cachelines ASAP */
+	prefetchw(vm_stat);
+	prefetchw(z->vm_stat);
+ 	p = pcp->vm_stat_diff;
+
+	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+		if (p[i]) {
+			zone_page_state_add(p[i], z, i);
+			p[i] = 0;
+		}
 }
 
 /*
  * For use when we know that interrupts are disabled.
+ *
+ * Interrupts must be disabled since we rely on the differential
+ * not changing between initial retrieval and final store.
  */
 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 				int delta)
 {
-	s8 *p;
-	long x;
-
-	p = diff_pointer(zone, item);
-	x = delta + *p;
+	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+	s8 *p = &pcp->vm_stat_diff[item];
+	long x = delta + *p;
 
 	if (unlikely(x > STAT_THRESHOLD || x < -STAT_THRESHOLD)) {
-		zone_page_state_add(x, zone, item);
-		x = 0;
-	}
-
-	*p = x;
+		/*
+		 * We cannot update the differential since the
+		 * result is beyond the threshold.
+	 	 * Consolidate all counters first.
+		 */
+		zone_page_state_consolidate(zone, pcp);
+		zone_page_state_add(delta, zone, item);
+	} else
+		*p = x;
 }
 EXPORT_SYMBOL(__mod_zone_page_state);
 
 /*
- * For an unknown interrupt state
+ * When running with interrupts enabled
  */
 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 					int delta)
@@ -174,23 +214,17 @@ EXPORT_SYMBOL(mod_zone_page_state);
  * The increment or decrement is known and therefore one boundary check can
  * be omitted.
  *
- * Some processors have inc/dec instructions that are atomic vs an interrupt.
- * However, the code must first determine the differential location in a zone
- * based on the processor number and then inc/dec the counter. There is no
- * guarantee without disabling preemption that the processor will not change
- * in between and therefore the atomicity vs. interrupt cannot be exploited
- * in a useful way here.
+ * Note that interrupts must be DISABLED since there cannot be any other counter
+ * operation between the increment and the checking of the threshhold condition.
  */
 static void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 {
-	s8 *p = diff_pointer(zone, item);
+	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+	s8 *p = &pcp->vm_stat_diff[item];
 
 	(*p)++;
-
-	if (unlikely(*p > STAT_THRESHOLD)) {
-		zone_page_state_add(*p, zone, item);
-		*p = 0;
-	}
+	if (unlikely(*p > STAT_THRESHOLD))
+		zone_page_state_consolidate(zone, pcp);
 }
 
 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -202,14 +236,12 @@ EXPORT_SYMBOL(__inc_zone_page_state);
 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
 {
 	struct zone *zone = page_zone(page);
-	s8 *p = diff_pointer(zone, item);
+	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+	s8 *p = &pcp->vm_stat_diff[item];
 
 	(*p)--;
-
-	if (unlikely(*p < -STAT_THRESHOLD)) {
-		zone_page_state_add(*p, zone, item);
-		*p = 0;
-	}
+	if (unlikely(*p < -STAT_THRESHOLD))
+		zone_page_state_consolidate(zone, pcp);
 }
 EXPORT_SYMBOL(__dec_zone_page_state);
 
@@ -224,58 +256,45 @@ void inc_zone_state(struct zone *zone, e
 
 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
 {
-	unsigned long flags;
-	struct zone *zone;
-
-	zone = page_zone(page);
-	local_irq_save(flags);
-	__inc_zone_state(zone, item);
-	local_irq_restore(flags);
+	inc_zone_state(page_zone(page), item);
 }
 EXPORT_SYMBOL(inc_zone_page_state);
 
 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
 {
 	unsigned long flags;
-	struct zone *zone;
-	s8 *p;
 
-	zone = page_zone(page);
 	local_irq_save(flags);
-	p = diff_pointer(zone, item);
-
-	(*p)--;
-
-	if (unlikely(*p < -STAT_THRESHOLD)) {
-		zone_page_state_add(*p, zone, item);
-		*p = 0;
-	}
+	__dec_zone_page_state(page, item);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(dec_zone_page_state);
 
 /*
  * Update the zone counters for one cpu.
+ * Preemption must be disabled.
  */
 void refresh_cpu_vm_stats(int cpu)
 {
 	struct zone *zone;
 	int i;
-	unsigned long flags;
 
 	for_each_zone(zone) {
 		struct per_cpu_pageset *pcp;
 
 		pcp = zone_pcp(zone, cpu);
+		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+			s8 *p = pcp->vm_stat_diff + i;
+
+			if (*p) {
+				unsigned long flags;
 
-		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-			if (pcp->vm_stat_diff[i]) {
 				local_irq_save(flags);
-				zone_page_state_add(pcp->vm_stat_diff[i],
-					zone, i);
-				pcp->vm_stat_diff[i] = 0;
+				zone_page_state_add(*p, zone, i);
+				*p = 0;
 				local_irq_restore(flags);
 			}
+		}
 	}
 }