Use huge pages for the virtual memory map

Using huge pages for the virtual memory map reduces memory pressure.

If we have a kernel configured without HUGETLB support then we fall back
to regular page size.

We simply place the pages using the huge page region.

Note that this has the strange effect on IA64 that the huge page size
affects the TLB pressure. The larger the huge page size the less TLB pressure
from the memory map and the more fragmentation effects.

With 1MB page structs we can map 16k pages which is 256 megabytes.
So if we have 4 GB ram per node (hopefully properly aligned) then we
can map the complete memory in a node with one 16 MB huge page that is
using only a single TLB entry instead of 1024 right now.

Since the address space is sparse there is no additional effect (apart
from wasting memory) if the huge page size is increased beyond that limit.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.18-mm3/arch/ia64/mm/init.c
===================================================================
--- linux-2.6.18-mm3.orig/arch/ia64/mm/init.c	2006-10-06 17:19:28.640655301 -0500
+++ linux-2.6.18-mm3/arch/ia64/mm/init.c	2006-10-06 18:40:28.479684738 -0500
@@ -463,6 +463,12 @@ retry_pte:
 	return hole_next_pfn - pgdat->node_start_pfn;
 }
 
+#ifdef CONFIG_HUGETLB_PAGE
+#define VMEM_MAP_PAGE_SIZE (1UL << hpage_shift)
+#else
+#define VMEM_MAP_PAGE_SIZE PAGE_SIZE
+#endif
+
 int __init
 create_mem_map_page_table (u64 start, u64 end, void *arg)
 {
@@ -483,11 +489,11 @@ create_mem_map_page_table (u64 start, u6
 					 ~(MAX_ORDER_NR_PAGES - 1));
 	map_end   = pfn_to_page(ALIGN(__pa(end) >> PAGE_SHIFT, MAX_ORDER_NR_PAGES));
 
-	start_page = (unsigned long) map_start & PAGE_MASK;
-	end_page = PAGE_ALIGN((unsigned long) map_end);
+	start_page = (unsigned long) map_start & ~(VMEM_MAP_PAGE_SIZE - 1);
+	end_page = ALIGN((unsigned long) map_end, VMEM_MAP_PAGE_SIZE);
 	node = paddr_to_nid(__pa(start));
 
-	for (address = start_page; address < end_page; address += PAGE_SIZE) {
+	for (address = start_page; address < end_page; address += VMEM_MAP_PAGE_SIZE) {
 		pgd = pgd_offset_k(address);
 		if (pgd_none(*pgd))
 			pgd_populate(&init_mm, pgd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
@@ -501,9 +507,23 @@ create_mem_map_page_table (u64 start, u6
 			pmd_populate_kernel(&init_mm, pmd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
 		pte = pte_offset_kernel(pmd, address);
 
-		if (pte_none(*pte))
-			set_pte(pte, pfn_pte(__pa(alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE)) >> PAGE_SHIFT,
-					     PAGE_KERNEL));
+		if (pte_none(*pte)) {
+			unsigned long addr;
+
+			addr = __pa(__alloc_bootmem_node(NODE_DATA(node),
+					VMEM_MAP_PAGE_SIZE,
+					VMEM_MAP_PAGE_SIZE,
+					__pa(MAX_DMA_ADDRESS)));
+#ifdef CONFIG_HUGETLB_PAGE
+			printk(KERN_CRIT "Huge virtual mmap range %lx-%lx page @%lx:%lx size=%lu node=%d\n", start, end, address, addr, VMEM_MAP_PAGE_SIZE, node);
+#endif
+			set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
+		}
+#ifdef CONFIG_HUGETLB_PAGE
+		else
+			printk(KERN_CRIT "Huge virtual mmap %lx-%lx @%lx node %d already present.\n",
+				start, end, address, node);
+#endif
 	}
 	return 0;
 }
Index: linux-2.6.18-mm3/include/asm-ia64/page.h
===================================================================
--- linux-2.6.18-mm3.orig/include/asm-ia64/page.h	2006-10-06 17:19:28.582056647 -0500
+++ linux-2.6.18-mm3/include/asm-ia64/page.h	2006-10-06 19:40:56.731070296 -0500
@@ -99,23 +99,41 @@ do {						\
 
 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
 
+#ifdef CONFIG_VIRTUAL_MEM_MAP
 /*
  * STRUCT_PAGE_ORDER is needed to approximate the size of struct page
  * that is unknown at this point. struct page must be smaller than
  * 1 << STRUCT_PAGE_ORDER.
  */
 #define STRUCT_PAGE_ORDER	6
-
-#define VIRTUAL_MEM_MAP		(RGN_BASE(RGN_GATE) + 0x200000000UL)
 #define VIRTUAL_MEM_MAP_SIZE	(1UL << (IA64_MAX_PHYS_BITS - PAGE_SHIFT +\
 					STRUCT_PAGE_ORDER))
 
+#ifdef CONFIG_HUGETLB_PAGE
+
+/*
+ * Use huge pages for the virtual memory map. Since we have separate
+ * address space for the kernel we can just use the whole 1 Petabyte
+ * range. Hmmm.... Are there context switch issues?
+ */
+#define VIRTUAL_MEM_MAP_REGION RGN_HPAGE
+#define VIRTUAL_MEM_MAP	(RGN_BASE(VIRTUAL_MEM_MAP_REGION) + 0x0UL)
+#define VMALLOC_START	(RGN_BASE(RGN_GATE) + 0x200000000UL)
+#else
+
+/*
+ * Place the virtual memory map in the VMALLOC area reducing the
+ * available address space of 128 TB by 8 TB.
+ */
+#define VIRTUAL_MEM_MAP_REGION RGN_GATE
+#define VIRTUAL_MEM_MAP	(RGN_BASE(VIRTUAL_MEM_MAP_REGION) + 0x200000000UL)
 #define VMALLOC_START	(VIRTUAL_MEM_MAP + VIRTUAL_MEM_MAP_SIZE)
+#endif
+
 #define VMALLOC_END	(RGN_BASE(RGN_GATE) + (1UL << (4*PAGE_SHIFT - 9)))
 
 #define virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
 
-#ifdef CONFIG_VIRTUAL_MEM_MAP
 extern int ia64_pfn_valid (unsigned long pfn);
 #elif defined(CONFIG_FLATMEM)
 # define ia64_pfn_valid(pfn) 1
Index: linux-2.6.18-mm3/arch/ia64/mm/fault.c
===================================================================
--- linux-2.6.18-mm3.orig/arch/ia64/mm/fault.c	2006-10-06 17:17:43.000000000 -0500
+++ linux-2.6.18-mm3/arch/ia64/mm/fault.c	2006-10-06 17:22:51.052138046 -0500
@@ -105,13 +105,14 @@ ia64_do_page_fault (unsigned long addres
 
 #ifdef CONFIG_VIRTUAL_MEM_MAP
 	/*
-	 * If fault is in region 5 and we are in the kernel, we may already
-	 * have the mmap_sem (pfn_valid macro is called during mmap). There
-	 * is no vma for region 5 addr's anyway, so skip getting the semaphore
-	 * and go directly to the exception handling code.
+	 * If fault is in VIRTUAL_MEM_MAP region and we are in the kernel,
+	 * we may already have the mmap_sem (pfn_valid macro is called during
+	 * mmap). There is no vma for VIRTUAL_MEM_MAPs region anyway, so skip
+	 * getting the semaphore and go directly to the exception handling
+	 * code.
 	 */
 
-	if ((REGION_NUMBER(address) == 5) && !user_mode(regs))
+	if ((REGION_NUMBER(address) == VIRTUAL_MEM_MAP_REGION) && !user_mode(regs))
 		goto bad_area_no_up;
 #endif