VM statics: framework in page allocator The current statistics for slab, pagecache and pagetable use are kept in a per cpu array in order to avoid having to use locks when updating these counters. However, this means that the counters only have meaning when added up for all processors. The counter for slab allocations does just mean that this processor has performed some slab allocations. The counter cannot be used to see how many slab pages were allocated on a specific node or in a zone. The same is true for the page cache and pagetables. The following patch moves the keeping of the statistics into the page allocator. Pages are requested by specifying a type (slab, pagetable, pagecache) and the page allocator will count the pages during allocation and freeing of pages. That is possible since the page allocator already needs to take the zone lock in order to access the free lists. We can now have accurate statistics per zone for slab cache, page table and page cache use since the counters are now protected by a lock. A nice side effect is that code sizes shrinks since we do not need these calls to inc_page_state() anymore. Only i386, x86_64 and ia64 are supported right now. The other arches would need to add __GFP_PAGETABLE to page table allocations in order to get the correct statistics. Signed-off-by: Christoph Lameter Index: linux-2.6.15-rc3/include/linux/gfp.h =================================================================== --- linux-2.6.15-rc3.orig/include/linux/gfp.h 2005-11-28 19:51:27.000000000 -0800 +++ linux-2.6.15-rc3/include/linux/gfp.h 2005-11-30 18:29:45.000000000 -0800 @@ -47,6 +47,9 @@ struct vm_area_struct; #define __GFP_ZERO ((__force gfp_t)0x8000u)/* Return zeroed page on success */ #define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */ #define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */ +#define __GFP_SLAB ((__force gfp_t)0x40000u) /* Request a slab page */ +#define __GFP_PAGETABLE ((__force gfp_t)0x80000u) /* Request pagetable page */ +#define __GFP_PAGECACHE ((__force gfp_t)0x100000u) /* Request pagecache page */ #define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) Index: linux-2.6.15-rc3/include/linux/mmzone.h =================================================================== --- linux-2.6.15-rc3.orig/include/linux/mmzone.h 2005-11-28 19:51:27.000000000 -0800 +++ linux-2.6.15-rc3/include/linux/mmzone.h 2005-11-30 18:29:45.000000000 -0800 @@ -147,6 +147,9 @@ struct zone { unsigned long nr_scan_inactive; unsigned long nr_active; unsigned long nr_inactive; + unsigned long nr_slab; + unsigned long nr_pagecache; + unsigned long nr_pagetable; unsigned long pages_scanned; /* since last reclaim */ int all_unreclaimable; /* All pages pinned */ @@ -315,9 +318,12 @@ typedef struct pglist_data { extern struct pglist_data *pgdat_list; void __get_zone_counts(unsigned long *active, unsigned long *inactive, - unsigned long *free, struct pglist_data *pgdat); + unsigned long *free, unsigned long *pagetable, + unsigned long *slab, unsigned long *pagecache, + struct pglist_data *pgdat); void get_zone_counts(unsigned long *active, unsigned long *inactive, - unsigned long *free); + unsigned long *free, unsigned long *pagetabe, + unsigned long *slab, unsigned long *pagecache); void build_all_zonelists(void); void wakeup_kswapd(struct zone *zone, int order); int zone_watermark_ok(struct zone *z, int order, unsigned long mark, Index: linux-2.6.15-rc3/include/linux/page-flags.h =================================================================== --- linux-2.6.15-rc3.orig/include/linux/page-flags.h 2005-11-28 19:51:27.000000000 -0800 +++ linux-2.6.15-rc3/include/linux/page-flags.h 2005-11-30 18:29:45.000000000 -0800 @@ -59,7 +59,7 @@ #define PG_dirty 4 #define PG_lru 5 #define PG_active 6 -#define PG_slab 7 /* slab debug (Suparna wants this) */ +#define PG_slab 7 /* Page is used for slab */ #define PG_checked 8 /* kill me in 2.5.. */ #define PG_arch_1 9 @@ -76,6 +76,9 @@ #define PG_nosave_free 18 /* Free, should not be written */ #define PG_uncached 19 /* Page has been mapped as uncached */ +#define PG_pagecache 20 /* Page is used for page cache */ +#define PG_pagetable 21 /* Page is used for page tables */ + /* * Global page accounting. One instance per CPU. Only unsigned longs are * allowed. @@ -84,10 +87,8 @@ struct page_state { unsigned long nr_dirty; /* Dirty writeable pages */ unsigned long nr_writeback; /* Pages under writeback */ unsigned long nr_unstable; /* NFS unstable pages */ - unsigned long nr_page_table_pages;/* Pages used for pagetables */ unsigned long nr_mapped; /* mapped into pagetables */ - unsigned long nr_slab; /* In slab */ -#define GET_PAGE_STATE_LAST nr_slab +#define GET_PAGE_STATE_LAST nr_mapped /* * The below are zeroed by get_page_state(). Use get_full_page_state() @@ -215,6 +216,18 @@ extern void __mod_page_state(unsigned lo #define TestClearPageSlab(page) test_and_clear_bit(PG_slab, &(page)->flags) #define TestSetPageSlab(page) test_and_set_bit(PG_slab, &(page)->flags) +#define PageCache(page) test_bit(PG_pagecache, &(page)->flags) +#define SetPageCache(page) set_bit(PG_pagecache, &(page)->flags) +#define ClearPageCache(page) clear_bit(PG_pagecache, &(page)->flags) +#define TestClearPageCache(page) test_and_clear_bit(PG_pagecache, &(page)->flags) +#define TestSetPageCache(page) test_and_set_bit(PG_pagecache, &(page)->flags) + +#define PageTable(page) test_bit(PG_pagetable, &(page)->flags) +#define SetPageTable(page) set_bit(PG_pagetable, &(page)->flags) +#define ClearPageTable(page) clear_bit(PG_pagetable, &(page)->flags) +#define TestClearPageTable(page) test_and_clear_bit(PG_pagetable, &(page)->flags) +#define TestSetPageTable(page) test_and_set_bit(PG_pagetable, &(page)->flags) + #ifdef CONFIG_HIGHMEM #define PageHighMem(page) is_highmem(page_zone(page)) #else Index: linux-2.6.15-rc3/mm/page_alloc.c =================================================================== --- linux-2.6.15-rc3.orig/mm/page_alloc.c 2005-11-30 18:29:00.000000000 -0800 +++ linux-2.6.15-rc3/mm/page_alloc.c 2005-11-30 18:32:14.000000000 -0800 @@ -300,6 +300,13 @@ static inline void __free_pages_bulk (st unsigned long page_idx; int order_size = 1 << order; + if (PageSlab(page)) + zone->nr_slab -= order_size; + if (PageCache(page)) + zone->nr_pagecache -= order_size; + if (PageTable(page)) + zone->nr_pagetable -= order_size; + if (unlikely(order)) destroy_compound_page(page, order); @@ -345,7 +352,6 @@ static inline int free_pages_check(const 1 << PG_locked | 1 << PG_active | 1 << PG_reclaim | - 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | 1 << PG_reserved ))) @@ -489,7 +495,6 @@ static int prep_new_page(struct page *pa 1 << PG_active | 1 << PG_dirty | 1 << PG_reclaim | - 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | 1 << PG_reserved ))) @@ -504,7 +509,9 @@ static int prep_new_page(struct page *pa page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1 | - 1 << PG_checked | 1 << PG_mappedtodisk); + 1 << PG_slab | 1 << PG_pagecache | + 1 << PG_pagetable | 1 << PG_checked | + 1 << PG_mappedtodisk); set_page_private(page, 0); set_page_refs(page, order); kernel_map_pages(page, 1 << order, 1); @@ -770,11 +777,28 @@ again: } if (page != NULL) { + int pages = 1 << order; + BUG_ON(bad_range(zone, page)); - mod_page_state_zone(zone, pgalloc, 1 << order); + mod_page_state_zone(zone, pgalloc, pages); if (prep_new_page(page, order)) goto again; + if (gfp_flags & __GFP_SLAB) { + SetPageSlab(page); + zone->nr_slab += pages; + } + + if (gfp_flags & __GFP_PAGECACHE) { + SetPageCache(page); + zone->nr_pagecache += pages; + } + + if (gfp_flags & __GFP_PAGETABLE) { + SetPageTable(page); + zone->nr_pagetable += pages; + } + if (gfp_flags & __GFP_ZERO) prep_zero_page(page, order, gfp_flags); @@ -1264,7 +1288,9 @@ void __mod_page_state(unsigned long offs EXPORT_SYMBOL(__mod_page_state); void __get_zone_counts(unsigned long *active, unsigned long *inactive, - unsigned long *free, struct pglist_data *pgdat) + unsigned long *free, unsigned long *pagetable, + unsigned long *slab, unsigned long *pagecache, + struct pglist_data *pgdat) { struct zone *zones = pgdat->node_zones; int i; @@ -1272,27 +1298,40 @@ void __get_zone_counts(unsigned long *ac *active = 0; *inactive = 0; *free = 0; + *pagetable = 0; + *slab = 0; + *pagecache = 0; for (i = 0; i < MAX_NR_ZONES; i++) { *active += zones[i].nr_active; *inactive += zones[i].nr_inactive; *free += zones[i].free_pages; + *pagetable += zones[i].nr_pagetable; + *slab += zones[i].nr_slab; + *pagecache += zones[i].nr_pagecache; } } -void get_zone_counts(unsigned long *active, - unsigned long *inactive, unsigned long *free) +void get_zone_counts(unsigned long *active, unsigned long *inactive, + unsigned long *free, unsigned long *pagetable, + unsigned long *slab, unsigned long *pagecache) { struct pglist_data *pgdat; *active = 0; *inactive = 0; *free = 0; + *pagetable = 0; + *slab = 0; + *pagecache = 0; for_each_pgdat(pgdat) { - unsigned long l, m, n; - __get_zone_counts(&l, &m, &n, pgdat); + unsigned long l, m, n, o, p, q; + __get_zone_counts(&l, &m, &n, &o, &p, &q, pgdat); *active += l; *inactive += m; *free += n; + *pagetable += o; + *slab += p; + *pagecache += q; } } @@ -1341,6 +1380,9 @@ void show_free_areas(void) unsigned long active; unsigned long inactive; unsigned long free; + unsigned long pagecache; + unsigned long pagetable; + unsigned long slab; struct zone *zone; for_each_zone(zone) { @@ -1370,23 +1412,24 @@ void show_free_areas(void) } get_page_state(&ps); - get_zone_counts(&active, &inactive, &free); + get_zone_counts(&active, &inactive, &free, &pagetable, &slab, &pagecache); printk("Free pages: %11ukB (%ukB HighMem)\n", K(nr_free_pages()), K(nr_free_highpages())); printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " - "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", + "unstable:%lu free:%u slab:%lu pagecache:%lu mapped:%lu pagetables:%lu\n", active, inactive, ps.nr_dirty, ps.nr_writeback, ps.nr_unstable, nr_free_pages(), - ps.nr_slab, + slab, + pagecache, ps.nr_mapped, - ps.nr_page_table_pages); + pagetable); for_each_zone(zone) { int i; @@ -1397,6 +1440,9 @@ void show_free_areas(void) " min:%lukB" " low:%lukB" " high:%lukB" + " pagecache:%lukB" + " slab: %lukB" + " pagetables:%luKB" " active:%lukB" " inactive:%lukB" " present:%lukB" @@ -1408,6 +1454,9 @@ void show_free_areas(void) K(zone->pages_min), K(zone->pages_low), K(zone->pages_high), + K(zone->nr_pagecache), + K(zone->nr_slab), + K(zone->nr_pagetable), K(zone->nr_active), K(zone->nr_inactive), K(zone->present_pages), @@ -2025,6 +2074,9 @@ static void __init free_area_init_core(s zone->nr_scan_inactive = 0; zone->nr_active = 0; zone->nr_inactive = 0; + zone->nr_slab = 0; + zone->nr_pagecache = 0; + zone->nr_pagetable = 0; atomic_set(&zone->reclaim_in_progress, 0); if (!size) continue; Index: linux-2.6.15-rc3/drivers/base/node.c =================================================================== --- linux-2.6.15-rc3.orig/drivers/base/node.c 2005-11-28 19:51:27.000000000 -0800 +++ linux-2.6.15-rc3/drivers/base/node.c 2005-11-30 18:29:45.000000000 -0800 @@ -43,10 +43,13 @@ static ssize_t node_read_meminfo(struct unsigned long inactive; unsigned long active; unsigned long free; + unsigned long pagetable; + unsigned long slab; + unsigned long pagecache; si_meminfo_node(&i, nid); get_page_state_node(&ps, nid); - __get_zone_counts(&active, &inactive, &free, NODE_DATA(nid)); + __get_zone_counts(&active, &inactive, &free, &pagetable, &slab, &pagecache, NODE_DATA(nid)); /* Check for negative values in these approximate counters */ if ((long)ps.nr_dirty < 0) @@ -55,8 +58,6 @@ static ssize_t node_read_meminfo(struct ps.nr_writeback = 0; if ((long)ps.nr_mapped < 0) ps.nr_mapped = 0; - if ((long)ps.nr_slab < 0) - ps.nr_slab = 0; n = sprintf(buf, "\n" "Node %d MemTotal: %8lu kB\n" @@ -70,7 +71,9 @@ static ssize_t node_read_meminfo(struct "Node %d LowFree: %8lu kB\n" "Node %d Dirty: %8lu kB\n" "Node %d Writeback: %8lu kB\n" + "Node %d Pagecache: %8lu kB\n" "Node %d Mapped: %8lu kB\n" + "Node %d Pagetable: %8lu kB\n" "Node %d Slab: %8lu kB\n", nid, K(i.totalram), nid, K(i.freeram), @@ -83,8 +86,10 @@ static ssize_t node_read_meminfo(struct nid, K(i.freeram - i.freehigh), nid, K(ps.nr_dirty), nid, K(ps.nr_writeback), + nid, K(pagecache), nid, K(ps.nr_mapped), - nid, K(ps.nr_slab)); + nid, K(pagetable), + nid, K(slab)); n += hugetlb_report_node_meminfo(nid, buf + n); return n; } Index: linux-2.6.15-rc3/fs/proc/proc_misc.c =================================================================== --- linux-2.6.15-rc3.orig/fs/proc/proc_misc.c 2005-11-28 19:51:27.000000000 -0800 +++ linux-2.6.15-rc3/fs/proc/proc_misc.c 2005-11-30 18:29:45.000000000 -0800 @@ -124,13 +124,16 @@ static int meminfo_read_proc(char *page, unsigned long inactive; unsigned long active; unsigned long free; + unsigned long pagetable; + unsigned long slab; + unsigned long pagecache; unsigned long committed; unsigned long allowed; struct vmalloc_info vmi; long cached; get_page_state(&ps); - get_zone_counts(&active, &inactive, &free); + get_zone_counts(&active, &inactive, &free, &pagetable, &slab, &pagecache); /* * display in kilobytes. @@ -142,7 +145,7 @@ static int meminfo_read_proc(char *page, allowed = ((totalram_pages - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100) + total_swap_pages; - cached = get_page_cache_size() - total_swapcache_pages - i.bufferram; + cached = pagecache - total_swapcache_pages - i.bufferram; if (cached < 0) cached = 0; @@ -191,10 +194,10 @@ static int meminfo_read_proc(char *page, K(ps.nr_dirty), K(ps.nr_writeback), K(ps.nr_mapped), - K(ps.nr_slab), + K(slab), K(allowed), K(committed), - K(ps.nr_page_table_pages), + K(pagetable), (unsigned long)VMALLOC_TOTAL >> 10, vmi.used >> 10, vmi.largest_chunk >> 10 Index: linux-2.6.15-rc3/mm/readahead.c =================================================================== --- linux-2.6.15-rc3.orig/mm/readahead.c 2005-11-28 19:51:27.000000000 -0800 +++ linux-2.6.15-rc3/mm/readahead.c 2005-11-30 18:29:45.000000000 -0800 @@ -565,7 +565,8 @@ unsigned long max_sane_readahead(unsigne unsigned long active; unsigned long inactive; unsigned long free; + unsigned long a,b,c; - __get_zone_counts(&active, &inactive, &free, NODE_DATA(numa_node_id())); + __get_zone_counts(&active, &inactive, &free, &a, &b, &c, NODE_DATA(numa_node_id())); return min(nr, (inactive + free) / 2); } Index: linux-2.6.15-rc3/arch/i386/mm/pgtable.c =================================================================== --- linux-2.6.15-rc3.orig/arch/i386/mm/pgtable.c 2005-11-28 19:51:27.000000000 -0800 +++ linux-2.6.15-rc3/arch/i386/mm/pgtable.c 2005-11-30 18:29:45.000000000 -0800 @@ -27,6 +27,7 @@ void show_mem(void) int total = 0, reserved = 0; int shared = 0, cached = 0; int highmem = 0; + int slab = 0, pagetable = 0; struct page *page; pg_data_t *pgdat; unsigned long i; @@ -43,6 +44,10 @@ void show_mem(void) total++; if (PageHighMem(page)) highmem++; + if (PageSlab(page)) + slab++; + if (PageTable(page)) + pagetable++; if (PageReserved(page)) reserved++; else if (PageSwapCache(page)) @@ -62,8 +67,8 @@ void show_mem(void) printk(KERN_INFO "%lu pages dirty\n", ps.nr_dirty); printk(KERN_INFO "%lu pages writeback\n", ps.nr_writeback); printk(KERN_INFO "%lu pages mapped\n", ps.nr_mapped); - printk(KERN_INFO "%lu pages slab\n", ps.nr_slab); - printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages); + printk(KERN_INFO "%lu pages slab\n", slab); + printk(KERN_INFO "%lu pages pagetables\n", pagetable); } /*