Revise page migration 1. Make swap migration independent of swap 2. Faster migration of anonymous pages 3. Migrate VM_LOCKED pages 4. Add VM_DONTMOVE and MAP_DONTMOVE to allow the allocation of memory segments that are not migratable. TODO: - Limit swap code so that it does not use the last swap file. Signed-off-by: Christoph Lameter Index: linux-2.6.16-mm2/mm/swap_state.c =================================================================== --- linux-2.6.16-mm2.orig/mm/swap_state.c 2006-03-30 21:21:06.000000000 -0800 +++ linux-2.6.16-mm2/mm/swap_state.c 2006-03-30 21:21:16.000000000 -0800 @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -300,6 +301,13 @@ struct page * lookup_swap_cache(swp_entr { struct page *page; + /* + * If the swap type is SWP_TYPE_MIGRATION then the + * swap entry contains the pfn of a page. + */ + if (swp_type(entry) == SWP_TYPE_MIGRATION) + return pfn_to_page(swp_offset(entry)); + page = find_get_page(&swapper_space, entry.val); if (page) Index: linux-2.6.16-mm2/mm/memory.c =================================================================== --- linux-2.6.16-mm2.orig/mm/memory.c 2006-03-30 21:21:06.000000000 -0800 +++ linux-2.6.16-mm2/mm/memory.c 2006-03-30 21:21:16.000000000 -0800 @@ -1879,7 +1879,6 @@ static int do_swap_page(struct mm_struct goto out; entry = pte_to_swp_entry(orig_pte); -again: page = lookup_swap_cache(entry); if (!page) { swapin_readahead(entry, address, vma); @@ -1906,8 +1905,7 @@ again: if (!PageSwapCache(page)) { /* Page migration has occured */ unlock_page(page); - page_cache_release(page); - goto again; + goto out; } /* Index: linux-2.6.16-mm2/mm/migrate.c =================================================================== --- linux-2.6.16-mm2.orig/mm/migrate.c 2006-03-30 21:21:06.000000000 -0800 +++ linux-2.6.16-mm2/mm/migrate.c 2006-03-30 21:44:23.000000000 -0800 @@ -16,8 +16,7 @@ #include #include #include -#include /* for try_to_release_page(), - buffer_heads_over_limit */ +#include #include #include #include @@ -28,8 +27,6 @@ #include "internal.h" -#include "internal.h" - /* The maximum number of pages to take off the LRU for migration */ #define MIGRATE_CHUNK_SIZE 256 @@ -73,10 +70,6 @@ int isolate_lru_page(struct page *page, */ int migrate_prep(void) { - /* Must have swap device for migration */ - if (nr_swap_pages <= 0) - return -ENODEV; - /* * Clear the LRU lists so pages can be isolated. * Note that pages may be moved off the LRU after we have @@ -84,7 +77,6 @@ int migrate_prep(void) * pages that may be busy. */ lru_add_drain_all(); - return 0; } @@ -193,8 +185,9 @@ int migrate_page_remove_references(struc * indicates that the page is in use or truncate has removed * the page. */ - if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) - return -EAGAIN; + if (!page->mapping || + page_mapcount(page) + nr_refs + !!mapping != page_count(page)) + return -EAGAIN; /* * Establish swap ptes for anonymous pages or destroy pte @@ -216,25 +209,36 @@ int migrate_page_remove_references(struc * is still set and the operation may continue. */ if (try_to_unmap(page, 1) == SWAP_FAIL) - /* A vma has VM_LOCKED set -> permanent failure */ + /* A vma has VM_DONTMOVE set -> permanent failure */ return -EPERM; + if (!mapping) + /* + * A anonymous page that is not on swap. We have removed all + * ptes and therefore are the only ones accessing the page. + */ + return 0; + /* * Give up if we were unable to remove all mappings. */ if (page_mapcount(page)) return -EAGAIN; + /* + * This page is backed by a mapping. So we have no special migration ptes + * to worry about. + */ write_lock_irq(&mapping->tree_lock); radix_pointer = (struct page **)radix_tree_lookup_slot( &mapping->page_tree, page_index(page)); - if (!page_mapping(page) || page_count(page) != nr_refs || + if (!page_mapping(page) || page_count(page) != nr_refs + 1 || *radix_pointer != page) { write_unlock_irq(&mapping->tree_lock); - return 1; + return -EAGAIN; } /* @@ -256,7 +260,6 @@ int migrate_page_remove_references(struc *radix_pointer = newpage; __put_page(page); write_unlock_irq(&mapping->tree_lock); - return 0; } EXPORT_SYMBOL(migrate_page_remove_references); @@ -266,6 +269,8 @@ EXPORT_SYMBOL(migrate_page_remove_refere */ void migrate_page_copy(struct page *newpage, struct page *page) { + int mapcount; + copy_highpage(newpage, page); if (PageError(page)) @@ -286,12 +291,29 @@ void migrate_page_copy(struct page *newp set_page_dirty(newpage); } + newpage->index = page->index; + newpage->mapping = page->mapping; + + /* + * Anonymous pages preserve the mapcount but have removed the ptes. + * These need to be transferred to the new page. + */ + mapcount = page_mapcount(page); + + set_page_count(newpage, page_count(newpage) + mapcount); + set_page_count(page, page_count(page) - mapcount); + + reset_page_mapcount(page); + reset_page_mapcount(newpage); + atomic_add(mapcount, &newpage->_mapcount); + ClearPageSwapCache(page); ClearPageActive(page); ClearPagePrivate(page); set_page_private(page, 0); page->mapping = NULL; + /* * If any waiters have accumulated on the new page then * wake them up. @@ -313,10 +335,12 @@ int migrate_page(struct page *newpage, s BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_remove_references(newpage, page, 2); + rc = migrate_page_remove_references(newpage, page, 1); - if (rc) + if (rc) { + remove_migration_ptes(page, page); return rc; + } migrate_page_copy(newpage, page); @@ -328,7 +352,7 @@ int migrate_page(struct page *newpage, s * waiting on the page lock to use the new page via the page tables * before the new page is unlocked. */ - remove_from_swap(newpage); + remove_migration_ptes(page, newpage); return 0; } EXPORT_SYMBOL(migrate_page); @@ -396,25 +420,13 @@ redo: * Only wait on writeback if we have already done a pass where * we we may have triggered writeouts for lots of pages. */ - if (pass > 0) { + if (pass > 0) wait_on_page_writeback(page); - } else { + else { if (PageWriteback(page)) goto unlock_page; } - /* - * Anonymous pages must have swap cache references otherwise - * the information contained in the page maps cannot be - * preserved. - */ - if (PageAnon(page) && !PageSwapCache(page)) { - if (!add_to_swap(page, GFP_KERNEL)) { - rc = -ENOMEM; - goto unlock_page; - } - } - if (!to) { rc = swap_page(page); goto next; @@ -428,9 +440,12 @@ redo: * Try to migrate the page. */ mapping = page_mapping(page); - if (!mapping) + if (!mapping) { + + rc = migrate_page(newpage, page); goto unlock_both; + } else if (mapping->a_ops->migratepage) { /* * Most pages have a mapping and most filesystems @@ -473,24 +488,6 @@ redo: goto unlock_both; } - /* - * On early passes with mapped pages simply - * retry. There may be a lock held for some - * buffers that may go away. Later - * swap them out. - */ - if (pass > 4) { - /* - * Persistently unable to drop buffers..... As a - * measure of last resort we fall back to - * swap_page(). - */ - unlock_page(newpage); - newpage = NULL; - rc = swap_page(page); - goto next; - } - unlock_both: unlock_page(newpage); @@ -540,7 +537,7 @@ int buffer_migrate_page(struct page *new head = page_buffers(page); - rc = migrate_page_remove_references(newpage, page, 3); + rc = migrate_page_remove_references(newpage, page, 2); if (rc) return rc; @@ -577,7 +574,6 @@ int buffer_migrate_page(struct page *new bh = bh->b_this_page; } while (bh != head); - return 0; } EXPORT_SYMBOL(buffer_migrate_page); Index: linux-2.6.16-mm2/mm/swapfile.c =================================================================== --- linux-2.6.16-mm2.orig/mm/swapfile.c 2006-03-30 21:21:06.000000000 -0800 +++ linux-2.6.16-mm2/mm/swapfile.c 2006-03-30 21:21:16.000000000 -0800 @@ -395,6 +395,9 @@ void free_swap_and_cache(swp_entry_t ent struct swap_info_struct * p; struct page *page = NULL; + if (swp_type(entry) == SWP_TYPE_MIGRATION) + return; + p = swap_info_get(entry); if (p) { if (swap_entry_free(p, swp_offset(entry)) == 1) @@ -609,15 +612,6 @@ static int unuse_mm(struct mm_struct *mm return 0; } -#ifdef CONFIG_MIGRATION -int remove_vma_swap(struct vm_area_struct *vma, struct page *page) -{ - swp_entry_t entry = { .val = page_private(page) }; - - return unuse_vma(vma, entry, page); -} -#endif - /* * Scan swap_map from current position to next entry still in use. * Recycle to start on reaching the end, returning 0 when empty. @@ -710,7 +704,6 @@ static int try_to_unuse(unsigned int typ */ swap_map = &si->swap_map[i]; entry = swp_entry(type, i); -again: page = read_swap_cache_async(entry, NULL, 0); if (!page) { /* @@ -745,12 +738,6 @@ again: wait_on_page_locked(page); wait_on_page_writeback(page); lock_page(page); - if (!PageSwapCache(page)) { - /* Page migration has occured */ - unlock_page(page); - page_cache_release(page); - goto again; - } wait_on_page_writeback(page); /* @@ -1704,6 +1691,9 @@ int swap_duplicate(swp_entry_t entry) int result = 0; type = swp_type(entry); + if (type == SWP_TYPE_MIGRATION) + return 1; + if (type >= nr_swapfiles) goto bad_file; p = type + swap_info; Index: linux-2.6.16-mm2/mm/rmap.c =================================================================== --- linux-2.6.16-mm2.orig/mm/rmap.c 2006-03-30 21:21:15.000000000 -0800 +++ linux-2.6.16-mm2/mm/rmap.c 2006-03-30 21:39:16.000000000 -0800 @@ -205,44 +205,6 @@ out: return anon_vma; } -#ifdef CONFIG_MIGRATION -/* - * Remove an anonymous page from swap replacing the swap pte's - * through real pte's pointing to valid pages and then releasing - * the page from the swap cache. - * - * Must hold page lock on page and mmap_sem of one vma that contains - * the page. - */ -void remove_from_swap(struct page *page) -{ - struct anon_vma *anon_vma; - struct vm_area_struct *vma; - unsigned long mapping; - - if (!PageSwapCache(page)) - return; - - mapping = (unsigned long)page->mapping; - - if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) - return; - - /* - * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. - */ - anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); - spin_lock(&anon_vma->lock); - - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) - remove_vma_swap(vma, page); - - spin_unlock(&anon_vma->lock); - delete_from_swap_cache(page); -} -EXPORT_SYMBOL(remove_from_swap); -#endif - /* * At what user virtual address is page expected in vma? */ @@ -291,7 +253,7 @@ pte_t *page_check_address(struct page *p pgd_t *pgd; pud_t *pud; pmd_t *pmd; - pte_t *pte; + pte_t *ptep, pte; spinlock_t *ptl; pgd = pgd_offset(mm, address); @@ -306,24 +268,77 @@ pte_t *page_check_address(struct page *p if (!pmd_present(*pmd)) return NULL; - pte = pte_offset_map(pmd, address); + ptep = pte_offset_map(pmd, address); + pte = *ptep; /* Make a quick check before getting the lock */ - if (!pte_present(*pte)) { - pte_unmap(pte); + if (pte_none(pte) || pte_file(pte)) { + pte_unmap(ptep); return NULL; } ptl = pte_lockptr(mm, pmd); spin_lock(ptl); - if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { - *ptlp = ptl; - return pte; + if (pte_present(pte)) { + if (page_to_pfn(page) == pte_pfn(pte)) { + *ptlp = ptl; + return ptep; + } + } else { + /* Could still be a migration entry pointing to the page */ + swp_entry_t entry = pte_to_swp_entry(pte); + + if (swp_type(entry) == SWP_TYPE_MIGRATION && + swp_offset(entry) == page_to_pfn(page)) { + *ptlp = ptl; + return ptep; + } } pte_unmap_unlock(pte, ptl); return NULL; } /* + * Restore a potential migration pte to a working pte entry for + * anonymous pages. + */ +static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr, + struct page *old, struct page *new) +{ + struct mm_struct *mm = vma->vm_mm; + pte_t *ptep; + spinlock_t *ptl; + + ptep = page_check_address(old, mm, addr, &ptl); + if (!ptep) + return; + + set_pte_at(mm, addr, ptep, pte_mkold(mk_pte(new, vma->vm_page_prot))); + spin_unlock(ptl); +} + +/* + * Get rid of all migration entries and replace them by + * references to the indicated page. + */ +void remove_migration_ptes(struct page *page, struct page *newpage) +{ + struct anon_vma *anon_vma; + struct vm_area_struct *vma; + + if (!PageAnon(newpage)) + return; + + anon_vma = page_lock_anon_vma(newpage); + BUG_ON(!anon_vma); + + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) + remove_migration_pte(vma, page_address_in_vma(newpage, vma), + page, newpage); + + spin_unlock(&anon_vma->lock); +} + +/* * Subfunctions of page_referenced: page_referenced_one called * repeatedly from either page_referenced_anon or page_referenced_file. */ @@ -578,7 +593,7 @@ void page_remove_rmap(struct page *page) * repeatedly from either try_to_unmap_anon or try_to_unmap_file. */ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, - int ignore_refs) + int migration) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -600,9 +615,13 @@ static int try_to_unmap_one(struct page * If it's recently referenced (perhaps page_referenced * skipped over this mm) then we should reactivate it. */ - if ((vma->vm_flags & (VM_LOCKED|VM_DONTMOVE)) || - (ptep_clear_flush_young(vma, address, pte) - && !ignore_refs)) { + if (migration && (vma->vm_flags & VM_DONTMOVE)) { + ret = SWAP_FAIL; + goto out_unmap; + } + + if (!migration && ((vma->vm_flags & VM_LOCKED) || + ptep_clear_flush_young(vma, address, pte))) { ret = SWAP_FAIL; goto out_unmap; } @@ -620,6 +639,17 @@ static int try_to_unmap_one(struct page if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; + + if (!PageSwapCache(page) && migration) { + /* + * Store the pfn of the page in a special migration + * pte. do_swap_page will wait until the page is unlocked + * and then restart the fault handling. + */ + entry = swp_entry(SWP_TYPE_MIGRATION, page_to_pfn(page)); + set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); + goto out_unmap; + } /* * Store the swap location in the pte. * See handle_pte_fault() ... @@ -764,7 +794,7 @@ static int try_to_unmap_anon(struct page * * This function is only called from try_to_unmap for object-based pages. */ -static int try_to_unmap_file(struct page *page, int ignore_refs) +static int try_to_unmap_file(struct page *page, int migration) { struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -778,7 +808,7 @@ static int try_to_unmap_file(struct page spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { - ret = try_to_unmap_one(page, vma, ignore_refs); + ret = try_to_unmap_one(page, vma, migration); if (ret == SWAP_FAIL || !page_mapped(page)) goto out; } @@ -863,16 +893,16 @@ out: * SWAP_AGAIN - we missed a mapping, try again later * SWAP_FAIL - the page is unswappable */ -int try_to_unmap(struct page *page, int ignore_refs) +int try_to_unmap(struct page *page, int migration) { int ret; BUG_ON(!PageLocked(page)); if (PageAnon(page)) - ret = try_to_unmap_anon(page, ignore_refs); + ret = try_to_unmap_anon(page, migration); else - ret = try_to_unmap_file(page, ignore_refs); + ret = try_to_unmap_file(page, migration); if (!page_mapped(page)) ret = SWAP_SUCCESS; Index: linux-2.6.16-mm2/include/linux/rmap.h =================================================================== --- linux-2.6.16-mm2.orig/include/linux/rmap.h 2006-03-19 21:53:29.000000000 -0800 +++ linux-2.6.16-mm2/include/linux/rmap.h 2006-03-30 21:21:16.000000000 -0800 @@ -92,7 +92,6 @@ static inline void page_dup_rmap(struct */ int page_referenced(struct page *, int is_locked); int try_to_unmap(struct page *, int ignore_refs); -void remove_from_swap(struct page *page); /* * Called from mm/filemap_xip.c to unmap empty zero page @@ -105,6 +104,11 @@ pte_t *page_check_address(struct page *, */ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); +/* + * Used by page migration to restore ptes of anonymous pages + */ +void remove_migration_ptes(struct page *page, struct page *newpage); + #else /* !CONFIG_MMU */ #define anon_vma_init() do {} while (0) Index: linux-2.6.16-mm2/include/linux/swap.h =================================================================== --- linux-2.6.16-mm2.orig/include/linux/swap.h 2006-03-30 21:21:05.000000000 -0800 +++ linux-2.6.16-mm2/include/linux/swap.h 2006-03-30 21:45:22.000000000 -0800 @@ -29,7 +29,10 @@ static inline int current_is_kswapd(void * the type/offset into the pte as 5/27 as well. */ #define MAX_SWAPFILES_SHIFT 5 -#define MAX_SWAPFILES (1 << MAX_SWAPFILES_SHIFT) +#define MAX_SWAPFILES ((1 << MAX_SWAPFILES_SHIFT)-1) + +/* Use last entry for page migration swap entries */ +#define SWP_TYPE_MIGRATION MAX_SWAPFILES /* * Magic header for a swap area. The first part of the union is @@ -250,7 +253,6 @@ extern int remove_exclusive_swap_page(st struct backing_dev_info; extern spinlock_t swap_lock; -extern int remove_vma_swap(struct vm_area_struct *vma, struct page *page); /* linux/mm/thrash.c */ extern struct mm_struct * swap_token_mm;