我们已经看到在分配页面时,如果页面数不够,那么会调用page_launder,reclaim_page,__free_page将页面换出,并重新投入分配。
为了避免总是在CPU忙碌的时候,也就是在缺页异常发生的时候,临时再来搜寻可供换出的内存页面并加以换出,Linux内核定期地检查并且预先将若干页面换出,腾出空间,以减轻系统在缺页异常发生时的负担。
为此,在Linux内核中设置了一个专司定期将页面换出的“守护神”kswapd和kreclaimd。
static int __init kswapd_init(void) { printk("Starting kswapd v1.8\n"); swap_setup(); kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); return 0; }
启动了两个内核线程,kswapd和kreclaimd。
首先分析kswapd,代码如下:
int kswapd(void *unused) { struct task_struct *tsk = current; tsk->session =1; tsk->pgrp = 1; strcpy(tsk->comm, "kswapd"); sigfillset(&tsk->blocked); kswapd_task = tsk; ..... tsk->flags |= PF_MEMALLOC;//执行公务,标志位置1 /* * Kswapd main loop. */ for (;;) { static int recalc = 0; /* If needed, try to free some memory. */ if (inactive_shortage() || free_shortage()) { int wait = 0; /* Do we need to do some synchronous flushing? */ if (waitqueue_active(&kswapd_done)) wait = 1; do_try_to_free_pages(GFP_KSWAPD, wait);//主体函数 } ...... refill_inactive_scan(6, 0); ...... ...... if (!free_shortage() || !inactive_shortage()) { interruptible_sleep_on_timeout(&kswapd_wait, HZ);//每隔1秒钟唤醒一次,继续执行循环 ...... } else if (out_of_memory()) { oom_kill(); } } }
在一些简单的初始化操作以后,程序便进入一个无限循环。在每次循环的末尾一般都会调用interruptible_sleep_on_timeout()进入睡眠,让内核自由地调度别的进程运行。但是内核在1秒钟后又会唤醒并调度kswapd继续运行,这时候kswapd就又回到这无限循环开始的地方。
这个函数执行的主体函数是do_try_to_free_pages,代码如下:
static int do_try_to_free_pages(unsigned int gfp_mask, int user) { int ret = 0; ...... if (free_shortage() || nr_inactive_dirty_pages > nr_free_pages() + nr_inactive_clean_pages()) ret += page_launder(gfp_mask, user); ...... if (free_shortage() || inactive_shortage()) { shrink_dcache_memory(6, gfp_mask); shrink_icache_memory(6, gfp_mask); ret += refill_inactive(gfp_mask, user); } else { ...... kmem_cache_reap(gfp_mask); ret = 1; } return ret; }
shrink_dcache_memory和shrink_icache_memory用来回收积累起来的大量的dentry数据结构和inode数据结构。这些数据结构在文件关闭以后并不立即释放,而是放在LRU队列中作为后备,以防不久将来的文件操作又要用到。
kmem_cache_reap用于收割slab块。slab管理机制也是倾向于分配和保持更多的空闲物理页面,而不热衷于退还这些页面,所以过一段时间就要通过kmem_cache_reap来收割。
一、我们首先分析的是refill_inactive,代码如下:
static int refill_inactive(unsigned int gfp_mask, int user) { int priority, count, start_count, made_progress; count = inactive_shortage() + free_shortage(); if (user) count = (1 << page_cluster); start_count = count; /* Always trim SLAB caches when memory gets low. */ kmem_cache_reap(gfp_mask);//收割slab priority = 6;//循环从优先级最低的6级开始,逐步加大"力度"直到0级, do { made_progress = 0; if (current->need_resched) {//内核线程必须自律,因为永远不会返回用户空间,就永远不会检查这个标志位 __set_current_state(TASK_RUNNING);//表示希望继续执行的愿望 schedule();//调度 } while (refill_inactive_scan(priority, 1)) { made_progress = 1; if (--count <= 0)//达到目标,就提前结束 goto done; } ...... shrink_dcache_memory(priority, gfp_mask);//回收积累起来的大量的dentry数据结构和inode数据结构 shrink_icache_memory(priority, gfp_mask); ...... while (swap_out(priority, gfp_mask)) { made_progress = 1; if (--count <= 0)//达到目标,就提前结束 goto done; } ...... if (!inactive_shortage() || !free_shortage()) goto done;//不缺少页面了,也提前结束 ...... if (!made_progress) priority--; } while (priority >= 0); /* Always end on a refill_inactive.., may sleep... */ while (refill_inactive_scan(0, 1)) { if (--count <= 0) goto done; } done: return (count < start_count); }
1、refill_inactive_scan函数,如下:
int refill_inactive_scan(unsigned int priority, int oneshot) { struct list_head * page_lru; struct page * page; int maxscan, page_active = 0; int ret = 0; /* Take the lock while messing with the list... */ spin_lock(&pagemap_lru_lock); maxscan = nr_active_pages >> priority;//当priority为0时,才扫描整个队列 while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) { page = list_entry(page_lru, struct page, lru); /* Wrong page on list?! (list corruption, should not happen) */ if (!PageActive(page)) {//是否是活跃的页面 printk("VM: refill_inactive, wrong page on list.\n"); list_del(page_lru); nr_active_pages--; continue; } /* Do aging on the pages. */ if (PageTestandClearReferenced(page)) {//是否受到过访问 age_page_up_nolock(page); page_active = 1; } else { age_page_down_ageonly(page);//没有受到过访问,减少页面寿命 ...... if (page->age == 0 && page_count(page) <= (page->buffers ? 2 : 1)) {//如果页面寿命为0,且使用计数为1(预读后,未被进程认领的页面) deactivate_page_nolock(page); page_active = 0; } else { page_active = 1; } } ...... if (page_active || PageActive(page)) { list_del(page_lru); list_add(page_lru, &active_list); } else { ret = 1; if (oneshot) break; } } spin_unlock(&pagemap_lru_lock); return ret; }
在Linux内核源代码情景分析-内存管理之用户页面的换入这篇文章中,
预读后,未被进程认领的页面最后使用计数为1;
page->list链入mapping->clean_pages;
page->next_hash和page->pprev_hash链入全局的Hash表;
page->lru链入了全局的active_list;
预读后,未被进程认领的页面会执行deactivate_page_nolock,代码如下:
void deactivate_page_nolock(struct page * page) { /* * One for the cache, one for the extra reference the * caller has and (maybe) one for the buffers. * * This isn‘t perfect, but works for just about everything. * Besides, as long as we don‘t move unfreeable pages to the * inactive_clean list it doesn‘t need to be perfect... */ int maxcount = (page->buffers ? 3 : 2); page->age = 0; ClearPageReferenced(page); /* * Don‘t touch it if it‘s not on the active list. * (some pages aren‘t on any list at all) */ if (PageActive(page) && page_count(page) <= maxcount && !page_ramdisk(page)) { del_page_from_active_list(page); add_page_to_inactive_dirty_list(page); } }
del_page_from_active_list函数,如下:
#define del_page_from_active_list(page) { list_del(&(page)->lru); \ ClearPageActive(page); nr_active_pages--; DEBUG_ADD_PAGE ZERO_PAGE_BUG }
add_page_to_inactive_dirty_list函数,如下:
#define add_page_to_inactive_dirty_list(page) { DEBUG_ADD_PAGE ZERO_PAGE_BUG SetPageInactiveDirty(page); list_add(&(page)->lru, &inactive_dirty_list); nr_inactive_dirty_pages++; page->zone->inactive_dirty_pages++; }
预读后,未被进程认领的页面,执行后的结果是:
最后使用计数为1;
page->list链入mapping->clean_pages;//由于没有被访问过
page->next_hash和page->pprev_hash链入全局的Hash表;
page->lru链入了全局的inactive_dirty_list;
也不用断开映射,因为本来就没有映射。
2、swap_out函数,如下:
static int swap_out(unsigned int priority, int gfp_mask) { int counter; int __ret = 0; ...... counter = (nr_threads << SWAP_SHIFT) >> priority;//优先级越大,counter越大 if (counter < 1) counter = 1; for (; counter >= 0; counter--) { struct list_head *p; unsigned long max_cnt = 0; struct mm_struct *best = NULL; int assign = 0; int found_task = 0; select: spin_lock(&mmlist_lock); p = init_mm.mmlist.next; for (; p != &init_mm.mmlist; p = p->next) { struct mm_struct *mm = list_entry(p, struct mm_struct, mmlist); if (mm->rss <= 0) continue; found_task++; /* Refresh swap_cnt? */ if (assign == 1) { mm->swap_cnt = (mm->rss >> SWAP_SHIFT); if (mm->swap_cnt < SWAP_MIN) mm->swap_cnt = SWAP_MIN; } if (mm->swap_cnt > max_cnt) {//swap_cnt表示该进程尚未被考察的页面,找出swap_cnt最大的进程 max_cnt = mm->swap_cnt; best = mm; } } /* Make sure it doesn‘t disappear */ if (best) atomic_inc(&best->mm_users);//增加mm_users spin_unlock(&mmlist_lock); ...... if (!best) { if (!assign && found_task > 0) { assign = 1; goto select; } break; } else { __ret = swap_out_mm(best, gfp_mask);//执行主体 mmput(best);//减少mm_users break; } } return __ret; }
swap_out_mm一层一层地往下调用,经过swap_out_vma(),swap_out_pgd(),swap_out_pmd(),一直到try_to_swap_out,试图换出由一个页面表项pte所指向的内存页面。
try_to_swap_out函数,如下:
static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask) { pte_t pte; swp_entry_t entry; struct page * page; int onlist; pte = *page_table; if (!pte_present(pte))//物理页面是否在内存中 goto out_failed; page = pte_page(pte); if ((!VALID_PAGE(page)) || PageReserved(page)) goto out_failed; if (!mm->swap_cnt) return 1; mm->swap_cnt--;//被考察的页面数减1 onlist = PageActive(page); /* Don‘t look at this pte if it‘s been accessed recently. */ if (ptep_test_and_clear_young(page_table)) {//如果页面被访问过,那么直接out_failed age_page_up(page); goto out_failed; } if (!onlist) /* The page is still mapped, so it can‘t be freeable... */ age_page_down_ageonly(page); ...... if (page->age > 0)//如果页面的age不小于0,页out_failed goto out_failed; if (TryLockPage(page)) goto out_failed; ...... pte = ptep_get_and_clear(page_table);//走到这里,说明页面最近没有访问过,且age小于0,清空页目录项 flush_tlb_page(vma, address); ...... if (PageSwapCache(page)) {//page结构在swapper_space队列中 entry.val = page->index;//盘上数据块的位置 if (pte_dirty(pte))//回忆页面换入时,页目录项的属性被设置为可写,脏 set_page_dirty(page);//会执行 set_swap_pte: swap_duplicate(entry); set_pte(page_table, swp_entry_to_pte(entry));//页目录项指向盘上数据块的地址 drop_pte: UnlockPage(page); mm->rss--; deactivate_page(page);//见上面的函数 page_cache_release(page);//使用计数减1 out_failed: return 0; } ...... }
把活跃页面变成不活跃脏的页面,要满足两点。
第一点,是最近没有被访问过,判断的标准是:
static inline int ptep_test_and_clear_young(pte_t *ptep) { return test_and_clear_bit(_PAGE_BIT_ACCESSED, ptep); }
也就是页目录项的是否被访问标志位,回忆页面换入时,页目录项的属性被设置为可写,脏。当i386CPU的内存映射机制在通过一个页面表项和页表项将一个线性地址映射成一个物理地址,进而访问这个物理地址时,就会自动将该表项的_PAGE_ACCESSED标志位设成1。
第二点,page->age小于0。
set_page_dirty函数,如下:
static inline void set_page_dirty(struct page * page) { if (!test_and_set_bit(PG_dirty, &page->flags)) __set_page_dirty(page); }
void __set_page_dirty(struct page *page) { struct address_space *mapping = page->mapping; spin_lock(&pagecache_lock); list_del(&page->list); list_add(&page->list, &mapping->dirty_pages); spin_unlock(&pagecache_lock); mark_inode_dirty_pages(mapping->host); }
page->flags对应为设置为PG_dirty。并且page->list链入mapping->dirty_pages。
最后执行完try_to_swap_out,结果是:
使用计数为1;
page->list链入mapping->dirty_pages;
page->next_hash和page->pprev_hash链入全局的Hash表;
page->lru链入了全局的inactive_dirty_list;
page->flags对应为设置为PG_dirty。
由于out_failed返回0,使swap_out_mm能够依次考察和处理一个进程的所有页面。
二、看完了refill_inactive,返回函数do_try_to_free_pages,来看一下page_launder,代码如下:
int page_launder(int gfp_mask, int sync) { int launder_loop, maxscan, cleaned_pages, maxlaunder; int can_get_io_locks; struct list_head * page_lru; struct page * page; /* * We can only grab the IO locks (eg. for flushing dirty * buffers to disk) if __GFP_IO is set. */ can_get_io_locks = gfp_mask & __GFP_IO; launder_loop = 0; maxlaunder = 0; cleaned_pages = 0; dirty_page_rescan: spin_lock(&pagemap_lru_lock); maxscan = nr_inactive_dirty_pages; while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list && maxscan-- > 0) { page = list_entry(page_lru, struct page, lru); /* Wrong page on list?! (list corruption, should not happen) */ if (!PageInactiveDirty(page)) {//如果不是不活跃脏的页面,冤假错案 printk("VM: page_launder, wrong page on list.\n"); list_del(page_lru); nr_inactive_dirty_pages--; page->zone->inactive_dirty_pages--; continue; } /* Page is or was in use? Move it to the active list. */ if (PageTestandClearReferenced(page) || page->age > 0 || //这个以后会在单独的博客中解释 (!page->buffers && page_count(page) > 1) || page_ramdisk(page)) { del_page_from_inactive_dirty_list(page); add_page_to_active_list(page); continue; } /* * The page is locked. IO in progress? * Move it to the back of the list. */ if (TryLockPage(page)) {//如果已经上锁 list_del(page_lru); list_add(page_lru, &inactive_dirty_list);//挂在inactive_dirty_list的最后 continue; } /* * Dirty swap-cache page? Write it out if * last copy.. */ if (PageDirty(page)) {//如果页面是脏的 int (*writepage)(struct page *) = page->mapping->a_ops->writepage; int result; if (!writepage) goto page_active; /* First time through? Move it to the back of the list */ if (!launder_loop) {//循环第一遍launder_loop为0 list_del(page_lru); list_add(page_lru, &inactive_dirty_list);//挂到inactive_dirty_list的最后 UnlockPage(page); continue; } //第二次循环,执行到这里 ClearPageDirty(page);//清空脏标志位 page_cache_get(page);//使用计数加1 spin_unlock(&pagemap_lru_lock); result = writepage(page);//同步到磁盘,页面就变干净了 page_cache_release(page);//使用计数减1 /* And re-start the thing.. */ spin_lock(&pagemap_lru_lock); if (result != 1) continue; /* writepage refused to do anything */ set_page_dirty(page);//不会执行到这里 goto page_active; } //执行到这里时,一定是干净的页面 ...... if (page->buffers) { ...... } else if (page->mapping && !PageDirty(page)) {//如果页面是干净的,包括刚才是脏的,现在变成干净的 ....... del_page_from_inactive_dirty_list(page); add_page_to_inactive_clean_list(page); UnlockPage(page); cleaned_pages++; } else { page_active: ...... del_page_from_inactive_dirty_list(page); add_page_to_active_list(page); UnlockPage(page); } } spin_unlock(&pagemap_lru_lock); ...... if (can_get_io_locks && !launder_loop && free_shortage()) {//缺少可供分配的页面 launder_loop = 1;//一共最多进行两次循环 /* If we cleaned pages, never do synchronous IO. */ if (cleaned_pages) sync = 0; /* We only do a few "out of order" flushes. */ maxlaunder = MAX_LAUNDER; /* Kflushd takes care of the rest. */ wakeup_bdflush(0); goto dirty_page_rescan;//返回到dirty_page_rescan } /* Return the number of pages moved to the inactive_clean list. */ return cleaned_pages; }
其中del_page_from_inactive_dirty_list函数,如下:
#define del_page_from_inactive_dirty_list(page) { list_del(&(page)->lru); ClearPageInactiveDirty(page); nr_inactive_dirty_pages--; page->zone->inactive_dirty_pages--; DEBUG_ADD_PAGE ZERO_PAGE_BUG }
add_page_to_inactive_clean_list函数如下:
#define add_page_to_inactive_clean_list(page) { DEBUG_ADD_PAGE ZERO_PAGE_BUG SetPageInactiveClean(page); list_add(&(page)->lru, &page->zone->inactive_clean_list); page->zone->inactive_clean_pages++; }
最后执行完page_launder,结果是:
使用计数为1;
page->list链入mapping->dirty_pages或者clean_pages(保持原样);
page->next_hash和page->pprev_hash链入全局的Hash表;
page->lru链入了page->zone->inactive_clean_list;
然后,我们分析kreclaimd,代码如下:
int kreclaimd(void *unused) { struct task_struct *tsk = current; pg_data_t *pgdat; tsk->session = 1; tsk->pgrp = 1; strcpy(tsk->comm, "kreclaimd"); sigfillset(&tsk->blocked); current->flags |= PF_MEMALLOC;//执行公务 while (1) { /* * We sleep until someone wakes us up from * page_alloc.c::__alloc_pages(). */ interruptible_sleep_on(&kreclaimd_wait); /* * Move some pages from the inactive_clean lists to * the free lists, if it is needed. */ pgdat = pgdat_list; do { int i; for(i = 0; i < MAX_NR_ZONES; i++) { zone_t *zone = pgdat->node_zones + i; if (!zone->size) continue; while (zone->free_pages < zone->pages_low) { struct page * page; page = reclaim_page(zone);//主体代码 if (!page) break; __free_page(page); } } pgdat = pgdat->node_next; } while (pgdat); } }
reclaim_page代码如下:
struct page * reclaim_page(zone_t * zone) { struct page * page = NULL; struct list_head * page_lru; int maxscan; /* * We only need the pagemap_lru_lock if we don‘t reclaim the page, * but we have to grab the pagecache_lock before the pagemap_lru_lock * to avoid deadlocks and most of the time we‘ll succeed anyway. */ spin_lock(&pagecache_lock); spin_lock(&pagemap_lru_lock); maxscan = zone->inactive_clean_pages; while ((page_lru = zone->inactive_clean_list.prev) != &zone->inactive_clean_list && maxscan--) {//扫描zone->inactive_clean_list page = list_entry(page_lru, struct page, lru); /* Wrong page on list?! (list corruption, should not happen) */ if (!PageInactiveClean(page)) {//冤假错案 printk("VM: reclaim_page, wrong page on list.\n"); list_del(page_lru); page->zone->inactive_clean_pages--; continue; } /* Page is or was in use? Move it to the active list. */ if (PageTestandClearReferenced(page) || page->age > 0 || (!page->buffers && page_count(page) > 1)) {//这个会用单独的博客介绍 del_page_from_inactive_clean_list(page); add_page_to_active_list(page); continue; } /* The page is dirty, or locked, move to inactive_dirty list. */ if (page->buffers || PageDirty(page) || TryLockPage(page)) { del_page_from_inactive_clean_list(page); add_page_to_inactive_dirty_list(page); continue; } /* OK, remove the page from the caches. */ if (PageSwapCache(page)) {//page结构在swapper_space队列中 __delete_from_swap_cache(page);//执行这里 goto found_page; } if (page->mapping) { __remove_inode_page(page); goto found_page; } /* We should never ever get here. */ printk(KERN_ERR "VM: reclaim_page, found unknown page\n"); list_del(page_lru); zone->inactive_clean_pages--; UnlockPage(page); } /* Reset page pointer, maybe we encountered an unfreeable page. */ page = NULL; goto out; found_page: del_page_from_inactive_clean_list(page);//执行这里 UnlockPage(page); page->age = PAGE_AGE_START; if (page_count(page) != 1) printk("VM: reclaim_page, found page with count %d!\n", page_count(page)); out: spin_unlock(&pagemap_lru_lock); spin_unlock(&pagecache_lock); memory_pressure++; return page; }
__delete_from_swap_cache函数,代码如下:
void __delete_from_swap_cache(struct page *page) { swp_entry_t entry; entry.val = page->index; #ifdef SWAP_CACHE_INFO swap_cache_del_total++; #endif remove_from_swap_cache(page); swap_free(entry); }
remove_from_swap_cache函数,代码如下:
static inline void remove_from_swap_cache(struct page *page) { struct address_space *mapping = page->mapping; if (mapping != &swapper_space) BUG(); if (!PageSwapCache(page) || !PageLocked(page)) PAGE_BUG(page); PageClearSwapCache(page); ClearPageDirty(page); __remove_inode_page(page); }
__remove_inode_page函数,代码如下:
void __remove_inode_page(struct page *page) { if (PageDirty(page)) BUG(); remove_page_from_inode_queue(page); remove_page_from_hash_queue(page); page->mapping = NULL; }
remove_page_from_inode_queue函数,代码如下:
static inline void remove_page_from_inode_queue(struct page * page) { struct address_space * mapping = page->mapping; mapping->nrpages--; list_del(&page->list); page->mapping = NULL; }
remove_page_from_hash_queue函数,代码如下:
static inline void remove_page_from_hash_queue(struct page * page) { struct page *next = page->next_hash; struct page **pprev = page->pprev_hash; if (next) next->pprev_hash = pprev; *pprev = next; page->pprev_hash = NULL; atomic_dec(&page_cache_size); }
del_page_from_inactive_clean_list函数,代码如下:
#define del_page_from_inactive_clean_list(page) { list_del(&(page)->lru); ClearPageInactiveClean(page); page->zone->inactive_clean_pages--; DEBUG_ADD_PAGE ZERO_PAGE_BUG }
最后执行完reclaim_page,结果是:
使用计数为1;
page->list为空;
page->next_hash和page->pprev_hash位空;
page->lru为空;
回到kreclaimd,会执行__free_page,此时使用计数减为0,回收这个页面到free_area[MAX_ORDER],下次alloc_page就能分配到了。
void __free_pages(struct page *page, unsigned long order) { if (!PageReserved(page) && put_page_testzero(page))//使用计数为0 __free_pages_ok(page, order); }
总结:
kswapd内核线程:
1、refill_inactive_scan和swap_out,把活跃的页面变成不活跃脏的页面。挑选的原则是最近没有被访问,且age小于0。
2、page_launder,把不活跃脏的页面变成不活跃干净的页面。
kreclaimd内核线程:
3、把不活跃干净的页面,所有的链表关系都清除,但使用计数仍然为1。
4、__free_page,此时使用计数减为0,回收这个页面到free_area[MAX_ORDER],下次alloc_page就能分配到了。