Linux内核源代码情景分析-mmap后,文件与虚拟区间建立映射

一、文件映射的页面换入

在mmap后,mmap参考Linux内核源代码情景分析-系统调用mmap(),当这个区间的一个页面首次受到访问时,会由于见面无映射而发生缺页异常,相应的异常处理程序do_no_page()。

static inline int handle_pte_fault(struct mm_struct *mm,
	struct vm_area_struct * vma, unsigned long address,
	int write_access, pte_t * pte)
{
	pte_t entry;

	/*
	 * We need the page table lock to synchronize with kswapd
	 * and the SMP-safe atomic PTE updates.
	 */
	spin_lock(&mm->page_table_lock);
	entry = *pte;
	if (!pte_present(entry)) {
		/*
		 * If it truly wasn‘t present, we know that kswapd
		 * and the PTE updates will not touch it later. So
		 * drop the lock.
		 */
		spin_unlock(&mm->page_table_lock);
		if (pte_none(entry))
			return do_no_page(mm, vma, address, write_access, pte);
		return do_swap_page(mm, vma, address, pte, pte_to_swp_entry(entry), write_access);
	}

	if (write_access) {
		if (!pte_write(entry))
			return do_wp_page(mm, vma, address, pte, entry);

		entry = pte_mkdirty(entry);
	}
	entry = pte_mkyoung(entry);
	establish_pte(vma, address, pte, entry);
	spin_unlock(&mm->page_table_lock);
	return 1;
}

由于pte_none(entry)为true,所以执行do_no_page,代码如下:

static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
	unsigned long address, int write_access, pte_t *page_table)
{
	struct page * new_page;
	pte_t entry;

	if (!vma->vm_ops || !vma->vm_ops->nopage)
		return do_anonymous_page(mm, vma, page_table, write_access, address);

	/*
	 * The third argument is "no_share", which tells the low-level code
	 * to copy, not share the page even if sharing is possible.  It‘s
	 * essentially an early COW detection.
	 */
	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);//指向了filemap_nopage
	if (new_page == NULL)	/* no page was available -- SIGBUS */
		return 0;
	if (new_page == NOPAGE_OOM)
		return -1;
	++mm->rss;
	/*
	 * This silly early PAGE_DIRTY setting removes a race
	 * due to the bad i386 page protection. But it‘s valid
	 * for other architectures too.
	 *
	 * Note that if write_access is true, we either now have
	 * an exclusive copy of the page, or this is a shared mapping,
	 * so we can make it writable and dirty to avoid having to
	 * handle that later.
	 */
	flush_page_to_ram(new_page);
	flush_icache_page(vma, new_page);
	entry = mk_pte(new_page, vma->vm_page_prot);
	if (write_access) {
		entry = pte_mkwrite(pte_mkdirty(entry));
	} else if (page_count(new_page) > 1 &&
		   !(vma->vm_flags & VM_SHARED))
		entry = pte_wrprotect(entry);
	set_pte(page_table, entry);//建立映射
	/* no need to invalidate: a not-present page shouldn‘t be cached */
	update_mmu_cache(vma, address, entry);
	return 2;	/* Major fault */
}

filemap_nopage,分配一个空闲内存页面并从文件读入相应的页面。

struct page * filemap_nopage(struct vm_area_struct * area,
	unsigned long address, int no_share)
{
	int error;
	struct file *file = area->vm_file;
	struct inode *inode = file->f_dentry->d_inode;
	struct address_space *mapping = inode->i_mapping;//mapping来源于inode->i_mapping
	struct page *page, **hash, *old_page;
	unsigned long size, pgoff;

	pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;

retry_all:
	/*
	 * An external ptracer can access pages that normally aren‘t
	 * accessible..
	 */
	size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
	if ((pgoff >= size) && (area->vm_mm == current->mm))
		return NULL;

	/*
	 * Do we have something in the page cache already?
	 */
	hash = page_hash(mapping, pgoff);
retry_find:
	page = __find_get_page(mapping, pgoff, hash);//现在还没有找到
	if (!page)
		goto no_cached_page;//转到no_cache_page

	/*
	 * Ok, found a page in the page cache, now we need to check
	 * that it‘s up-to-date.
	 */
	if (!Page_Uptodate(page))//分配到了页面后,转到page_not_update
		goto page_not_uptodate;

success:
 	/*
	 * Try read-ahead for sequential areas.
	 */
	if (VM_SequentialReadHint(area))
		nopage_sequential_readahead(area, pgoff, size);

	/*
	 * Found the page and have a reference on it, need to check sharing
	 * and possibly copy it over to another page..
	 */
	old_page = page;
	if (no_share) {
		struct page *new_page = page_cache_alloc();

		if (new_page) {
			copy_user_highpage(new_page, old_page, address);
			flush_page_to_ram(new_page);
		} else
			new_page = NOPAGE_OOM;
		page_cache_release(page);
		return new_page;
	}

	flush_page_to_ram(old_page);
	return old_page;

no_cached_page:
	/*
	 * If the requested offset is within our file, try to read a whole
	 * cluster of pages at once.
	 *
	 * Otherwise, we‘re off the end of a privately mapped file,
	 * so we need to map a zero page.
	 */
	if ((pgoff < size) && !VM_RandomReadHint(area))
		error = read_cluster_nonblocking(file, pgoff, size);
	else
		error = page_cache_read(file, pgoff);//分配page结构,并加入到对应的队列中

	/*
	 * The page we want has now been added to the page cache.
	 * In the unlikely event that someone removed it in the
	 * meantime, we‘ll just come back here and read it again.
	 */
	if (error >= 0)
		goto retry_find;//返回到retry_find

	/*
	 * An error return from page_cache_read can result if the
	 * system is low on memory, or a problem occurs while trying
	 * to schedule I/O.
	 */
	if (error == -ENOMEM)
		return NOPAGE_OOM;
	return NULL;

page_not_uptodate:
	lock_page(page);

	/* Did it get unhashed while we waited for it? */
	if (!page->mapping) {
		UnlockPage(page);
		page_cache_release(page);
		goto retry_all;
	}

	/* Did somebody else get it up-to-date? */
	if (Page_Uptodate(page)) {
		UnlockPage(page);
		goto success;
	}

	if (!mapping->a_ops->readpage(file, page)) {
		wait_on_page(page);
		if (Page_Uptodate(page))
			goto success;
	}

	/*
	 * Umm, take care of errors if the page isn‘t up-to-date.
	 * Try to re-read it _once_. We do this synchronously,
	 * because there really aren‘t any performance issues here
	 * and we need to check for errors.
	 */
	lock_page(page);

	/* Somebody truncated the page on us? */
	if (!page->mapping) {
		UnlockPage(page);
		page_cache_release(page);
		goto retry_all;
	}

	/* Somebody else successfully read it in? */
	if (Page_Uptodate(page)) {
		UnlockPage(page);
		goto success;
	}
	ClearPageError(page);
	if (!mapping->a_ops->readpage(file, page)) {//指向ext2_readpage(),把文件从硬盘读入到内存页面中
		wait_on_page(page);
		if (Page_Uptodate(page))
			goto success;
	}

	/*
	 * Things didn‘t work out. Return zero to tell the
	 * mm layer so, possibly freeing the page cache page first.
	 */
	page_cache_release(page);
	return NULL;
}

page_cache_read,分配page结构,并加入到对应的队列中,代码如下:

static inline int page_cache_read(struct file * file, unsigned long offset)
{
	struct inode *inode = file->f_dentry->d_inode;
	struct address_space *mapping = inode->i_mapping;
	struct page **hash = page_hash(mapping, offset);
	struct page *page; 

	spin_lock(&pagecache_lock);
	page = __find_page_nolock(mapping, offset, *hash);
	spin_unlock(&pagecache_lock);
	if (page)
		return 0;

	page = page_cache_alloc();//分配page结构
	if (!page)
		return -ENOMEM;

	if (!add_to_page_cache_unique(page, mapping, offset, hash)) {//加入到对应的队列中
		int error = mapping->a_ops->readpage(file, page);
		page_cache_release(page);
		return error;
	}
	/*
	 * We arrive here in the unlikely event that someone
	 * raced with us and added our page to the cache first.
	 */
	page_cache_free(page);
	return 0;
}

此时:

page->list链入mapping->clean_pages;

page->next_hash和page->pprev_hash链入全局的Hash表;

page->lru链入了全局的active_list;

然后返回到retry_find,这次__find_get_page已经找到了,继续执行goto page_not_uptodate。mapping->a_ops->readpage,指向ext2_readpage(),把文件从硬盘读入到内存页面中。

二、文件映射的页面换出

refill_inactive_scan和swap_out,把活跃的页面变成不活跃脏的页面。挑选的原则是最近没有被访问,且age小于0。

page_launder,把不活跃脏的页面变成不活跃干净的页面。

不活跃脏的页面,有如下特点:

使用计数为1;

page->list链入mapping->dirty_pages/clean_pages;

page->next_hash和page->pprev_hash链入全局的Hash表;

page->lru链入了全局的inactive_dirty_list;

page->flags对应为设置为PG_dirty。

不活跃干净的页面,有如下特点:

使用计数为1;

page->list链入mapping->dirty_pages/clean_pages(保持原样);

page->next_hash和page->pprev_hash链入全局的Hash表;

page->lru链入了page->zone->inactive_clean_list;

swap_out,会调用try_to_swap_out,代码如下:

static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
{
	pte_t pte;
	swp_entry_t entry;
	struct page * page;
	int onlist;

	pte = *page_table;
	if (!pte_present(pte))//物理页面是否在内存中
		goto out_failed;
	page = pte_page(pte);
	if ((!VALID_PAGE(page)) || PageReserved(page))
		goto out_failed;

	if (!mm->swap_cnt)
		return 1;

	mm->swap_cnt--;//被考察的页面数减1

	onlist = PageActive(page);
	/* Don‘t look at this pte if it‘s been accessed recently. */
	if (ptep_test_and_clear_young(page_table)) {//如果页面被访问过,那么直接out_failed
		age_page_up(page);
		goto out_failed;
	}
	if (!onlist)
		/* The page is still mapped, so it can‘t be freeable... */
		age_page_down_ageonly(page);

	......
	if (page->age > 0)//如果页面的age不小于0,页out_failed
		goto out_failed;

	if (TryLockPage(page))
		goto out_failed;

	......
	pte = ptep_get_and_clear(page_table);//走到这里,说明页面最近没有访问过,且age小于0,清空页目录项
	flush_tlb_page(vma, address);

	......
	if (PageSwapCache(page)) {//page结构不在swapper_space队列中
		entry.val = page->index;
		if (pte_dirty(pte))
			set_page_dirty(page);
set_swap_pte:
		swap_duplicate(entry);
		set_pte(page_table, swp_entry_to_pte(entry));
drop_pte:
		UnlockPage(page);
		mm->rss--;
		deactivate_page(page);
		page_cache_release(page);
out_failed:
		return 0;
	}
        flush_cache_page(vma, address);
	if (!pte_dirty(pte))//或者这里跳转到drop_pte
		goto drop_pte;

	/*
	 * Ok, it‘s really dirty. That means that
	 * we should either create a new swap cache
	 * entry for it, or we should write it back
	 * to its own backing store.
	 */
	if (page->mapping) {//或者这里跳转到drop_pte
		set_page_dirty(page);
		goto drop_pte;
	}

	/*
	 * This is a dirty, swappable page.  First of all,
	 * get a suitable swap entry for it, and make sure
	 * we have the swap cache set up to associate the
	 * page with that swap entry.
	 */
	entry = get_swap_page();
	if (!entry.val)
		goto out_unlock_restore; /* No swap space left */

	/* Add it to the swap cache and mark it dirty */
	add_to_swap_cache(page, entry);
	set_page_dirty(page);
	goto set_swap_pte;

out_unlock_restore:
	set_pte(page_table, pte);
	UnlockPage(page);
}

对于用于文件映射的页面与普通的换入/换出页面有不同的处理。对于前者是解除页面映射,把页面表项设置成0;而对后者是断开页面映射,使页面表项指向盘上页面。

三、文件映射的恢复映射

1、对于不活跃脏的页面和不活跃干净的页面, 如果发生缺页中断,因页面无映射而进入do_no_page(),而不像换入/换出页面那样进入do_swap_page()。

do_no_page中调用filemap_nopage,再次访问到这个页面,那么会调用__find_get_page,会在全局的Hash表找到对应的页面,并且引用计数加1,变成2,但还没有移到活跃队列中。什么时候转移到活跃队列中呢?

答案在,page_launder和reclaim_page中。

page_launder:

if (PageTestandClearReferenced(page) || page->age > 0 ||   //此时引用计数大于1
                (!page->buffers && page_count(page) > 1) ||
                page_ramdisk(page)) {
            del_page_from_inactive_dirty_list(page);
            add_page_to_active_list(page);
            continue;
} 

reclaim_page:

if (PageTestandClearReferenced(page) || page->age > 0 ||
                (!page->buffers && page_count(page) > 1)) {//此时引用计数大于1
            del_page_from_inactive_clean_list(page);
            add_page_to_active_list(page);
            continue;
}  

2、如果reclaim_page,把不活跃干净的页面,所有的链表关系都清除,但使用计数仍然为1。

如果发生缺页中断,do_no_page,再调用filemap_nopage,再次访问到这个页面,调用__find_get_page为NULL,重新执行page_cache_read,分配一个空闲内存页面并从文件读入相应的页面。

此时:

page->list链入mapping->clean_pages;

page->next_hash和page->pprev_hash链入全局的Hash表;

page->lru链入了全局的active_list;

时间: 2024-10-20 02:02:58

Linux内核源代码情景分析-mmap后,文件与虚拟区间建立映射的相关文章

Linux内核源代码情景分析-交换分区

在Linux内核源代码情景分析-共享内存中,共享内存,当内存紧张时是换出到交换分区. 在Linux内核源代码情景分析-mmap后,文件与虚拟区间建立映射中,文件映射的页面,当内存紧张时是换出到硬盘上的文件中. 这里的交换分区,就是是swap分区,记得给电脑安装ubuntu时,就有一项是swap分区. 交换分区和文件的区别是: 文件是在一个具体的文件系统之下的,交换分区没有这个必要,它可能是一个裸分区,不需要文件系统. 需要说明一点,并不是所有从物理内存中交换出来的数据都会被放到Swap中(如果这

Linux内核源代码情景分析-文件系统安装后的访问

在Linux内核源代码情景分析-文件系统的安装,一文中,已经调用sudo mount -t ext2 /dev/sdb1 /mnt/sdb,在/mnt/sdb节点上挂载了文件系统,那么我们接下来访问/mnt/sdb/hello.c节点.我们来看一下path_walk的执行有什么不同? int path_walk(const char * name, struct nameidata *nd) { struct dentry *dentry; struct inode *inode; int er

Linux内核源代码情景分析-文件的写

write对应的系统调用是sys_write,代码如下: asmlinkage ssize_t sys_write(unsigned int fd, const char * buf, size_t count) { ssize_t ret; struct file * file; ret = -EBADF; file = fget(fd); if (file) { if (file->f_mode & FMODE_WRITE) { struct inode *inode = file-&g

Linux内核源代码情景分析-访问权限与文件安全性

在Linux内核源代码情景分析-从路径名到目标节点,一文中path_walk代码中,err = permission(inode, MAY_EXEC)当前进程是否可以访问这个节点,代码如下: int permission(struct inode * inode,int mask) { if (inode->i_op && inode->i_op->permission) { int retval; lock_kernel(); retval = inode->i_

Linux内核源代码情景分析-共享内存

一.库函数shmget()--共享内存区的创建与寻找 asmlinkage long sys_shmget (key_t key, size_t size, int shmflg) { struct shmid_kernel *shp; int err, id = 0; down(&shm_ids.sem); if (key == IPC_PRIVATE) { err = newseg(key, shmflg, size);//分配一个共享内存区供本进程专用,最后返回的是一体化的标示号 } el

Linux内核源代码情景分析-内存管理

用户空间的页面有下面几种: 1.普通的用户空间页面,包括进程的代码段.数据段.堆栈段.以及动态分配的"存储堆". 2.通过系统调用mmap()映射到用户空间的已打开文件的内容. 3.进程间的共享内存区. 这些页面的的周转有两方面的意思. 1.页面的分配,使用,回收.如进程压栈时新申请的页面,这类页面不进行盘区交换,不使用时释放得以回收. 这部分通过一个场景来解释: Linux内核源代码情景分析-内存管理之用户堆栈的扩展. 2.盘区交换.如要执行硬盘上的对应代码段.把硬盘上的代码段换入内

Linux内核源代码情景分析-系统调用brk()

首先看下进程地址空间示意图: 我们简单的说,从低地址到高地址,代码区和数据区,空洞,堆栈区.    在Linux内核源代码情景分析-内存管理之用户堆栈的扩展,我们申请了从堆栈区往下,数据区上面的页面.    在Linux内核源代码情景分析-内存管理之用户页面的换入,我们申请了用于换入/换出的页面.    在本文中,我们申请的是从数据区往上,堆栈区下面的页面.    我们通过一个实例来分析,brk(),见下图: 1.由于新边界比旧边界地址高,我们申请旧边界和新边界之间的页面.就是把对应的虚拟地址映

Linux内核源代码情景分析-fork()

父进程fork出子进程: fork经过系统调用,来到了sys_fork,详细过程请参考Linux内核源代码情景分析-系统调用. asmlinkage int sys_fork(struct pt_regs regs) { return do_fork(SIGCHLD, regs.esp, &regs, 0); } int do_fork(unsigned long clone_flags, unsigned long stack_start, //stack_start为用户空间堆栈指针 str

Linux内核源代码情景分析-特殊文件系统/proc

由于proc文件系统并不物理地存在于任何设备上,它的安装过程是特殊的.对proc文件系统不能直接通过mount()来安装,而要先由系统内核在内核初始化时自动地通过一个函数kern_mount()安装一次,然后再由处理系统初始化的进程通过mount()安装,实际上是"重安装". 一.在内核初始化时调用init_proc_fs(),代码如下: static DECLARE_FSTYPE(proc_fs_type, "proc", proc_read_super, FS_