Linux内核源代码情景分析-系统调用mmap()

一个进程可以系统调用mmap(),将一个已打开文件的内容映射到它的用户空间,其用户界面为:

mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset)。

参数fd代表着一个已打开文件,offset为文件中的起点,而start为映射到用户空间中的起始地址,length则为长度。还有两个参数prot和flags,前者用于对所映射区间的访问模式,如可写、可执行等;后者则用于其他控制目的。从应用程序设计的角度来说,比之常规的文件操作,如read()、write()、lseek(),将文件映射到用户空间后像访问内存一样地访问文件显然要方便得多。

mmap对应系统调用为sys_mmap2,代码为:

asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
	unsigned long prot, unsigned long flags,
	unsigned long fd, unsigned long pgoff)
{
	return do_mmap2(addr, len, prot, flags, fd, pgoff);
}

do_mmap2,代码如下:

static inline long do_mmap2(
	unsigned long addr, unsigned long len,
	unsigned long prot, unsigned long flags,
	unsigned long fd, unsigned long pgoff)
{
	int error = -EBADF;
	struct file * file = NULL;

	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
	if (!(flags & MAP_ANONYMOUS)) {//MAP_ANONYMOUS设成1,表示没有文件,实际上只是用来"圈地"
		file = fget(fd);//获取file结构
		if (!file)
			goto out;
	}

	down(t->mm->mmap_sem);
	error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
	up(¤t->mm->mmap_sem);

	if (file)
		fput(file);
out:
	return error;
}

inline函数do_mmap(),是供内核自己用的,它也是将已打开文件映射到当前进程空间。代码为:

static inline unsigned long do_mmap(struct file *file, unsigned long addr,
	unsigned long len, unsigned long prot,
	unsigned long flag, unsigned long offset)
{
	unsigned long ret = -EINVAL;
	if ((offset + PAGE_ALIGN(len)) < offset)
		goto out;
	if (!(offset & ~PAGE_MASK))
		ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
out:
	return ret;
}

两者都调用,do_mmap_pgoff,代码如下:

unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len,
	unsigned long prot, unsigned long flags, unsigned long pgoff)
{
	struct mm_struct * mm = current->mm;
	struct vm_area_struct * vma;
	int correct_wcount = 0;
	int error;

	.....//各种判断,先忽略
	if (flags & MAP_FIXED) {
		if (addr & ~PAGE_MASK)
			return -EINVAL;
	} else {//MAP_FIXED为0,就表示指定的映射地址只是一个参考值,不能满足时可以由内核给分配一个
		addr = get_unmapped_area(addr, len);//当前进程的用户空间中分配一个起始地址
		if (!addr)
			return -ENOMEM;
	}

	/* Determine the object being mapped and call the appropriate
	 * specific mapper. the address has already been validated, but
	 * not unmapped, but the maps are removed from the list.
	 */
	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);//映射到一个特定的文件也是一种属性,属性不同的区段不能共存于同一逻辑区间,所以总要为之单独建立一个逻辑区间
	if (!vma)
		return -ENOMEM;

	vma->vm_mm = mm;
	vma->vm_start = addr;//起始地址
	vma->vm_end = addr + len;//结束地址
	vma->vm_flags = vm_flags(prot,flags) | mm->def_flags;

	if (file) {//设置vma->flags
		VM_ClearReadHint(vma);
		vma->vm_raend = 0;

		if (file->f_mode & FMODE_READ)
			vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
		if (flags & MAP_SHARED) {
			vma->vm_flags |= VM_SHARED | VM_MAYSHARE;

			/* This looks strange, but when we don‘t have the file open
			 * for writing, we can demote the shared mapping to a simpler
			 * private mapping. That also takes care of a security hole
			 * with ptrace() writing to a shared mapping without write
			 * permissions.
			 *
			 * We leave the VM_MAYSHARE bit on, just to get correct output
			 * from /proc/xxx/maps..
			 */
			if (!(file->f_mode & FMODE_WRITE))
				vma->vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
		}
	} else {
		vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
		if (flags & MAP_SHARED)
			vma->vm_flags |= VM_SHARED | VM_MAYSHARE;
	}
	vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f];
	vma->vm_ops = NULL;
	vma->vm_pgoff = pgoff;//所映射内容在文件中的起点,有了这个起点,发生缺页异常时,就可以根据虚拟地址计算出相应页面在文件中的位置
	vma->vm_file = NULL;
	vma->vm_private_data = NULL;

	/* Clear old maps */
	error = -ENOMEM;
	if (do_munmap(mm, addr, len))//检查目标地址在当前进程的虚拟空间是否已经在使用,如果已经在使用就要将老的映射撤销,要是这个操作失败,则goto free_vma。因为flags的标志位为MAP_FIXED为1时,并未对此检查。
		goto free_vma;

	/* Check against address space limit. */
	if ((mm->total_vm << PAGE_SHIFT) + len //虚拟空间的使用是否超出了为其设置的下限
	    > current->rlim[RLIMIT_AS].rlim_cur)
		goto free_vma;

	/* Private writable mapping? Check memory availability.. */
	if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE &&//物理页面数是否够
	    !(flags & MAP_NORESERVE)				 &&
	    !vm_enough_memory(len >> PAGE_SHIFT))
		goto free_vma;

	if (file) {
		if (vma->vm_flags & VM_DENYWRITE) {
			error = deny_write_access(file);//排斥常规文件操作,如read write
			if (error)
				goto free_vma;
			correct_wcount = 1;
		}
		vma->vm_file = file;//重点哦
		get_file(file);
		error = file->f_op->mmap(file, vma);//指向了generic_file_mmap
		if (error)
			goto unmap_and_free_vma;
	} else if (flags & MAP_SHARED) {
		error = shmem_zero_setup(vma);
		if (error)
			goto free_vma;
	}

	/* Can addr have changed??
	 *
	 * Answer: Yes, several device drivers can do it in their
	 *         f_op->mmap method. -DaveM
	 */
	flags = vma->vm_flags;
	addr = vma->vm_start;

	insert_vm_struct(mm, vma);//插入到对应的队列中
	if (correct_wcount)
		atomic_inc(&file->f_dentry->d_inode->i_writecount);

	mm->total_vm += len >> PAGE_SHIFT;
	if (flags & VM_LOCKED) {//仅在加锁时才调用make_pages_present
		mm->locked_vm += len >> PAGE_SHIFT;
		make_pages_present(addr, addr + len);
	}
	return addr;//最后返回的起始虚拟地址,一般是后12位为0

unmap_and_free_vma:
	if (correct_wcount)
		atomic_inc(&file->f_dentry->d_inode->i_writecount);
	vma->vm_file = NULL;
	fput(file);
	/* Undo any partial mapping done by a device driver. */
	flush_cache_range(mm, vma->vm_start, vma->vm_end);
	zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start);
	flush_tlb_range(mm, vma->vm_start, vma->vm_end);
free_vma:
	kmem_cache_free(vm_area_cachep, vma);
	return error;
}

generic_file_mmap函数,代码如下:

int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
{
	struct vm_operations_struct * ops;
	struct inode *inode = file->f_dentry->d_inode;

	ops = &file_private_mmap;
	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
		if (!inode->i_mapping->a_ops->writepage)
			return -EINVAL;
		ops = &file_shared_mmap;
	}
	if (!inode->i_sb || !S_ISREG(inode->i_mode))
		return -EACCES;
	if (!inode->i_mapping->a_ops->readpage)
		return -ENOEXEC;
	UPDATE_ATIME(inode);
	vma->vm_ops = ops;//重点哦
	return 0;
}

其中file_private_mmap,代码如下:

static struct vm_operations_struct file_private_mmap = {
	nopage:		filemap_nopage,
};

inode->i_mapping->a_ops->writepage和inode->i_mapping->a_ops->readpage指向了:

struct address_space_operations ext2_aops = {
	readpage: ext2_readpage,
	writepage: ext2_writepage,
	sync_page: block_sync_page,
	prepare_write: ext2_prepare_write,
	commit_write: generic_commit_write,
	bmap: ext2_bmap
};

最后vm_area_struct数据结构,重要的段(vm_ops,file,pgoff,vm_start,vm_end)都设置好了,返回起始虚拟地址。

读者也许感到困惑,在文件与虚拟区间之间建立映射难道就这么简单?而且我们根本就没有看到页面映射的建立!

那么什么时候建立映射呢?

当这个区间的一个页面首次受到访问时,会由于见面无映射而发生缺页异常,相应的处理函数是do_no_page(),而不是do_swap_page()。代码如下:

static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
	unsigned long address, int write_access, pte_t *page_table)
{
	struct page * new_page;
	pte_t entry;

	if (!vma->vm_ops || !vma->vm_ops->nopage)
		return do_anonymous_page(mm, vma, page_table, write_access, address);

	/*
	 * The third argument is "no_share", which tells the low-level code
	 * to copy, not share the page even if sharing is possible.  It‘s
	 * essentially an early COW detection.
	 */
	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);//调用filemap_nopage
	if (new_page == NULL)	/* no page was available -- SIGBUS */
		return 0;
	if (new_page == NOPAGE_OOM)
		return -1;
	++mm->rss;
	/*
	 * This silly early PAGE_DIRTY setting removes a race
	 * due to the bad i386 page protection. But it‘s valid
	 * for other architectures too.
	 *
	 * Note that if write_access is true, we either now have
	 * an exclusive copy of the page, or this is a shared mapping,
	 * so we can make it writable and dirty to avoid having to
	 * handle that later.
	 */
	flush_page_to_ram(new_page);
	flush_icache_page(vma, new_page);
	entry = mk_pte(new_page, vma->vm_page_prot);
	if (write_access) {
		entry = pte_mkwrite(pte_mkdirty(entry));
	} else if (page_count(new_page) > 1 &&
		   !(vma->vm_flags & VM_SHARED))
		entry = pte_wrprotect(entry);
	set_pte(page_table, entry);
	/* no need to invalidate: a not-present page shouldn‘t be cached */
	update_mmu_cache(vma, address, entry);
	return 2;	/* Major fault */
}

do_no_page会调用filemap_nopage,filemap_nopage会调用ext2_readpage()分配一个空闲内存页面并从文件读入相应的页面,然后建立起映射。

时间: 2024-11-02 13:07:55

Linux内核源代码情景分析-系统调用mmap()的相关文章

Linux内核源代码情景分析-系统调用brk()

首先看下进程地址空间示意图: 我们简单的说,从低地址到高地址,代码区和数据区,空洞,堆栈区.    在Linux内核源代码情景分析-内存管理之用户堆栈的扩展,我们申请了从堆栈区往下,数据区上面的页面.    在Linux内核源代码情景分析-内存管理之用户页面的换入,我们申请了用于换入/换出的页面.    在本文中,我们申请的是从数据区往上,堆栈区下面的页面.    我们通过一个实例来分析,brk(),见下图: 1.由于新边界比旧边界地址高,我们申请旧边界和新边界之间的页面.就是把对应的虚拟地址映

Linux内核源代码情景分析-系统调用mknod

普通文件可以用open或者create创建,FIFO文件可以用pipe创建,mknod主要用于设备文件的创建. 在内核中,mknod是由sys_mknod实现的,代码如下: asmlinkage long sys_mknod(const char * filename, int mode, dev_t dev) //比如filename为/tmp/server_socket,dev是设备号 { int error = 0; char * tmp; struct dentry * dentry;

Linux内核源代码情景分析-系统调用

一.系统调用初始化 void __init trap_init(void) { ...... set_system_gate(SYSCALL_VECTOR,&system_call);//0x80 ...... } 对0x80中断向量,设置了系统调用的总入口system_call. static void __init set_system_gate(unsigned int n, void *addr) { _set_gate(idt_table+n,15,3,addr); } 在IDT中设置

Linux内核源代码情景分析-系统调用select以及异步输入/输出

一.系统调用select,把原来当前进程的单睡眠等待状态变成了现在的多睡眠等待状态.具体请看代码,select在内核中的实现为sys_select,代码如下: asmlinkage long sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)//inp,outp,exp是关于已打开文件的位图,tvp表明准备睡眠等待的最长时间,0表示无限期的睡眠等待,这些指针都指向了用户空间,详细解释请看下面

Linux内核源代码情景分析-mmap后,文件与虚拟区间建立映射

一.文件映射的页面换入 在mmap后,mmap参考Linux内核源代码情景分析-系统调用mmap(),当这个区间的一个页面首次受到访问时,会由于见面无映射而发生缺页异常,相应的异常处理程序do_no_page(). static inline int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct * vma, unsigned long address, int write_access, pte_t * pte) {

Linux内核源代码情景分析-进程的创建,执行,等待,消亡

我们先看下面的程序: #include <stdio.h> int main() { int child; char *args[] = {"/bin/echo", "Hello", "World!", NULL}; if(!(child = fork())) { /* child */ execve("/bin/echo", args, NULL}); printf("I am back, someth

Linux内核源代码情景分析-fork()

父进程fork出子进程: fork经过系统调用,来到了sys_fork,详细过程请参考Linux内核源代码情景分析-系统调用. asmlinkage int sys_fork(struct pt_regs regs) { return do_fork(SIGCHLD, regs.esp, &regs, 0); } int do_fork(unsigned long clone_flags, unsigned long stack_start, //stack_start为用户空间堆栈指针 str

Linux内核源代码情景分析-内存管理

用户空间的页面有下面几种: 1.普通的用户空间页面,包括进程的代码段.数据段.堆栈段.以及动态分配的"存储堆". 2.通过系统调用mmap()映射到用户空间的已打开文件的内容. 3.进程间的共享内存区. 这些页面的的周转有两方面的意思. 1.页面的分配,使用,回收.如进程压栈时新申请的页面,这类页面不进行盘区交换,不使用时释放得以回收. 这部分通过一个场景来解释: Linux内核源代码情景分析-内存管理之用户堆栈的扩展. 2.盘区交换.如要执行硬盘上的对应代码段.把硬盘上的代码段换入内

Linux内核源代码情景分析-文件系统的安装

执行sudo mount -t ext2 /dev/sdb1 /mnt/sdb,将文件系统挂在到/mnt/sdb上.系统调用mount,映射到内核层执行的是sys_mount.假设/dev/sdb1和/mnt/sdb都位于ext2文件系统中. asmlinkage long sys_mount(char * dev_name, char * dir_name, char * type, unsigned long flags, void * data)//dev_name指向了"/dev/sdb