在下面几种情况下会发生,页面出错异常(也叫缺页中断):
1、相应的页面目录项或者页面表项为空,也就是该线性地址与物理地址的映射关系尚未建立,或者已经撤销。本文讨论的就是这种情况。
2、相应的物理页面不在内存中。
3、指令中规定的访问方式与页面的权限不符,例如企图写一个“只读”的页面。
首先看下进程地址空间示意图:
假设现在需要调用某个子程序,因此CPU需将返回地址压入堆栈,也就是要将返回地址写入虚拟空间地址为(%esp-4)的地方。可是,在我们这个情景中地址(%esp-4)落入了空洞中,这是尚未映射的地址,因此必然要引起一次页面出错异常。
这里假定CPU的运行已经到达了页面异常服务程序的主体do_page_fault()的入口处。代码如下:
arch/i386/mm/fault.c
asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code) { struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct * vma; unsigned long address; unsigned long page; unsigned long fixup; int write; siginfo_t info; /* get the address */ __asm__("movl %%cr2,%0":"=r" (address));//把映射的失败的地址保存在address中,也就是%esp-4 tsk = current;//task_struct /* * We fault-in kernel-space virtual memory on-demand. The * ‘reference‘ page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. */ if (address >= TASK_SIZE) goto vmalloc_fault; mm = tsk->mm;//mm_struct info.si_code = SEGV_MAPERR; /* * If we‘re in an interrupt or have no user * context, we must not take the fault.. */ if (in_interrupt() || !mm) goto no_context; down(&mm->mmap_sem); vma = find_vma(mm, address);//找出结束地址大于给定地址的第一个区间。 if (!vma)//没有找到,说明没有一个区间的结束地址高于给定的地址,参考上图,说明这个地址是在堆栈之下,也就是3G字节以上了。 goto bad_area; if (vma->vm_start <= address)//起始地址不高于address,说明映射已经建立,转到good_area去进一步检查失败原因。 goto good_area; if (!(vma->vm_flags & VM_GROWSDOWN))//起始地址大于address,说明落到了空洞里面;如果vm_flags为VM_GROWSDOWN,说明落在堆栈区中,不会goto bad_area。 goto bad_area; if (error_code & 4) {//发生在用户态 /* * accessing the stack below %esp is always a bug. * The "+ 32" is there due to some instructions (like * pusha) doing post-decrement on the stack and that * doesn‘t show up until later.. */ if (address + 32 < regs->esp)//确保这是压栈操作,一次压入堆栈是4个字节,最多是pusha,压入32个字节。 goto bad_area; } if (expand_stack(vma, address))//看下面代码注释 goto bad_area; /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ good_area: info.si_code = SEGV_ACCERR; write = 0; switch (error_code & 3) {// 110 & 011 = 2 default: /* 3: write, present */ #ifdef TEST_VERIFY_AREA if (regs->cs == KERNEL_CS) printk("WP fault at %08lx\n", regs->eip); #endif /* fall through */ case 2: /* write, not present */ if (!(vma->vm_flags & VM_WRITE)) goto bad_area; write++;//执行到这里 break; case 1: /* read, present */ goto bad_area; case 0: /* read, not present */ if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; } /* * If for any reason at all we couldn‘t handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ switch (handle_mm_fault(mm, vma, address, write)) { case 1: tsk->min_flt++; break; case 2: tsk->maj_flt++; break; case 0: goto do_sigbus; default: goto out_of_memory; } /* * Did it hit the DOS screen memory VA from vm86 mode? */ if (regs->eflags & VM_MASK) { unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; if (bit < 32) tsk->thread.screen_bitmap |= 1 << bit; } up(&mm->mmap_sem); return; ....... }
内核的中断/异常响应机制还传过来两个参数。一个是pt_regs结构指针regs,它指向例外发生前夕CPU中各寄存器内容的一份副本。而error_code则进一步指明映射失败的具体原因。
error_code:
bit 0 == 0 means no page found, 1 means protection fault
bit 1 == 0 means read, 1 means write
bit 2 == 0 means kernel, 1 means user-mode
此时,error_code为110,用户态,尚未映射,写。
expand_stack函数,代码如下:
static inline int expand_stack(struct vm_area_struct * vma, unsigned long address) { unsigned long grow; address &= PAGE_MASK;//地址按页面边界对齐 grow = (vma->vm_start - address) >> PAGE_SHIFT;//本例中grow为1个页面 if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur || ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur) return -ENOMEM; vma->vm_start = address;//起始地址向低地址移了一个页面的距离 vma->vm_pgoff -= grow; vma->vm_mm->total_vm += grow; if (vma->vm_flags & VM_LOCKED) vma->vm_mm->locked_vm += grow; return 0; }
handle_mm_fault函数,代码如下:
int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, unsigned long address, int write_access) { int ret = -1; pgd_t *pgd; pmd_t *pmd; pgd = pgd_offset(mm, address);//返回页面表项指针 pmd = pmd_alloc(pgd, address);//中转了一下,还是页目录表项指针 if (pmd) { pte_t * pte = pte_alloc(pmd, address);//返回指向页表项的指针 if (pte) ret = handle_pte_fault(mm, vma, address, write_access, pte); } return ret; }
pgd_offset函数,如下:
#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address))
pmd_alloc函数,如下:
extern inline pmd_t * pmd_alloc(pgd_t *pgd, unsigned long address) { if (!pgd) BUG(); return (pmd_t *) pgd; }
pte_alloc函数,如下:
extern inline pte_t * pte_alloc(pmd_t * pmd, unsigned long address) { address = (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);//在页表中的偏移 if (pmd_none(*pmd))//是否存在该页目录项 goto getnew;//如果没有就去创建 if (pmd_bad(*pmd)) goto fix; return (pte_t *)pmd_page(*pmd) + address;//有就返回指向页表项的指针 getnew: { unsigned long page = (unsigned long) get_pte_fast();//创建页表 if (!page) return get_pte_slow(pmd, address); set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(page)));//让页目录项指向页表 return (pte_t *)page + address;//返回指向页表项的指针 } fix: __handle_bad_pmd(pmd); return NULL;
handle_pte_fault函数,如下:
static inline int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct * vma, unsigned long address, int write_access, pte_t * pte) { pte_t entry; /* * We need the page table lock to synchronize with kswapd * and the SMP-safe atomic PTE updates. */ spin_lock(&mm->page_table_lock); entry = *pte;//页表项中内容 if (!pte_present(entry)) {//页表项为空 /* * If it truly wasn‘t present, we know that kswapd * and the PTE updates will not touch it later. So * drop the lock. */ spin_unlock(&mm->page_table_lock); if (pte_none(entry))//页表项为空 return do_no_page(mm, vma, address, write_access, pte); return do_swap_page(mm, vma, address, pte, pte_to_swp_entry(entry), write_access); } if (write_access) { if (!pte_write(entry)) return do_wp_page(mm, vma, address, pte, entry); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); establish_pte(vma, address, pte, entry); spin_unlock(&mm->page_table_lock); return 1; }
do_no_page函数,如下:
static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int write_access, pte_t *page_table) { struct page * new_page; pte_t entry; if (!vma->vm_ops || !vma->vm_ops->nopage)//都为空 return do_anonymous_page(mm, vma, page_table, write_access, address); ....... return 2; /* Major fault */ }
do_anonymous_page函数,如下:
static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr) { struct page *page = NULL; pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); if (write_access) { //write_access为1 page = alloc_page(GFP_HIGHUSER);//分配页面 if (!page) return -1; clear_user_highpage(page, addr); entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));//页表项赋予已写过对应的物理页,可进行读、写或者执行 mm->rss++; flush_page_to_ram(page); } set_pte(page_table, entry);//页表项(属性刚才已经设置了)指向对应的页面 /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); return 1; /* Minor fault */ }
依次返回,从异常处理返回以后,堆栈区已经扩展了,再重新执行一便以前夭折的那条压栈指令,然后就可以继续往下执行了。对于用户程序来说,这整个过程都是“透明”的,就像什么事也没有发生,而堆栈区间就仿佛从一开始就已经分配好了足够大的空间一样。