Linux内核源代码情景分析-exit()

执行完/bin/echo之后,会调动do_exit,销毁子进程:

我们还是先从系统调用exit()说起,先来看exit()的实现,进入到内核态执行sys_exit。

asmlinkage long sys_exit(int error_code)
{
	do_exit((error_code&0xff)<<	8);
}
NORET_TYPE void do_exit(long code)
{
	struct task_struct *tsk = current;

	if (in_interrupt())//中断服务不能中断
		panic("Aiee, killing interrupt handler!");
	if (!tsk->pid)//空转idle进程是不允许退出的
		panic("Attempted to kill the idle task!");
	if (tsk->pid == 1)//init进程是不允许退出的
		panic("Attempted to kill init!");
	tsk->flags |= PF_EXITING;
	del_timer_sync(&tsk->real_timer);

fake_volatile:
#ifdef CONFIG_BSD_PROCESS_ACCT
	acct_process(code);
#endif
	__exit_mm(tsk);//如果通过指针共享,只是减少共享计数mm->mm_users。如果自立门户,则释放mm_struct,vm_struct;释放页目录表,页表

	lock_kernel();
	sem_exit();//信号相关,看完进程间通信再说
	__exit_files(tsk);//如果通过指针共享,只是减少共享计数files->count。如果自立门户,那就要释放files_struct数据结构
	__exit_fs(tsk);//如果通过指针共享,只是减少共享计数fs->count。如果自立门户,那就要释放fs_struct数据结构
	exit_sighand(tsk);//如果通过指针共享,只是减少共享计数sig->count。如果自立门户,那就要释放signal_struct数据结构
	exit_thread();//空函数

	if (current->leader)
		disassociate_ctty(1);

	put_exec_domain(tsk->exec_domain);
	if (tsk->binfmt && tsk->binfmt->module)
		__MOD_DEC_USE_COUNT(tsk->binfmt->module);

	tsk->exit_code = code;
	exit_notify();//将当前进程设置为僵死状态;并给父进程发信号;其当前进程的子进程的父进程设置为init进程
	schedule();
	BUG();
/*
 * In order to get rid of the "volatile function does return" message
 * I did this little loop that confuses gcc to think do_exit really
 * is volatile. In fact it‘s schedule() that is volatile in some
 * circumstances: when current->state = ZOMBIE, schedule() never
 * returns.
 *
 * In fact the natural way to do all this is to have the label and the
 * goto right after each other, but I put the fake_volatile label at
 * the start of the function just in case something /really/ bad
 * happens, and the schedule returns. This way we can try again. I‘m
 * not paranoid: it‘s just that everybody is out to get me.
 */
	goto fake_volatile;
}

__exit_mm,子进程自立门户,释放mm_struct,vm_struct;释放页目录表,页表:

static inline void __exit_mm(struct task_struct * tsk)
{
	struct mm_struct * mm = tsk->mm;

	mm_release();
	if (mm) {
		atomic_inc(&mm->mm_count);
		if (mm != tsk->active_mm) BUG();
		/* more a memory barrier than a real lock */
		task_lock(tsk);
		tsk->mm = NULL;
		task_unlock(tsk);
		enter_lazy_tlb(mm, current, smp_processor_id());
		mmput(mm);//主要是这句
	}
}
void mmput(struct mm_struct *mm)
{
	if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) {//mm->mm_users为1
		list_del(&mm->mmlist);
		spin_unlock(&mmlist_lock);
		exit_mmap(mm);//释放vm_struct,并把页目录表项和页表项都清0
		mmdrop(mm);//释放mm_struct和页目录表,页表
	}
}
void exit_mmap(struct mm_struct * mm)
{
	struct vm_area_struct * mpnt;

	release_segments(mm);
	spin_lock(&mm->page_table_lock);
	mpnt = mm->mmap;
	mm->mmap = mm->mmap_avl = mm->mmap_cache = NULL;
	spin_unlock(&mm->page_table_lock);
	mm->rss = 0;
	mm->total_vm = 0;
	mm->locked_vm = 0;
	while (mpnt) {
		struct vm_area_struct * next = mpnt->vm_next;
		unsigned long start = mpnt->vm_start;
		unsigned long end = mpnt->vm_end;
		unsigned long size = end - start;

		if (mpnt->vm_ops) {
			if (mpnt->vm_ops->close)
				mpnt->vm_ops->close(mpnt);
		}
		mm->map_count--;
		remove_shared_vm_struct(mpnt);
		flush_cache_range(mm, start, end);
		zap_page_range(mm, start, size);
		if (mpnt->vm_file)
			fput(mpnt->vm_file);
		kmem_cache_free(vm_area_cachep, mpnt);
		mpnt = next;
	}

	/* This is just debugging */
	if (mm->map_count)
		printk("exit_mmap: map count is %d\n", mm->map_count);

	clear_page_tables(mm, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD);
}
static inline void mmdrop(struct mm_struct * mm)
{
	if (atomic_dec_and_test(&mm->mm_count))
		__mmdrop(mm);
}
inline void __mmdrop(struct mm_struct *mm)
{
	if (mm == &init_mm) BUG();
	pgd_free(mm->pgd);
	destroy_context(mm);
	free_mm(mm);
}

回到do_exit,继续执行__exit_files,子进程自立门户,释放files_struct数据结构,代码如下:

static inline void __exit_files(struct task_struct *tsk)
{
	struct files_struct * files = tsk->files;

	if (files) {
		task_lock(tsk);
		tsk->files = NULL;
		task_unlock(tsk);
		put_files_struct(files);
	}
}
void put_files_struct(struct files_struct *files)
{
	if (atomic_dec_and_test(&files->count)) {//files->count为1
		close_files(files);
		/*
		 * Free the fd and fdset arrays if we expanded them.
		 */
		if (files->fd != &files->fd_array[0])
			free_fd_array(files->fd, files->max_fds);
		if (files->max_fdset > __FD_SETSIZE) {
			free_fdset(files->open_fds, files->max_fdset);
			free_fdset(files->close_on_exec, files->max_fdset);
		}
		kmem_cache_free(files_cachep, files);
	}
}

继续执行__exit_fs,子进程自立门户,那就要释放fs_struct数据结构,代码如下:

static inline void __exit_fs(struct task_struct *tsk)
{
	struct fs_struct * fs = tsk->fs;

	if (fs) {
		task_lock(tsk);
		tsk->fs = NULL;
		task_unlock(tsk);
		__put_fs_struct(fs);
	}
}
static inline void __put_fs_struct(struct fs_struct *fs)
{
	/* No need to hold fs->lock if we are killing it */
	if (atomic_dec_and_test(&fs->count)) {//fs->count为1
		dput(fs->root);
		mntput(fs->rootmnt);
		dput(fs->pwd);
		mntput(fs->pwdmnt);
		if (fs->altroot) {
			dput(fs->altroot);
			mntput(fs->altrootmnt);
		}
		kmem_cache_free(fs_cachep, fs);
	}
}

继续执行exit_sighand,子进程自立门户,那就要释放signal_struct数据结构,代码如下:

void exit_sighand(struct task_struct *tsk)
{
	struct signal_struct * sig = tsk->sig;

	spin_lock_irq(&tsk->sigmask_lock);
	if (sig) {
		tsk->sig = NULL;
		if (atomic_dec_and_test(&sig->count))//sig->count为1
			kmem_cache_free(sigact_cachep, sig);
	}
	tsk->sigpending = 0;
	flush_sigqueue(&tsk->pending);
	spin_unlock_irq(&tsk->sigmask_lock);
}

继续执行exit_notify,当前进程设置为僵死状态;并给父进程发信号;其当前进程的子进程的父进程设置为init进程,代码如下:

static void exit_notify(void)
{
	struct task_struct * p, *t;

	forget_original_parent(current);//其当前进程的子进程的父进程设置为init进程
	/*
	 * Check to see if any process groups have become orphaned
	 * as a result of our exiting, and if they have any stopped
	 * jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
	 *
	 * Case i: Our father is in a different pgrp than we are
	 * and we were the only connection outside, so our pgrp
	 * is about to become orphaned.
	 */

	t = current->p_pptr;//通知的是p_pptr,在forget_original_parent设置的是p->p_opptr = reaper

	if ((t->pgrp != current->pgrp) &&
	    (t->session == current->session) &&
	    will_become_orphaned_pgrp(current->pgrp, current) &&
	    has_stopped_jobs(current->pgrp)) {
		kill_pg(current->pgrp,SIGHUP,1);
		kill_pg(current->pgrp,SIGCONT,1);
	}

	/* Let father know we died
	 *
	 * Thread signals are configurable, but you aren‘t going to use
	 * that to send signals to arbitary processes.
	 * That stops right now.
	 *
	 * If the parent exec id doesn‘t match the exec id we saved
	 * when we started then we know the parent has changed security
	 * domain.
	 *
	 * If our self_exec id doesn‘t match our parent_exec_id then
	 * we have changed execution domain as these two values started
	 * the same after a fork.
	 *
	 */

	if(current->exit_signal != SIGCHLD &&
	    ( current->parent_exec_id != t->self_exec_id  ||
	      current->self_exec_id != current->parent_exec_id)
	    && !capable(CAP_KILL))
		current->exit_signal = SIGCHLD;//给父进程发的信号是SIGCHLD

	/*
	 * This loop does two things:
	 *
  	 * A.  Make init inherit all the child processes
	 * B.  Check to see if any process groups have become orphaned
	 *	as a result of our exiting, and if they have any stopped
	 *	jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
	 */

	write_lock_irq(&tasklist_lock);
	current->state = TASK_ZOMBIE;//当前进程设置为僵死状态
	do_notify_parent(current, current->exit_signal);//给父进程发信号
	while (current->p_cptr != NULL) {
		p = current->p_cptr;
		current->p_cptr = p->p_osptr;
		p->p_ysptr = NULL;
		p->ptrace = 0;

		p->p_pptr = p->p_opptr;//这里,把p_pptr和p_opptr统一了,都是reaper
		p->p_osptr = p->p_pptr->p_cptr;
		if (p->p_osptr)
			p->p_osptr->p_ysptr = p;
		p->p_pptr->p_cptr = p;
		if (p->state == TASK_ZOMBIE)
			do_notify_parent(p, p->exit_signal);
		/*
		 * process group orphan check
		 * Case ii: Our child is in a different pgrp
		 * than we are, and it was the only connection
		 * outside, so the child pgrp is now orphaned.
		 */
		if ((p->pgrp != current->pgrp) &&
		    (p->session == current->session)) {
			int pgrp = p->pgrp;

			write_unlock_irq(&tasklist_lock);
			if (is_orphaned_pgrp(pgrp) && has_stopped_jobs(pgrp)) {
				kill_pg(pgrp,SIGHUP,1);
				kill_pg(pgrp,SIGCONT,1);
			}
			write_lock_irq(&tasklist_lock);
		}
	}
	write_unlock_irq(&tasklist_lock);
}
static inline void forget_original_parent(struct task_struct * father)
{
	struct task_struct * p, *reaper;

	read_lock(&tasklist_lock);

	/* Next in our thread group */
	reaper = next_thread(father);
	if (reaper == father)
		reaper = child_reaper;//init进程

	for_each_task(p) {
		if (p->p_opptr == father) {
			/* We dont want people slaying init */
			p->exit_signal = SIGCHLD;
			p->self_exec_id++;
			p->p_opptr = reaper;//其当前进程的子进程的父进程设置为init进程,这里设置的p_opptr
			if (p->pdeath_signal) send_sig(p->pdeath_signal, p, 0);
		}
	}
	read_unlock(&tasklist_lock);
}
void do_notify_parent(struct task_struct *tsk, int sig)//sig为SIGCHLD
{
	struct siginfo info;
	int why, status;

	info.si_signo = sig;
	info.si_errno = 0;
	info.si_pid = tsk->pid;
	info.si_uid = tsk->uid;

	/* FIXME: find out whether or not this is supposed to be c*time. */
	info.si_utime = tsk->times.tms_utime;
	info.si_stime = tsk->times.tms_stime;

	status = tsk->exit_code & 0x7f;
	why = SI_KERNEL;	/* shouldn‘t happen */
	switch (tsk->state) {
	case TASK_STOPPED:
		/* FIXME -- can we deduce CLD_TRAPPED or CLD_CONTINUED? */
		if (tsk->ptrace & PT_PTRACED)
			why = CLD_TRAPPED;
		else
			why = CLD_STOPPED;
		break;

	default:
		if (tsk->exit_code & 0x80)
			why = CLD_DUMPED;
		else if (tsk->exit_code & 0x7f)
			why = CLD_KILLED;
		else {
			why = CLD_EXITED;
			status = tsk->exit_code >> 8;
		}
		break;
	}
	info.si_code = why;
	info.si_status = status;

	send_sig_info(sig, &info, tsk->p_pptr);//给父进程发送SIGCHLD信号
	wake_up_parent(tsk->p_pptr);//唤醒父进程,父进程在wait时,将状态设置为TASK_INTERRUPTIBLE,现在设置为TASK_RUNNING
}

至此,进程的基本资源都已经释放了,但是当前进程的残骸仍旧占用着最低限度的资源,包括其task_struct数据结构和系统空间堆栈所在的两个页面。当前进程自己不释放这两个页面,就像人们自己并不在临终注销自己的户口一样,而是通知其父进程,让父进程料理后事。当前进程状态为 TASK_ZOMBIE,schedule时,无限延迟调度该进程。

下面,最后执行schedule,假设只有父进程和子进程,父进程的状态已经是TASK_RUNNING,切换到父进程继续执行。

#define switch_to(prev,next,last) do {						asm volatile("pushl %%esi\n\t"					\ //把esi存入现在进程prev的堆栈
		     "pushl %%edi\n\t"					\ //把edi存入现在进程prev的堆栈
		     "pushl %%ebp\n\t"					\ //把ebp存入现在进程prev的堆栈
		     "movl %%esp,%0\n\t"	/* save ESP */		\ //现在进程prev的esp保存在prev->thread.esp
		     "movl %3,%%esp\n\t"	/* restore ESP */	\ //将要切换的进程next->thread.esp保存在esp中,堆栈已经切换了
		     "movl $1f,%1\n\t"		/* save EIP */		\ //现在进程prev的eip(也就是"1:\t"地址)保存在prev->thread.eip
		     "pushl %4\n\t"		/* restore EIP */	\ //将要切换的进程next->thread.eip保存在eip中
		     "jmp __switch_to\n"				\ //且不说__switch_to中干了些什么,当CPU执行到那里的ret指令时,由于是通过jmp指令转过去的,最后进入堆栈的next->thread.eip就变成了返回地址
		     "1:\t"						\ //如果切换的不是子进程,next->thread.eip实际上就是上一次保存在prev->thread.eip,也就是这一行语句
		     "popl %%ebp\n\t"					\ //由于堆栈已经切换过来,pop出的都是上面存入进程prev堆栈的内容
		     "popl %%edi\n\t"							     "popl %%esi\n\t"							     :"=m" (prev->thread.esp),"=m" (prev->thread.eip),			      "=b" (last)							     :"m" (next->thread.esp),"m" (next->thread.eip),			      "a" (prev), "d" (next),						      "b" (prev));					} while (0)

父进程在sys_wait4等待,父进程从"1:\t"继续执行,继续执行sys_wait4函数。

时间: 2024-10-03 14:16:49

Linux内核源代码情景分析-exit()的相关文章

Linux内核源代码情景分析-系统调用brk()

首先看下进程地址空间示意图: 我们简单的说,从低地址到高地址,代码区和数据区,空洞,堆栈区.    在Linux内核源代码情景分析-内存管理之用户堆栈的扩展,我们申请了从堆栈区往下,数据区上面的页面.    在Linux内核源代码情景分析-内存管理之用户页面的换入,我们申请了用于换入/换出的页面.    在本文中,我们申请的是从数据区往上,堆栈区下面的页面.    我们通过一个实例来分析,brk(),见下图: 1.由于新边界比旧边界地址高,我们申请旧边界和新边界之间的页面.就是把对应的虚拟地址映

Linux内核源代码情景分析-fork()

父进程fork出子进程: fork经过系统调用,来到了sys_fork,详细过程请参考Linux内核源代码情景分析-系统调用. asmlinkage int sys_fork(struct pt_regs regs) { return do_fork(SIGCHLD, regs.esp, &regs, 0); } int do_fork(unsigned long clone_flags, unsigned long stack_start, //stack_start为用户空间堆栈指针 str

Linux内核源代码情景分析-内存管理之slab-回收

在上一篇文章Linux内核源代码情景分析-内存管理之slab-分配与释放,最后形成了如下图的结构: 图 1 我们看到空闲slab块占用的若干页面,不会自己释放:我们是通过kmem_cache_reap和kmem_cache_shrink来回收的.他们的区别是: 1.我们先看kmem_cache_shrink,代码如下: int kmem_cache_shrink(kmem_cache_t *cachep) { if (!cachep || in_interrupt() || !is_chaine

Linux内核源代码情景分析-系统调用mknod

普通文件可以用open或者create创建,FIFO文件可以用pipe创建,mknod主要用于设备文件的创建. 在内核中,mknod是由sys_mknod实现的,代码如下: asmlinkage long sys_mknod(const char * filename, int mode, dev_t dev) //比如filename为/tmp/server_socket,dev是设备号 { int error = 0; char * tmp; struct dentry * dentry;

Linux内核源代码情景分析-文件系统的安装

执行sudo mount -t ext2 /dev/sdb1 /mnt/sdb,将文件系统挂在到/mnt/sdb上.系统调用mount,映射到内核层执行的是sys_mount.假设/dev/sdb1和/mnt/sdb都位于ext2文件系统中. asmlinkage long sys_mount(char * dev_name, char * dir_name, char * type, unsigned long flags, void * data)//dev_name指向了"/dev/sdb

Linux内核源代码情景分析-访问权限与文件安全性

在Linux内核源代码情景分析-从路径名到目标节点,一文中path_walk代码中,err = permission(inode, MAY_EXEC)当前进程是否可以访问这个节点,代码如下: int permission(struct inode * inode,int mask) { if (inode->i_op && inode->i_op->permission) { int retval; lock_kernel(); retval = inode->i_

Linux内核源代码情景分析-文件系统安装后的访问

在Linux内核源代码情景分析-文件系统的安装,一文中,已经调用sudo mount -t ext2 /dev/sdb1 /mnt/sdb,在/mnt/sdb节点上挂载了文件系统,那么我们接下来访问/mnt/sdb/hello.c节点.我们来看一下path_walk的执行有什么不同? int path_walk(const char * name, struct nameidata *nd) { struct dentry *dentry; struct inode *inode; int er

Linux内核源代码情景分析-内存管理

用户空间的页面有下面几种: 1.普通的用户空间页面,包括进程的代码段.数据段.堆栈段.以及动态分配的"存储堆". 2.通过系统调用mmap()映射到用户空间的已打开文件的内容. 3.进程间的共享内存区. 这些页面的的周转有两方面的意思. 1.页面的分配,使用,回收.如进程压栈时新申请的页面,这类页面不进行盘区交换,不使用时释放得以回收. 这部分通过一个场景来解释: Linux内核源代码情景分析-内存管理之用户堆栈的扩展. 2.盘区交换.如要执行硬盘上的对应代码段.把硬盘上的代码段换入内

Linux内核源代码情景分析-共享内存

一.库函数shmget()--共享内存区的创建与寻找 asmlinkage long sys_shmget (key_t key, size_t size, int shmflg) { struct shmid_kernel *shp; int err, id = 0; down(&shm_ids.sem); if (key == IPC_PRIVATE) { err = newseg(key, shmflg, size);//分配一个共享内存区供本进程专用,最后返回的是一体化的标示号 } el