1、概念

1.1 什么是进程？

进程是程序运行的一个实例。能够看作充分描写叙述程序已经运行到何种程度的数据结构的汇集。

从内核观点看。进程的目的就是担当分配系统资源（CPU时间，内存等）的实体。

我们熟悉的fork()库函数，它有两种使用方法：

(1)、一个父进程希望复制自己，使父子进程运行不同的代码段。经常使用于网络服务程序。

(2)、一个进程要运行一个不同的程序，fork()后马上exec()，如shell。

1.2 什么是线程？

有时候，一个进程希望有多个运行流，如一款麻将游戏，三个由电脑控制的人都应被看做是“独立思考的”，它们须要并行工作。

但它们又不是全然无关的，不能设计成单独的进程。所以就须要比进程更小的单位，它们独立被调度，又共享一些资源。

1.3 Linux内核怎样实现线程？

Linux内核并没有标准的线程。linux使用轻量级进程的方式实现线程。也能够觉得轻量级进程就是Linux线程。

所谓轻量级进程，就是它的资源并非独享的。而是与一组轻量级进程共享。

这就是线程组。

getpid()、kill()、_exit()这种一些系统调用，对线程组总体起作用。

只这些内核这些功能还远远不够。必须实用户线程库的支持。线程库使用户看起来，线程和进程是独立的概念。

POSIX兼容的pthread库：LinuxThreads，Native Posix Thread Library（NPTL）、IBM的Next Generation
Posix Threading Package（NGPT）

线程使用独立于进程的一套库，pthread_create(...)，pthread_exit(...)，pthread_join(...), pthread_cancel(...)

2、预备知识：进程描写叙述符task_struct

内核将关于一个进程的全部信息放在一个结构体里以方便管理。

严格的一一相应，进程。轻量级进程，内核线程。

想一想，一个进程会有哪些信息？【include/linux/sched.h: struct task_struct】

进程标识符PID：使用唯一的数字来标识当前进程。task_struct的pid字段用来存放pid，进程按创建先后被顺序编号，pid值达到上限就回滚使用闲置的小pid，内核管理一个pid位图pidmap_array。

另外注意，当考虑到线程这个因素之后，pid含义不再是进程id，而仅仅能看成是进程线程全局唯一的数字标识。应用常常须要的是进程id，所以引入tgid字段，对进程它是自己的pid，对轻量级进程它是领头进程pid。getpid()系统调用返回的是tgid值而不是pid值。

task_struct结构体的声明例如以下：


struct task_struct {

	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */

	struct thread_info *thread_info;

	atomic_t usage;

	unsigned long flags;	/* per process flags, defined below */

	unsigned long ptrace;

 

	int lock_depth;		/* Lock depth */

 

	int prio, static_prio;

	struct list_head run_list;

	prio_array_t *array;

 

	unsigned long sleep_avg;

	unsigned long long timestamp, last_ran;

	int activated;

 

	unsigned long policy;

	cpumask_t cpus_allowed;

	unsigned int time_slice, first_time_slice;

 

#ifdef CONFIG_SCHEDSTATS

	struct sched_info sched_info;

#endif

 

	struct list_head tasks;

	/*

	 * ptrace_list/ptrace_children forms the list of my children

	 * that were stolen by a ptracer.

	 */

	struct list_head ptrace_children;

	struct list_head ptrace_list;

 

	struct mm_struct *mm, *active_mm;

 

/* task state */

	struct linux_binfmt *binfmt;

	long exit_state;

	int exit_code, exit_signal;

	int pdeath_signal;  /*  The signal sent when the parent dies  */

	/* ???

 */

	unsigned long personality;

	unsigned did_exec:1;

	pid_t pid;

	pid_t tgid;

	/* 

	 * pointers to (original) parent process, youngest child, younger sibling,

	 * older sibling, respectively.  (p->father can be replaced with 

	 * p->parent->pid)

	 */

	struct task_struct *real_parent; /* real parent process (when being debugged) */

	struct task_struct *parent;	/* parent process */

	/*

	 * children/sibling forms the list of my children plus the

	 * tasks I‘m ptracing.

	 */

	struct list_head children;	/* list of my children */

	struct list_head sibling;	/* linkage in my parent‘s children list */

	struct task_struct *group_leader;	/* threadgroup leader */

 

	/* PID/PID hash table linkage. */

	struct pid pids[PIDTYPE_MAX];

 

	struct completion *vfork_done;		/* for vfork() */

	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */

	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */

 

	unsigned long rt_priority;

	unsigned long it_real_value, it_real_incr;

	cputime_t it_virt_value, it_virt_incr;

	cputime_t it_prof_value, it_prof_incr;

	struct timer_list real_timer;

	cputime_t utime, stime;

	unsigned long nvcsw, nivcsw; /* context switch counts */

	struct timespec start_time;

/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */

	unsigned long min_flt, maj_flt;

/* process credentials */

	uid_t uid,euid,suid,fsuid;

	gid_t gid,egid,sgid,fsgid;

	struct group_info *group_info;

	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted;

	unsigned keep_capabilities:1;

	struct user_struct *user;

#ifdef CONFIG_KEYS

	struct key *session_keyring;	/* keyring inherited over fork */

	struct key *process_keyring;	/* keyring private to this process (CLONE_THREAD) */

	struct key *thread_keyring;	/* keyring private to this thread */

#endif

	int oomkilladj; /* OOM kill score adjustment (bit shift). */

	char comm[TASK_COMM_LEN];

/* file system info */

	int link_count, total_link_count;

/* ipc stuff */

	struct sysv_sem sysvsem;

/* CPU-specific state of this task */

	struct thread_struct thread;

/* filesystem information */

	struct fs_struct *fs;

/* open file information */

	struct files_struct *files;

/* namespace */

	struct namespace *namespace;

/* signal handlers */

	struct signal_struct *signal;

	struct sighand_struct *sighand;

 

	sigset_t blocked, real_blocked;

	struct sigpending pending;

 

	unsigned long sas_ss_sp;

	size_t sas_ss_size;

	int (*notifier)(void *priv);

	void *notifier_data;

	sigset_t *notifier_mask;

	

	void *security;

	struct audit_context *audit_context;

 

/* Thread group tracking */

   	u32 parent_exec_id;

   	u32 self_exec_id;

/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */

	spinlock_t alloc_lock;

/* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */

	spinlock_t proc_lock;

/* context-switch lock */

	spinlock_t switch_lock;

 

/* journalling filesystem info */

	void *journal_info;

 

/* VM state */

	struct reclaim_state *reclaim_state;

 

	struct dentry *proc_dentry;

	struct backing_dev_info *backing_dev_info;

 

	struct io_context *io_context;

 

	unsigned long ptrace_message;

	siginfo_t *last_siginfo; /* For ptrace use.  */

/*

 * current io wait handle: wait queue entry to use for io waits

 * If this thread is processing aio, this points at the waitqueue

 * inside the currently handled kiocb. It may be NULL (i.e. default

 * to a stack based synchronous wait) if its doing sync IO.

 */

	wait_queue_t *io_wait;

/* i/o counters(bytes read/written, #syscalls */

	u64 rchar, wchar, syscr, syscw;

#if defined(CONFIG_BSD_PROCESS_ACCT)

	u64 acct_rss_mem1;	/* accumulated rss usage */

	u64 acct_vm_mem1;	/* accumulated virtual memory usage */

	clock_t acct_stimexpd;	/* clock_t-converted stime since last update */

#endif

#ifdef CONFIG_NUMA

  	struct mempolicy *mempolicy;

	short il_next;

#endif

};

2.1 怎样获得进程描写叙述符

内核必须高速得到当前进程task_struct，current()宏能够得到当前进程的task_struct指针。它是怎样实现的？

Linux内核用了一个小技巧，在8KB内核栈配置下（内核栈能够配置为8KB或4KB，详见《深入理解linux内核》“多种类型的内核栈”），在该进程内核栈的顶端（低地址）放一个小的结构体thread_info（52字节）。再用thread_info.task指向task_struct。


// include/linux/sched.h

 

union thread_union{

    struct thread_info thread_info;

    unsigned long stack[2048];

}

esp是CPU的栈指针。用来存放栈顶单元的地址。能够直接得到。(esp & 0xffff e000)能够得到thread_info的指针，thread_info.task就是task_struct的指针。

注：

1、task是struct thread_info中的第一个元素，应该在最低地址，图中的画法是错误的；

2、current应该指向进程描写叙述符，而不是struct thread_info。

很多其它细节：

2.6曾经的内核，直接task_struct放在内核栈的尾端。2.6以后使用slab分配器分配task_struct，所以成了我们看到的这样子；对照PPC这部分的实现，PPC使用r2寄存器存储task_struct指针。x86属于便宜处理器，寄存器数量有限，无法做到这一点。

2.2 进程链表

内核将全部进程用链表链接。

链表头是init_task描写叙述符，它是cpu0上的0进程。也叫swapper进程。

疑问：

swapper是per cpu的，那链表头有多个？不是，仅仅有cpu0的0进程描写叙述符才是init_task这个静态全局变量，其他cpu的0进程描写叙述符不是init_task，也不是静态。也不是全局。

几个操作宏：

SET_LINKS宏：从进程链表中插入一个task_struct。

REMOVE_LINKS宏：从进程链表中删除一个task_struct。

for_each_process宏：从init_task開始遍历进程链表。

3、进程状态及各状态的组织

3.1 进程状态

task_struct中的state字段描写叙述了进程当前所处的状态：

TASK_RUNNING：可执行状态，执行或等待执行。

TASK_ INTERRUPTIBLE：可中断的等待状态，进程被挂起，直到某个条件变为真。

如硬件中断。等待的资源被释放。接受一个信号。

TASK_UNINTERRUPTIBLE：不可中断的等待状态，与可中断的等待状态类似，但不能被信号中断。仅用在特定情况（进程必须等待，直到一个不能被中断的事件发生），比如当进程打开一个设备文件。其对应的设备驱动程序開始探測对应的硬件设备时。探測完毕曾经，设备驱动程序不能被中断，否则，硬件设备会处于不可预知的状态。

TASK_ STOPPED：暂停状态。进程收到SIGSTOP、SIGTSTP、SIGTTIN、SIGTTOU信号后会进入暂停状态。

TASK_
TRACED：跟踪状态，进程的运行由debugger程序暂停，当一个进程被还有一个进程监控时。不论什么信号都能够把这个进程置于TASK_TRACED状态。

EXIT_ZOMBLE：僵死状态，进程死亡，等待父进程获取其死亡信息。此时进程描写叙述符还不能删除。

EXIT_DEAD：僵死撤消状态。进程彻底消亡，为防止竞争条件（其它进程再一次wait()），而设置EXIT_DEAD。

宏：

set_task_state宏：设置指定进程的状态。

set_current_state宏：设置当前进程的状态。

进程状态转换图：

3.2 各个状态的进程的组织

TASK_RUNNING：进程调度必须高速找出最佳可执行进程，因此可执行进程的组织结构至关重要。

Linux2.6为了让调度程序能在固定的时间内选出“最佳”可执行进程，建立了多个可执行进程链表，每种进程优先权（0~139）相应一个不同的链表。在SMP中。每一个CPU有自己的进程链表集。

TASK_ INTERRUPTIBLE、TASK_UNINTERRUPTIBLE：进程被划分为多个子类，每一个子类相应一个特定事件。

这种情况下，进程状态没有提供足够的信息来高速恢复进程。所以有必要引进附加的进程链表。linux用等待队列实现这种链表。

等待队列是Linux中实现的一种机制，它实现了在事件上的条件等待：希望等待特定事件的进程把自己放进合适的等待队列。并放弃控制权。等待队列表示一组睡眠的进程，当某一条件变为真时，由内核唤醒它们。

为了避免“惊群效应”。等待队列分为两种：资源相互排斥类和非相互排斥类。等待相互排斥类资源的进程由内核有选择的唤醒，而非相互排斥进程总是有内核在事件发生时唤醒。

TASK_STOPPED、EXIT_ZOMBIE、EXIT_DEAD：没有专门的链表。

4、进程从何而来？

4.1 进程间关系

因为进程不是凭空创见，而是由已有进程复制创建的。所以进程之间有父子关系。

也就是说，Linux进程之间的关系能够组织为一棵树，其根节点为0号进程。

task_struct中相关字段：

real_parent：创建pid为p的进程为父进程，假设这个父进程不复存在（如父进程先于子进程死亡），进程p就由init进程（1号进程）收养。

parent：通常与real_parent一致。但偶尔不同，如还有一个进程发出监控p的ptrace()系统调用请求时。

children：子进程链表头。

sibling：兄弟进程链表。

除了父子关系，进程还存在其它关系（线程组。进程组，登录会话，调试跟踪）：

group_leader：当前进程p所在进程组的领头进程的描写叙述符指针。

signal->pgrp：p所在进程组的领头进程的PID。

tgid：p所在线程组的领头进程的PID，getpid()系统调用返回该值，而不是pid。

signal->session：p的登录会话领头进程的PID。

ptrace_children：被debugger程序跟踪的p的子进程的链表头。

ptrace_list：指向所跟踪进程事实上际父进程链表的前一个和下一个元素。

进程组和会话中的进程安排：

进程组一般是由shell的管道线将几个进程编成一组的。比如，由下列形式的shell命令形成的进程组会话例如以下图：


proc1 | proc2 &

proc3 | proc4 | proc5

因此，一个进程拥有4个id：PID。tgid，pgrp，session。内核为了加速查找，以这四个值为索引。将task_struct组织为4个散列表。并用链表来解决散列冲突。

task_struct结构中：struct pid pids[4]；


struct pid{

    int nr;//冗余？

    struct hlist_node pid_chain;

    struct list_head pid_list;

}

所以散列表看起来是这种：

4.2 怎样创建进程

进程是在系统执行过程中动态创建的。比如：用户在shell中输入一条命令、程序执行fork或pthread_create等。

此时，进程怎样创建呢？-->

fork系统调用，曾经的做法是，子进程复制父进程所拥有的资源。

可是非常多情况下，子进程要做与父进程不同的事。所以子进程马上调用execve()，复制的数据马上丢弃。所以效率低。

后来引入了vfork系统调用，子进程共享其父进程的内存地址空间，并堵塞父进程的运行，一直到子进程退出或运行一个新的程序。

如今的fork引入了写时复制技术（copy-on-write） --> vfrok的优势不再，应避免使用。

此外，clone系统调用同意仔细地控制子进程共享哪些父进程的数据。被用来实现轻量级进程。

下表列出了clone的共享标志：


// include/linux/sched.h

 

/*

 * cloning flags:

 */

#define CSIGNAL		0x000000ff	/* signal mask to be sent at exit */

#define CLONE_VM	0x00000100	/* set if VM shared between processes */

#define CLONE_FS	0x00000200	/* set if fs info shared between processes */

#define CLONE_FILES	0x00000400	/* set if open files shared between processes */

#define CLONE_SIGHAND	0x00000800	/* set if signal handlers and blocked signals shared */

#define CLONE_PTRACE	0x00002000	/* set if we want to let tracing continue on the child too */

#define CLONE_VFORK	0x00004000	/* set if the parent wants the child to wake it up on mm_release */

#define CLONE_PARENT	0x00008000	/* set if we want to have the same parent as the cloner */

#define CLONE_THREAD	0x00010000	/* Same thread group? */

#define CLONE_NEWNS	0x00020000	/* New namespace group?

 */

#define CLONE_SYSVSEM	0x00040000	/* share system V SEM_UNDO semantics */

#define CLONE_SETTLS	0x00080000	/* create a new TLS for the child */

#define CLONE_PARENT_SETTID	0x00100000	/* set the TID in the parent */

#define CLONE_CHILD_CLEARTID	0x00200000	/* clear the TID in the child */

#define CLONE_DETACHED		0x00400000	/* Unused, ignored */

#define CLONE_UNTRACED		0x00800000	/* set if the tracing process can‘t force CLONE_PTRACE on this clone */

#define CLONE_CHILD_SETTID	0x01000000	/* set the TID in the child */

#define CLONE_STOPPED		0x02000000	/* Start in stopped state */

 

/*

 * List of flags we want to share for kernel threads,

 * if only because they are not used by them anyway.

 */

#define CLONE_KERNEL    (CLONE_FS | CLONE_FILES | CLONE_SIGHAND)

fork()、clone()、vfrok()系统调用均使用do_fork()函数实现。


//  kernel/fork.c

 

/*

 *  Ok, this is the main fork-routine.

 *

 * It copies the process, and if successful kick-starts

 * it and waits for it to finish using the VM if required.

 */

long do_fork(unsigned long clone_flags,

	      unsigned long stack_start,

	      struct pt_regs *regs,

	      unsigned long stack_size,

	      int __user *parent_tidptr,

	      int __user *child_tidptr)

{

	struct task_struct *p;

	int trace = 0;

	long pid = alloc_pidmap();//通过查找pidmap_array位图，为子进程分配新的PID

	if (pid < 0)

		return -EAGAIN;

	/* 检查子进程是否要跟踪*/

	if (unlikely(current->ptrace)) {

		trace = fork_traceflag (clone_flags);

		if (trace)

			clone_flags |= CLONE_PTRACE;

	}

	/* 核心！复制父进程的task_struct，并申请了内核栈和thread_info */

	p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);

	/*

	 * Do this prior waking up the new thread - the thread pointer

	 * might get invalid after that point, if the thread exits quickly.

	 */

	if (!IS_ERR(p)) {

		struct completion vfork;

		if (clone_flags & CLONE_VFORK) {

			p->vfork_done = &vfork;

			init_completion(&vfork);

		}

		/* 假设设置了CLONE_STOPPED标志。或要跟踪子进程，那么子进程被设置成TASK_STOPPED,并为子进程添加挂起的SIGSTOP信号。在还有一进程把子进程的状态恢复为TASK_RUNNING之前（一般是SIGCONT信号）。子进程不得执行*/

		if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {

			/*

			 * We‘ll start up with an immediate SIGSTOP.

			 */

			sigaddset(&p->pending.signal, SIGSTOP);

			set_tsk_thread_flag(p, TIF_SIGPENDING);

		}

		/* 唤醒子进程，1）若父子进程在同一cpu且不能共享页表（CLONE_VM=0）,则在执行队列中，把子进程插入在父进程前面，以避免不必要的写时复制开销;2）不同cpu或CLONE_VM=1,把子进程插入现在成执行队列的队尾 */

		if (!(clone_flags & CLONE_STOPPED))

			wake_up_new_task(p, clone_flags);

		else

			p->state = TASK_STOPPED;

		/* 假设父进程被跟踪，则把子进程pid保存，以使祖父进程（debugger）获取 */

		if (unlikely (trace)) {

			current->ptrace_message = pid;

			ptrace_notify ((trace << 8) | SIGTRAP);

		}

		/* vfrok要求父进程挂起，直到子进程结束或执行新的程序 */

		if (clone_flags & CLONE_VFORK) {

			wait_for_completion(&vfork);

			if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))

				ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);

		}

	} else {

		free_pidmap(pid);

		pid = PTR_ERR(p);

	}

	return pid;

}

子进程怎样运行？

系统调用返回时会触发进程调度，这时候子进程会被优先运行（假设同cpu且不共享页表），子进程描写叙述符thread字段的值被装入几个cpu寄存器，特别是esp（内核态堆栈指针）和eip（ret_from_fork()函数的地址）。最后schedule_tail()函数用存放在栈中的值装载全部的寄存器，并强迫cpu返回到用户态。子进程终于得到运行。之后进程调度，父进程可能得到运行。这就是为什么fork()运行一次。返回两次。

4.3 我们接下来从内核中走出来，站在应用程序猿的角度看看进程

系统调用的返回值放在eax寄存器中：返回给子进程的值是0。返回给父进程的值是子进程的PID。这是UNIX的通用做法。应用开发人员能够利用这一事实。使用基于PID值的条件语句，使子进程和父进程有不同的行为。

例如以下所看到的：


#include <stdio.h>

#include <bits/types.h>

#include <unistd.h>

#include <stdlib.h>

 

int glob = 6;

char buf[] = "a write to stdout\n";

int main(void)

{

	int var;

	pid_t pid;

	var = 88;

	if(write(STDOUT_FILENO, buf, sizeof(buf)-1) != sizeof(buf)-1)

		printf("write error\n");

	printf("before fork\n");

	if((pid = fork()) < 0){

		printf("fork error\n");

	} else if(pid == 0) {    /* child */

		glob++;

		var++;

	} else{                  /* parent */

		sleep(2);

	}

	printf("pid = %d, glob = %d, var = %d\n", getpid(), glob, var);

	exit(0);

}

这里引出还有一个问题。就是上面两种不同的运行方式造成结果不同的原因。

fork时。父进程数据空间拷贝到子进程中时，缓冲区也被拷贝到子进程中。

write是不带缓冲的，标准IO库（如printf）是带缓冲的。假设标准输出连接到终端设备，则它是行缓冲的，否则它是全缓冲的。

（详见《UNIX环境高级编程》）

从用户态看来，子进程继承了父进程的（有些须要结合《UNIX环境高级编程》上下文才干看懂）：

打开文件

实际用户ID、实际组ID、有效用户ID、有效组ID

附加组ID

进程组ID

会话ID

控制终端

设置用户ID标志和设置组ID标志

当前工作文件夹

根文件夹

文件模式创建屏蔽字

信号屏蔽和安排

针对随意打开文件描写叙述符的在运行时关闭标志

环境

连接的共享存储段

存储映射

资源限制

父进程和子进程的差别是：

fork的返回值

进程ID不同

父进程ID

子进程的tms_utime、tms_stime、 tms_cutime以及tms_ustime均被设置为0

父进程设置的文件所不会被子进程继承

子进程的未处理的闹钟被清除

子进程的未处理信号集设置为空集

父子进程对打开文件的共享：

注意差别于独立进程打开文件：

4.3 进程究竟从何而来——从start_kernel()開始


//  init/main.c

/*

 *	Activate the first processor.

 */

 

asmlinkage void __init start_kernel(void)

{

	char * command_line;

	extern struct kernel_param __start___param[], __stop___param[];

/*

 * Interrupts are still disabled. Do necessary setups, then

 * enable them

 */

	lock_kernel();

	page_address_init();

	printk(linux_banner);

	setup_arch(&command_line);

	setup_per_cpu_areas();

	/*

	 * Mark the boot cpu "online" so that it can call console drivers in

	 * printk() and can access its per-cpu storage.

	 */

	smp_prepare_boot_cpu();

 

	/*

	 * Set up the scheduler prior starting any interrupts (such as the

	 * timer interrupt). Full topology setup happens at smp_init()

	 * time - but meanwhile we still have a functioning scheduler.

	 */

	sched_init();

	/*

	 * Disable preemption - early bootup scheduling is extremely

	 * fragile until we cpu_idle() for the first time.

	 */

	preempt_disable();

	build_all_zonelists();

	page_alloc_init();//初始化伙伴系统

	printk("Kernel command line: %s\n", saved_command_line);

	parse_early_param();

	parse_args("Booting kernel", command_line, __start___param,

		   __stop___param - __start___param,

		   &unknown_bootoption);

	sort_main_extable();

	trap_init();

	rcu_init();

	init_IRQ();

	pidhash_init();

	init_timers();

	softirq_init();

	time_init();

	/*

	 * HACK ALERT! This is early. We‘re enabling the console before

	 * we‘ve done PCI setups etc, and console_init() must be aware of

	 * this. But we do want output early, in case something goes wrong.

	 */

	console_init();

	if (panic_later)

		panic(panic_later, panic_param);

	profile_init();

	local_irq_enable();

#ifdef CONFIG_BLK_DEV_INITRD

	if (initrd_start && !initrd_below_start_ok &&

			initrd_start < min_low_pfn << PAGE_SHIFT) {

		printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - "

		    "disabling it.\n",initrd_start,min_low_pfn << PAGE_SHIFT);

		initrd_start = 0;

	}

#endif

	vfs_caches_init_early();

	mem_init();

	kmem_cache_init();//初始化slab

	numa_policy_init();

	if (late_time_init)

		late_time_init();

	calibrate_delay();//确定cpu时钟速度

	pidmap_init();

	pgtable_cache_init();

	prio_tree_init();

	anon_vma_init();

#ifdef CONFIG_X86

	if (efi_enabled)

		efi_enter_virtual_mode();

#endif

	fork_init(num_physpages);

	proc_caches_init();

	buffer_init();

	unnamed_dev_init();

	security_init();

	vfs_caches_init(num_physpages);

	radix_tree_init();

	signals_init();

	/* rootfs populating might need page-writeback */

	page_writeback_init();

#ifdef CONFIG_PROC_FS

	proc_root_init();

#endif

	check_bugs();

	acpi_early_init(); /* before LAPIC and SMP init */

	/* Do the rest non-__init‘ed, we‘re now alive */

	rest_init();//继续，后面会创建1号init进程，最后cpu_idle()，用以cpu没进程运行时替补

}

/*

 * We need to finalize in a non-__init function or else race conditions

 * between the root thread and the init thread may cause start_kernel to

 * be reaped by free_initmem before the root thread has proceeded to

 * cpu_idle.

 *

 * gcc-3.4 accidentally inlines this function, so use noinline.

 */

 

static void noinline rest_init(void)

	__releases(kernel_lock)

{

	kernel_thread(init, NULL, CLONE_FS | CLONE_SIGHAND);

	numa_default_policy();

	unlock_kernel();

	preempt_enable_no_resched();

	cpu_idle();

}

疑问：

并非仅仅有代码就能够运行，没有内核栈，task_struct等数据结构怎么行？

0号进程使用的全部数据结构都是静态创建的（全部其它进程的数据结构都是动态分配的），也就是说，当磁盘中的内核映像载入如内存的时候。0号进程的数据结构也就有了。

0号进程完毕诸多初始化的光荣使命之后，循环在cpu_idle()，这时仅仅有在cpu没有可调度的进程时，就会运行0号进程。

SMP系统中，每一个CPU都有一个进程0.启动时仅仅用到一个cpu，禁用其它CPU，当0进程激活其它CPU时，通过copy_process()创建其它cpu的0号进程。

【linux-2.6.11】init() --> smp_prepare_cpus() --> smp_boot_cpus() --> do_boot_cpu() --> idle = fork_idle() --> task
= copy_process()

进程1

进程0使用kernel_thread()创建进程1，此时的进程1还是内核线程，它运行内核中init(
)函数[init/main.c]，继续初始化工作。

之后init()调用execve()装入可运行程序，通常是/sbin/init。

这个程序的主要工作是：用户系统初始化，启动各种deamon进程（依据《UNIX环境高级编程》，内核线程也属于守护进程），启动tty和图形界面。

此时init内核线程变为一个普通进程。在系统关闭之前，init进程一直存活，由于它创建和监控在操作系统外层运行的全部进程的活动。


static int init(void * unused)

{

	lock_kernel();

	/*

	 * Tell the world that we‘re going to be the grim

	 * reaper of innocent orphaned children.

	 *

	 * We don‘t want people to have to make incorrect

	 * assumptions about where in the task array this

	 * can be found.

	 */

	child_reaper = current;

	/* Sets up cpus_possible() */

	smp_prepare_cpus(max_cpus);   /* 这里创建其它0号进程 */

	do_pre_smp_initcalls();

	fixup_cpu_present_map();

	smp_init();

	sched_init_smp();

	/*

	 * Do this before initcalls, because some drivers want to access

	 * firmware files.

	 */

	populate_rootfs();

	do_basic_setup();

	/*

	 * check if there is an early userspace init.  If yes, let it do all

	 * the work

	 */

	if (sys_access((const char __user *) "/init", 0) == 0)

		execute_command = "/init";

	else

		prepare_namespace();

	/*

	 * Ok, we have completed the initial bootup, and

	 * we‘re essentially up and running. Get rid of the

	 * initmem segments and start the user-mode stuff..

	 */

	free_initmem();

	unlock_kernel();

	system_state = SYSTEM_RUNNING;

	numa_default_policy();

	if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)

		printk("Warning: unable to open an initial console.\n");

	(void) sys_dup(0);

	(void) sys_dup(0);

	

	/*

	 * We try each of these until one succeeds.

	 *

	 * The Bourne shell can be used instead of init if we are 

	 * trying to recover a really broken machine.

	 */

	if (execute_command)

		run_init_process(execute_command);

	run_init_process("/sbin/init");

	run_init_process("/etc/init");

	run_init_process("/bin/init");

	run_init_process("/bin/sh");

	panic("No init found.  Try passing init= option to kernel.");

}

其它内核线程

内核中有一些重要任务，以现行的方式运行效率不高。托付给独立的调度运行流做比較合适。每一个进程就是一个独立的运行流。而这些为内核工作的运行流共享使用内核的地址空间等资源，于是就被叫做内核线程。

Linux使用非常多其它内核线程，当中一些在初始化阶段创建。一些在系统执行过程中动态创建。

一些内核线程的样例：

keventd（也被称为事件）：运行keventd_wq工作队列中的函数。

kapmd：处理与高级电源管理（APM）相关的事件。

kswapd：运行周期内存回收。

pdflush：刷新“脏”缓冲区中的的内容到磁盘以回收内存。

kblockd：运行kblockd_workqueue工作队列中的函数，周期性的激活块设备驱动程序。

ksoftirqd：执行tasklet，系统中每cpu都有这样一个内核线程。

4.4 撤销进程

进程结束时必须通知内核，以便内核释放进程所拥有的资源，包含内存，打开文件，信号量等。

进程终止有8种方式：

正常终止：

1、从main返回

2、调用exit（做一些清理并结束进程）

3、调用_exit或_Exit（直接结束进程）（基于exit_group()系统调用）

4、最后一个线程从其启动例程返回

5、最后一个线程调用pthread_exit（基于exit()系统调用）

异常终止：

6、调用abort

7、接收到一个信号并终止

8、最后一个线程对取消请求作出响应

两个进程终止的系统调用：

1、exit_group()，终止整个线程组，也适用于单线程进程。

c库函数exit()基于此系统调用

2、exit()。终止某一个线程，而无论该线程所属线程组中的全部其它线程。linux线程库函数pthread_exit()基于此系统调用


//  kernel/exit.c

fastcall NORET_TYPE void do_exit(long code)

{

	struct task_struct *tsk = current;

	int group_dead;

	profile_task_exit(tsk);

	if (unlikely(in_interrupt()))

		panic("Aiee, killing interrupt handler!");

	if (unlikely(!tsk->pid))

		panic("Attempted to kill the idle task!");

	if (unlikely(tsk->pid == 1))

		panic("Attempted to kill init!");

	if (tsk->io_context)

		exit_io_context();

	if (unlikely(current->ptrace & PT_TRACE_EXIT)) {

		current->ptrace_message = code;

		ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);

	}

	/* 更新状态，进程正在退出*/

	tsk->flags |= PF_EXITING;

	del_timer_sync(&tsk->real_timer);

	if (unlikely(in_atomic()))

		printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",

				current->comm, current->pid,

				preempt_count());

	acct_update_integrals();

	update_mem_hiwater();

	group_dead = atomic_dec_and_test(&tsk->signal->live);

	if (group_dead)

		acct_process(code);

	/* 解除对内存。信号量，文件系统。打开文件，命名空间等的引用。非共享则删除 */

	exit_mm(tsk);

	exit_sem(tsk);

	__exit_files(tsk);

	__exit_fs(tsk);

	exit_namespace(tsk);

	exit_thread();

	exit_keys(tsk);

	if (group_dead && tsk->signal->leader)

		disassociate_ctty(1);

	module_put(tsk->thread_info->exec_domain->module);

	if (tsk->binfmt)

		module_put(tsk->binfmt->module);

	/* exit_code，系统调用參数（正常终止）或内核提供的错误码（异常终止）*/

	tsk->exit_code = code;

	/* 更新亲属关系，子进程将被兄弟进程或init收养 

	 * 是否须要向父进程发送SIGCHLD信号

	 * release_task()回收进程其它数据结构占用的内存

	 * 进程EXIT_DEAD或EXIT_ZOMBIE*/

	exit_notify(tsk);

#ifdef CONFIG_NUMA

	mpol_free(tsk->mempolicy);

	tsk->mempolicy = NULL;

#endif

 

	BUG_ON(!(current->flags & PF_DEAD));

	/* 进程调度，一去不回 */

	schedule();

	BUG();

	/* Avoid "noreturn function does return".  */

	for (;;) ;

}

至此。进程已死，永不会复活。但其尸体（某些数据结构）还在，对僵死进程的处理有两种可能的方式：

假设父进程不须要接收来自子进程的信号，就调用do_exit()；

假设已经给父进程发送了一个信号，就调用wait4()或waitpid()系统调用。

后一种情况下，release_task()函数将回收进程描写叙述符所占用的内存空间；

而在前一种情况下。内存的回收将由进程调度程序来完毕。

5、进程切换

这里仅仅涉及内核怎样完毕进程切换。而不涉及调度机制和算法策略。也就是说。这里假定调度程序已经选好了合适的进程，怎样换下旧进程。装上新进程。

虽然每一个进程能够拥有属于自己的地址空间，但全部的进程必须共享cpu寄存器。

因此cpu寄存器的保存和恢复是进程切换的重要内容。

进程恢复前必须装入的一组数据称为硬件上下文。Linux中，硬件上下文一部分存放在task_struct。剩余部分存放在内核态堆栈中。

进程切换仅仅发生在内核态。

在运行进程切换之前，用户态进程使用的全部寄存器内容都已保存在内核态堆栈上。

task_struct中的类型为thread_struct的thread字段用于在进程切换时保存硬件上下文，它包括了大部分CPU寄存器，但不包括eax等通用寄存器，通用寄存器保留在内核堆栈中。

进程切换由两步组成：

1、切换页全局文件夹以安装一个新的地址空间。

2、切换内核态堆栈和硬件上下文。由switch_to宏完毕。

6、进程资源限制

每一个进程都有一组相关的资源限制，避免用户过分使用系统资源（CPU，磁盘等）。

当前进程的限制存放在current->signal->rlim[]数组字段，数组每一项元素代表一种资源

这些资源包含：


// include/asm-generic/resource.h

 

#define RLIMIT_CPU      0               /* CPU time in ms */

#define RLIMIT_FSIZE    1               /* Maximum filesize */

#define RLIMIT_DATA     2               /* max data size */

#define RLIMIT_STACK    3               /* max stack size */

#define RLIMIT_CORE     4               /* max core file size */

#define RLIMIT_RSS      5               /* max resident set size */

#define RLIMIT_NPROC    6               /* max number of processes */

#define RLIMIT_NOFILE   7               /* max number of open files */

#define RLIMIT_MEMLOCK  8               /* max locked-in-memory address space */

#define RLIMIT_AS       9               /* address space limit */

#define RLIMIT_LOCKS    10              /* maximum file locks held */

#define RLIMIT_SIGPENDING 11            /* max number of pending signals */

#define RLIMIT_MSGQUEUE 12              /* maximum bytes in POSIX mqueues */

 

#define RLIM_NLIMITS    13

当中rlim字段数据结构：


// include/linux/resource.h

 

struct rlimit {

        unsigned long   rlim_cur;

        unsigned long   rlim_max;

};

getrlimit()系统调用：读取rlim_cur

setrlimit()系统调用：改变rlim_cur。以rlim_max为上限。

仅仅有具有CAP_SYS_ESOURCE权限的超级用户才干改变rlim_max。

大多数资源限制RLIMIT_INFINITY，即内核没有对资源限制。然而系统管理员能够给一些资源施加更强的限制。

时间： 2024-10-25 03:35:27

Linux内核剖析之进程简单介绍

1、概念

1.1 什么是进程？

1.2 什么是线程？

1.3 Linux内核怎样实现线程？

2、预备知识：进程描写叙述符task_struct

2.1 怎样获得进程描写叙述符

2.2 进程链表

3、进程状态及各状态的组织

3.1 进程状态

3.2 各个状态的进程的组织

4、进程从何而来？

4.1 进程间关系

4.2 怎样创建进程

4.3 我们接下来从内核中走出来，站在应用程序猿的角度看看进程

4.3 进程究竟从何而来——从start_kernel()開始

4.4 撤销进程

5、进程切换

6、进程资源限制

Linux内核剖析之进程简单介绍的相关文章

Linux内核剖析之进程地址空间（二）

Linux内核剖析之进程简介

【转载】linux内核笔记之进程地址空间

linux内核学习：进程管理

linux内核中与进程相关的数据结构（基于linux-mainline-rc4）

Linux内核调试技术——进程上下文R状态死锁监测

Linux内核剖析之内存管理

Linux内核剖析之内核同步

内核调试神器SystemTap — 简单介绍与使用（一）

Linux内核剖析 之 进程简单介绍

1、概念

1.1 什么是进程？

1.2 什么是线程？

1.3 Linux内核怎样实现线程？

2、预备知识：进程描写叙述符task_struct

2.1 怎样获得进程描写叙述符

2.2 进程链表

3、进程状态及各状态的组织

3.1 进程状态

3.2 各个状态的进程的组织

4、进程从何而来？

4.1 进程间关系

4.2 怎样创建进程

4.3 我们接下来从内核中走出来，站在应用程序猿的角度看看进程

4.3 进程究竟从何而来——从start_kernel()開始

4.4 撤销进程

5、进程切换

6、进程资源限制

Linux内核剖析 之 进程简单介绍的相关文章

Linux内核剖析之进程简单介绍

Linux内核剖析之进程简单介绍的相关文章