Linux内核源代码情景分析-系统初始化 / 憋错料

我们跳过boot，setup，直接来到head代码，内核映像的起点是stext，也是_stext，引导和解压缩以后的整个映像放在内存从0x100000即1MB开始的区间。CPU执行内核映像的入口startup_32就在内核映像开头的地方，因此其物理地址也是0x100000。

然而，在正常运行时整个内核映像都应该在系统空间中，系统空间的虚拟地址与物理地址间有个固定的位移，这就是0xC0000000，即3GB。所以，在连接内核映像时已经在所有的符号地址加了一个偏移量0xC0000000，这样startup_32的虚拟地址就成了0xC0100000。

进入startup_32时都运行于保护模式下的段式寻址方式。段描述表中与_KERNEL_CS和_KERNEL_DS想对应的描述项所提供的基地址都是0。其中代码段寄存器CS已在进入startup_32之前设置成_KERNEL_CS，数据段寄存器则尚未设置成_KERNEL_DS。

虽然代码段寄存器已经设置成_KERNEL_CS，从而startup_32的地址为0xC0100000。但是在转入这个入口时使用的指令是"ljmp 0x100000"而不是”ljmp startup_32“，所以装入CPU中的寄存器IP的地址是物理地址0x100000而不是虚拟地址0xC0000000。这样，CPU在进入startup_32以后就会继续以物理地址取指令。只要不在代码段中引用某个地址，例如向某个地址绝对转移，或者调用某个子程序，就可以一直这样运行下去，而与CS的内容无关。此外，CPU的中断已在进入startup_32之前关闭了。

从startup_32开始的汇编代码在arch/i386/kernel/head.S中，代码如下：

/*
 *  linux/arch/i386/head.S -- the 32-bit startup code.
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  Enhanced CPU detection and feature setting code by Mike Jagdis
 *  and Martin Mares, November 1997.
 */

.text
#include <linux/config.h>
#include <linux/threads.h>
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/desc.h>

#define OLD_CL_MAGIC_ADDR	0x90020
#define OLD_CL_MAGIC		0xA33F
#define OLD_CL_BASE_ADDR	0x90000
#define OLD_CL_OFFSET		0x90022
#define NEW_CL_POINTER		0x228	/* Relative to real mode data */

/*
 * References to members of the boot_cpu_data structure.
 */

#define CPU_PARAMS	SYMBOL_NAME(boot_cpu_data)
#define X86		CPU_PARAMS+0
#define X86_VENDOR	CPU_PARAMS+1
#define X86_MODEL	CPU_PARAMS+2
#define X86_MASK	CPU_PARAMS+3
#define X86_HARD_MATH	CPU_PARAMS+6
#define X86_CPUID	CPU_PARAMS+8
#define X86_CAPABILITY	CPU_PARAMS+12
#define X86_VENDOR_ID	CPU_PARAMS+16

/*
 * swapper_pg_dir is the main page directory, address 0x00101000
 *
 * On entry, %esi points to the real-mode code as a 32-bit pointer.
 */
ENTRY(stext)
ENTRY(_stext)
startup_32:
/*
 * Set segments to known values
 */
	cld
	movl $(__KERNEL_DS),%eax
	movl %eax,%ds
	movl %eax,%es
	movl %eax,%fs
	movl %eax,%gs  //将ds,es,fs,gs都设置成_KERNEL_DS
        ......
/*
 * Initialize page tables
 */
	movl $pg0-__PAGE_OFFSET,%edi //pg0是虚拟地址，所以要减去3GB的地址，才变成物理地址
	movl $007,%eax		//"007"代表PRESENT+RW+USER

2:	stosl //向目标地址复制数据
	add $0x1000,%eax //每次递增0x1000
	cmp $empty_zero_page-__PAGE_OFFSET,%edi //直到empty_zero_pag就不在复制了
	jne 2b ////从pg0开始直到empty_zero_page之间的8K字节设置成一个临时的页面映射表，依次是0x0,0x1000,0x2000，也就是物理内存中的页面0、1、2。映射表的大小是两个页面，即2K个表项，所以代表着一块8MB的存储空间，这就是Linux内核对内存大小的最低限度要求

/*
 * Enable paging
 */
3:
	movl $swapper_pg_dir-__PAGE_OFFSET,%eax //页目录表的位置
	movl %eax,%cr3		//设置页目录表的地址
	movl %cr0,%eax
	orl $0x80000000,%eax
	movl %eax,%cr0		//开启分页机制
	jmp 1f			//此时使用的是物理地址，这就是页目录表中低768个表项的前两项设置成0x00102007，0x00103007，起过度作用
1:
	movl $1f,%eax
	jmp *%eax		//此时再跳转，使用的就是虚拟地址了，也就是1标识符的实际物理地址+3GB，形成虚拟地址，虚拟地址再通过分页机制，也就是页目录表中低256个表项中前两项设置成0x00102007，0x00103007，得到1标识符的实际物理地址，实际上就是1标识符的实际物理地址+3GB再减去3GB
1:
	/* Set up the stack pointer */
	lss stack_start,%esp//设置了堆栈的位置

        ......

/*
 * Clear BSS first so that there are no surprises...
 * No need to cld as DF is already clear from cld above...
 */
	xorl %eax,%eax//暂时忽略
	movl $ SYMBOL_NAME(__bss_start),%edi
	movl $ SYMBOL_NAME(_end),%ecx
	subl %edi,%ecx
	rep
	stosb

/*
 * start system 32-bit setup. We need to re-do some of the things done
 * in 16-bit mode for the "real" operations.
 */
	call setup_idt//初始化中断向量表
/*
 * Initialize eflags.  Some BIOS‘s leave bits like NT set.  This would
 * confuse the debugger if this code is traced.
 * XXX - best to initialize before switching to protected mode.
 */
	pushl $0
	popfl
/*
 * Copy bootup parameters out of the way. First 2kB of
 * _empty_zero_page is for boot parameters, second 2kB
 * is for the command line.
 *
 * Note: %esi still has the pointer to the real-mode data.
 */
	movl $ SYMBOL_NAME(empty_zero_page),%edi//将setup传递过来的引导参数和命令行复制到empty_zero_page中
	movl $512,%ecx
	cld
	rep
	movsl
	xorl %eax,%eax
	movl $512,%ecx
	rep
	stosl
	movl SYMBOL_NAME(empty_zero_page)+NEW_CL_POINTER,%esi
	andl %esi,%esi
	jnz 2f			# New command line protocol
	cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR
	jne 1f
	movzwl OLD_CL_OFFSET,%esi
	addl $(OLD_CL_BASE_ADDR),%esi
2:
	movl $ SYMBOL_NAME(empty_zero_page)+2048,%edi
	movl $512,%ecx
	rep
	movsl
        ......

	movl $-1,X86_CPUID		#  -1 for no CPUID initially

/* check if it is 486 or 386. */
/*
 * XXX - this does a lot of unnecessary setup.  Alignment checks don‘t
 * apply at our cpl of 0 and the stack ought to be aligned already, and
 * we don‘t need to preserve eflags.
 */

	movl $3,X86		# at least 386 //暂不关心
	pushfl			# push EFLAGS
	popl %eax		# get EFLAGS
	movl %eax,%ecx		# save original EFLAGS
	xorl $0x40000,%eax	# flip AC bit in EFLAGS
	pushl %eax		# copy to EFLAGS
	popfl			# set EFLAGS
	pushfl			# get new EFLAGS
	popl %eax		# put it in eax
	xorl %ecx,%eax		# change in flags
	andl $0x40000,%eax	# check if AC bit changed
	je is386

	movl $4,X86		# at least 486
	movl %ecx,%eax
	xorl $0x200000,%eax	# check ID flag
	pushl %eax
	popfl			# if we are on a straight 486DX, SX, or
	pushfl			# 487SX we can‘t change it
	popl %eax
	xorl %ecx,%eax
	pushl %ecx		# restore original EFLAGS
	popfl
	andl $0x200000,%eax
	je is486

	/* get vendor info */
	xorl %eax,%eax			# call CPUID with 0 -> return vendor ID
	cpuid
	movl %eax,X86_CPUID		# save CPUID level
	movl %ebx,X86_VENDOR_ID		# lo 4 chars
	movl %edx,X86_VENDOR_ID+4	# next 4 chars
	movl %ecx,X86_VENDOR_ID+8	# last 4 chars

	orl %eax,%eax			# do we have processor info as well?
	je is486

	movl $1,%eax		# Use the CPUID instruction to get CPU type
	cpuid
	movb %al,%cl		# save reg for future use
	andb $0x0f,%ah		# mask processor family
	movb %ah,X86
	andb $0xf0,%al		# mask model
	shrb $4,%al
	movb %al,X86_MODEL
	andb $0x0f,%cl		# mask mask revision
	movb %cl,X86_MASK
	movl %edx,X86_CAPABILITY

is486:
	movl %cr0,%eax		# 486 or better
	andl $0x80000011,%eax	# Save PG,PE,ET
	orl $0x50022,%eax	# set AM, WP, NE and MP
	jmp 2f

is386:	pushl %ecx		# restore original EFLAGS
	popfl
	movl %cr0,%eax		# 386
	andl $0x80000011,%eax	# Save PG,PE,ET
	orl $2,%eax		# set MP
2:	movl %eax,%cr0
	call check_x87
        ......
	lgdt gdt_descr //设置CPU的"全局段描述表寄存器"GDTR
	lidt idt_descr//设置CPU的"中断描述表寄存器"IDTR
	ljmp $(__KERNEL_CS),$1f  //重新装载cs,ds,es,fs,gs
1:	movl $(__KERNEL_DS),%eax	# reload all the segment registers
	movl %eax,%ds		# after changing gdt.
	movl %eax,%es
	movl %eax,%fs
	movl %eax,%gs
        ......
	lss stack_start,%esp	# Load processor stack
        ......
	xorl %eax,%eax
	lldt %ax          //LDTR选择子清零
	cld			# gcc2 wants the direction flag cleared at all times
        ......
	call SYMBOL_NAME(start_kernel) //开始执行start_kernel
L6:
	jmp L6			# main should never return here, but
				# just in case, we know what happens.

#ifdef CONFIG_SMP
ready:	.byte 0
#endif

/*
 * We depend on ET to be correct. This checks for 287/387.
 */
check_x87:
	movb $0,X86_HARD_MATH
	clts
	fninit
	fstsw %ax
	cmpb $0,%al
	je 1f
	movl %cr0,%eax		/* no coprocessor: have to set bits */
	xorl $4,%eax		/* set EM */
	movl %eax,%cr0
	ret
	ALIGN
1:	movb $1,X86_HARD_MATH
	.byte 0xDB,0xE4		/* fsetpm for 287, ignored by 387 */
	ret

/*
 *  setup_idt
 *
 *  sets up a idt with 256 entries pointing to
 *  ignore_int, interrupt gates. It doesn‘t actually load
 *  idt - that can be done only after paging has been enabled
 *  and the kernel moved to PAGE_OFFSET. Interrupts
 *  are enabled elsewhere, when we can be relatively
 *  sure everything is ok.
 */
setup_idt://每个表项的大小是8个字节，共有256个表项，都指向了同一个中断响应程序ignore_int
	lea ignore_int,%edx
	movl $(__KERNEL_CS << 16),%eax
	movw %dx,%ax		/* selector = 0x0010 = cs */
	movw $0x8E00,%dx	/* interrupt gate - dpl=0, present */

	lea SYMBOL_NAME(idt_table),%edi
	mov $256,%ecx
rp_sidt:
	movl %eax,(%edi)
	movl %edx,4(%edi)
	addl $8,%edi
	dec %ecx
	jne rp_sidt
	ret

ENTRY(stack_start) //task_struct和堆栈共同占用两个页面，堆栈在高地址端
	.long SYMBOL_NAME(init_task_union)+8192
	.long __KERNEL_DS

/* This is the default interrupt "handler" :-) */
int_msg:
	.asciz "Unknown interrupt\n"
	ALIGN
ignore_int://中断处理程序
	cld
	pushl %eax
	pushl %ecx
	pushl %edx
	pushl %es
	pushl %ds
	movl $(__KERNEL_DS),%eax
	movl %eax,%ds
	movl %eax,%es
	pushl $int_msg
	call SYMBOL_NAME(printk)
	popl %eax
	popl %ds
	popl %es
	popl %edx
	popl %ecx
	popl %eax
	iret

/*
 * The interrupt descriptor table has room for 256 idt‘s,
 * the global descriptor table is dependent on the number
 * of tasks we can have..
 */
#define IDT_ENTRIES	256
#define GDT_ENTRIES	(__TSS(NR_CPUS))

.globl SYMBOL_NAME(idt)
.globl SYMBOL_NAME(gdt)

	ALIGN
	.word 0
idt_descr:
	.word IDT_ENTRIES*8-1		//中断描述符表的长度
SYMBOL_NAME(idt):
	.long SYMBOL_NAME(idt_table) //中断描述符表的基地址.idt_table是个全局变量

	.word 0
gdt_descr:
	.word GDT_ENTRIES*8-1//全局段描述表的长度
SYMBOL_NAME(gdt):
	.long SYMBOL_NAME(gdt_table) //全局段描述表的基地址,gdt_table如下

/*
 * This is initialized to create an identity-mapping at 0-8M (for bootup
 * purposes) and another mapping of the 0-8M area at virtual address
 * PAGE_OFFSET.
 */
.org 0x1000
ENTRY(swapper_pg_dir)//参考下面的解释
	.long 0x00102007 //指向了pg0
	.long 0x00103007 //指向了pg1
	.fill BOOT_USER_PGD_PTRS-2,4,0 //768
	/* default: 766 entries */
	.long 0x00102007 //指向了pg0
	.long 0x00103007 //指向了pg1
	/* default: 254 entries */
	.fill BOOT_KERNEL_PGD_PTRS-2,4,0 //256

/*
 * The page tables are initialized to only 8MB here - the final page
 * tables are set up later depending on memory size.
 */
.org 0x2000 //实际的物理地址是0x00102007
ENTRY(pg0)

.org 0x3000 //实际的物理地址是0x00103007
ENTRY(pg1)

/*
 * empty_zero_page must immediately follow the page tables ! (The
 * initialization loop counts until empty_zero_page)
 */

.org 0x4000
ENTRY(empty_zero_page)

.org 0x5000
ENTRY(empty_bad_page)

.org 0x6000
ENTRY(empty_bad_pte_table)

#if CONFIG_X86_PAE

 .org 0x7000
 ENTRY(empty_bad_pmd_table)

 .org 0x8000

#else

 .org 0x7000

#endif

/*
 * This starts the data section. Note that the above is all
 * in the text section because it has alignment requirements
 * that we cannot fulfill any other way.
 */
.data

ALIGN
/*
 * This contains typically 140 quadwords, depending on NR_CPUS.
 *
 * NOTE! Make sure the gdt descriptor in head.S matches this if you
 * change anything.
 */
ENTRY(gdt_table)
	.quad 0x0000000000000000	/* NULL descriptor */
	.quad 0x0000000000000000	/* not used */
	.quad 0x00cf9a000000ffff	/* 0x10 kernel 4GB code at 0x00000000 */
	.quad 0x00cf92000000ffff	/* 0x18 kernel 4GB data at 0x00000000 */
	.quad 0x00cffa000000ffff	/* 0x23 user   4GB code at 0x00000000 */
	.quad 0x00cff2000000ffff	/* 0x2b user   4GB data at 0x00000000 */
	.quad 0x0000000000000000	/* not used */
	.quad 0x0000000000000000	/* not used */
	/*
	 * The APM segments have byte granularity and their bases
	 * and limits are set at run time.
	 */
	.quad 0x0040920000000000	/* 0x40 APM set up for bad BIOS‘s */
	.quad 0x00409a0000000000	/* 0x48 APM CS    code */
	.quad 0x00009a0000000000	/* 0x50 APM CS 16 code (16 bit) */
	.quad 0x0040920000000000	/* 0x58 APM DS    data */
	.fill NR_CPUS*4,8,0		/* space for TSS‘s and LDT‘s */

/*
 * This is to aid debugging, the various locking macros will be putting
 * code fragments here.  When an oops occurs we‘d rather know that it‘s
 * inside the .text.lock section rather than as some offset from whatever
 * function happens to be last in the .text segment.
 */
.section .text.lock
ENTRY(stext_lock)

.org 0x1000
ENTRY(swapper_pg_dir)
	.long 0x00102007
	.long 0x00103007
	.fill BOOT_USER_PGD_PTRS-2,4,0 //768
	/* default: 766 entries */
	.long 0x00102007
	.long 0x00103007
	/* default: 254 entries */
	.fill BOOT_KERNEL_PGD_PTRS-2,4,0 //256

我们单独解释下这段代码，一个页目录表有1024个表项，共代表着4GB的虚拟空间。Linux内核以3GB为界把整个虚拟空间分成用户空间和系统空间。所以，页目录表中低768个表项用于用户空间的映射，而高256个表项用于系统空间的映射。

在Linux0.11中，内核空间和用户空间是这样切换的。

首先页目录项是这样的：

页目录表的前4项用于内核空间，分别指向页表0，页表1，页表2，页表3，共映射16MB的空间，内核态使用GDT，基地址为0，可以访问到所有的内存地址。

当处于进程2的用户态时，对应的页目录表是32~48项，对应的16个页表是自己创建的。由于用户态使用LDT，基地址为128MB。比如cs：eip，其中eip为0，那么经过分段机制，虚拟地址为128MB，经过分页机制，首先根据虚拟地址的前10位选择的便是页目录项中的第32项，然后根据虚拟地址的中间10位是选择的是第32项所指向页表中的第一个页表项，最后根据后12位都为0，这个页表项指向的内存地址便是要访问的物理地址。

在Linux2.4中，内核空间和用户空间是这样切换的。

每个进程有不同的页目录表，页目录价表有1024个表项，共代表着4GB的虚拟空间。Linux内核以3GB为界把整个虚拟空间分成用户空间和系统空间。所以，页目录表中低768个表项用于用户空间的映射，而高256个表项用于系统空间的映射。

用户空间的虚拟地址是0~3G，也就是对应得了页目录表中的低768个表项。还记得我们分配用户空间的虚拟地址就是从0分配到3G么，Linux内核源代码情景分析-execve()。

内核空间的虚拟地址是3G~4G，对应的是页目录表中的高256个表项，由于内核空间的标识符经过链接后都在实际的物理地址上加上了3G，所以访问内核空间时，虚拟地址在3G~4G，经过分页机制(如上)就变成了实际的物理地址(其实就是虚拟地址减去3G)。

Linux2.4的不适用LDT，只使用GDT，无论在内核空间还是用户空间，逻辑地址经过分段机制，得到的虚拟地址与逻辑地址相同。

GDT如下：

ENTRY(gdt_table)
	.quad 0x0000000000000000	/* NULL descriptor */
	.quad 0x0000000000000000	/* not used */
	.quad 0x00cf9a000000ffff	/* 0x10 kernel 4GB code at 0x00000000 */
	.quad 0x00cf92000000ffff	/* 0x18 kernel 4GB data at 0x00000000 */
	.quad 0x00cffa000000ffff	/* 0x23 user   4GB code at 0x00000000 */
	.quad 0x00cff2000000ffff	/* 0x2b user   4GB data at 0x00000000 */
	.quad 0x0000000000000000	/* not used */
	.quad 0x0000000000000000	/* not used */
	/*

时间： 2024-10-26 22:22:26

Linux内核源代码情景分析-系统初始化

Linux内核源代码情景分析-系统初始化的相关文章

Linux内核源代码情景分析-fork()

Linux内核源代码情景分析-特殊文件系统/proc

Linux内核源代码情景分析-内存管理之用户页面的定期换出

Linux内核源代码情景分析-设备文件系统devfs

Linux内核源代码情景分析-内存管理之slab-回收

Linux内核源代码情景分析-文件系统的安装

Linux内核源代码情景分析-访问权限与文件安全性

Linux内核源代码情景分析-强制性调度

Linux内核源代码情景分析-特殊文件系统/proc-对/proc/self/cwd的访问