linux kernel被bootloader加载到内存后,cpu首先执行head.s中的start_of_setup函数等函数,然后跳转到main.c,main中首先执行detect_memory函数探测内存;
int detect_memory(void) { int err = -1; if (detect_memory_e820() > 0) err = 0; if (!detect_memory_e801()) err = 0; if (!detect_memory_88()) err = 0; return err; }
linux内核通过detect_memory_xxx来获取内存相关信息;这几个函数都是通过触发int 0x15 中断获取;,同时调用前分别把AX寄存器设置为0xe820h、0xe801h、0x88h
对于e820();
struct e820entry { __u64 addr; /* start of memory segment */该内存段的起始地址 __u64 size; /* size of memory segment */该内存段段的大小 __u32 type; /* type of memory segment */该内存段的类型 } __attribute__((packed)); struct e820map { <span style="white-space:pre"> </span>__u32 nr_map; <span style="white-space:pre"> </span>struct e820entry map[E820_X_MAX]; };
type:该内存段的类型,可分为Usable (normal) RAM,Reserved - unusable,ACPI reclaimable memory,ACPI NVS memory,Area containing bad memory,要获取所有的内存段的信息,detect_memory_e820()通过一个do_while循环来不断触发int
0x15中断来获取每个内存段的信息,并且将这些信息保存在一个struct e820entry类型的数组中。
static int detect_memory_e820(void) { int count = 0; struct biosregs ireg, oreg; struct e820entry *desc = boot_params.e820_map; static struct e820entry buf; /* static so it is zeroed */ initregs(&ireg); ireg.ax = 0xe820; ireg.cx = sizeof buf; ireg.edx = SMAP; ireg.di = (size_t)&buf; /* * Note: at least one BIOS is known which assumes that the * buffer pointed to by one e820 call is the same one as * the previous call, and only changes modified fields. Therefore, * we use a temporary buffer and copy the results entry by entry. * * This routine deliberately does not try to account for * ACPI 3+ extended attributes. This is because there are * BIOSes in the field which report zero for the valid bit for * all ranges, and we don't currently make any use of the * other attribute bits. Revisit this if we see the extended * attribute bits deployed in a meaningful way in the future. */ do { <span style="white-space:pre"> </span> /*在执行这条内联汇编语句时输入的参数有: eax寄存器=0xe820 dx寄存器=’SMAP’ edi寄存器=desc ebx寄存器=next ecx寄存器=size 返回给c语言代码的参数有: id=eax寄存器 rr=edx寄存器 ext=ebx寄存器 size=ecx寄存器 desc指向的内存地址在执行0x15中断调用时被设置 */ <span style="white-space:pre"> </span> intcall(0x15, &ireg, &oreg);/*触发中断0x15*/ ireg.ebx = oreg.ebx; /* for next iteration... */ /* BIOSes which terminate the chain with CF = 1 as opposed to %ebx = 0 don't always report the SMAP signature on the final, failing, probe. */ if (oreg.eflags & X86_EFLAGS_CF) break; /* Some BIOSes stop returning SMAP in the middle of the search loop. We don't know exactly how the BIOS screwed up the map at that point, we might have a partial map, the full map, or complete garbage, so just return failure. */ if (oreg.eax != SMAP) { count = 0; break; } *desc++ = buf;/*保存获取的内存段信息*/ count++; /*获取的内存段数目加1*/ } while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_map)); <span style="white-space:pre"> </span>/*将内存块数保持到变量中*/ return boot_params.e820_entries = count; }
static int detect_memory_e801(void) { struct biosregs ireg, oreg; initregs(&ireg); ireg.ax = 0xe801; intcall(0x15, &ireg, &oreg); if (oreg.eflags & X86_EFLAGS_CF) return -1; /* Do we really need to do this? */ if (oreg.cx || oreg.dx) { oreg.ax = oreg.cx; oreg.bx = oreg.dx; } if (oreg.ax > 15*1024) { return -1; /* Bogus! */ } else if (oreg.ax == 15*1024) { boot_params.alt_mem_k = (oreg.bx << 6) + oreg.ax; } else { /* * This ignores memory above 16MB if we have a memory * hole there. If someone actually finds a machine * with a memory hole at 16MB and no support for * 0E820h they should probably generate a fake e820 * map. */ boot_params.alt_mem_k = oreg.ax; } return 0; } static int detect_memory_88(void) { struct biosregs ireg, oreg; initregs(&ireg); ireg.ah = 0x88; intcall(0x15, &ireg, &oreg); boot_params.screen_info.ext_mem_k = oreg.ax; return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */ }
对于32位的系统,通过调用链arch/x86/boot/main.c:main()--->arch/x86/boot/pm.c:go_to_protected_mode()--->arch/x86/boot/pmjump.S:protected_mode_jump()--->arch/i386/boot/compressed/head_32.S:startup_32()--->arch/x86/kernel/head_32.S:startup_32()--->arch/x86/kernel/head32.c:i386_start_kernel()--->init/main.c:start_kernel(),到达众所周知的Linux内核启动函数start_kernel(),这里会调用setup_arch()完成与体系结构相关的一系列初始化工作,其中就包括各种内存的初始化工作,如内存图的建立、管理区的初始化等等。对x86体系结构,setup_arch()函数在arch/x86/kernel/setup.c中,如下:
void __init setup_arch(char **cmdline_p) { /* ...... */ x86_init.oem.arch_setup(); setup_memory_map(); /* 建立内存图 */ e820_reserve_setup_data(); /* ...... */ /* * partially used pages are not usable - thus * we are rounding upwards: */ max_pfn = e820_end_of_ram_pfn(); /* 找出最大可用内存页面帧号 */ <span style="white-space:pre"> </span><pre name="code" class="cpp" style="font-size: 24px;"> /* ...... */
#ifdef CONFIG_X86_32/* max_low_pfn在这里更新 */find_low_pfn_range(); /* 找出低端内存的最大页帧号 */#elsenum_physpages = max_pfn;/* ...... *//* max_pfn_mapped在这更新 *//* 初始化内存映射机制 */max_low_pfn_mapped = init_memory_mapping(0,
max_low_pfn<<PAGE_SHIFT);max_pfn_mapped = max_low_pfn_mapped;/* ...... */initmem_init(0, max_pfn); /* 启动内存分配器 *//* ...... */x86_init.paging.pagetable_setup_start(swapper_pg_dir);paging_init(); /* 建立完整的页表 */x86_init.paging.pagetable_setup_done(swapper_pg_dir);/*
...... */}
在 start_kernel---->setup_arch()--------------->setup_memory_map;
void __init setup_memory_map(void) { char *who; who = x86_init.resources.memory_setup(); memcpy(&e820_saved, &e820, sizeof(struct e820map)); printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n"); e820_print_map(who); }
在x86_init.c中定义了x86下的memory_setup函数:
/* * The platform setup functions are preset with the default functions * for standard PC hardware. */ struct x86_init_ops x86_init __initdata = { .resources = { .probe_roms = probe_roms, .reserve_resources = reserve_standard_io_resources, .memory_setup = default_machine_specific_memory_setup, }, .mpparse = { .mpc_record = x86_init_uint_noop, .setup_ioapic_ids = x86_init_noop, .mpc_apic_id = default_mpc_apic_id, .smp_read_mpc_oem = default_smp_read_mpc_oem, .mpc_oem_bus_info = default_mpc_oem_bus_info, .find_smp_config = default_find_smp_config, .get_smp_config = default_get_smp_config, }, .irqs = { .pre_vector_init = init_ISA_irqs, .intr_init = native_init_IRQ, .trap_init = x86_init_noop, }, .oem = { .arch_setup = x86_init_noop, .banner = default_banner, }, .mapping = { .pagetable_reserve = native_pagetable_reserve, }, .paging = { .pagetable_setup_start = native_pagetable_setup_start, .pagetable_setup_done = native_pagetable_setup_done, }, .timers = { .setup_percpu_clockev = setup_boot_APIC_clock, .tsc_pre_init = x86_init_noop, .timer_init = hpet_time_init, .wallclock_init = x86_init_noop, }, .iommu = { .iommu_init = iommu_init_noop, }, .pci = { .init = x86_default_pci_init, .init_irq = x86_default_pci_init_irq, .fixup_irqs = x86_default_pci_fixup_irqs, }, };
可知会回调:default_machine_specific_memory_setup();
char *__init default_machine_specific_memory_setup(void) { char *who = "BIOS-e820"; u32 new_nr; /* * Try to copy the BIOS-supplied E820-map. * * Otherwise fake a memory map; one section from 0k->640k, * the next section from 1mb->appropriate_mem_k */ new_nr = boot_params.e820_entries; sanitize_e820_map(boot_params.e820_map, /*消除重叠的内存段*/ ARRAY_SIZE(boot_params.e820_map), &new_nr); boot_params.e820_entries = new_nr; if (append_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) { /*将内存布局的信息从boot_params.e820_map拷贝到struct e820map e820*/ u64 mem_size; /* compare results from other methods and take the greater */ if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) { mem_size = boot_params.screen_info.ext_mem_k; who = "BIOS-88"; } else { mem_size = boot_params.alt_mem_k; who = "BIOS-e801"; } e820.nr_map = 0; e820_add_region(0, LOWMEMSIZE(), E820_RAM); e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); } /* In case someone cares... */ return who; }
1.消除内存段的重叠部分
2.将内存布局信息从boot_params.e820_map拷贝到e820中
append_e820_map(boot_params.e820_map, boot_params.e820_entries)将会调用一下函数:
static int __init __append_e820_map(struct e820entry *biosmap, int nr_map) { while (nr_map) { u64 start = biosmap->addr; u64 size = biosmap->size; u64 end = start + size; u32 type = biosmap->type; /* Overflow in 64 bits? Ignore the memory map. */ if (start > end) return -1; e820_add_region(start, size, type); 循环nr_map次添加内存块到e820中去; biosmap++; nr_map--; } return 0; }
void __init e820_add_region(u64 start, u64 size, int type) { __e820_add_region(&e820, start, size, type); }
struct e820map e820;
物理内存就已经从BIOS中读出来存放到全局变量e820中,
建立内存后
setup_arch------------->e820_end_of_ram_pfn;
/*
* partially used pages are not usable - thus
* we are rounding upwards:
*/
max_pfn = e820_end_of_ram_pfn();
static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) { int i; unsigned long last_pfn = 0; unsigned long max_arch_pfn = MAX_ARCH_PFN;/*4G地址空间对应的页面数*/ for (i = 0; i < e820.nr_map; i++) { /*循环遍历内存布局数组*/ struct e820entry *ei = &e820.map[i]; unsigned long start_pfn; unsigned long end_pfn; if (ei->type != type) continue; start_pfn = ei->addr >> PAGE_SHIFT; end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT; if (start_pfn >= limit_pfn)/*起始地址大于MAX_ARCH_PFN,无视之*/ continue; if (end_pfn > limit_pfn) { /*结束地址大于MAX_ARCH_PFN则直接最大页框编号设为MAX_ARCH_PFN*/ last_pfn = limit_pfn; break; } if (end_pfn > last_pfn) /*该内存段的末地址大于之前找到的最大页框编号, 则重置最大页框编号*/ last_pfn = end_pfn; } if (last_pfn > max_arch_pfn)/*大于4G空间时*/ last_pfn = max_arch_pfn; printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n", last_pfn, max_arch_pfn); return last_pfn; /*返回最后一个页面帧号*/ }
unsigned long __init e820_end_of_ram_pfn(void) { <span style="white-space:pre"> </span>return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); }
#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)
其中__VANALLOC_RESERVE为128M,上图说明了第4GB的内存划分
可知:MAXMEM为一个略小于896M的值(896M-8K-4M-4M)即略小于低端内存的上限,高端内存的起始地址
setup_arch()-->find_low_pfn_range().该函数用来划分低端内存和高端内存的界限,确定高端内存的起始地址
/* max_low_pfn get updated here */
find_low_pfn_range();
/* * Determine low and high memory ranges: */ void __init find_low_pfn_range(void) { /* it could update max_pfn */ if (max_pfn <= MAXMEM_PFN)/*实际物理内存小于等于低端内存896M*/ lowmem_pfn_init(); else highmem_pfn_init(); }
/* * We have more RAM than fits into lowmem - we try to put it into * highmem, also taking the highmem=x boot parameter into account: */ /*高端地址空间的页面数可以在启动中进行配置; 如果不配置,在这里进行设置大小*/ void __init highmem_pfn_init(void) { /*MAXMEM_PFN为最大物理地址-(4M+4M+8K+128M); 所以低端内存的大小其实比我们说的896M低一些*/ max_low_pfn = MAXMEM_PFN;/*设定高端内存和低端内存的分界线*/ if (highmem_pages == -1)/*高端内存页面数如果在开机没有设置*/ highmem_pages = max_pfn - MAXMEM_PFN;/*总页面数减去低端页面数*/ /*如果highmem_pages变量在启动项设置了,那么在这里就要进行这样的判断,因为可能出现不一致的情况*/ if (highmem_pages + MAXMEM_PFN < max_pfn) max_pfn = MAXMEM_PFN + highmem_pages; if (highmem_pages + MAXMEM_PFN > max_pfn) { printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL, pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages)); highmem_pages = 0; } #ifndef CONFIG_HIGHMEM /* Maximum memory usable is what is directly addressable */ printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); if (max_pfn > MAX_NONPAE_PFN) printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); else printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); max_pfn = MAXMEM_PFN; #else /* !CONFIG_HIGHMEM *//*存在高端地址情况*/ #ifndef CONFIG_HIGHMEM64G /*在没有配置64G的情况下,内存的大小不能超过4G*/ if (max_pfn > MAX_NONPAE_PFN) { max_pfn = MAX_NONPAE_PFN; printk(KERN_WARNING MSG_HIGHMEM_TRIMMED); } #endif /* !CONFIG_HIGHMEM64G */ #endif /* !CONFIG_HIGHMEM */ }
当实际内存小于896M时
void __init lowmem_pfn_init(void) { /* max_low_pfn is 0, we already have early_res support */ /*将分界线初始化为实际物理内存的最大页框号,由于系统的内存小于896M, 所以全部内存为低端内存,如需要高端内存,则从中分一部分出来进行分配*/ max_low_pfn = max_pfn; if (highmem_pages == -1) highmem_pages = 0; #ifdef CONFIG_HIGHMEM /*如果用户定义了HIGHMEM,即需要分配高端内存*/ if (highmem_pages >= max_pfn) { /*如果高端内存的页起始地址>=最大页框号,则无法分配*/ printk(KERN_ERR MSG_HIGHMEM_TOO_BIG, pages_to_mb(highmem_pages), pages_to_mb(max_pfn)); highmem_pages = 0; } if (highmem_pages) { /*这个条件保证低端内存不能小于64M*/ if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) { printk(KERN_ERR MSG_LOWMEM_TOO_SMALL, pages_to_mb(highmem_pages)); highmem_pages = 0; } max_low_pfn -= highmem_pages; /*设定好低、高端内存的分界线*/ } #else if (highmem_pages) printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n"); #endif }
当实际的物理内存大于896M,由highmem_pfn_init()进行分配 void __init highmem_pfn_init(void) { max_low_pfn = MAXMEM_PFN; /*设定高端内存和低端内存的分界线*/ if (highmem_pages == -1) /*未设定高端内存的页框数*/ highmem_pages = max_pfn - MAXMEM_PFN; /*默认为最大页框数减去MAXMEM_PFN*/ if (highmem_pages + MAXMEM_PFN < max_pfn) /*高端内存页框数加上MAXMEM_PFN小于最大页框数*/ max_pfn = MAXMEM_PFN + highmem_pages; /*将最大页框数下调到前两者的和*/ if (highmem_pages + MAXMEM_PFN > max_pfn){ /*申请的高端内存超过范围则不分配*/ printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL, pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages)); highmem_pages = 0; } #ifndef CONFIG_HIGHMEM /* Maximum memory usable is what is directly addressable */ printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); if (max_pfn > MAX_NONPAE_PFN) printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); else printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); max_pfn = MAXMEM_PFN; #else /* !CONFIG_HIGHMEM */ #ifndef CONFIG_HIGHMEM64G if (max_pfn > MAX_NONPAE_PFN) { max_pfn = MAX_NONPAE_PFN; printk(KERN_WARNING MSG_HIGHMEM_TRIMMED); } #endif /* !CONFIG_HIGHMEM64G */ #endif /* !CONFIG_HIGHMEM */ }