1、struct page
1 /* Each physical page in the system has a struct page associated with 2 * it to keep track of whatever it is we are using the page for at the 3 * moment. Note that we have no way to track which tasks are using 4 * a page, though if it is a pagecache page, rmap structures can tell us 5 * who is mapping it. 6 */ 7 struct page { 8 unsigned long flags; /* Atomic flags, some possibly 9 * updated asynchronously */ 10 atomic_t _count; /* Usage count, see below. */ 11 union { 12 atomic_t _mapcount; /* Count of ptes mapped in mms, 13 * to show when page is mapped 14 * & limit reverse map searches. 15 */ 16 struct { /* SLUB uses */ 17 short unsigned int inuse; 18 short unsigned int offset; 19 }; 20 }; 21 union { 22 struct { 23 unsigned long private; /* Mapping-private opaque data: 24 * usually used for buffer_heads 25 * if PagePrivate set; used for 26 * swp_entry_t if PageSwapCache; 27 * indicates order in the buddy 28 * system if PG_buddy is set. 29 */ 30 struct address_space *mapping; /* If low bit clear, points to 31 * inode address_space, or NULL. 32 * If page mapped as anonymous 33 * memory, low bit is set, and 34 * it points to anon_vma object: 35 * see PAGE_MAPPING_ANON below. 36 */ 37 }; 38 #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS 39 spinlock_t ptl; 40 #endif 41 struct { /* SLUB uses */ 42 void **lockless_freelist; 43 struct kmem_cache *slab; /* Pointer to slab */ 44 }; 45 struct { 46 struct page *first_page; /* Compound pages */ 47 }; 48 }; 49 union { 50 pgoff_t index; /* Our offset within mapping. */ 51 void *freelist; /* SLUB: freelist req. slab lock */ 52 }; 53 struct list_head lru; /* Pageout list, eg. active_list 54 * protected by zone->lru_lock ! 55 */ 56 /* 57 * On machines where all RAM is mapped into kernel address space, 58 * we can simply calculate the virtual address. On machines with 59 * highmem some memory is mapped into kernel virtual memory 60 * dynamically, so we need a place to store that address. 61 * Note that this field could be 16 bits on x86 ... ;) 62 * 63 * Architectures with slow multiplication can define 64 * WANT_PAGE_VIRTUAL in asm/page.h 65 */ 66 #if defined(WANT_PAGE_VIRTUAL) 67 void *virtual; /* Kernel virtual address (NULL if 68 not kmapped, ie. highmem) */ 69 #endif /* WANT_PAGE_VIRTUAL */ 70 };
flags:flag域用来存放页的状态。这些状态包括页是不是脏的,是不是被锁定在内存中。flag的每一位单独表示一种状态,可以表示32种状态。
_count:_count域用来存放页的引用计数,也就是页引用了多少次。当计数变为-1时表示当前内核并没有引用这一项,于是在在新的分配中就可以使用它。
virtual:virtual域是页的虚拟地址。virtual就是页在虚拟内存中的地址。有些内存(即所谓的高端内存)并不永久的映射到内核地址空间上,在这种情况下,这个域的值为NULL,需要的时候,必须动态的映射这些页。
page结构直接与物理页面相关,而并非与虚拟页相关。因此她描述的页是短暂的,即使页中所包含的数据继续存在,由于交换的原因,他们可能并不再和同一个page结构相关联。内核仅仅用这个数据结构来描述当前时刻相关的物理页中存放的东西。这种数据结构的目的在于描述物理内存本身,而不是描述结构体里面的数据。内核用这个结构来管理系统中的所有页,因为内核需要知道一个页是不是空闲(也就是页有没有被分配)。如果页已经被分配,内核还需要知道谁拥有这个页。拥有者可能是用户空间进程、动态分配的内核数据、静态内核代码或页高速缓存等。
2、内存分区
因为有些页位于内存中特定的物理地址上,所以不能将其用于一些特定的任务,由于存在这种限制,所以内核把页划分为不同的区。内核使用区对具有相似特性的页进行分组。
1 struct zone { 2 /* Fields commonly accessed by the page allocator */ 3 unsigned long pages_min, pages_low, pages_high; 4 /* 5 * We don‘t know if the memory that we‘re going to allocate will be freeable 6 * or/and it will be released eventually, so to avoid totally wasting several 7 * GB of ram we must reserve some of the lower zone memory (otherwise we risk 8 * to run OOM on the lower zones despite there‘s tons of freeable ram 9 * on the higher zones). This array is recalculated at runtime if the 10 * sysctl_lowmem_reserve_ratio sysctl changes. 11 */ 12 unsigned long lowmem_reserve[MAX_NR_ZONES]; 13 14 #ifdef CONFIG_NUMA 15 int node; 16 /* 17 * zone reclaim becomes active if more unmapped pages exist. 18 */ 19 unsigned long min_unmapped_pages; 20 unsigned long min_slab_pages; 21 struct per_cpu_pageset *pageset[NR_CPUS]; 22 #else 23 struct per_cpu_pageset pageset[NR_CPUS]; 24 #endif 25 /* 26 * free areas of different sizes 27 */ 28 spinlock_t lock; 29 #ifdef CONFIG_MEMORY_HOTPLUG 30 /* see spanned/present_pages for more description */ 31 seqlock_t span_seqlock; 32 #endif 33 struct free_area free_area[MAX_ORDER]; 34 35 36 ZONE_PADDING(_pad1_) 37 38 /* Fields commonly accessed by the page reclaim scanner */ 39 spinlock_t lru_lock; 40 struct list_head active_list; 41 struct list_head inactive_list; 42 unsigned long nr_scan_active; 43 unsigned long nr_scan_inactive; 44 unsigned long pages_scanned; /* since last reclaim */ 45 int all_unreclaimable; /* All pages pinned */ 46 47 /* A count of how many reclaimers are scanning this zone */ 48 atomic_t reclaim_in_progress; 49 50 /* Zone statistics */ 51 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; 52 53 /* 54 * prev_priority holds the scanning priority for this zone. It is 55 * defined as the scanning priority at which we achieved our reclaim 56 * target at the previous try_to_free_pages() or balance_pgdat() 57 * invokation. 58 * 59 * We use prev_priority as a measure of how much stress page reclaim is 60 * under - it drives the swappiness decision: whether to unmap mapped 61 * pages. 62 * 63 * Access to both this field is quite racy even on uniprocessor. But 64 * it is expected to average out OK. 65 */ 66 int prev_priority; 67 68 69 ZONE_PADDING(_pad2_) 70 /* Rarely used or read-mostly fields */ 71 72 /* 73 * wait_table -- the array holding the hash table 74 * wait_table_hash_nr_entries -- the size of the hash table array 75 * wait_table_bits -- wait_table_size == (1 << wait_table_bits) 76 * 77 * The purpose of all these is to keep track of the people 78 * waiting for a page to become available and make them 79 * runnable again when possible. The trouble is that this 80 * consumes a lot of space, especially when so few things 81 * wait on pages at a given time. So instead of using 82 * per-page waitqueues, we use a waitqueue hash table. 83 * 84 * The bucket discipline is to sleep on the same queue when 85 * colliding and wake all in that wait queue when removing. 86 * When something wakes, it must check to be sure its page is 87 * truly available, a la thundering herd. The cost of a 88 * collision is great, but given the expected load of the 89 * table, they should be so rare as to be outweighed by the 90 * benefits from the saved space. 91 * 92 * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the 93 * primary users of these fields, and in mm/page_alloc.c 94 * free_area_init_core() performs the initialization of them. 95 */ 96 wait_queue_head_t * wait_table; 97 unsigned long wait_table_hash_nr_entries; 98 unsigned long wait_table_bits; 99 100 /* 101 * Discontig memory support fields. 102 */ 103 struct pglist_data *zone_pgdat; 104 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ 105 unsigned long zone_start_pfn; 106 107 /* 108 * zone_start_pfn, spanned_pages and present_pages are all 109 * protected by span_seqlock. It is a seqlock because it has 110 * to be read outside of zone->lock, and it is done in the main 111 * allocator path. But, it is written quite infrequently. 112 * 113 * The lock is declared along with zone->lock because it is 114 * frequently read in proximity to zone->lock. It‘s good to 115 * give them a chance of being in the same cacheline. 116 */ 117 unsigned long spanned_pages; /* total size, including holes */ 118 unsigned long present_pages; /* amount of memory (excluding holes) */ 119 120 /* 121 * rarely used fields: 122 */ 123 const char *name; 124 } ____cacheline_internodealigned_in_smp;
name:name域是一个以NULL结束的字符串表示这个区的名字。内核启动期间初始化这个值,其代码位于mm/page_alloc.c中。三个名字分别为:DMA、Normal、HighMem
获得页
struct page *alloc_pages(gfp_t gfp_mask,unsigned int order)
void *page_address(struct page *page)
unsigned long _get_free_pages(gft_t gfp_mask,unsigned int order)
struct page *alloc_page(gfp_t gfp_mask)
unsigned long _get_free_page(gfp_t gfp_mask)
get _zeroed_page(gfp_mask)
__get_dma_pages(gfp_mask,order)
获得以字节为单位的内存单元
void *kmalloc(size_t size,gfp_t flags)
void kfree(const void *ptr)
void *vmalloc(unsigned long size)
void vfree(const void *addr)
kmalloc()函数分配的内存物理地址和虚拟地址都是连续的,vmalloc()函数分配的内存虚拟地址是连续的,但是物理地址不连续,这个也是用户空间分配函数的工作方式,malloc()函数返回的地址是虚拟空间的地址,更详细的说是进程堆空间的内存地址,我们知道进程所有的内存都是虚拟内存,但是不能保证这些虚拟地址在物理RAM中也是连续的,同时这种分配方式相比于kmalloc效率要低一点,因为物理地址不连续的内存转换成连续的虚拟地址需要专门建立页表,vmalloc()函数一般在为了获得大块内存的时候使用。
伙伴算法
一.算法概览
可以在维基百科上找到该算法的描述,大体如是:
分配内存:
1.寻找大小合适的内存块(大于等于所需大小并且最接近2的幂,比如需要27,实际分配32)
1.如果找到了,分配给应用程序。
2.如果没找到,分出合适的内存块。
1.对半分离出高于所需大小的空闲内存块
2.如果分到最低限度,分配这个大小。
3.回溯到步骤1(寻找合适大小的块)
4.重复该步骤直到一个合适的块
释放内存:
1.释放该内存块
1.寻找相邻的块,看其是否释放了。
2.如果相邻块也释放了,合并这两个块,重复上述步骤直到遇上未释放的相邻块,或者达到最高上限(即所有内存都释放了)。
看起来蛮晦涩,有人给出了一个便于理解的示意图,如下:
上图中,首先我们假设我们一个内存块有1024K,当我们需要给A分配70K内存的时候,
1. 我们发现1024K的一半大于70K,然后我们就把1024K的内存分成两半,一半512K。
2. 然后我们发现512K的一半仍然大于70K,于是我们再把512K的内存再分成两半,一半是256K。
3.我们发现256k任然大于70k,于是我们将256k再分成两半,一半是128k。
3. 此时,我们发现128K的一半小于70K,于是我们就分配为A分配128K的内存。
后面的,B,C,D都这样,而释放内存时,则会把相邻的块一步一步地合并起来(合并也必需按分裂的逆操作进行合并)。
我们可以看见,这样的算法,用二叉树这个数据结构来实现再合适不过了。
算法实现
伙伴算法的分配器的实现思路是,通过一个数组形式的完全二叉树来监控管理内存,二叉树的节点用于标记相应内存块的使用状态,高层节点对应大的块,低层节点对应小的块,在分配和释放中我们就通过这些节点的标记属性来进行块的分离合并。如图所示,假设总大小为16单位的内存,我们就建立一个深度为5的满二叉树,根节点从数组下标[0]开始,监控大小16的块;它的左右孩子节点下标[1~2],监控大小8的块;第三层节点下标[3~6]监控大小4的块……依此类推。
在分配阶段,首先要搜索大小适配的块,假设第一次分配3,转换成2的幂是4,我们先要对整个内存进行对半切割,从16切割到4需要两步,那么从下标[0]节点开始深度搜索到下标[3]的节点并将其标记为已分配。第二次再分配3那么就标记下标[4]的节点。第三次分配6,即大小为8,那么搜索下标[2]的节点,因为下标[1]所对应的块被下标[3~4]占用了。
在释放阶段,我们依次释放上述第一次和第二次分配的块,即先释放[3]再释放[4],当释放下标[4]节点后,我们发现之前释放的[3]是相邻的,于是我们立马将这两个节点进行合并,这样一来下次分配大小8的时候,我们就可以搜索到下标[1]适配了。若进一步释放下标[2],同[1]合并后整个内存就回归到初始状态。
slab分配器----什么是slab分配器?-----为什么要用slab分配器?
分配和释放数据结构是所有内核中最普遍的操作之一。为了便于数据的频繁分配和回收,编程人员常常会用空链表。空链表包含可供使用的、已经分配好的数据结构块。当代码需要一个新的数据结构实例时,就可以从空链表中抓取一个,而不需要再进行内存分配,这样可以提高效率,当不需要时可以将内存放回空闲链表而不是释放掉。所以空闲链表相当于对象高速缓存---快速存储频繁使用的对象类型。我们常用的进程描述符struct tast_struct就可以用slab进行内存申请。从这个意义上说,空闲链表相当于对象高速缓存-----快速存储频繁使用该的对象类型。
linux中设计了slab层(即slab分配器)来实现高速数据结构缓存,slab分配器扮演了通用数据结构缓存层的角色。slab层把不同的对象划分为高速缓存组,每个高速缓存组都存放不同类型的对象,每种对象对应一个高速缓存组。后面会讲申请高速缓存,然后在申请的高速缓存中获取对象。例如,一个高速缓存用于存放进程描述符(task_struct结构的一个空闲链表),而另一个高速缓存存放索引节点对象(struct inode)。kmalloc()接口建立在slab层之上,使用了一组通用高速缓存。后面讲kmalloc()函数时会讲函数中调用了kmem_cache_alloc(malloc_sizes[i].cs_dmacachep,flags)。这些高速缓存又被划分为slab(这个子系统名字的来由)。slab由一个或者多个物理上连续的页组成。一般情况下,slab也就仅仅由一页组成。每个高速缓存可以由多个slab组成。
每个slab都包含一些对象成员,这里的对象指的是被缓存的数据结构。每个slab处于三种状态之一:满、部分满、空。一个满的slab没有空闲的对象(slab中所有的对象已经被分配),一个空的slab没有分配出任何对象(slab中的所有对象都是空的)。一个部分满的slab有一些对象已经分配出去,还有些对象空闲着。当内核的某一部分需要一个新的对象时,先从部分满的slab中进行分配。如果没有部分满的slab,就从空的slab中进行分配。如果没有空的slab,就要创建一个slab了。
slab分配器创建slab
1 /* 2 * Interface to system‘s page allocator. No need to hold the cache-lock. 3 * 4 * If we requested dmaable memory, we will get it. Even if we 5 * did not request dmaable memory, we might get it, but that 6 * would be relatively rare and ignorable. 7 */ 8 static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) 9 { 10 struct page *page; 11 int nr_pages; 12 int i; 13 14 #ifndef CONFIG_MMU 15 /* 16 * Nommu uses slab‘s for process anonymous memory allocations, and thus 17 * requires __GFP_COMP to properly refcount higher order allocations 18 */ 19 flags |= __GFP_COMP; 20 #endif 21 22 flags |= cachep->gfpflags; 23 24 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 25 if (!page) 26 return NULL; 27 28 nr_pages = (1 << cachep->gfporder); 29 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 30 add_zone_page_state(page_zone(page), 31 NR_SLAB_RECLAIMABLE, nr_pages); 32 else 33 add_zone_page_state(page_zone(page), 34 NR_SLAB_UNRECLAIMABLE, nr_pages); 35 for (i = 0; i < nr_pages; i++) 36 __SetPageSlab(page + i); 37 return page_address(page); 38 } 39 40 /* 41 * Interface to system‘s page release. 42 */ 43 static void kmem_freepages(struct kmem_cache *cachep, void *addr) 44 { 45 unsigned long i = (1 << cachep->gfporder); 46 struct page *page = virt_to_page(addr); 47 const unsigned long nr_freed = i; 48 49 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 50 sub_zone_page_state(page_zone(page), 51 NR_SLAB_RECLAIMABLE, nr_freed); 52 else 53 sub_zone_page_state(page_zone(page), 54 NR_SLAB_UNRECLAIMABLE, nr_freed); 55 while (i--) { 56 BUG_ON(!PageSlab(page)); 57 __ClearPageSlab(page); 58 page++; 59 } 60 if (current->reclaim_state) 61 current->reclaim_state->reclaimed_slab += nr_freed; 62 free_pages((unsigned long)addr, cachep->gfporder); 63 }
slab分配是基于buddy算法的,也就是说slab的页框申请是使用伙伴系统算法的,因为slab层的slab分配器对内存的最底层的分配还是前面讲的分配函数如:_get_free_pages(),这是后面所讲的slab分配器的基础。可以很通俗的将slab分配器主要是提高了内存的使用效率,提高了CPU对内存的使用效率,对于一些需要不断申请和释放的内存,在slab分配器这儿做了一个缓存,直接可以从这儿获得已经申请好的内存,不用在需要内存时再去申请。这对于如文件系统中文件描述符和进程描述符这两种需要不断申请和释放的数据结构,用高速缓存这种方式简直就是一个福音啊。多说一句其实内核在缓存这方面做了很多的工作我们在对块设备进行访问的时候,所做的页高速缓存道理和slab差不多,只不过后者时减少CPU对低速存储设备反复读写效率的损失,用了将块设备在内存中缓存,一定时间进行刷新。
如果对存储区的请求不频繁,就用一组普通高速缓存来处理,前面讲的kmalloc()函数接口就建立在slab层之上,使用了一组通用高速缓存。
1 static inline void *kmalloc(size_t size, gfp_t flags) 2 { 3 if (__builtin_constant_p(size)) { 4 int i = 0; 5 #define CACHE(x) 6 if (size <= x) 7 goto found; 8 else 9 i++; 10 #include "kmalloc_sizes.h" 11 #undef CACHE 12 { 13 extern void __you_cannot_kmalloc_that_much(void); 14 __you_cannot_kmalloc_that_much(); 15 } 16 found: 17 #ifdef CONFIG_ZONE_DMA 18 if (flags & GFP_DMA) 19 return kmem_cache_alloc(malloc_sizes[i].cs_dmacachep, 20 flags); 21 #endif 22 return kmem_cache_alloc(malloc_sizes[i].cs_cachep, flags); 23 } 24 return __kmalloc(size, flags); 25 } 26 27 static inline void *kzalloc(size_t size, gfp_t flags) 28 { 29 if (__builtin_constant_p(size)) { 30 int i = 0; 31 #define CACHE(x) 32 if (size <= x) 33 goto found; 34 else 35 i++; 36 #include "kmalloc_sizes.h" 37 #undef CACHE 38 { 39 extern void __you_cannot_kzalloc_that_much(void); 40 __you_cannot_kzalloc_that_much(); 41 } 42 found: 43 #ifdef CONFIG_ZONE_DMA 44 if (flags & GFP_DMA) 45 return kmem_cache_zalloc(malloc_sizes[i].cs_dmacachep, 46 flags); 47 #endif 48 return kmem_cache_zalloc(malloc_sizes[i].cs_cachep, flags); 49 } 50 return __kzalloc(size, flags); 51 }
slab层的设计
kmem_cache结构
每个高速缓存都是用kmem_cache结构来表示。这个结构中包含三个链表:slabs_full、slabs_partial、slabs_empty,均放在kmem_list3结构体内,这些链表包含高速缓存中的所有slab.slab描述符struct slab结构体用来描述给个slab。
1 struct kmem_cache { 2 /* 1) per-cpu data, touched during every alloc/free */ 3 struct array_cache *array[NR_CPUS]; 4 /* 2) Cache tunables. Protected by cache_chain_mutex */ 5 unsigned int batchcount; 6 unsigned int limit; 7 unsigned int shared; 8 9 unsigned int buffer_size; 10 u32 reciprocal_buffer_size; 11 /* 3) touched by every alloc & free from the backend */ 12 13 unsigned int flags; /* constant flags */ 14 unsigned int num; /* # of objs per slab */ 15 16 /* 4) cache_grow/shrink */ 17 /* order of pgs per slab (2^n) */ 18 unsigned int gfporder; 19 20 /* force GFP flags, e.g. GFP_DMA */ 21 gfp_t gfpflags; 22 23 size_t colour; /* cache colouring range */ 24 unsigned int colour_off; /* colour offset */ 25 struct kmem_cache *slabp_cache; 26 unsigned int slab_size; 27 unsigned int dflags; /* dynamic flags */ 28 29 /* constructor func */ 30 void (*ctor) (void *, struct kmem_cache *, unsigned long); 31 32 /* 5) cache creation/removal */ 33 const char *name; 34 struct list_head next; 35 36 /* 6) statistics */ 37 #if STATS 38 unsigned long num_active; 39 unsigned long num_allocations; 40 unsigned long high_mark; 41 unsigned long grown; 42 unsigned long reaped; 43 unsigned long errors; 44 unsigned long max_freeable; 45 unsigned long node_allocs; 46 unsigned long node_frees; 47 unsigned long node_overflow; 48 atomic_t allochit; 49 atomic_t allocmiss; 50 atomic_t freehit; 51 atomic_t freemiss; 52 #endif 53 #if DEBUG 54 /* 55 * If debugging is enabled, then the allocator can add additional 56 * fields and/or padding to every object. buffer_size contains the total 57 * object size including these internal fields, the following two 58 * variables contain the offset to the user object and its size. 59 */ 60 int obj_offset; 61 int obj_size; 62 #endif 63 /* 64 * We put nodelists[] at the end of kmem_cache, because we want to size 65 * this array to nr_node_ids slots instead of MAX_NUMNODES 66 * (see kmem_cache_init()) 67 * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache 68 * is statically defined, so we reserve the max number of nodes. 69 */ 70 struct kmem_list3 *nodelists[MAX_NUMNODES]; 71 /* 72 * Do not add fields after nodelists[] 73 */ 74 };
kmem_list3结构
1 /* 2 * The slab lists for all objects. 3 */ 5 struct kmem_list3 { 6 struct list_head slabs_partial; /* partial list first, better asm code */ 7 struct list_head slabs_full; 8 struct list_head slabs_free; 9 unsigned long free_objects; 10 unsigned int free_limit; 11 unsigned int colour_next; /* Per-node cache coloring */ 12 spinlock_t list_lock; 13 struct array_cache *shared; /* shared per node */ 14 struct array_cache **alien; /* on other nodes */ 15 unsigned long next_reap; /* updated without locking */ 16 int free_touched; /* updated without locking */ 17 };
struct slab结构
1 /* 2 * struct slab 3 * 4 * Manages the objs in a slab. Placed either at the beginning of mem allocated 5 * for a slab, or allocated from an general cache. 6 * Slabs are chained into three list: fully used, partial, fully free slabs. 7 */ 8 struct slab { 9 struct list_head list; 10 unsigned long colouroff; 11 void *s_mem; /* including colour offset */ 12 unsigned int inuse; /* num of objs active in slab */ 13 kmem_bufctl_t free; 14 unsigned short nodeid; 15 };
slab分配器的接口
一个新的高速缓存可以通过kmem_cache_creat()函数创建
struct kmem_cache *kmem_cahce_create(const char *name,size_t size,size_t align,unsigned long flags,void (*ctor)(void *))
kmem_cache_creat()在成功时会返回一个指向所创建高速缓存的指针;否则,返回NULL。这个函数不能用于中断上下文中调用,因为他可能会睡眠。
如果给定的高速缓存部分既没有满也没有空的slab时可以通过调用页分配函数:kmem_getpages()--------->_get_free_pages()得到内存。当可用内存变得紧张时,系统试图释放更多内存以供使用,或者高速缓存显示的被撤销时调用函数:kmem_freepages()释放内存。
要撤销一个高速缓存,则调用:
int kmem_cache_destroy(struct kmem_cache *cachep)
这个函数可以撤销给定的高速缓存,在模块的注销代码中被调用(这里指的是创建了自己的高速缓存的模块),这个函数不能从中断上下文中调用这个函数,因为它也可能睡眠,调用该函数的条件:
1、高速缓存中的所有slab都必须为空
2、在使用该函数的过程中不能再访问这个高速缓存区。该函数成功时返回0,不成功时返回非0值。
从高速缓存中分配slab对象
创建了高速缓存后可以通过函数获取slab对象:void *kmem_cahce_alloc(struct kmem_cache *cachep,gfp_t flags),该函数从给定的高速缓存cachep中返回一个指向对象的指针。如果高速缓存的所有slab中都没有空闲的对象,那么slab层必须通过kmem_getpages()获取新的页,这个前面讲过slab层分配新的slab。
释放一个对象,并把它返回给原先的slab,使用函数:void kmem_cache_free(struct kmem_cache *cachep,void *objp),这样就能把cachep中的对象objp标记为空。
slab分配器的使用实例--------->task_struct
1、内核用一个全局变量存放指向tast_struct高速缓存的指针:struct kmem_cache *task_struct_cachep;
2、在内核初始化期间,在定义与kernel/fork.c的fork_init()中会创建高速缓存:
1 task_struct_cachep=kmem_cache_create (const char *name, size_t size, size_t align,unsigned long flags,void (*ctor)(void*, struct kmem_cache *, unsigned long),void (*dtor)(void*, struct kmem_cache *, unsigned long))
1 void __init fork_init(unsigned long mempages) 2 { 3 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 4 #ifndef ARCH_MIN_TASKALIGN 5 #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES 6 #endif 7 /* create a slab on which task_structs can be allocated */ 8 task_struct_cachep = 9 kmem_cache_create("task_struct", sizeof(struct task_struct), 10 ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL); 11 #endif 12 13 /* 14 * The default maximum number of threads is set to a safe 15 * value: the thread structures can take up at most half 16 * of memory. 17 */ 18 max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE); 19 20 /* 21 * we need to allow at least 20 threads to boot a system 22 */ 23 if(max_threads < 20) 24 max_threads = 20; 25 26 init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; 27 init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; 28 init_task.signal->rlim[RLIMIT_SIGPENDING] = 29 init_task.signal->rlim[RLIMIT_NPROC]; 30 }
这样就创建了一个名为task_stuct的高速缓存,其中存放的就是类型为struct task_struct的对象。该对象被创建后存放在slab中偏移量为ARCH_MIN_TASKALIGN个字节的地方。
每当进程调用fork()时,一定会创建一个新的进程描述符。这是在dup_tastk_struct()中完成的,该函数会被do_fork()调用:
fork()----------->sys_fork()---------->do_fork()----------->copy_process()------------>dup_task_struct()
do_fork()
1 /* 2 * Ok, this is the main fork-routine. 3 * 4 * It copies the process, and if successful kick-starts 5 * it and waits for it to finish using the VM if required. 6 */ 7 long do_fork(unsigned long clone_flags, 8 unsigned long stack_start, 9 struct pt_regs *regs, 10 unsigned long stack_size, 11 int __user *parent_tidptr, 12 int __user *child_tidptr) 13 { 14 struct task_struct *p; 15 int trace = 0; 16 struct pid *pid = alloc_pid(); 17 long nr; 18 19 if (!pid) 20 return -EAGAIN; 21 nr = pid->nr; 22 if (unlikely(current->ptrace)) { 23 trace = fork_traceflag (clone_flags); 24 if (trace) 25 clone_flags |= CLONE_PTRACE; 26 } 27 dup_task_struct 28 p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid); 29 /* 30 * Do this prior waking up the new thread - the thread pointer 31 * might get invalid after that point, if the thread exits quickly. 32 */ 33 if (!IS_ERR(p)) { 34 struct completion vfork; 35 36 if (clone_flags & CLONE_VFORK) { 37 p->vfork_done = &vfork; 38 init_completion(&vfork); 39 } 40 41 if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) { 42 /* 43 * We‘ll start up with an immediate SIGSTOP. 44 */ 45 sigaddset(&p->pending.signal, SIGSTOP); 46 set_tsk_thread_flag(p, TIF_SIGPENDING); 47 } 48 49 if (!(clone_flags & CLONE_STOPPED)) 50 wake_up_new_task(p, clone_flags); 51 else 52 p->state = TASK_STOPPED; 53 54 if (unlikely (trace)) { 55 current->ptrace_message = nr; 56 ptrace_notify ((trace << 8) | SIGTRAP); 57 } 58 59 if (clone_flags & CLONE_VFORK) { 60 freezer_do_not_count(); 61 wait_for_completion(&vfork); 62 freezer_count(); 63 if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) { 64 current->ptrace_message = nr; 65 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); 66 } 67 } 68 } else { 69 free_pid(pid); 70 nr = PTR_ERR(p); 71 } 72 return nr; 73 }
copy_process()
1 /* 2 * This creates a new process as a copy of the old one, 3 * but does not actually start it yet. 4 * 5 * It copies the registers, and all the appropriate 6 * parts of the process environment (as per the clone 7 * flags). The actual kick-off is left to the caller. 8 */ 9 static struct task_struct *copy_process(unsigned long clone_flags, 10 unsigned long stack_start, 11 struct pt_regs *regs, 12 unsigned long stack_size, 13 int __user *parent_tidptr, 14 int __user *child_tidptr, 15 struct pid *pid) 16 { 17 int retval; 18 struct task_struct *p = NULL; 19 20 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) 21 return ERR_PTR(-EINVAL); 22 23 /* 24 * Thread groups must share signals as well, and detached threads 25 * can only be started up within the thread group. 26 */ 27 if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) 28 return ERR_PTR(-EINVAL); 29 30 /* 31 * Shared signal handlers imply shared VM. By way of the above, 32 * thread groups also imply shared VM. Blocking this case allows 33 * for various simplifications in other code. 34 */ 35 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) 36 return ERR_PTR(-EINVAL); 37 38 retval = security_task_create(clone_flags); 39 if (retval) 40 goto fork_out; 41 42 retval = -ENOMEM; 43 p = dup_task_struct(current); 44 if (!p) 45 goto fork_out; .......... }
dup_task_struct
1 static struct task_struct *dup_task_struct(struct task_struct *orig) 2 { 3 struct task_struct *tsk; 4 struct thread_info *ti; 5 6 prepare_to_copy(orig); 7 8 tsk = alloc_task_struct(); 9 if (!tsk) 10 return NULL; 11 12 ti = alloc_thread_info(tsk); 13 if (!ti) { 14 free_task_struct(tsk); 15 return NULL; 16 } 17 18 *tsk = *orig; 19 tsk->stack = ti; 20 setup_thread_stack(tsk, orig); 21 22 #ifdef CONFIG_CC_STACKPROTECTOR 23 tsk->stack_canary = get_random_int(); 24 #endif 25 26 /* One for us, one for whoever does the "release_task()" (usually parent) */ 27 atomic_set(&tsk->usage,2); 28 atomic_set(&tsk->fs_excl, 0); 29 #ifdef CONFIG_BLK_DEV_IO_TRACE 30 tsk->btrace_seq = 0; 31 #endif 32 tsk->splice_pipe = NULL; 33 return tsk;
进程执行完后,如果没有子进程在等待的话,它的进程描述符就会被释放,并返回给task_struct_cachep slab高速缓存。这就是在free_task_struct()中执行的(tsk是目前的进程)
1 /** 2 * kmem_cache_free - Deallocate an object 3 * @cachep: The cache the allocation was from. 4 * @objp: The previously allocated object. 5 * 6 * Free an object which was previously allocated from this 7 * cache. 8 */ 9 void kmem_cache_free(struct kmem_cache *cachep, void *objp) 10 { 11 unsigned long flags; 12 13 BUG_ON(virt_to_cache(objp) != cachep); 14 15 local_irq_save(flags); 16 debug_check_no_locks_freed(objp, obj_size(cachep)); 17 __cache_free(cachep, objp); 18 local_irq_restore(flags); 19 }
kmem_cache_free(task_struct_cachep, tsk);
由于进程描述符是内核的核心组成部分,时刻都要用到,因此task_struct_cachep高速缓存绝不会被撤销掉,只是释放调用释放函数kmem_cache_free(),而不是kmem_cache_destroy()函数。 完(本文主要参考了:linux 内核设计与实现、深入理解linux内核、linux内核完全解析0.11版本,本人只是初步学习linux内核,错误之处请指正)