Linux内核源代码情景分析-共享内存

一、库函数shmget()--共享内存区的创建与寻找

asmlinkage long sys_shmget (key_t key, size_t size, int shmflg)
{
	struct shmid_kernel *shp;
	int err, id = 0;

	down(&shm_ids.sem);
	if (key == IPC_PRIVATE) {
		err = newseg(key, shmflg, size);//分配一个共享内存区供本进程专用,最后返回的是一体化的标示号
	} else if ((id = ipc_findkey(&shm_ids, key)) == -1) {//在shm_ids寻找shmid_kernel结构(共享内存区),如果没有找到,id为-1。如果找到了id为标示号。
		if (!(shmflg & IPC_CREAT))//没有找到也不允许创建,那么就出错返回
			err = -ENOENT;
		else
			err = newseg(key, shmflg, size);//否则创建一个共享内存区
	} else if ((shmflg & IPC_CREAT) && (shmflg & IPC_EXCL)) {//如果找到了,但是要求的是创建,那么也返回出错
		err = -EEXIST;
	} else {//如果找到了,也不要求创建,就是正常情况下了
		shp = shm_lock(id);//通过标示号id,获取共享内存区
		if(shp==NULL)
			BUG();
		if (shp->shm_segsz < size)
			err = -EINVAL;
		else if (ipcperms(&shp->shm_perm, shmflg))
			err = -EACCES;
		else
			err = shm_buildid(id, shp->shm_perm.seq);//最后返回的还是一体化参数
		shm_unlock(id);
	}
	up(&shm_ids.sem);
	return err;//无论是创建还是查找,最后都返回的是一体化的标示号
}

键值IPC_PRIVATE,即0,是特殊的,它表示要分配一个共享内存区供本进程专用。其他键值则表示要创建或寻找的是"共享"内存区。而标志位IPC_CREAT则表示目的在于创建。

1、当键值是IPC_PRIVATE时,会调用newseg,分配一个共享内存区供本进程专用,代码如下:

static int newseg (key_t key, int shmflg, size_t size)
{
	int error;
	struct shmid_kernel *shp;
	int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT;
	struct file * file;
	char name[13];
	int id;

	if (size < SHMMIN || size > shm_ctlmax)
		return -EINVAL;

	if (shm_tot + numpages >= shm_ctlall)
		return -ENOSPC;

	shp = (struct shmid_kernel *) kmalloc (sizeof (*shp), GFP_USER);//分配shmid_kernel结构
	if (!shp)
		return -ENOMEM;
	sprintf (name, "SYSV%08x", key);
	file = shmem_file_setup(name, size);//在特殊文件系统"shm"中建立映射文件
	error = PTR_ERR(file);
	if (IS_ERR(file))
		goto no_file;

	error = -ENOSPC;
	id = shm_addid(shp);//将shmid_kernel结构链入shm_ids
	if(id == -1)
		goto no_id;
	shp->shm_perm.key = key;
	shp->shm_flags = (shmflg & S_IRWXUGO);
	shp->shm_cprid = current->pid;
	shp->shm_lprid = 0;
	shp->shm_atim = shp->shm_dtim = 0;
	shp->shm_ctim = CURRENT_TIME;
	shp->shm_segsz = size;
	shp->shm_nattch = 0;
	shp->id = shm_buildid(id,shp->shm_perm.seq);//将这个标识号转换成一个一体化的标示号
	shp->shm_file = file;//指向新建立的file
	file->f_dentry->d_inode->i_ino = shp->id;
	file->f_op = &shm_file_operations;//最后又重新设置了一遍f_op,这里是shm_file_operations,而不是shmem_file_operations
	shm_tot += numpages;
	shm_unlock (id);
	return shp->id;//返回的是一体化的标示号

no_id:
	fput(file);
no_file:
	kfree(shp);
	return error;
}

shmid_kernel结构如下:

struct shmid_kernel /* private to the kernel */
{
	struct kern_ipc_perm	shm_perm;
	struct file *		shm_file;
	int			id;
	unsigned long		shm_nattch;
	unsigned long		shm_segsz;
	time_t			shm_atim;
	time_t			shm_dtim;
	time_t			shm_ctim;
	pid_t			shm_cprid;
	pid_t			shm_lprid;
};

shmem_file_setup,在特殊文件系统"shm"中建立映射文件,代码如下:

struct file *shmem_file_setup(char * name, loff_t size)
{
	int error;
	struct file *file;
	struct inode * inode;
	struct dentry *dentry, *root;
	struct qstr this;
	int vm_enough_memory(long pages);

	error = -ENOMEM;
	if (!vm_enough_memory((size) >> PAGE_SHIFT))
		goto out;

	this.name = name;
	this.len = strlen(name);
	this.hash = 0; /* will go */
	root = shmem_fs_type.kern_mnt->mnt_root;//shm特殊文件系统的根节点的dentry结构
	dentry = d_alloc(root, &this);//分配shm节点的dentry结构
	if (!dentry)
		goto out;

	error = -ENFILE;
	file = get_empty_filp();
	if (!file)
		goto put_dentry;

	error = -ENOSPC;
	inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);//分配shm节点的inode结构
	if (!inode)
		goto close_file;

	d_instantiate(dentry, inode);//shm节点的dentry结构和shm节点的inode结构相关联
	dentry->d_inode->i_size = size;
	file->f_vfsmnt = mntget(shmem_fs_type.kern_mnt);
	file->f_dentry = dentry;//指向刚刚的dentry
	file->f_op = &shmem_file_operations;//设置如下
	file->f_mode = FMODE_WRITE | FMODE_READ;
	inode->i_nlink = 0;	/* It is unlinked */
	return(file);

close_file:
	put_filp(file);
put_dentry:
	dput (dentry);
out:
	return ERR_PTR(error);
}

其中shmem_fs_type.kern_mnt->mnt_root是在init_shmem_fs中建立的。

static DECLARE_FSTYPE(shmem_fs_type, "shm", shmem_read_super, FS_LITTER);

static int __init init_shmem_fs(void)
{
	int error;
	struct vfsmount * res;

	if ((error = register_filesystem(&shmem_fs_type))) {
		printk (KERN_ERR "Could not register shmem fs\n");
		return error;
	}

	res = kern_mount(&shmem_fs_type);
	if (IS_ERR (res)) {
		printk (KERN_ERR "could not kern_mount shmem fs\n");
		unregister_filesystem(&shmem_fs_type);
		return PTR_ERR(res);
	}

	devfs_mk_dir (NULL, "shm", NULL);
	return 0;
}

shmem_get_inode,分配shm节点的inode结构,代码如下:

struct inode *shmem_get_inode(struct super_block *sb, int mode, int dev)
{
	struct inode * inode;

	spin_lock (&sb->u.shmem_sb.stat_lock);
	if (!sb->u.shmem_sb.free_inodes) {
		spin_unlock (&sb->u.shmem_sb.stat_lock);
		return NULL;
	}
	sb->u.shmem_sb.free_inodes--;
	spin_unlock (&sb->u.shmem_sb.stat_lock);

	inode = new_inode(sb);
	if (inode) {
		inode->i_mode = mode;
		inode->i_uid = current->fsuid;
		inode->i_gid = current->fsgid;
		inode->i_blksize = PAGE_CACHE_SIZE;
		inode->i_blocks = 0;
		inode->i_rdev = to_kdev_t(dev);
		inode->i_mapping->a_ops = &shmem_aops;//shmem_aops设置如下
		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
		spin_lock_init (&inode->u.shmem_i.lock);
		switch (mode & S_IFMT) {
		default:
			init_special_inode(inode, mode, dev);
			break;
		case S_IFREG://i_op和i_fop设置如下
			inode->i_op = &shmem_inode_operations;
			inode->i_fop = &shmem_file_operations;
			break;
		case S_IFDIR:
			inode->i_op = &shmem_dir_inode_operations;
			inode->i_fop = &shmem_dir_operations;
			break;
		case S_IFLNK:
			inode->i_op = &page_symlink_inode_operations;
			break;
		}
		spin_lock (&shmem_ilock);
		list_add (&inode->u.shmem_i.list, &shmem_inodes);
		spin_unlock (&shmem_ilock);
	}
	return inode;
}

inode->i_op = &shmem_inode_operations,代码如下:

static struct inode_operations shmem_inode_operations = {
	truncate:	shmem_truncate,
};

inode->i_fop = &shmem_file_operations,代码如下:

static struct file_operations shmem_file_operations = {
	mmap:		shmem_mmap
};

inode->i_mapping->a_ops = &shmem_aops,代码如下:

static struct address_space_operations shmem_aops = {
	writepage: shmem_writepage
};

返回到shmem_file_setup,file->f_op = &shmem_file_operations,如下:

static struct file_operations shmem_file_operations = {
	mmap:		shmem_mmap
};

返回到newseg,shm_addid,将shmid_kernel结构链入shm_ids,代码如下:

static inline int shm_addid(struct shmid_kernel *shp)
{
	return ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni+1);//shp->shm_perm是kern_ipc_perm
}
int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size)
{
	int id;

	size = grow_ary(ids,size);
	for (id = 0; id < size; id++) {
		if(ids->entries[id].p == NULL)
			goto found;
	}
	return -1;
found:
	ids->in_use++;
	if (id > ids->max_id)
		ids->max_id = id;

	new->cuid = new->uid = current->euid;
	new->gid = new->cgid = current->egid;

	new->seq = ids->seq++;
	if(ids->seq > ids->seq_max)
		ids->seq = 0;

	spin_lock(&ids->ary);
	ids->entries[id].p = new;//把shp->shem_perm链入到全局的shm_ids中
	return id;//返回标识号
}

其中shm_ids如下:

struct ipc_ids {
	int size;
	int in_use;
	int max_id;
	unsigned short seq;
	unsigned short seq_max;
	struct semaphore sem;
	spinlock_t ary;
	struct ipc_id* entries;
};
static struct ipc_ids shm_ids;
struct ipc_id {
	struct kern_ipc_perm* p;
};

继续执行,shm_buildid,将这个标识号转换成一个一体化的标示号,代码如下:

#define shm_buildid(id, seq) 	ipc_buildid(&shm_ids, id, seq)
extern inline int ipc_buildid(struct ipc_ids* ids, int id, int seq)
{
	return SEQ_MULTIPLIER*seq + id;
}

2、当我们需要根据执行的key来寻找共享内存区时,调用ipc_findkey,代码如下:

int ipc_findkey(struct ipc_ids* ids, key_t key)
{
	int id;
	struct kern_ipc_perm* p;

	for (id = 0; id <= ids->max_id; id++) {
		p = ids->entries[id].p;
		if(p==NULL)
			continue;
		if (key == p->key)
			return id;//返回标示号,而不是一体化标示号
	}
	return -1;
}

如果找到了,也不要求创建,就是正常情况下了,执行shm_lock,通过标识号id获取共享内存区,如下:

#define shm_lock(id)	((struct shmid_kernel*)ipc_lock(&shm_ids,id))
extern inline struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id)
{
	struct kern_ipc_perm* out;
	int lid = id % SEQ_MULTIPLIER;//无论是标示号还是一体化标示号都通吃
	if(lid > ids->size)
		return NULL;

	spin_lock(&ids->ary);
	out = ids->entries[lid].p;
	if(out==NULL)
		spin_unlock(&ids->ary);
	return out;
}

二、库函数shmat()--建立共享内存区的映射

通过shmget()以给定键值创建了一个共享内存区,或者取得了已创建共享内存区的一体化的标示号以后,还要通过shmat()将这个内存区映射到本进程的虚拟空间,sys_shmat代码如下:

asmlinkage long sys_shmat (int shmid, char *shmaddr, int shmflg, ulong *raddr)//shmaddr为当前进程所要求映射的目标地址,也就是映射后该共享内存区在这个进程的用户空间中的起始地址
{
	struct shmid_kernel *shp;
	unsigned long addr;
	struct file * file;
	int    err;
	unsigned long flags;
	unsigned long prot;
	unsigned long o_flags;
	int acc_mode;
	void *user_addr;

	if (shmid < 0)
		return -EINVAL;

	if ((addr = (ulong)shmaddr)) {
		if (addr & (SHMLBA-1)) {
			if (shmflg & SHM_RND)
				addr &= ~(SHMLBA-1);	   /* round down */
			else
				return -EINVAL;
		}
		flags = MAP_SHARED | MAP_FIXED;
	} else
		flags = MAP_SHARED;

	if (shmflg & SHM_RDONLY) {
		prot = PROT_READ;
		o_flags = O_RDONLY;
		acc_mode = S_IRUGO;
	} else {
		prot = PROT_READ | PROT_WRITE;
		o_flags = O_RDWR;
		acc_mode = S_IRUGO | S_IWUGO;
	}

	/*
	 * We cannot rely on the fs check since SYSV IPC does have an
	 * aditional creator id...
	 */
	shp = shm_lock(shmid);//通过一体化标示号找到共享内存区
	if(shp == NULL)
		return -EINVAL;
	if (ipcperms(&shp->shm_perm, acc_mode)) {
		shm_unlock(shmid);
		return -EACCES;
	}
	file = shp->shm_file;//找到file结构
	shp->shm_nattch++;
	shm_unlock(shmid);

	down(&current->mm->mmap_sem);
	user_addr = (void *) do_mmap (file, addr, file->f_dentry->d_inode->i_size, prot, flags, 0);//建立起文件与虚拟空间的映射
	up(&current->mm->mmap_sem);

	down (&shm_ids.sem);
	if(!(shp = shm_lock(shmid)))
		BUG();
	shp->shm_nattch--;
	if(shp->shm_nattch == 0 &&
	   shp->shm_flags & SHM_DEST)
		shm_destroy (shp);
	shm_unlock(shmid);
	up (&shm_ids.sem);

	*raddr = (unsigned long) user_addr;
	err = 0;
	if (IS_ERR(user_addr))
		err = PTR_ERR(user_addr);
	return err;

}

do_mmap,建立起文件与虚拟空间的映射。代码如下:

static inline unsigned long do_mmap(struct file *file, unsigned long addr,
	unsigned long len, unsigned long prot,
	unsigned long flag, unsigned long offset)
{
	unsigned long ret = -EINVAL;
	if ((offset + PAGE_ALIGN(len)) < offset)
		goto out;
	if (!(offset & ~PAGE_MASK))
		ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
out:
	return ret;
}
unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len,
	unsigned long prot, unsigned long flags, unsigned long pgoff)
{
	struct mm_struct * mm = current->mm;
	struct vm_area_struct * vma;
	int correct_wcount = 0;
	int error;

	......
	if (flags & MAP_FIXED) {
		if (addr & ~PAGE_MASK)
			return -EINVAL;
	} else {
		addr = get_unmapped_area(addr, len);//如果addr为0,那么就自行分配一个虚拟空间
		if (!addr)
			return -ENOMEM;
	}

	/* Determine the object being mapped and call the appropriate
	 * specific mapper. the address has already been validated, but
	 * not unmapped, but the maps are removed from the list.
	 */
	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);//分配了vm_area_struct结构
	if (!vma)
		return -ENOMEM;

	vma->vm_mm = mm;
	vma->vm_start = addr;
	vma->vm_end = addr + len;
	vma->vm_flags = vm_flags(prot,flags) | mm->def_flags;

	if (file) {
		VM_ClearReadHint(vma);
		vma->vm_raend = 0;

		if (file->f_mode & FMODE_READ)
			vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
		if (flags & MAP_SHARED) {
			vma->vm_flags |= VM_SHARED | VM_MAYSHARE;

			/* This looks strange, but when we don‘t have the file open
			 * for writing, we can demote the shared mapping to a simpler
			 * private mapping. That also takes care of a security hole
			 * with ptrace() writing to a shared mapping without write
			 * permissions.
			 *
			 * We leave the VM_MAYSHARE bit on, just to get correct output
			 * from /proc/xxx/maps..
			 */
			if (!(file->f_mode & FMODE_WRITE))
				vma->vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
		}
	} else {
		vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
		if (flags & MAP_SHARED)
			vma->vm_flags |= VM_SHARED | VM_MAYSHARE;
	}
	vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f];
	vma->vm_ops = NULL;
	vma->vm_pgoff = pgoff;
	vma->vm_file = NULL;
	vma->vm_private_data = NULL;

	/* Clear old maps */
	error = -ENOMEM;
	if (do_munmap(mm, addr, len))
		goto free_vma;

	/* Check against address space limit. */
	if ((mm->total_vm << PAGE_SHIFT) + len
	    > current->rlim[RLIMIT_AS].rlim_cur)
		goto free_vma;

	/* Private writable mapping? Check memory availability.. */
	if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE &&
	    !(flags & MAP_NORESERVE)				 &&
	    !vm_enough_memory(len >> PAGE_SHIFT))
		goto free_vma;

	if (file) {
		if (vma->vm_flags & VM_DENYWRITE) {
			error = deny_write_access(file);
			if (error)
				goto free_vma;
			correct_wcount = 1;
		}
		vma->vm_file = file;//这里是重点
		get_file(file);
		error = file->f_op->mmap(file, vma);//最后设置成shmem_mmap
		if (error)
			goto unmap_and_free_vma;
	} else if (flags & MAP_SHARED) {
		error = shmem_zero_setup(vma);
		if (error)
			goto free_vma;
	}

	/* Can addr have changed??
	 *
	 * Answer: Yes, several device drivers can do it in their
	 *         f_op->mmap method. -DaveM
	 */
	flags = vma->vm_flags;
	addr = vma->vm_start;

	insert_vm_struct(mm, vma);
	if (correct_wcount)
		atomic_inc(&file->f_dentry->d_inode->i_writecount);

	mm->total_vm += len >> PAGE_SHIFT;
	if (flags & VM_LOCKED) {
		mm->locked_vm += len >> PAGE_SHIFT;
		make_pages_present(addr, addr + len);
	}
	return addr;//起始虚拟地址

unmap_and_free_vma:
	if (correct_wcount)
		atomic_inc(&file->f_dentry->d_inode->i_writecount);
	vma->vm_file = NULL;
	fput(file);
	/* Undo any partial mapping done by a device driver. */
	flush_cache_range(mm, vma->vm_start, vma->vm_end);
	zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start);
	flush_tlb_range(mm, vma->vm_start, vma->vm_end);
free_vma:
	kmem_cache_free(vm_area_cachep, vma);
	return error;
}

file->f_op->mmap(file, vma),最后设置成shmem_mmap,代码如下:

static int shm_mmap(struct file * file, struct vm_area_struct * vma)
{
	UPDATE_ATIME(file->f_dentry->d_inode);
	vma->vm_ops = &shm_vm_ops;
	shm_inc(file->f_dentry->d_inode->i_ino);
	return 0;
}
static struct vm_operations_struct shm_vm_ops = {
	open:	shm_open,	/* callback for a new vm-area open */
	close:	shm_close,	/* callback for when the vm-area is released */
	nopage:	shmem_nopage,
};

在sys_shmat()中实际上并没有建立页面的映射,而是把它推迟到了实际需要的时候。

三、所以,在将一块共享内存区纳入一个进程的存储空间以后,当其中的任何一个页面首次受到访问时就会因为“缺页”而产生一次页面异常。从do_page_fault()开始,顺着handle_mm_fault()、handle_pte_fault(),一直到do_no_page。在do_no_page()中,如果产生异常的地址所属区间的指针vm_ops指向一个vm_operations_struct数据结构,并且该结构中的函数指针nopage非零,就会调用这个函数来建立所在页面的映射表项。

static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
	unsigned long address, int write_access, pte_t *page_table)
{
	struct page * new_page;
	pte_t entry;

	if (!vma->vm_ops || !vma->vm_ops->nopage)
		return do_anonymous_page(mm, vma, page_table, write_access, address);

	/*
	 * The third argument is "no_share", which tells the low-level code
	 * to copy, not share the page even if sharing is possible.  It‘s
	 * essentially an early COW detection.
	 */
	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);//对于共享内存来说,指向了shmem_page
	if (new_page == NULL)	/* no page was available -- SIGBUS */
		return 0;
	if (new_page == NOPAGE_OOM)
		return -1;
	++mm->rss;
	/*
	 * This silly early PAGE_DIRTY setting removes a race
	 * due to the bad i386 page protection. But it‘s valid
	 * for other architectures too.
	 *
	 * Note that if write_access is true, we either now have
	 * an exclusive copy of the page, or this is a shared mapping,
	 * so we can make it writable and dirty to avoid having to
	 * handle that later.
	 */
	flush_page_to_ram(new_page);
	flush_icache_page(vma, new_page);
	entry = mk_pte(new_page, vma->vm_page_prot);
	if (write_access) {
		entry = pte_mkwrite(pte_mkdirty(entry));
	} else if (page_count(new_page) > 1 &&
		   !(vma->vm_flags & VM_SHARED))
		entry = pte_wrprotect(entry);
	set_pte(page_table, entry);//把页表项指向新申请的page,这样就建立了映射
	/* no need to invalidate: a not-present page shouldn‘t be cached */
	update_mmu_cache(vma, address, entry);
	return 2;	/* Major fault */
}

vma->vm_ops->nopage,对于共享内存来说,指向了shmem_page,代码如下:

struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, int no_share)
{
	unsigned long size;
	struct page * page;
	unsigned int idx;
	swp_entry_t *entry;
	struct inode * inode = vma->vm_file->f_dentry->d_inode;
	struct address_space * mapping = inode->i_mapping;
	struct shmem_inode_info *info;

	idx = (address - vma->vm_start) >> PAGE_SHIFT;
	idx += vma->vm_pgoff;

	down (&inode->i_sem);
	size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;//页面数
	page = NOPAGE_SIGBUS;
	if ((idx >= size) && (vma->vm_mm == current->mm))
		goto out;

	/* retry, we may have slept */
	page = __find_lock_page(mapping, idx, page_hash (mapping, idx));
	if (page)
		goto cached_page;

	info = &inode->u.shmem_i;
	entry = shmem_swp_entry (info, idx);
	if (!entry)
		goto oom;
	if (entry->val) {//目前为0
		unsigned long flags;

		/* Look it up and read it in.. */
		page = lookup_swap_cache(*entry);
		if (!page) {
			lock_kernel();
			swapin_readahead(*entry);
			page = read_swap_cache(*entry);
			unlock_kernel();
			if (!page)
				goto oom;
		}

		/* We have to this with page locked to prevent races */
		spin_lock (&info->lock);
		swap_free(*entry);
		lock_page(page);
		delete_from_swap_cache_nolock(page);
		*entry = (swp_entry_t) {0};
		flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced) | (1 << PG_arch_1));
		page->flags = flags | (1 << PG_dirty);
		add_to_page_cache_locked(page, mapping, idx);
		info->swapped--;
		spin_unlock (&info->lock);
	} else {//执行这里
		spin_lock (&inode->i_sb->u.shmem_sb.stat_lock);
		if (inode->i_sb->u.shmem_sb.free_blocks == 0)
			goto no_space;
		inode->i_sb->u.shmem_sb.free_blocks--;
		spin_unlock (&inode->i_sb->u.shmem_sb.stat_lock);
		/* Ok, get a new page */
		page = page_cache_alloc();//分配一个页面
		if (!page)
			goto oom;
		clear_user_highpage(page, address);
		inode->i_blocks++;
		add_to_page_cache (page, mapping, idx);//这个函数是重点,此时的mapping是inode->mapping,而不是交换分区&swapper_space
	}
	/* We have the page */
	SetPageUptodate (page);

cached_page:
	UnlockPage (page);
	up(&inode->i_sem);

	if (no_share) {
		struct page *new_page = page_cache_alloc();

		if (new_page) {
			copy_user_highpage(new_page, page, address);
			flush_page_to_ram(new_page);
		} else
			new_page = NOPAGE_OOM;
		page_cache_release(page);
		return new_page;
	}

	flush_page_to_ram (page);
	return(page);
no_space:
	spin_unlock (&inode->i_sb->u.shmem_sb.stat_lock);
oom:
	page = NOPAGE_OOM;
out:
	up(&inode->i_sem);
	return page;
}

add_to_page_cache,将page加入到相关队里中去。相关代码请参考Linux内核源代码情景分析-内存管理之用户页面的换入,只不是此时的mapping是inode->mapping,而不是交换分区&swapper_space。

page->list链入mapping->clean_pages;
    page->next_hash和page->pprev_hash链入全局的Hash表;
    page->lru链入了全局的active_list;

page->mapping来自于inode->mapping。也是在这里赋值的。

返回到do_no_page,把页表项指向新申请的page。这样就建立了映射。

假设两个进程一个是申请sys_shmget的共享内存区,一个是查找sys_shmget刚刚申请的共享内存区;都通过sys_shmat,将这个内存区映射到本进程的虚拟空间。

第一个进程如上面的步骤,建立了映射后,往共享内存区写数据。第二个进程会调用page = __find_lock_page(mapping, idx, page_hash (mapping, idx));找到刚刚分配的内存,并建立映射。这样第二个进程就能读取刚刚写入的数据。

四、当内存紧张时,共享内存区也会被换入到交换分区,参考Linux内核源代码情景分析-内存管理之用户页面的定期换出

kswapd内核线程:

1、refill_inactive_scan和swap_out,把活跃的页面变成不活跃脏的页面。挑选的原则是最近没有被访问,且age小于0。

2、page_launder,把不活跃脏的页面变成不活跃干净的页面。

我们这里主要分析page_launder,算法如下:

                  if (PageDirty(page)) {
			int (*writepage)(struct page *) = page->mapping->a_ops->writepage;//还记得我们设置过shmem_writepage
			int result;

			if (!writepage)
				goto page_active;

			/* First time through? Move it to the back of the list */
			if (!launder_loop) {
				list_del(page_lru);
				list_add(page_lru, &inactive_dirty_list);
				UnlockPage(page);
				continue;
			}

			/* OK, do a physical asynchronous write to swap.  */
			ClearPageDirty(page);
			page_cache_get(page);
			spin_unlock(&pagemap_lru_lock);

			result = writepage(page);//shmem_writepage
			page_cache_release(page);

			/* And re-start the thing.. */
			spin_lock(&pagemap_lru_lock);
			if (result != 1)
				continue;
			/* writepage refused to do anything */
			set_page_dirty(page);
			goto page_active;
		}

inode->i_mapping->a_ops = &shmem_aops,代码如下:

static struct address_space_operations shmem_aops = {
	writepage: shmem_writepage
};

writepage(page),也就是shmem_writepage(page),代码如下:

static int shmem_writepage(struct page * page)
{
	int error;
	struct shmem_inode_info *info;
	swp_entry_t *entry, swap;

	info = &page->mapping->host->u.shmem_i;
	if (info->locked)
		return 1;
	swap = __get_swap_page(2);//从交换设备上分配一个页面
	if (!swap.val)
		return 1;

	spin_lock(&info->lock);
	entry = shmem_swp_entry (info, page->index);//根据物理页面号,通过这个函数在文件的swp_entry_t表中找到相应的表项,此表项表示一个页面在交换设备上的页面号,目前什么内容没有
	if (!entry)	/* this had been allocted on page allocation */
		BUG();
	error = -EAGAIN;
	if (entry->val) {
                __swap_free(swap, 2);
		goto out;
        }

        *entry = swap;//页面在交换设备上的页面号
	error = 0;
	/* Remove the from the page cache */
	lru_cache_del(page);
	remove_inode_page(page);

	/* Add it to the swap cache */
	add_to_swap_cache(page, swap);
	page_cache_release(page);
	set_page_dirty(page);
	info->swapped++;
out:
	spin_unlock(&info->lock);
	UnlockPage(page);
	return error;
}

shmem_swp_entry,根据物理页面号,通过这个函数在文件的swp_entry_t表中找到相应的表项,此表项表示一个页面在交换设备上的页面号。

static swp_entry_t * shmem_swp_entry (struct shmem_inode_info *info, unsigned long index)
{
	if (index < SHMEM_NR_DIRECT)
		return info->i_direct+index;

	index -= SHMEM_NR_DIRECT;
	if (index >= ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
		return NULL;

	if (!info->i_indirect) {
		info->i_indirect = (swp_entry_t **) get_zeroed_page(GFP_USER);
		if (!info->i_indirect)
			return NULL;
	}
	if(!(info->i_indirect[index/ENTRIES_PER_PAGE])) {
		info->i_indirect[index/ENTRIES_PER_PAGE] = (swp_entry_t *) get_zeroed_page(GFP_USER);
		if (!info->i_indirect[index/ENTRIES_PER_PAGE])
			return NULL;
	}

	return info->i_indirect[index/ENTRIES_PER_PAGE]+index%ENTRIES_PER_PAGE;
}
struct shmem_inode_info {
	spinlock_t	lock;
	swp_entry_t	i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */
	swp_entry_t   **i_indirect; /* doubly indirect blocks */
	unsigned long	swapped;
	int		locked;     /* into memory */
	struct list_head	list;
};

返回到shmem_writepage,执行如下:

	/* Remove the from the page cache */
	lru_cache_del(page);
	remove_inode_page(page);

page->list为空;

page->next_hash和page->pprev_hash位空;

page->lru为空;

继续执行,代码如下:

	/* Add it to the swap cache */
	add_to_swap_cache(page, swap);
	page_cache_release(page);
void add_to_swap_cache(struct page *page, swp_entry_t entry)
{
	unsigned long flags;

#ifdef SWAP_CACHE_INFO
	swap_cache_add_total++;
#endif
	if (!PageLocked(page))
		BUG();
	if (PageTestandSetSwapCache(page))
		BUG();
	if (page->mapping)
		BUG();
	flags = page->flags & ~((1 << PG_error) | (1 << PG_arch_1));
	page->flags = flags | (1 << PG_uptodate);
	add_to_page_cache_locked(page, &swapper_space, entry.val);
}

参考Linux内核源代码情景分析-内存管理之用户页面的换入,执行后的结果如下:

page->list链入mapping->clean_pages;
    page->next_hash和page->pprev_hash链入全局的Hash表;
    page->lru链入了全局的active_list;

只是此时mapping是交换分区&swapper_space。而不是的是inode->mapping,所以page->mapping->a_ops->writepage就指向了swap_writepage了。

当page_launcher再次扫描到这个页面时,它的page->mapping->a_ops->writepage已经指向了swap_writepage了。流程就和Linux内核源代码情景分析-内存管理之用户页面的定期换出完全一样了。

static int swap_writepage(struct page *page)
{
	rw_swap_page(WRITE, page, 0);
	return 0;
}

把页面写入了交换分区。最后:

page->list链入mapping->dirty_pages或者clean_pages(保持原样);
    page->next_hash和page->pprev_hash链入全局的Hash表;
    page->lru链入了page->zone->inactive_clean_list;

五、恢复映射

1、如果refill_inactive_scan和swap_out,把活跃的页面变成不活跃脏的页面。挑选的原则是最近没有被访问,且age小于0。

或者,page_launder,把不活跃脏的页面变成不活跃干净的页面。

不活跃脏的页面,有如下特点:

使用计数为1;

page->list链入mapping->dirty_pages/clean_pages;

page->next_hash和page->pprev_hash链入全局的Hash表;

page->lru链入了全局的inactive_dirty_list;

page->flags对应为设置为PG_dirty。

不活跃干净的页面,有如下特点:

使用计数为1;

page->list链入mapping->dirty_pages/clean_pages(保持原样);

page->next_hash和page->pprev_hash链入全局的Hash表;

page->lru链入了page->zone->inactive_clean_list;

如果发生缺页中断,do_no_page中调用shmem_nopage,再次访问到这个页面,那么会调用lookup_swap_cache,会在全局的Hash表找到对应的页面,并且引用计数加1,变成2,但还没有移到活跃队列中。什么时候转移到活跃队列中呢?

答案在,page_launder和reclaim_page中。

page_launder:

if (PageTestandClearReferenced(page) || page->age > 0 ||   //此时引用计数大于1
                (!page->buffers && page_count(page) > 1) ||
                page_ramdisk(page)) {
            del_page_from_inactive_dirty_list(page);
            add_page_to_active_list(page);
            continue;
} 

reclaim_page:

if (PageTestandClearReferenced(page) || page->age > 0 ||
                (!page->buffers && page_count(page) > 1)) {//此时引用计数大于1
            del_page_from_inactive_clean_list(page);
            add_page_to_active_list(page);
            continue;
}  

2、如果reclaim_page,把不活跃干净的页面,所有的链表关系都清除,但使用计数仍然为1。

如果发生缺页中断,do_no_page调用shmem_nopage,再次访问到这个页面,调用look_swap_cache为NULL,所以继续执行,代码位于shmem_nopage:

if (entry->val) {//目前不为0了,应为刚刚换出时设置了
		unsigned long flags;

		/* Look it up and read it in.. */
		page = lookup_swap_cache(*entry);
		if (!page) {
			lock_kernel();
			swapin_readahead(*entry);//从交换区预读
			page = read_swap_cache(*entry);//从交换区真读
			unlock_kernel();
			if (!page)
				goto oom;
		}

		/* We have to this with page locked to prevent races */
		spin_lock (&info->lock);
		swap_free(*entry);
		lock_page(page);
		delete_from_swap_cache_nolock(page);//从交换区队列中移除
		*entry = (swp_entry_t) {0};//swap_entry_t项清零
		flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced) | (1 << PG_arch_1));
		page->flags = flags | (1 << PG_dirty);
		add_to_page_cache_locked(page, mapping, idx);
		info->swapped--;
		spin_unlock (&info->lock);
	} else {

add_to_page_cache_locked,最后的结构就是:

page->list链入mapping->clean_pages;
    page->next_hash和page->pprev_hash链入全局的Hash表;
    page->lru链入了全局的active_list。

时间: 2024-08-05 21:19:51

Linux内核源代码情景分析-共享内存的相关文章

Linux内核源代码情景分析-交换分区

在Linux内核源代码情景分析-共享内存中,共享内存,当内存紧张时是换出到交换分区. 在Linux内核源代码情景分析-mmap后,文件与虚拟区间建立映射中,文件映射的页面,当内存紧张时是换出到硬盘上的文件中. 这里的交换分区,就是是swap分区,记得给电脑安装ubuntu时,就有一项是swap分区. 交换分区和文件的区别是: 文件是在一个具体的文件系统之下的,交换分区没有这个必要,它可能是一个裸分区,不需要文件系统. 需要说明一点,并不是所有从物理内存中交换出来的数据都会被放到Swap中(如果这

Linux内核源代码情景分析-内存管理

用户空间的页面有下面几种: 1.普通的用户空间页面,包括进程的代码段.数据段.堆栈段.以及动态分配的"存储堆". 2.通过系统调用mmap()映射到用户空间的已打开文件的内容. 3.进程间的共享内存区. 这些页面的的周转有两方面的意思. 1.页面的分配,使用,回收.如进程压栈时新申请的页面,这类页面不进行盘区交换,不使用时释放得以回收. 这部分通过一个场景来解释: Linux内核源代码情景分析-内存管理之用户堆栈的扩展. 2.盘区交换.如要执行硬盘上的对应代码段.把硬盘上的代码段换入内

Linux内核源代码情景分析-内存管理之slab-回收

在上一篇文章Linux内核源代码情景分析-内存管理之slab-分配与释放,最后形成了如下图的结构: 图 1 我们看到空闲slab块占用的若干页面,不会自己释放:我们是通过kmem_cache_reap和kmem_cache_shrink来回收的.他们的区别是: 1.我们先看kmem_cache_shrink,代码如下: int kmem_cache_shrink(kmem_cache_t *cachep) { if (!cachep || in_interrupt() || !is_chaine

Linux内核源代码情景分析-内存管理之用户页面的定期换出

我们已经看到在分配页面时,如果页面数不够,那么会调用page_launder,reclaim_page,__free_page将页面换出,并重新投入分配. 为了避免总是在CPU忙碌的时候,也就是在缺页异常发生的时候,临时再来搜寻可供换出的内存页面并加以换出,Linux内核定期地检查并且预先将若干页面换出,腾出空间,以减轻系统在缺页异常发生时的负担. 为此,在Linux内核中设置了一个专司定期将页面换出的"守护神"kswapd和kreclaimd. static int __init k

Linux内核源代码情景分析-fork()

父进程fork出子进程: fork经过系统调用,来到了sys_fork,详细过程请参考Linux内核源代码情景分析-系统调用. asmlinkage int sys_fork(struct pt_regs regs) { return do_fork(SIGCHLD, regs.esp, &regs, 0); } int do_fork(unsigned long clone_flags, unsigned long stack_start, //stack_start为用户空间堆栈指针 str

Linux内核源代码情景分析-系统调用mknod

普通文件可以用open或者create创建,FIFO文件可以用pipe创建,mknod主要用于设备文件的创建. 在内核中,mknod是由sys_mknod实现的,代码如下: asmlinkage long sys_mknod(const char * filename, int mode, dev_t dev) //比如filename为/tmp/server_socket,dev是设备号 { int error = 0; char * tmp; struct dentry * dentry;

Linux内核源代码情景分析-文件系统的安装

执行sudo mount -t ext2 /dev/sdb1 /mnt/sdb,将文件系统挂在到/mnt/sdb上.系统调用mount,映射到内核层执行的是sys_mount.假设/dev/sdb1和/mnt/sdb都位于ext2文件系统中. asmlinkage long sys_mount(char * dev_name, char * dir_name, char * type, unsigned long flags, void * data)//dev_name指向了"/dev/sdb

Linux内核源代码情景分析-mmap后,文件与虚拟区间建立映射

一.文件映射的页面换入 在mmap后,mmap参考Linux内核源代码情景分析-系统调用mmap(),当这个区间的一个页面首次受到访问时,会由于见面无映射而发生缺页异常,相应的异常处理程序do_no_page(). static inline int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct * vma, unsigned long address, int write_access, pte_t * pte) {

Linux内核源代码情景分析-系统调用brk()

首先看下进程地址空间示意图: 我们简单的说,从低地址到高地址,代码区和数据区,空洞,堆栈区.    在Linux内核源代码情景分析-内存管理之用户堆栈的扩展,我们申请了从堆栈区往下,数据区上面的页面.    在Linux内核源代码情景分析-内存管理之用户页面的换入,我们申请了用于换入/换出的页面.    在本文中,我们申请的是从数据区往上,堆栈区下面的页面.    我们通过一个实例来分析,brk(),见下图: 1.由于新边界比旧边界地址高,我们申请旧边界和新边界之间的页面.就是把对应的虚拟地址映