linux写入磁盘过程经历VFS -> 页缓存(page cache) -> 具体的文件系统(ext2/3/4、XFS、ReiserFS等) -> Block IO ->设备驱动 -> SCSI指令(或者其他指令),总体来说linux文件写入磁盘过程比较复杂
1、VFS(虚拟文件系统)
Linux中采用了VFS的方式屏蔽了多个文件系统的差别, 当需要不同的设备或者其他文件系统时,采用挂载mount的方式访问其他设备或者其他文件系统(这里可以把文件系统理解为具体的设备)。正是因为使用了VFS,所以所有的文件系统设备使用统一的文件目录树视图访问,整个存储空间采用一个文件系统目录树来管理,屏蔽了底层多个文件系统之间的差别。当然,如果你需要把你自己编写的文件系统集成到Linux内核,采用VFS的方式进行访问,你需要采用模块加载的方式进行处理,相应的文件系统模块文件需要编入到系统目录/lib/modules/your-system-name/kernel/fs当中。当然VFS的作用远不止这些,通过VFS也进行访问设备,在Linux下所有的对象都是文件,简化了系统的访问。
1.1 正常情况下,所有的文件操作通过系统调用进入到VFS中,特殊的处理,直接操作原始设备。文件系统写入的系统调用为:
#include <unistd.h>
ssize_t write(int fd, const void * buffer, size_t count);
1.2 当采用系统调用进入VFS时,接下来的处理交给VFS层。处理过程比较中要的是vfs_write、generic_file_aio_write
1 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) 2 { 3 ssize_t ret; 4 5 if (!(file->f_mode & FMODE_WRITE)) 6 return -EBADF; 7 if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write)) 8 return -EINVAL; 9 if (unlikely(!access_ok(VERIFY_READ, buf, count))) 10 return -EFAULT; 11 12 ret = rw_verify_area(WRITE, file, pos, count); 13 if (ret >= 0) { 14 count = ret; 15 if (file->f_op->write) 16 ret = file->f_op->write(file, buf, count, pos); 17 else 18 ret = do_sync_write(file, buf, count, pos); 19 if (ret > 0) { 20 fsnotify_modify(file->f_path.dentry); 21 add_wchar(current, ret); 22 } 23 inc_syscw(current); 24 } 25 26 return ret; 27 }
1 /** 2 * generic_file_aio_write - write data to a file 3 * @iocb: IO state structure 4 * @iov: vector with data to write 5 * @nr_segs: number of segments in the vector 6 * @pos: position in file where to write 7 * 8 * This is a wrapper around __generic_file_aio_write() to be used by most 9 * filesystems. It takes care of syncing the file in case of O_SYNC file 10 * and acquires i_mutex as needed. 11 */ 12 ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 13 unsigned long nr_segs, loff_t pos) 14 { 15 struct file *file = iocb->ki_filp; 16 struct inode *inode = file->f_mapping->host; 17 ssize_t ret; 18 19 BUG_ON(iocb->ki_pos != pos); 20 21 mutex_lock(&inode->i_mutex); 22 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 23 mutex_unlock(&inode->i_mutex); 24 25 if (ret > 0 || ret == -EIOCBQUEUED) { 26 ssize_t err; 27 28 err = generic_write_sync(file, pos, ret); 29 if (err < 0 && ret > 0) 30 ret = err; 31 } 32 return ret; 33 }
2、 对于VFS层也有采用page cache和非page cache两种,下面重要介绍采用page cache的处理。
在VFS中, 每个打开操作的文件对应内核都有一个address_space 数据结构, 该数据结构是用来表示系统中打开的文件,并且一个打开的文件只有一个address_space数据结构。
如下:
1 struct address_space { 2 struct inode *host; /* owner: inode, block_device */ 3 struct radix_tree_root page_tree; /* radix tree of all pages */ 4 spinlock_t tree_lock; /* and lock protecting it */ 5 unsigned int i_mmap_writable;/* count VM_SHARED mappings */ 6 struct prio_tree_root i_mmap; /* tree of private and shared mappings */ 7 struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ 8 spinlock_t i_mmap_lock; /* protect tree, count, list */ 9 unsigned int truncate_count; /* Cover race condition with truncate */ 10 unsigned long nrpages; /* number of total pages */ 11 pgoff_t writeback_index;/* writeback starts here */ 12 const struct address_space_operations *a_ops; /* methods */ 13 unsigned long flags; /* error bits/gfp mask */ 14 struct backing_dev_info *backing_dev_info; /* device readahead, etc */ 15 spinlock_t private_lock; /* for use by the address_space */ 16 struct list_head private_list; /* ditto */ 17 struct address_space *assoc_mapping; /* ditto */ 18 struct mutex unmap_mutex; /* to protect unmapping */ 19 } __attribute__((aligned(sizeof(long))));
对于文件中的文件内容缓存采用的是基数树的方式来保存的,在成员变量page_tree中,关于基数树的介绍参考[1]和[2]。 下面是关于page cache写处理的几个重要的函数
1 ssize_t 2 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, 3 unsigned long nr_segs, loff_t pos, loff_t *ppos, 4 size_t count, ssize_t written) 5 { 6 struct file *file = iocb->ki_filp; 7 struct address_space *mapping = file->f_mapping; 8 ssize_t status; 9 struct iov_iter i; 10 11 iov_iter_init(&i, iov, nr_segs, count, written); 12 status = generic_perform_write(file, &i, pos); 13 14 if (likely(status >= 0)) { 15 written += status; 16 *ppos = pos + status; 17 } 18 19 /* 20 * If we get here for O_DIRECT writes then we must have fallen through 21 * to buffered writes (block instantiation inside i_size). So we sync 22 * the file data here, to try to honour O_DIRECT expectations. 23 */ 24 if (unlikely(file->f_flags & O_DIRECT) && written) 25 status = filemap_write_and_wait_range(mapping, 26 pos, pos + written - 1); 27 28 return written ? written : status; 29 }
调用page cache中的write_begin 和write_end
Note: 在进行VFS系统调用写入文件过程中,可以允许在文件中的任何位置写入,这其中就包括当写入的过程中写入的起始位置不是一个block的开始位置,这时需要特殊的处理,上述的过程都在write_begin这个函数调用过程中处理完毕。
3、ext2/3/4中文件的处理。
当在page cache中进行到write_begin时,需要ext4中的ext4_write_begin处理, 如下:
1 static int ext4_write_begin(struct file *file, struct address_space *mapping, 2 loff_t pos, unsigned len, unsigned flags, 3 struct page **pagep, void **fsdata) 4 { 5 struct inode *inode = mapping->host; 6 int ret, needed_blocks; 7 handle_t *handle; 8 int retries = 0; 9 struct page *page; 10 pgoff_t index; 11 unsigned from, to; 12 ......... 13 14 index = pos >> PAGE_CACHE_SHIFT; 15 from = pos & (PAGE_CACHE_SIZE - 1); 16 to = from + len; 17 18 retry: 19 handle = ext4_journal_start(inode, needed_blocks); 20 if (IS_ERR(handle)) { 21 ret = PTR_ERR(handle); 22 goto out; 23 } 24 25 /* We cannot recurse into the filesystem as the transaction is already 26 * started */ 27 flags |= AOP_FLAG_NOFS; 28 29 page = grab_cache_page_write_begin(mapping, index, flags); 30 if (!page) { 31 ext4_journal_stop(handle); 32 ret = -ENOMEM; 33 goto out; 34 } 35 *pagep = page; 36 37 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 38 ext4_get_block); 39 40 if (!ret && ext4_should_journal_data(inode)) { 41 ret = walk_page_buffers(handle, page_buffers(page), 42 from, to, NULL, do_journal_get_write_access); 43 } 44 45 if (ret) { 46 unlock_page(page); 47 page_cache_release(page); 48 /* 49 * block_write_begin may have instantiated a few blocks 50 * outside i_size. Trim these off again. Don‘t need 51 * i_size_read because we hold i_mutex. 52 * 53 * Add inode to orphan list in case we crash before 54 * truncate finishes 55 */ 56 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 57 ext4_orphan_add(handle, inode); 58 59 ext4_journal_stop(handle); 60 if (pos + len > inode->i_size) { 61 ext4_truncate_failed_write(inode); 62 /* 63 * If truncate failed early the inode might 64 * still be on the orphan list; we need to 65 * make sure the inode is removed from the 66 * orphan list in that case. 67 */ 68 if (inode->i_nlink) 69 ext4_orphan_del(NULL, inode); 70 } 71 } 72 73 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 74 goto retry; 75 out: 76 return ret; 77 }
其中在ext4_write_begin中包含了很多的处理功能,包括文件物理块的分配(假设ext4中的delay allocation特性没有开启)、文件块的部分写过程的处理等。下面是在ext_write_begin函数调用过程中比较重要的几个函数。
1 /* 2 * block_write_begin takes care of the basic task of block allocation and 3 * bringing partial write blocks uptodate first. 4 * 5 * If *pagep is not NULL, then block_write_begin uses the locked page 6 * at *pagep rather than allocating its own. In this case, the page will 7 * not be unlocked or deallocated on failure. 8 */ 9 int block_write_begin(struct file *file, struct address_space *mapping, 10 loff_t pos, unsigned len, unsigned flags, 11 struct page **pagep, void **fsdata, 12 get_block_t *get_block) 13 { 14 struct inode *inode = mapping->host; 15 int status = 0; 16 struct page *page; 17 pgoff_t index; 18 unsigned start, end; 19 int ownpage = 0; 20 21 index = pos >> PAGE_CACHE_SHIFT; 22 start = pos & (PAGE_CACHE_SIZE - 1); 23 end = start + len; 24 25 page = *pagep; 26 if (page == NULL) { 27 ownpage = 1; 28 page = grab_cache_page_write_begin(mapping, index, flags); 29 if (!page) { 30 status = -ENOMEM; 31 goto out; 32 } 33 *pagep = page; 34 } else 35 BUG_ON(!PageLocked(page)); 36 37 status = __block_prepare_write(inode, page, start, end, get_block); 38 if (unlikely(status)) { 39 ClearPageUptodate(page); 40 41 if (ownpage) { 42 unlock_page(page); 43 page_cache_release(page); 44 *pagep = NULL; 45 46 /* 47 * prepare_write() may have instantiated a few blocks 48 * outside i_size. Trim these off again. Don‘t need 49 * i_size_read because we hold i_mutex. 50 */ 51 if (pos + len > inode->i_size) 52 vmtruncate(inode, inode->i_size); 53 } 54 } 55 56 out: 57 return status; 58 }
1 static int __block_prepare_write(struct inode *inode, struct page *page, 2 unsigned from, unsigned to, get_block_t *get_block) 3 { 4 unsigned block_start, block_end; 5 sector_t block; 6 int err = 0; 7 unsigned blocksize, bbits; 8 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; 9 10 BUG_ON(!PageLocked(page)); 11 BUG_ON(from > PAGE_CACHE_SIZE); 12 BUG_ON(to > PAGE_CACHE_SIZE); 13 BUG_ON(from > to); 14 15 blocksize = 1 << inode->i_blkbits; 16 if (!page_has_buffers(page)) 17 create_empty_buffers(page, blocksize, 0); 18 head = page_buffers(page); 19 20 bbits = inode->i_blkbits; 21 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); 22 23 for(bh = head, block_start = 0; bh != head || !block_start; 24 block++, block_start=block_end, bh = bh->b_this_page) { 25 block_end = block_start + blocksize; 26 if (block_end <= from || block_start >= to) { 27 if (PageUptodate(page)) { 28 if (!buffer_uptodate(bh)) 29 set_buffer_uptodate(bh); 30 } 31 continue; 32 } 33 if (buffer_new(bh)) 34 clear_buffer_new(bh); 35 if (!buffer_mapped(bh)) { 36 WARN_ON(bh->b_size != blocksize); 37 err = get_block(inode, block, bh, 1); 38 if (err) 39 break; 40 if (buffer_new(bh)) { 41 unmap_underlying_metadata(bh->b_bdev, 42 bh->b_blocknr); 43 if (PageUptodate(page)) { 44 clear_buffer_new(bh); 45 set_buffer_uptodate(bh); 46 mark_buffer_dirty(bh); 47 continue; 48 } 49 if (block_end > to || block_start < from) 50 zero_user_segments(page, 51 to, block_end, 52 block_start, from); 53 continue; 54 } 55 } 56 if (PageUptodate(page)) { 57 if (!buffer_uptodate(bh)) 58 set_buffer_uptodate(bh); 59 continue; 60 } 61 if (!buffer_uptodate(bh) && !buffer_delay(bh) && 62 !buffer_unwritten(bh) && 63 (block_start < from || block_end > to)) { 64 ll_rw_block(READ, 1, &bh); 65 *wait_bh++=bh; 66 } 67 } 68 /* 69 * If we issued read requests - let them complete. 70 */ 71 while(wait_bh > wait) { 72 wait_on_buffer(*--wait_bh); 73 if (!buffer_uptodate(*wait_bh)) 74 err = -EIO; 75 } 76 if (unlikely(err)) 77 page_zero_new_buffers(page, from, to); 78 return err; 79 }
1 /** 2 * ll_rw_block: low-level access to block devices (DEPRECATED) 3 * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead) 4 * @nr: number of &struct buffer_heads in the array 5 * @bhs: array of pointers to &struct buffer_head 6 * 7 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and 8 * requests an I/O operation on them, either a %READ or a %WRITE. The third 9 * %SWRITE is like %WRITE only we make sure that the *current* data in buffers 10 * are sent to disk. The fourth %READA option is described in the documentation 11 * for generic_make_request() which ll_rw_block() calls. 12 * 13 * This function drops any buffer that it cannot get a lock on (with the 14 * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be 15 * clean when doing a write request, and any buffer that appears to be 16 * up-to-date when doing read request. Further it marks as clean buffers that 17 * are processed for writing (the buffer cache won‘t assume that they are 18 * actually clean until the buffer gets unlocked). 19 * 20 * ll_rw_block sets b_end_io to simple completion handler that marks 21 * the buffer up-to-date (if approriate), unlocks the buffer and wakes 22 * any waiters. 23 * 24 * All of the buffers must be for the same device, and must also be a 25 * multiple of the current approved size for the device. 26 */ 27 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) 28 { 29 int i; 30 31 for (i = 0; i < nr; i++) { 32 struct buffer_head *bh = bhs[i]; 33 34 if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG) 35 lock_buffer(bh); 36 else if (!trylock_buffer(bh)) 37 continue; 38 39 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC || 40 rw == SWRITE_SYNC_PLUG) { 41 if (test_clear_buffer_dirty(bh)) { 42 bh->b_end_io = end_buffer_write_sync; 43 get_bh(bh); 44 if (rw == SWRITE_SYNC) 45 submit_bh(WRITE_SYNC, bh); 46 else 47 submit_bh(WRITE, bh); 48 continue; 49 } 50 } else { 51 if (!buffer_uptodate(bh)) { 52 bh->b_end_io = end_buffer_read_sync; 53 get_bh(bh); 54 submit_bh(rw, bh); 55 continue; 56 } 57 } 58 unlock_buffer(bh); 59 } 60 }
其中在ext4中块的分配过程中,管理块分配处理的函数实现在fs/ext4/balloc.c fs/ext4/mballoc.c
4、当page cache中的数据需要刷新到disk上的时候,这时处理的过程由Block IO接管。
在进行文件page cache刷新到disk上的过程中比较重要的数据结构有如下两个buffer_head 和 bio
1 struct buffer_head { 2 unsigned long b_state; /* buffer state bitmap (see above) */ 3 struct buffer_head *b_this_page;/* circular list of page‘s buffers */ 4 struct page *b_page; /* the page this bh is mapped to */ 5 6 sector_t b_blocknr; /* start block number */ 7 size_t b_size; /* size of mapping */ 8 char *b_data; /* pointer to data within the page */ 9 10 struct block_device *b_bdev; 11 bh_end_io_t *b_end_io; /* I/O completion */ 12 void *b_private; /* reserved for b_end_io */ 13 struct list_head b_assoc_buffers; /* associated with another mapping */ 14 struct address_space *b_assoc_map; /* mapping this buffer is 15 associated with */ 16 atomic_t b_count; /* users using this buffer_head */ 17 };
1 /* 2 * main unit of I/O for the block layer and lower layers (ie drivers and 3 * stacking drivers) 4 */ 5 struct bio { 6 sector_t bi_sector; /* device address in 512 byte 7 sectors */ 8 struct bio *bi_next; /* request queue link */ 9 struct block_device *bi_bdev; 10 unsigned long bi_flags; /* status, command, etc */ 11 unsigned long bi_rw; /* bottom bits READ/WRITE, 12 * top bits priority 13 */ 14 15 unsigned short bi_vcnt; /* how many bio_vec‘s */ 16 unsigned short bi_idx; /* current index into bvl_vec */ 17 ............... 18 19 /* 20 * We can inline a number of vecs at the end of the bio, to avoid 21 * double allocations for a small number of bio_vecs. This member 22 * MUST obviously be kept at the very end of the bio. 23 */ 24 struct bio_vec bi_inline_vecs[0]; 25 };
在Block IO层进行基本的IO request的合并和处理调度, 基本的层由elevator管理, 具体的调度算法有noop、deadline和anticipate等多种调度算法,现在默认的调度算法是deadline,当然调度算法可调,根据系统可以调成系统最有的处理。
[1] 基数树(radix tree). http://blog.csdn.net/joker0910/article/details/8250085
[2] Radix Tree. http://en.wikipedia.org/wiki/Radix_tree