分类

Storage?

从flashcache的创建开始，介绍flashcache在SSD上的layout和内存数据结构，简单地说就是数据组织形式。

        sprintf(dmsetup_cmd, "echo 0 %lu flashcache %s %s %s %d 2 %lu %lu %d %lu %d %lu"
                " | dmsetup create %s",
                disk_devsize, disk_devname, ssd_devname, cachedev, cache_mode, block_size,
                cache_size, associativity, disk_associativity, write_cache_only, md_block_size,
                cachedev);

从flashcache之后的参数算起：

dmc的成员	dmsetup create中的参数	默认值	含义	?
disk_dev	disk_devname	无	慢速块设备的名字	?
cache_dev	ssd_devname	无	SSD设备的名字	?
dm_vdevname	flashcache的名字	无	flashcache起的名字	?
cache_mode	cache_mode	无	三种合法值：write_back,write_through和write_around	?
persistence（非dmc的成员变量）	2	2	实际flashcache_ctr函数，即为flashcache_create服务，也为flashcache_load服务	?
block_size	block_size	8	8个扇区即4K	?
size	cache_size	设备扇区总数/block_size，	注意这个值的含义是block的个数，即总扇区除以block_size.	?
assoc	associativity	512	合法值为（256,8192）之间的2的整数幂，不包含256和8192	?
disk_assoc	?	?	?	?
write_only_cache	write_cache_only	0	write_back模式有一个子模式，即write_only	?
md_block_size	?	8	?	?
num_sets	?	dmc->size?? dmc->assoc_shift	?	?
?	?	?	?	?

影响flashcache布局的几个参数有：

block_size: 默认情况下值为8，即8个扇区组成一个block，即block的大小为4KB
size : block的个数

注意，注意在


      //截止到此处，dmc->size是SSD设备的扇区个数，
      //后面调用dmc->size /= (dmc->block_size)执行之后，才变成block的个数。

        dmc->md_blocks = INDEX_TO_MD_BLOCK(dmc, dmc->size / dmc->block_size) + 1 + 1;
        /*总扇区数减去md_block需要的扇区数，得到最多可以用于存放数据的扇区数*/
        dmc->size -= dmc->md_blocks * MD_SECTORS_PER_BLOCK(dmc);
        /*可以用来存放cache数据的block个数，默认情况下即4K的个数*/
        dmc->size /= dmc->block_size;
　　　　/*注意，block是要组成set的，因此有assoc的概念，默认512个block组成一个set
　　　　 *因此block的个数需要向下对齐512的倍数*/
        dmc->size = (dmc->size / dmc->assoc) * dmc->assoc;           

        /*有了准确的block的个数，需要的meta data block重新计算*/
        dmc->md_blocks = INDEX_TO_MD_BLOCK(dmc, dmc->size) + 1 + 1;
        DMINFO("flashcache_writeback_create: md_blocks = %d, md_sectors = %dn",
               dmc->md_blocks, dmc->md_blocks * MD_SECTORS_PER_BLOCK(dmc));
        dev_size = to_sector(dmc->cache_dev->bdev->bd_inode->i_size);
        cache_size = dmc->md_blocks * MD_SECTORS_PER_BLOCK(dmc) + (dmc->size * dmc->block_size);
        if (cache_size > dev_size) {
                DMERR("Requested cache size exceeds the cache device's capacity"
                      "(%lu>%lu)",
                      cache_size, dev_size);
                vfree((void *)header);
                return 1;
        }

这段代码的执行过后，我们基本就能对flashcache的组织形式有一定的了解了。首先是8个扇区做成一个block，然后是512个block组成一个set，这样的话，一个set的大小为2M，将SSD整体空间扣除meta需要的部分之后，组织成这样的结构：

注意，这只是cache block的部分，对于flashcache来说，还有metadata block和superblock。和文件系统一样，flashcache也有superblock，介绍flashcache的组织形式：

        header = (struct flash_superblock *)vmalloc(MD_BLOCK_BYTES(dmc));
        if (!header) {
                DMERR("flashcache_writeback_create: Unable to allocate sector");
                return 1;
        }
struct flash_superblock {
        sector_t size;          /* Cache size */
        u_int32_t block_size;   /* Cache block size */
        u_int32_t assoc;        /* Cache associativity */
        u_int32_t cache_sb_state;       /* Clean shutdown ? */
        char cache_devname[DEV_PATHLEN]; /* Contains dm_vdev name as of v2 modifications */
        sector_t cache_devsize;
        char disk_devname[DEV_PATHLEN]; /* underlying block device name (use UUID paths!) */
        sector_t disk_devsize;
        u_int32_t cache_version;
        u_int32_t md_block_size;
        u_int32_t disk_assoc;
        u_int32_t write_only_cache;
};

尽管flashcache的superblock需要的空间比较小，但是flashcache给superblock预留了一个meta data block的大小，即默认情况下4KB的大小，为将来可能的扩展预留的空间。

flash_superblock这个数据结构存在在SSD这个设备的第一个4K，当机器重启之后，flashcache_load会阅读该设备，确保头部扇区中存放的内容，即superblock的内容：

        ssd_devname = argv[optind++];
        cache_fd = open(ssd_devname, O_RDONLY);
        if (cache_fd < 0) {
                fprintf(stderr, "Failed to open %sn", ssd_devname);
                exit(1);
        }
        lseek(cache_fd, 0, SEEK_SET);
        if (read(cache_fd, buf, 512) < 0) {
                fprintf(stderr, "Cannot read Flashcache superblock %sn", ssd_devname);
                exit(1);
        }
        if (!(sb->cache_sb_state == CACHE_MD_STATE_DIRTY ||
              sb->cache_sb_state == CACHE_MD_STATE_CLEAN ||
              sb->cache_sb_state == CACHE_MD_STATE_FASTCLEAN ||
              sb->cache_sb_state == CACHE_MD_STATE_UNSTABLE)) {
                fprintf(stderr, "%s: Invalid Flashcache superblock %sn", pname, ssd_devname);
                exit(1);
        }

创建flashcache的时候，flashcache_create也会阅读SSD设备的第一个扇区，来确保SSD是不是已经创建了flashcache。

对于上面网格图中的任何一个cache block，都需要数据结构来描述其状态，比如值是否有效，是否DIRTY等，其数据结构如下：

#ifdef FLASHCACHE_DO_CHECKSUMS
struct flash_cacheblock {
        sector_t        dbn;    /* Sector number of the cached block */
        u_int64_t       checksum;
        u_int32_t       cache_state; /* INVALID | VALID | DIRTY */
} __attribute__ ((aligned(32)));
#else
struct flash_cacheblock {
        sector_t        dbn;    /* Sector number of the cached block */
        u_int32_t       cache_state; /* INVALID | VALID | DIRTY */
} __attribute__ ((aligned(16)));
#endif

对于我们而言，flash_cacheblock的大小为16字节，因此，每个cache block都会有16字节的元数据。这16字节描述了一个cache block。每个meta data block 默认有4KB，即每个meta data block可以存放256 个cache block的元数据信息。

综合上述讨论，一个完整的SSD layout如下所示：

上面的布局，主要是块设备上的布局，除此外，flashcache正常运行期间，需要消耗内存，内存中有数据结构管理这些cache block，如下所示：

 order = dmc->size * sizeof(struct cacheblock);
 struct cacheblock {
        u_int16_t       cache_state;
        int16_t         nr_queued;      /* jobs in pending queue */
        u_int16_t       lru_prev, lru_next;
        u_int8_t        use_cnt;
        u_int8_t        lru_state;
        sector_t        dbn;    /* Sector number of the cached block */
        u_int16_t       hash_prev, hash_next;
#ifdef FLASHCACHE_DO_CHECKSUMS
        u_int64_t       checksum;
#endif
} __attribute__((packed));

目前来讲，不考虑checksum，内存中18 Byte 描述一个cache block（默认4KB）。

        order = dmc->size * sizeof(struct cacheblock);
        DMINFO("Allocate %luKB (%luB per) mem for %lu-entry cache"
               "(capacity:%luMB, associativity:%u, block size:%u "
               "sectors(%uKB))",
               order >> 10, sizeof(struct cacheblock), dmc->size,
               cache_size >> (20-SECTOR_SHIFT), dmc->assoc, dmc->block_size,
               dmc->block_size >> (10-SECTOR_SHIFT));
        dmc->cache = (struct cacheblock *)vmalloc(order);
        if (!dmc->cache) {
                vfree((void *)header);
                DMERR("flashcache_writeback_create: Unable to allocate cache md");
                return 1;
        }
        memset(dmc->cache, 0, order);
        /* Initialize the cache structs */
        for (i = 0; i < dmc->size ; i++) {
                dmc->cache[i].dbn = 0;
#ifdef FLASHCACHE_DO_CHECKSUMS
                dmc->cache[i].checksum = 0;
#endif
                dmc->cache[i].cache_state = INVALID;
                dmc->cache[i].lru_state = 0;
                dmc->cache[i].nr_queued = 0;
        }

通过这个18 Byte的内存描述一个flashcache 的cache block，我们可以估算，一个400G 的SSD作为flashcache的SSD部分，消耗的内存约为：

400G/4KB*18 = 1.8GB

cache_set

dmc的assoc 默认是512，表示512个block组成一个set，即512*4K= 2MB：

init:
        /*计算整个flashcache set的个数*/
        dmc->num_sets = dmc->size >> dmc->assoc_shift;
        order = dmc->num_sets * sizeof(struct cache_set);
        dmc->cache_sets = (struct cache_set *)vmalloc(order);
        if (!dmc->cache_sets) {
                ti->error = "Unable to allocate memory";
                r = -ENOMEM;
                vfree((void *)dmc->cache);
                goto bad3;
        }
        memset(dmc->cache_sets, 0, order);
        for (i = 0 ; i < dmc->num_sets ; i++) {
                dmc->cache_sets[i].set_fifo_next = i * dmc->assoc;
                dmc->cache_sets[i].set_clean_next = i * dmc->assoc;
                dmc->cache_sets[i].fallow_tstamp = jiffies;
                dmc->cache_sets[i].fallow_next_cleaning = jiffies;
                dmc->cache_sets[i].hotlist_lru_tail = FLASHCACHE_NULL;
                dmc->cache_sets[i].hotlist_lru_head = FLASHCACHE_NULL;
                dmc->cache_sets[i].warmlist_lru_tail = FLASHCACHE_NULL;
                dmc->cache_sets[i].warmlist_lru_head = FLASHCACHE_NULL;
                spin_lock_init(&dmc->cache_sets[i].set_spin_lock);
        }

对于每个set有单独的数据结构描述：

struct cache_set {
        spinlock_t              set_spin_lock;
        u_int32_t               set_fifo_next;
        u_int32_t               set_clean_next;
        u_int16_t               clean_inprog;
        u_int16_t               nr_dirty;
        u_int16_t               dirty_fallow;
        unsigned long           fallow_tstamp;
        unsigned long           fallow_next_cleaning;
        /*
         * 2 LRU queues/cache set.
         * 1) A block is faulted into the MRU end of the warm list from disk.
         * 2) When the # of accesses hits a threshold, it is promoted to the
         * (MRU) end of the hot list. To keep the lists in equilibrium, the
         * LRU block from the host list moves to the MRU end of the warm list.
         * 3) Within each list, an access will move the block to the MRU end.
         * 4) Reclaims happen from the LRU end of the warm list. After reclaim
         * we move a block from the LRU end of the hot list to the MRU end of
         * the warm list.
         */
        u_int16_t               hotlist_lru_head, hotlist_lru_tail;
        u_int16_t               warmlist_lru_head, warmlist_lru_tail;
        u_int16_t               lru_hot_blocks, lru_warm_blocks;
#define NUM_BLOCK_HASH_BUCKETS          512
        u_int16_t               hash_buckets[NUM_BLOCK_HASH_BUCKETS];
        u_int16_t               invalid_head;
};

注意，对于同一个set的cache block而言，根据状态，位于三个不同的链表之中：

INVALID
- invalid_head为头部的invalid 链表
VALID
- hot：
  - hotlist_lru_head为头部，hotlist_lru_tail为尾部的hot链表
- warm
  - warmlist_lru_head为头部，warmlist_lru_tail为尾部的warm链表

注意，一个cacheblock只会位于其中的一条链表之中，不会同时属于hot和warm，更不会同时属于invalid和warm。

在64位系统上，指针的长度是8Byte，如果用普通的链表，prev next就要消耗16B的空间，这样是比较浪费的，flashcache使用是的u_int16_t类型的，每一个cacheblock通过一个2字节的short值，记录前一个cacheblock的值和后一个cacheblock的值。注意该值是同一个set的index值，因为默认set只有512，所以，2Byte的short足够记录下。

注意，当cacheblock中没有任何数据的时候，它位于invalid链表中，即这个链表里面都没啥数据。毫无疑问，当新建的flashcache里面，其实并没有任何有用的数据，并不和SATA DISK的数据相关联，因此，都会位于invalid 链表。在flashcache_ctr之中有如下的语句：

        for (i = 0 ; i < dmc->size ; i++) {
                dmc->cache[i].hash_prev = FLASHCACHE_NULL;
                dmc->cache[i].hash_next = FLASHCACHE_NULL;
                /*注意，flashcache_ctr并非只有创建flashcache一种情况，
                 *还有flashcache使用了一段时间之后，重启机器后的flashcache_load
                 *因此，需要判断对应的cacheblock的cache_state状态值，来初始化到合适的链表*/

                /*如果cache_state状态中VALID置位，则插入的flashcache_hash，方便查找*/
                if (dmc->cache[i].cache_state & VALID) {
                        flashcache_hash_insert(dmc, i);
                        atomic_inc(&dmc->cached_blocks);
                }
                /*如果dirty，则dirty统计增加*/
                if (dmc->cache[i].cache_state & DIRTY) {
                        dmc->cache_sets[i / dmc->assoc].nr_dirty++;
                        atomic_inc(&dmc->nr_dirty);
                }
                /*如果是新创建，或者该cacheblock并无有效数据，则插入Invalid链表
                 *对应新创建的flashcahce，所有的cacheblock都在invalid链表，
                 *注意，并不是1条链表，而是每个cacheset都有1条链表*/
                if (dmc->cache[i].cache_state & INVALID)
                        flashcache_invalid_insert(dmc, i);

下面来介绍hotlist和warmlist，flashcache采用的缓存置换算法是LRU算法，它维护着2条链表：hot和warm。当然了，顾名思义，hot链表的数据更热，更不应该被置换出去。每条链表有head和tail，约靠近尾部的cacheblock，越热，越不应该被置换出去。

数据是被访问的，因此，频繁访问的数据，可能会从warm迁到（premote）hot，如果hot链表中最冷的数据（即靠近head的数据），也可能会被降级（demote）到warm中。

除此以外，可能某个cacheblock中存在合法的数据（VALID），但是由于新的io进来，第一反应肯定是会不会我请求的IO对应的地址 dbn恰好在flashcahce的中并且状态为VALID，如果找到皆大欢喜；如果找不到，第二反应是寻找一个无人用的cacheblock，即位于INVALID链表的cacheblock。如果很不幸，没有INVALID的cache block，所有的block都已经用了（VALID），这时候，就必须要寻找牺牲品了，即reclaim策略。

接下来我们以flashcache_read为例，详细介绍寻找cacheblock的方法。

寻找cacheblock

对于读请求，由函数flashcache_read负责处理，注意，对于那些注定不会进入cacheblock的读写，在进入flashcache_read之前都已经过滤掉了：

        uncacheable = (unlikely(dmc->bypass_cache) ||
                       (to_sector(bio->bi_size) != dmc->block_size) ||
                       /*
                        * If the op is a READ, we serve it out of cache whenever possible,
                        * regardless of cacheablity
                        */
                       (bio_data_dir(bio) == WRITE &&
                        ((dmc->cache_mode == FLASHCACHE_WRITE_AROUND) ||
                         flashcache_uncacheable(dmc, bio))));
        spin_unlock_irqrestore(&dmc->ioctl_lock, flags);
        if (uncacheable) {
                flashcache_setlocks_multiget(dmc, bio);
                queued = flashcache_inval_blocks(dmc, bio);
                flashcache_setlocks_multidrop(dmc, bio);
                if (queued) {
                        if (unlikely(queued < 0))
                                flashcache_bio_endio(bio, -EIO, dmc, NULL);
                } else {
                        /* Start uncached IO */
                        /*绕过flashcache，直接访问慢速设备*/
                        flashcache_start_uncached_io(dmc, bio);
                }
        } else {
                /*如果io类型可以走flashcache，那么根据类型分别调用
                 *flashcache_read和flashcache_write*/
                if (bio_data_dir(bio) == READ)
                        flashcache_read(大专栏  flashcache 源码解析lass="n">dmc, bio);
                else
                        flashcache_write(dmc, bio);
        }
        return DM_MAPIO_SUBMITTED;

剩下内容的重点是cacheblock的查找置换的策略，什么io走flashcache，什么io直接访问慢速设备，并不是我们关心的内容。我们继续以flashcache_read为例，介绍寻找cacheblock的过程。

下面代码是查找cacheblock的方法，主要的寻找过程位于flashcache_lookup函数。

        flashcache_setlocks_multiget(dmc, bio);
        res = flashcache_lookup(dmc, bio, &index);
        /* Cache Read Hit case */
        if (res > 0) {
                cacheblk = &dmc->cache[index];
                if ((cacheblk->cache_state & VALID) &&
                    (cacheblk->dbn == bio->bi_sector)) {
                        flashcache_read_hit(dmc, bio, index);
                        return;
                }
        }
        /*
         * In all cases except for a cache hit (and VALID), test for potential
         * invalidations that we need to do.
         */
        queued = flashcache_inval_blocks(dmc, bio);
        if (queued) {
                if (unlikely(queued < 0))
                        flashcache_bio_endio(bio, -EIO, dmc, NULL);
                if ((res > 0) &&
                    (dmc->cache[index].cache_state == INVALID))
                        /*
                         * If happened to pick up an INVALID block, put it back on the
                         * per cache-set invalid list
                         */
                        flashcache_invalid_insert(dmc, index);
                flashcache_setlocks_multidrop(dmc, bio);
                return;
        }

因为数据是流动的，因此整个flashcache N个cacheset，每个cacheset M个cache block，其状态都是流动的，刚才我可能是invalid，可能很快我就位于warmlist了，再有数据访问，我可能就迁移到了hotlist了。因此理解flashcache_lookup，知道当用户某一个请求要访问sector_t dbn = bio->bi_sector 这个扇区的时候，如何查找cacheblock是理解状态流动的非常关键的一步。

static int
flashcache_lookup(struct cache_c *dmc, struct bio *bio, int *index)
{
        sector_t dbn = bio->bi_sector;
#if DMC_DEBUG
        int io_size = to_sector(bio->bi_size);
#endif
        unsigned long set_number = hash_block(dmc, dbn);
        int invalid, oldest_clean = -1;
        int start_index;

        start_index = dmc->assoc * set_number;
        DPRINTK("Cache lookup : dbn %llu(%lu), set = %d",
                dbn, io_size, set_number);
        find_valid_dbn(dmc, dbn, start_index, index);
        if (*index >= 0) {
                DPRINTK("Cache lookup HIT: Block %llu(%lu): VALID index %d",
                             dbn, io_size, *index);
                /* We found the exact range of blocks we are looking for */
                return VALID;
        }
        invalid = find_invalid_dbn(dmc, set_number);
        if (invalid == -1) {
                /* We didn't find an invalid entry, search for oldest valid entry */
                find_reclaim_dbn(dmc, start_index, &oldest_clean);
        }
        /*
         * Cache miss :
         * We can't choose an entry marked INPROG, but choose the oldest
         * INVALID or the oldest VALID entry.
         */
        *index = start_index + dmc->assoc;
        if (invalid != -1) {
                DPRINTK("Cache lookup MISS (INVALID): dbn %llu(%lu), set = %d, index = %d, start_index = %d", dbn, io_size, set_number, invalid, start_index);
                *index = invalid;
        } else if (oldest_clean != -1) {
                DPRINTK("Cache lookup MISS (VALID): dbn %llu(%lu), set = %d, index = %d, start_index = %d",
                             dbn, io_size, set_number, oldest_clean, start_index);
                *index = oldest_clean;
        } else {
                DPRINTK_LITE("Cache read lookup MISS (NOROOM): dbn %llu(%lu), set = %d",
                        dbn, io_size, set_number);
        }
        if (*index < (start_index + dmc->assoc))
                return INVALID;
        else {
                dmc->flashcache_stats.noroom++;
                return -1;
        }
}

注意，这就是寻找cacheblock的算法了。第一步是要寻找合适的set，因为flashcache默认情况下，每个set 512个cache block，首先要定位到那个cacheset，然后再cacheset中确定合适的cache block。通俗点说，就是分两步走：

找到合适的cache set
从该cache set中找到合适的cache block

第一步比较简单，根据bio的扇区号，计算hash，然后映射到对应的cache set：

unsigned long
hash_block(struct cache_c *dmc, sector_t dbn)
{
        unsigned long set_number, value;
        int num_cache_sets = dmc->size >> dmc->assoc_shift;

        /*
         * Starting in Flashcache SSD Version 3 :
         * We map a sequential cluster of disk_assoc blocks onto a given set.
         * But each disk_assoc cluster can be randomly placed in any set.
         * But if we are running on an older on-ssd cache, we preserve old
         * behavior.
         */
        if (dmc->on_ssd_version < 3 || dmc->disk_assoc == 0) {
                value = (unsigned long)
                        (dbn >> (dmc->block_shift + dmc->assoc_shift));
        } else {
                /*我们走本分支*/
                value = (unsigned long) (dbn >> dmc->disk_assoc_shift);
                /* Then place it in a random set */
                value = jhash_1word(value, 0xbeef);
        }
        set_number = value % num_cache_sets;
        DPRINTK("Hash: %llu(%lu)->%lu", dbn, value, set_number);
        return set_number;
}

我们走else分支，这里面有一个参数，初看flashcache不容易理解，即disk_assoc_shift，这个参数在创建flashcache的时候可以指定disk_associativity :

root@XMT-S02:~# dmsetup table
osd4: 0 70316455903 flashcache conf:
	ssd dev (/dev/disk/by-partlabel/osd4-ssd), disk dev (/dev/disk/by-partlabel/osd4-data) cache mode(WRITE_BACK)
	capacity(446572M), associativity(512), data block size(4K) metadata block size(4096b)
	disk assoc(256K)
	skip sequential thresh(32K)
	total blocks(114322432), cached blocks(96119380), cache percent(84)
	dirty blocks(41155646), dirty percent(35)
	nr_queued(0)

我们看到，默认情况下，disk assoc的值是256K，事实上这个控制选项发挥作用也就是在寻找合适的cache set中发挥控制作用，如果没有这个选项，直接拿dbn进行hash，然后map到cacheset，相邻的两个dbn，可能压根就不会位于同一个cache set，那么将来对同一个cache set的io进行merge也就没啥必要了，因为相邻的dbn在同一个set的可能性并不大。

有了这个disk assoc参数就不同了，它hash之前，首先执行：

value = (unsigned long) (dbn >> dmc->disk_assoc_shift);

它确保的是，在同一个256KB块内的扇区，最终会得到同一个value，然后hash会map到同一个cache set，将来就有可能将相邻的请求merge，从而提高性能。

除了此处不太好理解意外，其他基本就是算出hash值，然后对cache set的个数求余，来决定落在那个cache set中。

第一步已经解决了，接下来是第二部，如何在cache set中找到

其算法核心可以分成三部：

find_valid_dbn
find_invalid_dbn
find_reclaim_dbn

find_valid_dbn

static void
find_valid_dbn(struct cache_c *dmc, sector_t dbn,
               int start_index, int *index)
{
        *index = flashcache_hash_lookup(dmc, start_index / dmc->assoc, dbn);
        if (*index == -1)
                return;
        if (dmc->sysctl_reclaim_policy == FLASHCACHE_LRU &&
            ((dmc->cache[*index].cache_state & BLOCK_IO_INPROG) == 0))
                flashcache_lru_accessed(dmc, *index);
        /*
         * If the block was DIRTY and earmarked for cleaning because it was old, make
         * the block young again.
         */
        flashcache_clear_fallow(dmc, *index);
}

int
flashcache_hash_lookup(struct cache_c *dmc,
                       int set,
                       sector_t dbn)
{
        struct cache_set *cache_set = &dmc->cache_sets[set];
        int index;
        struct cacheblock *cacheblk;
        u_int16_t set_ix;
#if 0
        int start_index, end_index, i;
#endif

        set_ix = *flashcache_get_hash_bucket(dmc, cache_set, dbn);
        while (set_ix != FLASHCACHE_NULL) {
                index = set * dmc->assoc + set_ix;
                cacheblk = &dmc->cache[index];
                /* Only VALID blocks on the hash queue */
                VERIFY(cacheblk->cache_state & VALID);
                VERIFY((cacheblk->cache_state & INVALID) == 0);
                if (dbn == cacheblk->dbn)
                        return index;
                set_ix = cacheblk->hash_next;
        }
        return -1;
}  

static inline u_int16_t *
flashcache_get_hash_bucket(struct cache_c *dmc, struct cache_set *cache_set, sector_t dbn)
{
        unsigned int hash = jhash_1word(dbn, 0xfeed);

        return &cache_set->hash_buckets[hash % NUM_BLOCK_HASH_BUCKETS];
}

我们已经找到了cache set，默认情况下cache set中有512个cache block，这些cache block中是否有我们需要的扇区呢？

最容易想到的是，逐个cache block比对，看下dbn号是否一致，状态是否是VALID。但是这种方法太蠢，效率太低。正确的方法是hash。

如果cache block中存在有效数据，他会根据对应的扇区号 dbn来计算hash，放入cache set中的合适bucket中。这种hash的做法，加速了cache set内部对某dbn是否存在在某个cacheblock的查找。

对于读来讲最完美的情况是，请求要求的数据块，恰巧位于flashcache的SSD 设备中，这种情况称为读命中。如果命中的话，因为该cacheblock的数据，相当于获得一次有效的访问，那么当空间吃紧的时候，应该降低该block被替换出去的概率，即提升其热度。

        if (dmc->sysctl_reclaim_policy == FLASHCACHE_LRU &&
            ((dmc->cache[*index].cache_state & BLOCK_IO_INPROG) == 0))
                flashcache_lru_accessed(dmc, *index);

这个flashcache_lru_accessed函数，即某个cacheblock最近被访问时，需要执行的操作，代码中有一段注释，言简意赅地介绍了这部分的算法：

/*
 * Block is accessed.
 *
 * Algorithm :
   if (block is in the warm list) {
       block_lru_refcnt++;
       if (block_lru_refcnt >= THRESHOLD) {
          clear refcnt
          Swap this block for the block at LRU end of hot list
       } else
          move it to MRU end of the warm list
   }
   if (block is in the hot list)
       move it to MRU end of the hot list
 */

如果block目前在warm list
- 引用计数++
  - 如果引用计数大于等于门限值（sysctl_lru_promote_thresh），一般是2，则从warm list 移入 hot list的LRU端（最左端）
  - 如果引用计数低于门限值，则从移入 warm list的MRU端，即最右端
如果block 目前在hot list
- 将该block移入hot list的MRU端，即最右端。

两个链表host list和warm list，其最左端都是LRU端（Least Recent Used），其最右端是MRU端（Most Recent Used）。一旦需要置换，将某些cacheblock中的内容踢出出去，选择的顺序如下：

Worm List  LRU -------->Worm List MRU--------->Hot List LRU -------------> Hot List MRU

代码部分就不列了，简单的链表操作。

从cache set中寻找cache block的第一步就完成，这种情况是最幸运的一种，即要读取的内容所在的扇区，恰好在flashcache的 SSD部分中，数据有效VALID，可以拿到cache block的index，因为本次访问，将该cache block的热度提升到合适的位置。

但是也许并没有这么幸运，SSD中没有dbn对应扇区的内容，这种情况下，需要选择一个cache block来盛放即将从慢速设备的扇区中读取上来的内容。这种情况下，第一选择是选择一个并且投入使用的cache block，即INVALID状态的cache block。

Why？

如果不这么做，选择一个VALID状态的cache block，该cache block的内容就会被新的dbn的内容替换，那么该cache block中老的内容，就被逐出SSD了，如果紧接着发来一个访问cache block 中老的dbn扇区的内容的请求，就会造成miss。更恶劣的情况是该cache block的内容是dirty，flashcache 可能不得不先等待dirty内容flush下去之后，方能使用该cache block。

所以当命中已成不可能的时候，选择INVALID状态的cache block是上策：

find_invalid_dbn

static int
find_invalid_dbn(struct cache_c *dmc, int set)
{
        int index = flashcache_invalid_get(dmc, set);

        if (index != -1) {
                if (dmc->sysctl_reclaim_policy == FLASHCACHE_LRU)
                        flashcache_lru_accessed(dmc, index);
                VERIFY((dmc->cache[index].cache_state & FALLOW_DOCLEAN) == 0);
        }
        return index;
}

寻找INVALID状态的cache block比较简单，因为对于一个cache set而言，所有的invalid都位于invalid_head为头部的链表，只需要摘下头部的cache block就可以了

int
flashcache_invalid_get(struct cache_c *dmc, int set)
{
        struct cache_set *cache_set;
        int index;
        struct cacheblock *cacheblk;

        cache_set = &dmc->cache_sets[set];
        index = cache_set->invalid_head;
        if (index == FLASHCACHE_NULL)
                return -1;
        index += (set * dmc->assoc);
        cacheblk = &dmc->cache[index];
        VERIFY(cacheblk->cache_state == INVALID);
        flashcache_invalid_remove(dmc, index);
        return index;
}

同样的道理，因为该cache block从INVALID迁移到了warm list的MRU端。

其实这种情况还不错，因为还能找到闲置的cache block。随着flashcahe的使用，很可能这种情况也不可得。很可能该cache set下的所有的cache block都投入了战局，在该cache set已经找不到一块闲置的cache block了。

find_reclaim_dbn

这种情况下，就需要从投入使用的cacheblock中寻找一个牺牲品了，也就是cache block要回收了。优于SSD Dev和DIsk Dev大小的关系，不可能所有数据都存入SSD，所有的缓存算法都需要缓存替换算法，高效的缓存替换算法，能够获得更大的性能提升。

当选择牺牲品的时候，长期以来，我们维护hot list 和warm list的操作，就有了价值，这些信息给了我们选择牺牲品的依据。

static void
find_reclaim_dbn(struct cache_c *dmc, int start_index, int *index)
{
        if (dmc->sysctl_reclaim_policy == FLASHCACHE_FIFO)
                flashcache_reclaim_fifo_get_old_block(dmc, start_index, index);
        else /* flashcache_reclaim_policy == FLASHCACHE_LRU */
                flashcache_reclaim_lru_get_old_block(dmc, start_index, index);
}

Flashcache目前支持两种策略，FIFO和LRU。我们此处讨论LRU，这种算法就是将最近最不常使用的cache block替换出去。

代码给了注释：

/*
 * Get least recently used LRU block
 *
 * Algorithm :
 *      Always pick block from the LRU end of the warm list.
 *      And move it to the MRU end of the warm list.
 *      If we don't find a suitable block in the "warm" list,
 *      pick the block from the hot list, demote it to the warm
 *      list and move a block from the warm list to the hot list.
 */

总是从worm list的LRU端找，然后把它移到MRU端。如果在warm list找不到合适的，那么从hot list的LRU端找，如果找到，执行demote操作，即将hot list的LRU和worm list MRU互换位置。

都是一些简单的链表操作，就不在此处贴代码了。

原文地址：https://www.cnblogs.com/sanxiandoupi/p/11692198.html

时间： 2024-10-14 10:13:51

flashcache 源码解析

cache_set

寻找cacheblock

find_valid_dbn

find_invalid_dbn

find_reclaim_dbn

flashcache 源码解析的相关文章

ChrisRenke/DrawerArrowDrawable源码解析

五.jQuery源码解析之jQuery.extend(),jQuery.fn.extend()

eclipse中导入jdk源码、SpringMVC注解@RequestParam、SpringMVC文件上传源码解析、ajax上传excel文件

String源码解析（一）

Flume-ng源码解析之Channel组件

Spring源码解析-applicationContext

socketserver源码解析和协程版socketserver

Handler机制(四)---Handler源码解析

Android EventBus源码解析，带你深入理解EventBus