f2fs系列之写操作的相关流程

Windows Windows 2个月前 (08-15) 10次浏览 未收录 0个评论 扫描二维码
文章目录[隐藏]

背景

熟悉了文件系统的数据layout之后,理解它主要的IO路径(特别是写操作)是掌握一个文件系统必须要啃下来的骨头。F2FS也一样,IO相关的主要阶段包括:写提交;check point; GC。

写提交

IO相关的数据结构

最主要的两个数据结构如下:

 struct f2fs_io_info {
        struct f2fs_sb_info *sbi;       /* f2fs_sb_info pointer */
        nid_t ino;              /* inode number */
        enum page_type type;    /* contains DATA/NODE/META/META_FLUSH */
        enum temp_type temp;    /* contains HOT/WARM/COLD */
        int op;                 /* contains REQ_OP_ */
        int op_flags;           /* req_flag_bits */
        block_t new_blkaddr;    /* new block address to be written */
        block_t old_blkaddr;    /* old block address before Cow */
        struct page *page;      /* page to be written */
        struct page *encrypted_page;    /* encrypted page */
        struct list_head list;          /* serialize IOs */
        bool submitted;         /* indicate IO submission */
        int need_lock;          /* indicate we need to lock cp_rwsem */
        bool in_list;           /* indicate fio is in io_list */
        bool is_meta;           /* indicate borrow meta inode mapping or not */
        bool retry;             /* need to reallocate block address */
        enum iostat_type io_type;       /* io type */
        struct writeback_control *io_wbc; /* writeback control */
        unsigned char version;          /* version of the node */
};

#define is_read_io(rw) ((rw) == READ)
struct f2fs_bio_info {
        struct f2fs_sb_info *sbi;       /* f2fs superblock */
        struct bio *bio;                /* bios to merge */
        sector_t last_block_in_bio;     /* last block number */
        struct f2fs_io_info fio;        /* store buffered io info. */
        struct rw_semaphore io_rwsem;   /* blocking op for bio */
        spinlock_t io_lock;             /* serialize DATA/NODE IOs */
        struct list_head io_list;       /* track fios */
};

和page cache的交互

主要的代码在f2fs_file_operations中实现,头文件在f2fs.h中。

fs/f2fs/f2fs.h中列出了目录、文件、inode等主要数据结构等主要操作。如下所示:

extern const struct file_operations f2fs_dir_operations; extern const struct file_operations f2fs_file_operations; extern const struct inode_operations f2fs_file_inode_operations; extern const struct address_space_operations f2fs_dblock_aops; extern const struct address_space_operations f2fs_node_aops; extern const struct address_space_operations f2fs_meta_aops; extern const struct inode_operations f2fs_dir_inode_operations; extern const struct inode_operations f2fs_symlink_inode_operations; extern const struct inode_operations f2fs_encrypted_symlink_inode_operations; extern const struct inode_operations f2fs_special_inode_operations;

其中,上面和文件相关的操作是f2fs_file_operations, 支持write、read、seek、open等标准等操作。具体代码可以参考:fs/f2fs/file.c:

const struct file_operations f2fs_file_operations = { .llseek = f2fs_llseek, .read_iter = generic_file_read_iter, .write_iter = f2fs_file_write_iter, .open = f2fs_file_open, .release = f2fs_release_file, .mmap = f2fs_file_mmap, .flush = f2fs_file_flush, .fsync = f2fs_sync_file, .fallocate = f2fs_fallocate, .unlocked_ioctl = f2fs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = f2fs_compat_ioctl, #endif .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, };

其中上面的f2fs_file_write_iter会穿过kernel的page cache机制,
f2fs_file_write_iter->generic_file_write_iter->generic_file_direct_write->filemap_write_and_wait_range->filemap_fdatawrite_range->do_writepages->generic_writepages->write_cache_pages->__writepage。

最后会调用f2fs向page cache 注册的__writepage 函数f2fs_write_meta_page / f2fs_write_node_page/ f2fs_write_data_page分别写SSA/node/data 类型的数据区域。

#bash:~/workspace/linux-4.19.10/fs/f2fs$grep ".writepage" * -r
checkpoint.c: .writepage = f2fs_write_meta_page,
checkpoint.c: .writepages = f2fs_write_meta_pages,
data.c: .writepage = f2fs_write_data_page,
data.c: .writepages = f2fs_write_data_pages,
node.c: .writepage = f2fs_write_node_page,
node.c: .writepages = f2fs_write_node_pages,

这里,需要注意的是f2fs对meta/node/data 区域的读写模式是不同的,参考下面的注释:

/* * The below are the page types of bios used in submit_bio(). * The available types are: * DATA User data pages. It operates as async mode. * NODE Node pages. It operates as async mode. * META FS metadata pages such as SIT, NAT, CP. * NR_PAGE_TYPE The number of page types. * META_FLUSH Make sure the previous pages are written * with waiting the bio's completion * ... Only can be used with META. */ #define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type)) enum page_type { DATA, NODE, META, NR_PAGE_TYPE, META_FLUSH, INMEM, /* the below types are used by tracepoints only. */ INMEM_DROP, INMEM_INVALIDATE, INMEM_REVOKE, IPU, OPU, };

下面分别看三种类型的写和page cache的交互。

write_meta_page

先看write_meta_page:
f2fs_write_meta_page->f2fs_write_meta_page->f2fs_do_write_meta_page()->f2fs_submit_page_write()->构造fio,submit_merged_bio()-->__submit_bio()

上面f2fs_write_meta_page的主要流程如下:

void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, enum iostat_type io_type) { struct f2fs_io_info fio = { .sbi = sbi, .type = META, .temp = HOT, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_META | REQ_PRIO, .old_blkaddr = page->index, .new_blkaddr = page->index, .page = page, .encrypted_page = NULL, .in_list = false, }; if (unlikely(page->index >= MAIN_BLKADDR(sbi))) fio.op_flags &= ~REQ_META; set_page_writeback(page); ClearPageError(page); f2fs_submit_page_write(&fio); f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); }

最终通过submit_merged_bio把IO请求丢到通用块层,可以看到上面调用完成之后,meta 数据实际已经write back 写到磁盘里。

write_node_page

f2fs_write_node_page 会调用__write_node_page(), 后者会先构造node类型的f2fs_io_info, 然后设置脏页,并且直接调用写回函数:

 ............. set_page_writeback(page); ClearPageError(page); if (f2fs_in_warm_node_list(sbi, page)) { seq = f2fs_add_fsync_node_entry(sbi, page); if (seq_id) *seq_id = seq; } fio.old_blkaddr = ni.blk_addr; f2fs_do_write_node_page(nid, &fio); set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); up_read(&sbi->node_write);

可以看到对node 类型的数据,上面返回之后数据就写落盘了。

write_data_page

data相关的操作主要在:fs/f2fs/data.c中:

const struct address_space_operations f2fs_dblock_aops = { .readpage = f2fs_read_data_page, .readpages = f2fs_read_data_pages, .writepage = f2fs_write_data_page, .writepages = f2fs_write_data_pages, .write_begin = f2fs_write_begin, //准备页框 .write_end = f2fs_write_end, // set_page_dirty .set_page_dirty = f2fs_set_data_page_dirty, .invalidatepage = f2fs_invalidate_page, .releasepage = f2fs_release_page, .direct_IO = f2fs_direct_IO, .bmap = f2fs_bmap, #ifdef CONFIG_MIGRATION .migratepage = f2fs_migrate_page, #endif };

f2fs_write_data_page 会调用__write_data_page().同样也会构造data 类型的f2fs_io_info,然后drop page cache,把对应inode里的dirty page cache计数减一,等kernel 后台刷page cache的线程把数据刷到磁盘。

 if (f2fs_is_drop_cache(inode)) goto out; ........ inode_dec_dirty_pages(inode); if (err) ClearPageUptodate(page); if (wbc->for_reclaim) { f2fs_submit_merged_write_cond(sbi, inode, 0, page->index, DATA); clear_inode_flag(inode, FI_HOT_DATA); f2fs_remove_dirty_inode(inode); submitted = NULL; } ......

check point

Main area 可以认为是data和node的journal写的区域;
SSA (share segment area)可以认为是 NAT/SIT的journal 区域;
check point 的时候,就是把这些写到SSA,但尚未落到NAT/SIT的数据写到各自的区域。

由于f2fs的log-structure特性,每次写一个数据块,需要相应更改direct node,NAT和SIT,尤其是NAT和SIT区域,可能仅仅需要修改一个entry几个字节的信息,就要重写整个page,这会严重降低文件系统的性能和SSD的使用寿命,因此,f2fs使用了journal的机制来减少NAT和SIT的写次数。所谓journal,其实就是把NAT和SIT的更改写到f2fs_summary_block中,当写checkpoint时,才把dirty的SIT和NAT区域回写。

GC

上面做完check point之后,从SSA区域释放的NAT/SIT的数据需要释放,这个可以交给GC去做。

以section 为单位
两种victim 策略:
拥有最少有效block 数量的section: foreground cleaning process
拥有最老blocks平均age的section: background cleaning process

喜欢 (0)
[]
分享 (0)
关于作者:
发表我的评论
取消评论
表情 贴图 加粗 删除线 居中 斜体 签到

Hi,您需要填写昵称和邮箱!

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址