Linux Block块设备层内存映射

    xiaoxiao2022-07-02  128

    作者

    QQ群:852283276 微信:arm80x86 微信公众号:青儿创客基地 B站:主页 https://space.bilibili.com/208826118

    参考

    Linux那些事儿之我是Block层(11)传说中的内存映射(上) Linux那些事儿之我是Block层(12)传说中的内存映射(下) linux内核分析:read过程分析 write每次最大能写多少字节的数据 The iov_iter interface

    用户态内存的映射

    Linux使用blk_rq_map_user来完成用户态内存和bio的映射,blk_rq_map_user会填充struct request的struct bio结构体,

    int blk_rq_map_user(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, void __user *ubuf, unsigned long len, gfp_t gfp_mask) { struct iovec iov; struct iov_iter i; int ret = import_single_range(rq_data_dir(rq), ubuf, len, &iov, &i); if (unlikely(ret < 0)) return ret; return blk_rq_map_user_iov(q, rq, map_data, &i, gfp_mask); } EXPORT_SYMBOL(blk_rq_map_user);

    函数import_single_range中,MAX_RW_COUNT是一个宏:INT_MAX & PAGE_MASK,INT_MAX是2^31,理论上每次write可写的buff大小是2^31-2^12=2147479552,然后使用了迭代器,

    int import_single_range(int rw, void __user *buf, size_t len, struct iovec *iov, struct iov_iter *i) { if (len > MAX_RW_COUNT) len = MAX_RW_COUNT; if (unlikely(!access_ok(!rw, buf, len))) return -EFAULT; iov->iov_base = buf; iov->iov_len = len; iov_iter_init(i, rw, iov, 1, len); return 0; } EXPORT_SYMBOL(import_single_range);

    blk_rq_map_user_iov调用__blk_rq_map_user_iov

    /** * blk_rq_map_user_iov - map user data to a request, for passthrough requests * @q: request queue where request should be inserted * @rq: request to map data to * @map_data: pointer to the rq_map_data holding pages (if necessary) * @iter: iovec iterator * @gfp_mask: memory allocation flags * * Description: * Data will be mapped directly for zero copy I/O, if possible. Otherwise * a kernel bounce buffer is used. * * A matching blk_rq_unmap_user() must be issued at the end of I/O, while * still in process context. * * Note: The mapped bio may need to be bounced through blk_queue_bounce() * before being submitted to the device, as pages mapped may be out of * reach. It's the callers responsibility to make sure this happens. The * original bio must be passed back in to blk_rq_unmap_user() for proper * unmapping. */ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, const struct iov_iter *iter, gfp_t gfp_mask) { bool copy = false; unsigned long align = q->dma_pad_mask | queue_dma_alignment(q); struct bio *bio = NULL; struct iov_iter i; int ret; if (!iter_is_iovec(iter)) goto fail; if (map_data) copy = true; else if (iov_iter_alignment(iter) & align) copy = true; else if (queue_virt_boundary(q)) copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter); i = *iter; do { ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy); if (ret) goto unmap_rq; if (!bio) bio = rq->bio; } while (iov_iter_count(&i)); if (!bio_flagged(bio, BIO_USER_MAPPED)) rq->rq_flags |= RQF_COPY_USER; return 0; unmap_rq: __blk_rq_unmap_user(bio); fail: rq->bio = NULL; return -EINVAL; } EXPORT_SYMBOL(blk_rq_map_user_iov);

    函数bio_add_pc_page把page添加到bio,

    /** * bio_add_pc_page - attempt to add page to bio * @q: the target queue * @bio: destination bio * @page: page to add * @len: vec entry length * @offset: vec entry offset * * Attempt to add a page to the bio_vec maplist. This can fail for a * number of reasons, such as the bio being full or target block device * limitations. The target block device must allow bio's up to PAGE_SIZE, * so it is always possible to add a single page to an empty bio. * * This should only be used by REQ_PC bios. */ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { int retried_segments = 0; struct bio_vec *bvec; /* * cloned bio must not modify vec list */ if (unlikely(bio_flagged(bio, BIO_CLONED))) return 0; if (((bio->bi_iter.bi_size + len) >> 9) > queue_max_hw_sectors(q)) return 0; /* * For filesystems with a blocksize smaller than the pagesize * we will often be called with the same page as last time and * a consecutive offset. Optimize this special case. */ if (bio->bi_vcnt > 0) { struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; if (page == prev->bv_page && offset == prev->bv_offset + prev->bv_len) { prev->bv_len += len; bio->bi_iter.bi_size += len; goto done; } /* * If the queue doesn't support SG gaps and adding this * offset would create a gap, disallow it. */ if (bvec_gap_to_prev(q, prev, offset)) return 0; } if (bio->bi_vcnt >= bio->bi_max_vecs) return 0; /* * setup the new entry, we might clear it again later if we * cannot add the page */ bvec = &bio->bi_io_vec[bio->bi_vcnt]; bvec->bv_page = page; bvec->bv_len = len; bvec->bv_offset = offset; bio->bi_vcnt++; bio->bi_phys_segments++; bio->bi_iter.bi_size += len; /* * Perform a recount if the number of segments is greater * than queue_max_segments(q). */ while (bio->bi_phys_segments > queue_max_segments(q)) { if (retried_segments) goto failed; retried_segments = 1; blk_recount_segments(q, bio); } /* If we may be able to merge these biovecs, force a recount */ if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) bio_clear_flag(bio, BIO_SEG_VALID); done: return len; failed: bvec->bv_page = NULL; bvec->bv_len = 0; bvec->bv_offset = 0; bio->bi_vcnt--; bio->bi_iter.bi_size -= len; blk_recount_segments(q, bio); return 0; } EXPORT_SYMBOL(bio_add_pc_page);

    request映射

    使用blk_rq_map_sg把request映射到一个scatterlist,

    //petalinux2015.2.1, kernel 3.19 blk_rq_map_sg //block\blk-merge.c __blk_bios_map_sg __blk_segment_map_sg

    在__blk_segment_map_sg中,BIOVEC_PHYS_MERGEABLE检测相邻bio_vec是否可以合并,BIOVEC_SEG_BOUNDARY用于检测是否超过边界,比如某些硬件不能跨64MB边界传输,其单次发起DMA的最大大小就是64MB,可以通过queue_max_segment_size来控制scatterlist的表项。

    static inline void __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec, struct scatterlist *sglist, struct bio_vec *bvprv, struct scatterlist **sg, int *nsegs, int *cluster) { int nbytes = bvec->bv_len; if (*sg && *cluster) { if ((*sg)->length + nbytes > queue_max_segment_size(q)) goto new_segment; if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) goto new_segment; if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) goto new_segment; (*sg)->length += nbytes; } else { new_segment: if (!*sg) *sg = sglist; else { /* * If the driver previously mapped a shorter * list, we could see a termination bit * prematurely unless it fully inits the sg * table on each mapping. We KNOW that there * must be more entries here or the driver * would be buggy, so force clear the * termination bit to avoid doing a full * sg_init_table() in drivers for each command. */ sg_unmark_end(*sg); *sg = sg_next(*sg); } sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset); (*nsegs)++; } *bvprv = *bvec; }
    最新回复(0)