NVMe over TCP Write/Read命令下发流程梳理

总结

本文对NVMe over TCP的write和read命令下发流程进行了梳理

1. 环境

本文只针对Linux5.4.0版本的nvme内核模块源代码,使用命令

1
2
sudo nvme io-passthru /dev/nvme1n1 --opcode=1 --namespace-id=1 --data-len=4096 --write --cdw10=5120 --cdw11=0 --cdw12=7 -s -i 123.txt
sudo nvme io-passthru /dev/nvme1n1 --opcode=2 --namespace-id=1 --data-len=4096 --write --cdw10=5120 --cdw11=0 --cdw12=7 -s -i 456.txt

下发write和read命令,从nvme_ioctl函数开始对write进行一个梳理过程

2. 一些前置知识

NVMe over TCP传输时,使用的时SGL而不是PRP

3. host端与target端IO调用栈

我将所有函数都打了printk,write命令下发时,host端与target端的函数调用关系如下:

Host端:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
[  627.604513] nvme_dev_release
[ 709.055115] *****************************nvme_ioctl
[ 709.055116] nvme_get_ns_from_disk
[ 709.055117] nvme_user_cmd
[ 709.055120] nvme_passthru_start
[ 709.055120] nvme_to_user_ptr
[ 709.055121] nvme_to_user_ptr
[ 709.055121] nvme_submit_user_cmd
[ 709.055122] nvme_alloc_request
[ 709.055128] nvme_tcp_queue_rq
[ 709.055128] nvme_tcp_setup_cmd_pdu
[ 709.055129] nvme_setup_cmd
[ 709.055129] nvme_tcp_map_data
[ 709.055129] nvme_tcp_set_sg_inline
[ 709.055131] ##### nvme_tcp_queue_request
[ 709.055135] *************************************nvme_tcp_io_work
[ 709.055136] nvme_tcp_try_send
[ 709.055136] nvme_tcp_fetch_request
[ 709.055136] nvme_tcp_try_send_cmd_pdu
[ 709.055137] nvme_tcp_has_inline_data
[ 709.055146] nvme_tcp_init_iter
[ 709.055147] nvme_tcp_has_inline_data
[ 709.055147] nvme_tcp_try_send_data
[ 709.060325] nvme_tcp_data_ready
[ 709.060327] nvme_tcp_advance_req
[ 709.060328] nvme_tcp_try_recv
[ 709.060328] nvme_tcp_recv_skb
[ 709.060329] nvme_tcp_recv_pdu
[ 709.060329] nvme_tcp_init_recv_ctx
[ 709.060329] nvme_tcp_handle_comp
[ 709.060330] nvme_tcp_process_nvme_cqe
[ 709.060331] nvme_complete_rq
[ 709.060331] nvme_error_status
[ 709.060339] nvme_tcp_try_send
[ 709.060339] nvme_tcp_fetch_request
[ 709.060339] nvme_tcp_try_recv
[ 709.060340] *************************************nvme_tcp_io_work
[ 709.060341] nvme_tcp_try_send
[ 709.060341] nvme_tcp_fetch_request
[ 709.060341] nvme_tcp_try_recv

target端:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
[  587.920537] nvmet_tcp_data_ready
[ 587.920666] nvmet_tcp_io_work
[ 587.920667] ********************************nvmet_tcp_try_recv
[ 587.920667] nvmet_tcp_try_recv_one
[ 587.920667] nvmet_tcp_try_recv_pdu
[ 587.920672] nvmet_tcp_done_recv_pdu
[ 587.920672] nvmet_tcp_get_cmd
[ 587.920674] nvmet_req_init
[ 587.920674] nvmet_parse_io_cmd
[ 587.920675] nvmet_bdev_parse_io_cmd
[ 587.920677] nvmet_tcp_map_data
[ 587.920681] nvmet_tcp_map_pdu_iovec
[ 587.920681] nvmet_tcp_try_recv_data
[ 587.920683] nvmet_req_execute
[ 587.920684] nvmet_bdev_execute_rw
[ 587.920696] nvme_setup_cmd
[ 587.920697] nvme_setup_rw
[ 587.920711] nvmet_prepare_receive_pdu
[ 587.920711] nvmet_tcp_try_recv_one
[ 587.920711] nvmet_tcp_try_recv_pdu
[ 587.920712] nvmet_tcp_try_send
[ 587.920712] nvmet_tcp_try_send_one
[ 587.920713] nvmet_tcp_fetch_cmd
[ 587.920713] nvmet_tcp_process_resp_list
[ 587.922352] nvme_cleanup_cmd
[ 587.922353] nvme_complete_rq
[ 587.922354] nvme_error_status
[ 587.922356] nvmet_bio_done
[ 587.922357] blk_to_nvme_status
[ 587.922357] nvmet_req_complete
[ 587.922358] __nvmet_req_complete
[ 587.922359] nvmet_tcp_queue_response
[ 587.922370] nvmet_tcp_io_work
[ 587.922370] ********************************nvmet_tcp_try_recv
[ 587.922371] nvmet_tcp_try_recv_one
[ 587.922371] nvmet_tcp_try_recv_pdu
[ 587.922373] nvmet_tcp_try_send
[ 587.922374] nvmet_tcp_try_send_one
[ 587.922374] nvmet_tcp_fetch_cmd
[ 587.922374] nvmet_tcp_process_resp_list
[ 587.922375] nvmet_setup_response_pdu
[ 587.922375] nvmet_try_send_response
[ 587.922430] nvmet_tcp_try_send_one
[ 587.922431] nvmet_tcp_fetch_cmd
[ 587.922431] nvmet_tcp_process_resp_list

read命令下发时,函数调用IO栈如下:

host端:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
[10119.754240] *****************************nvme_ioctl
[10119.754241] nvme_get_ns_from_disk
[10119.754246] nvme_user_cmd
[10119.754248] nvme_passthru_start
[10119.754249] nvme_to_user_ptr
[10119.754250] nvme_to_user_ptr
[10119.754250] nvme_submit_user_cmd
[10119.754251] nvme_alloc_request
[10119.754264] nvme_tcp_queue_rq
[10119.754265] nvme_tcp_setup_cmd_pdu
[10119.754265] nvme_setup_cmd
[10119.754268] nvme_tcp_init_iter
[10119.754269] nvme_tcp_map_data
[10119.754269] nvme_tcp_set_sg_host_data
[10119.754271] ##### nvme_tcp_queue_request
[10119.754278] *************************************nvme_tcp_io_work
[10119.754278] nvme_tcp_try_send
[10119.754278] nvme_tcp_fetch_request
[10119.754279] nvme_tcp_try_send_cmd_pdu
[10119.754279] nvme_tcp_has_inline_data
[10119.754363] nvme_tcp_done_send_req
[10119.754364] nvme_tcp_has_inline_data
[10119.754364] nvme_tcp_try_recv
[10119.754365] nvme_tcp_try_send
[10119.754365] nvme_tcp_fetch_request
[10119.754366] nvme_tcp_try_recv
[10119.760851] nvme_tcp_data_ready
[10119.760860] nvme_tcp_data_ready
[10119.760864] *************************************nvme_tcp_io_work
[10119.760865] nvme_tcp_try_send
[10119.760865] nvme_tcp_fetch_request
[10119.760866] nvme_tcp_try_recv
[10119.762412] nvme_tcp_recv_skb
[10119.762412] nvme_tcp_recv_pdu
[10119.762413] nvme_tcp_handle_c2h_data
[10119.762415] nvme_tcp_recv_data
[10119.762417] nvme_tcp_init_recv_ctx
[10119.762417] nvme_tcp_recv_pdu
[10119.762417] nvme_tcp_init_recv_ctx
[10119.762418] nvme_tcp_handle_comp
[10119.762418] nvme_tcp_process_nvme_cqe
[10119.762419] nvme_complete_rq
[10119.762419] nvme_error_status
[10119.762426] nvme_tcp_try_send
[10119.762426] nvme_tcp_fetch_request
[10119.762427] nvme_tcp_try_recv

target端:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
[10113.557551] nvmet_tcp_io_work
[10113.557552] ********************************nvmet_tcp_try_recv
[10113.557552] nvmet_tcp_try_recv_one
[10113.557553] nvmet_tcp_try_recv_pdu
[10113.557562] nvmet_tcp_done_recv_pdu
[10113.557562] nvmet_tcp_get_cmd
[10113.557567] nvmet_req_init
[10113.557567] nvmet_parse_io_cmd
[10113.557576] nvmet_bdev_parse_io_cmd
[10113.557579] nvmet_tcp_map_data
[10113.557583] nvmet_req_execute
[10113.557583] nvmet_bdev_execute_rw
[10113.557607] nvme_setup_cmd
[10113.557607] nvme_setup_rw
[10113.557635] nvme_cleanup_cmd
[10113.557636] nvme_complete_rq
[10113.557637] nvme_error_status
[10113.557637] blk_mq_end_request
[10113.557638] nvmet_bio_done
[10113.557638] blk_to_nvme_status
[10113.557639] nvmet_req_complete
[10113.557639] __nvmet_req_complete
[10113.557640] nvmet_tcp_queue_response
[10113.557645] nvmet_prepare_receive_pdu
[10113.557645] nvmet_tcp_try_recv_one
[10113.557646] nvmet_tcp_try_recv_pdu
[10113.557646] nvmet_tcp_try_send
[10113.557647] nvmet_tcp_try_send_one
[10113.557647] nvmet_tcp_fetch_cmd
[10113.557647] nvmet_tcp_process_resp_list
[10113.557648] nvmet_setup_c2h_data_pdu
[10113.557649] nvmet_try_send_data_pdu
[10113.557653] nvmet_try_send_data
[10113.557685] nvmet_setup_response_pdu
[10113.557685] nvmet_try_send_response
[10113.557686] nvmet_tcp_put_cmd
[10113.557687] nvmet_tcp_try_send_one
[10113.557687] nvmet_tcp_fetch_cmd
[10113.557687] nvmet_tcp_process_resp_list
[10113.557687] llist_del_all=NULL
[10113.557688] nvmet_tcp_io_work
[10113.557688] ********************************nvmet_tcp_try_recv
[10113.557689] nvmet_tcp_try_recv_one
[10113.557689] nvmet_tcp_try_recv_pdu
[10113.557690] nvmet_tcp_try_send
[10113.557690] nvmet_tcp_try_send_one
[10113.557690] nvmet_tcp_fetch_cmd
[10113.557691] nvmet_tcp_process_resp_list

4. Write命令函数调用关系图

5. Read命令函数调用关系图

Write和Read命令下发的IO栈函数关系图如图所示,自己画的不是特别美观,看不清楚的可以下载下来然后放大

下面就以Write命令为例将函数按照顺序从上到下,从左到右,按箭头的顺序进行分析,Read命令类似

6. 函数具体分析

host端

nvme_ioctl

进入nvme_ioctl函数,传递由nvme-cli下发下来的写命令

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
struct nvme_ns_head *head = NULL;
void __user *argp = (void __user *)arg;
struct nvme_ns *ns;
int srcu_idx, ret;

// 读取nvme设备的namespace
ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
if (unlikely(!ns))
return -EWOULDBLOCK;

/*
* Handle ioctls that apply to the controller instead of the namespace
* seperately and drop the ns SRCU reference early. This avoids a
* deadlock when deleting namespaces using the passthrough interface.
*/
if (is_ctrl_ioctl(cmd)){
// 如果是admin命令,进入该分支,提前释放ns
return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
}

switch (cmd) {
case NVME_IOCTL_ID:
force_successful_syscall_return();
ret = ns->head->ns_id;
break;
case NVME_IOCTL_IO_CMD:
// 如果是IO命令进入该分支
ret = nvme_user_cmd(ns->ctrl, ns, argp);
break;
case NVME_IOCTL_SUBMIT_IO:
ret = nvme_submit_io(ns, argp);
break;
case NVME_IOCTL_IO64_CMD:
ret = nvme_user_cmd64(ns->ctrl, ns, argp);
break;
default:
if (ns->ndev)
ret = nvme_nvm_ioctl(ns, cmd, arg);
else
ret = -ENOTTY;
}

nvme_put_ns_from_disk(head, srcu_idx);

return ret;
}

由于是写命令,所以会进入nvme_user_cmd函数中进行命令下发

nvme_user_cmd

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
struct nvme_passthru_cmd __user *ucmd)
{
struct nvme_passthru_cmd cmd;
struct nvme_command c;
unsigned timeout = 0;
u32 effects;
u64 result;
int status;

if (!capable(CAP_SYS_ADMIN))
return -EACCES;
/*复制用户态已经设定好的cmd到cmd结构体中*/
if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
return -EFAULT;
if (cmd.flags)
return -EINVAL;

/*对nvme_command做一些初始化*/
memset(&c, 0, sizeof(c));
c.common.opcode = cmd.opcode;
c.common.flags = cmd.flags;
c.common.nsid = cpu_to_le32(cmd.nsid);
c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
c.common.cdw10 = cpu_to_le32(cmd.cdw10);
c.common.cdw11 = cpu_to_le32(cmd.cdw11);
c.common.cdw12 = cpu_to_le32(cmd.cdw12);
c.common.cdw13 = cpu_to_le32(cmd.cdw13);
c.common.cdw14 = cpu_to_le32(cmd.cdw14);
c.common.cdw15 = cpu_to_le32(cmd.cdw15);

/*设置超时时间*/
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);

effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
/*下发nvme_command,因为是使用nvme_over_TCP,所以使用的是tcp queue,而不是通过pci queue*/
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
nvme_to_user_ptr(cmd.addr), cmd.data_len,
nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
0, &result, timeout);
nvme_passthru_end(ctrl, effects);

if (status >= 0) {
/*用于admin命令或者nvme over fabric命令回传结果信息*/
if (put_user(result, &ucmd->result)){
return -EFAULT;
}
}

return status;
}

nvme_submit_user_cmd

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
static int nvme_submit_user_cmd(struct request_queue *q,
struct nvme_command *cmd, void __user *ubuffer,
unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
u32 meta_seed, u64 *result, unsigned timeout)
{
/*NVMe 命令中,如果最低位为1 代表这是一个写数据命令,如果不为0 则是一个读数据命令*/
bool write = nvme_is_write(cmd);
struct nvme_ns *ns = q->queuedata;
struct gendisk *disk = ns ? ns->disk : NULL;
struct request *req;
struct bio *bio = NULL;
void *meta = NULL;
int ret;

/*创建一个request结构体供后续执行*/
req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY);
if (IS_ERR(req)){
return PTR_ERR(req);
}

req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
nvme_req(req)->flags |= NVME_REQ_USERCMD;

if (ubuffer && bufflen) {
/*完成用户态内存ubuffer和req->bio的映射,这样下发数据可以直接根据req->bio取得用户态的数据
或者读取数据时,直接通过该bio将数据读取到用户态的buffer中*/
ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
GFP_KERNEL);
if (ret)
goto out;
bio = req->bio;
bio->bi_disk = disk;
if (disk && meta_buffer && meta_len) {
/*加入元数据*/
meta = nvme_add_user_metadata(bio, meta_buffer, meta_len,
meta_seed, write);
if (IS_ERR(meta)) {
ret = PTR_ERR(meta);
goto out_unmap;
}
req->cmd_flags |= REQ_INTEGRITY;
}
}

/*驱动层调用blk_execute_rq让块设备层执行request,由于是tcp queue,所以调用的是nvme_tcp_queue_rq函数*/
/*insert a request into queue for execution*/
blk_execute_rq(req->q, disk, req, 0);
if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
ret = -EINTR;
else
ret = nvme_req(req)->status;
if (result)
*result = le64_to_cpu(nvme_req(req)->result.u64);

if (meta && !ret && !write) {
/*如果是读命令且由metadata,将metadata写入到metabuffer中*/
if (copy_to_user(meta_buffer, meta, meta_len))
ret = -EFAULT;
}
kfree(meta);
out_unmap:
if (bio)
blk_rq_unmap_user(bio); /*取消用户态bio的映射关系*/
out:

blk_mq_free_request(req);
return ret;
}

nvme_tcp_queue_rq

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct nvme_ns *ns = hctx->queue->queuedata;
struct nvme_tcp_queue *queue = hctx->driver_data;
struct request *rq = bd->rq;
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
blk_status_t ret;

if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);

/*对将要发送的cmd pdu做一些初始化操作 包括分配cmd结构体,分配sgl*/
ret = nvme_tcp_setup_cmd_pdu(ns, rq);
if (unlikely(ret))
return ret;

/*为request的执行做一些前置工作*/
blk_mq_start_request(rq);

/*准备开始执行request*/
nvme_tcp_queue_request(req);

return BLK_STS_OK;
}

nvme_tcp_setup_cmd_pdu

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
struct request *rq)
{
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
struct nvme_tcp_cmd_pdu *pdu = req->pdu;
struct nvme_tcp_queue *queue = req->queue;
u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
blk_status_t ret;

ret = nvme_setup_cmd(ns, rq, &pdu->cmd);
if (ret)
return ret;

/*初始化nvme_tcp_request的一些状态,上面初始化的是request,而根据blk_mq_rq_to_pdu(rq)可知,两者是在物理空间上相邻的*/
req->state = NVME_TCP_SEND_CMD_PDU;
req->offset = 0;
req->data_sent = 0;
req->pdu_len = 0;
req->pdu_sent = 0;
req->data_len = blk_rq_nr_phys_segments(rq) ?
blk_rq_payload_bytes(rq) : 0; /*要发送数据的长度,这里的长度由*/
req->curr_bio = rq->bio;

/*如果发送的是写命令而且req的data_len <= 要发送的数据大小,则pdu_len赋值为data_len 这里可能是说,有data_len < 要发送数据的情况,
可能是用户在自定义参数赋值时出错了啥的*/
if (rq_data_dir(rq) == WRITE &&
req->data_len <= nvme_tcp_inline_data_size(queue))
req->pdu_len = req->data_len;
/*如果curr_bio不为空,说明是读命令,则初始化iter,这个是用于后续接收到sock数据写到这里*/
else if (req->curr_bio)
nvme_tcp_init_iter(req, READ);

/*对将要发送的pdu的hdr进行一些初始化操作*/
pdu->hdr.type = nvme_tcp_cmd;
pdu->hdr.flags = 0;
if (queue->hdr_digest)
pdu->hdr.flags |= NVME_TCP_F_HDGST;
if (queue->data_digest && req->pdu_len) {
pdu->hdr.flags |= NVME_TCP_F_DDGST;
ddgst = nvme_tcp_ddgst_len(queue);
}
pdu->hdr.hlen = sizeof(*pdu);
pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
pdu->hdr.plen =
cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);

/*初始化SGL,但不知道里面的addr中0和icdoff的意义*/
ret = nvme_tcp_map_data(queue, rq);
if (unlikely(ret)) {
nvme_cleanup_cmd(rq);
dev_err(queue->ctrl->ctrl.device,
"Failed to map data (%d)\n", ret);
return ret;
}

return 0;
}

nvme_tcp_queue_request

1
2
3
4
5
6
7
8
9
10
11
12
static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req)
{
printk("##### nvme_tcp_queue_request");
struct nvme_tcp_queue *queue = req->queue;

spin_lock(&queue->lock);
list_add_tail(&req->entry, &queue->send_list);
spin_unlock(&queue->lock);

/*执行io_work函数*/
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
}

nvme_tcp_io_work

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
static void nvme_tcp_io_work(struct work_struct *w)
{
struct nvme_tcp_queue *queue =
container_of(w, struct nvme_tcp_queue, io_work);
unsigned long deadline = jiffies + msecs_to_jiffies(1);

do {
bool pending = false;
int result;

/*利用TCP/IP发送PDU*/
result = nvme_tcp_try_send(queue);
if (result > 0) {
pending = true;
} else if (unlikely(result < 0)) {
dev_err(queue->ctrl->ctrl.device,
"failed to send request %d\n", result);

/*
* Fail the request unless peer closed the connection,
* in which case error recovery flow will complete all.
*/
if ((result != -EPIPE) && (result != -ECONNRESET)){
nvme_tcp_fail_request(queue->request);
}

nvme_tcp_done_send_req(queue);
return;
}

/*接收cqe*/
result = nvme_tcp_try_recv(queue);
if (result > 0)
pending = true;

if (!pending){
return;
}

} while (!time_after(jiffies, deadline)); /* quota is exhausted */


queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
}

nvme_tcp_try_send

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_request *req;
int ret = 1;

if (!queue->request) {
/*从queue中取出request执行,如果queue中没有request,则不执行,直接返回*/
queue->request = nvme_tcp_fetch_request(queue);
if (!queue->request){
return 0;
}
}
req = queue->request;

/*注意这里不是if else,而是4个if,req_>state的值会在各个send函数中改变,write命令在这里的执行顺序是,
nvme_tcp_try_send_cmd_pdu->nvme_tcp_try_send_data,无DDGST*/
if (req->state == NVME_TCP_SEND_CMD_PDU) {
ret = nvme_tcp_try_send_cmd_pdu(req);
if (ret <= 0)
goto done;
if (!nvme_tcp_has_inline_data(req)){
return ret;
}
}

if (req->state == NVME_TCP_SEND_H2C_PDU) {
ret = nvme_tcp_try_send_data_pdu(req);
if (ret <= 0)
goto done;
}

if (req->state == NVME_TCP_SEND_DATA) {
ret = nvme_tcp_try_send_data(req);
if (ret <= 0)
goto done;
}

if (req->state == NVME_TCP_SEND_DDGST)
ret = nvme_tcp_try_send_ddgst(req);
done:
if (ret == -EAGAIN)
ret = 0;
return ret;
}

nvme_tcp_try_send_cmd_pdu

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
{
struct nvme_tcp_queue *queue = req->queue;
struct nvme_tcp_cmd_pdu *pdu = req->pdu;
bool inline_data = nvme_tcp_has_inline_data(req);
int flags = MSG_DONTWAIT | (inline_data ? MSG_MORE : MSG_EOR);
u8 hdgst = nvme_tcp_hdgst_len(queue);
int len = sizeof(*pdu) + hdgst - req->offset;
int ret;


/*如果有摘要,则先处理摘要*/
if (queue->hdr_digest && !req->offset)
nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));

/*发送PDU*/
ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
offset_in_page(pdu) + req->offset, len, flags);
if (unlikely(ret <= 0))
return ret;

len -= ret;
if (!len) {
if (inline_data) {
/*如果有data要发送,则将state置为NVME_TCP_SEND_DATA,这样就可以进入nvme_tcp_try_send_data函数了*/
req->state = NVME_TCP_SEND_DATA;
if (queue->data_digest)
crypto_ahash_init(queue->snd_hash);
/*对iter进行初始化,不过不知道这个iter和SGL什么关系*/
nvme_tcp_init_iter(req, WRITE);
} else {
nvme_tcp_done_send_req(queue);
}
return 1;
}
/*累加offset*/
req->offset += ret;

return -EAGAIN;
}

nvme_tcp_try_send_data

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
{
struct nvme_tcp_queue *queue = req->queue;

while (true) {
struct page *page = nvme_tcp_req_cur_page(req);
size_t offset = nvme_tcp_req_cur_offset(req);
size_t len = nvme_tcp_req_cur_length(req);
bool last = nvme_tcp_pdu_last_send(req, len);
int ret, flags = MSG_DONTWAIT;

/*如果是最后的数据发送了而且没有data摘要,flag加上EOR,否则是MORE*/
if (last && !queue->data_digest)
flags |= MSG_EOR;
else
flags |= MSG_MORE;

if (sendpage_ok(page)) {
ret = kernel_sendpage(queue->sock, page, offset, len,
flags);
} else {
ret = sock_no_sendpage(queue->sock, page, offset, len,
flags);
}
if (ret <= 0)
return ret;

/*计算偏移值,便于继续发送下一个BIO的数据,所以,senddata其实发送的是BIO的数据?
如果不连续的话,则发送完一个BIO再发送下一个BIO的数据*/
nvme_tcp_advance_req(req, ret);
if (queue->data_digest)
nvme_tcp_ddgst_update(queue->snd_hash, page,
offset, ret);

/* fully successful last write*/
if (last && ret == len) {
if (queue->data_digest) {
nvme_tcp_ddgst_final(queue->snd_hash,
&req->ddgst);
req->state = NVME_TCP_SEND_DDGST;
req->offset = 0;
} else {
/*发送全部结束*/
nvme_tcp_done_send_req(queue);
}
return 1;
}
}
return -EAGAIN;
}

至此,Host端的数据发送已经结束(无DIGIST),之后按时间顺序转到Target端执行相关函数

target端

nvmet_tcp_io_work

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
static void nvmet_tcp_io_work(struct work_struct *w)
{
struct nvmet_tcp_queue *queue =
container_of(w, struct nvmet_tcp_queue, io_work);
bool pending;
int ret, ops = 0;

do {
pending = false;

/*与host端的io_work不一样,target端的io_work最开始是尝试接收*/
ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops);
if (ret > 0) {
pending = true;
} else if (ret < 0) {
if (ret == -EPIPE || ret == -ECONNRESET)
kernel_sock_shutdown(queue->sock, SHUT_RDWR);
else
nvmet_tcp_fatal_error(queue);
return;
}

/*接收完毕后,开始根据接收到的数据向主机端发送cmd,可能是R2T,可能是cqe*/
ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops);
if (ret > 0) {
/* transmitted message/data */
pending = true;
} else if (ret < 0) {
if (ret == -EPIPE || ret == -ECONNRESET)
kernel_sock_shutdown(queue->sock, SHUT_RDWR);
else
nvmet_tcp_fatal_error(queue);
return;
}

} while (pending && ops < NVMET_TCP_IO_WORK_BUDGET);

/*
* We exahusted our budget, requeue our selves
*/
if (pending)
queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
}

nvmet_tcp_try_recv

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue,
int budget, int *recvs)
{
int i, ret = 0;

for (i = 0; i < budget; i++) {
/*尝试从sock接收数据*/
ret = nvmet_tcp_try_recv_one(queue);
if (ret <= 0)
break;
(*recvs)++;
}

return ret;
}

nvmet_tcp_try_recv_one

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue)
{
int result = 0;

if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR))
return 0;

/*根据queue->rcv_state来进入不同的分支,write的state变化是,NVMET_TCP_RECV_PDU->nvmet_tcp_try_recv_data
queue->rcv_state在队列最开始初始化时,状态是NVMET_TCP_RECV_PDU*/
if (queue->rcv_state == NVMET_TCP_RECV_PDU) {
result = nvmet_tcp_try_recv_pdu(queue);
if (result != 0)
goto done_recv;
}

if (queue->rcv_state == NVMET_TCP_RECV_DATA) {
result = nvmet_tcp_try_recv_data(queue);
if (result != 0)
goto done_recv;
}

if (queue->rcv_state == NVMET_TCP_RECV_DDGST) {
result = nvmet_tcp_try_recv_ddgst(queue);
if (result != 0)
goto done_recv;
}

done_recv:
if (result < 0) {
if (result == -EAGAIN)
return 0;
return result;
}
return 1;
}

nvmet_tcp_try_recv_pdu

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue)
{
struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
int len;
struct kvec iov;
struct msghdr msg = { .msg_flags = MSG_DONTWAIT };

recv:
/*建立iov与queue->pdu的映射关系,这样sock接收到的数据可以直接写到queue->pdu中进行访问
queue->left初始值为sizeof(struct nvme_tcp_hdr)*/
iov.iov_base = (void *)&queue->pdu + queue->offset;
iov.iov_len = queue->left初始值为;
len = kernel_recvmsg(queue->sock, &msg, &iov, 1,
iov.iov_len, msg.msg_flags);
if (unlikely(len < 0))
return len;

queue->offset += len;
queue->left -= len;
if (queue->left)
return -EAGAIN;

if (queue->offset == sizeof(struct nvme_tcp_hdr)) {
u8 hdgst = nvmet_tcp_hdgst_len(queue);

if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) {
pr_err("unexpected pdu type %d\n", hdr->type);
nvmet_tcp_fatal_error(queue);
return -EIO;
}

if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) {
pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen);
return -EIO;
}

queue->left = hdr->hlen - queue->offset + hdgst;
goto recv;
}

if (queue->hdr_digest &&
nvmet_tcp_verify_hdgst(queue, &queue->pdu, queue->offset)) {
nvmet_tcp_fatal_error(queue); /* fatal */
return -EPROTO;
}

if (queue->data_digest &&
nvmet_tcp_check_ddgst(queue, &queue->pdu)) {
nvmet_tcp_fatal_error(queue); /* fatal */
return -EPROTO;
}

/*结束接收PDU的工作,接下来根据接受到的PDU进行处理*/
return nvmet_tcp_done_recv_pdu(queue);
}

nvmet_tcp_done_recv_pdu

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
{
struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd;
struct nvmet_req *req;
int ret;

if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
if (hdr->type != nvme_tcp_icreq) {
pr_err("unexpected pdu type (%d) before icreq\n",
hdr->type);
nvmet_tcp_fatal_error(queue);
return -EPROTO;
}
return nvmet_tcp_handle_icreq(queue);
}

if (hdr->type == nvme_tcp_h2c_data) {
ret = nvmet_tcp_handle_h2c_data_pdu(queue);
if (unlikely(ret))
return ret;
return 0;
}

/*新建nvmet_tcp_cmd结构体并做一些初始化操作*/
queue->cmd = nvmet_tcp_get_cmd(queue);
if (unlikely(!queue->cmd)) {
/* This should never happen */
pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d",
queue->idx, queue->nr_cmds, queue->send_list_len,
nvme_cmd->common.opcode);
nvmet_tcp_fatal_error(queue);
return -ENOMEM;
}

req = &queue->cmd->req;
/*复制传递过来的nvme_cmd到queue->cmd->req->cmd中去*/
memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd));

/*对nvmet_req做一些初始化操作,包括根据opcode绑定执行函数,绑定sqe,cqe等*/
if (unlikely(!nvmet_req_init(req, &queue->nvme_cq,
&queue->nvme_sq, &nvmet_tcp_ops))) {
pr_err("failed cmd %p id %d opcode %d, data_len: %d\n",
req->cmd, req->cmd->common.command_id,
req->cmd->common.opcode,
le32_to_cpu(req->cmd->common.dptr.sgl.length));

nvmet_tcp_handle_req_failure(queue, queue->cmd, req);
return -EAGAIN;
}

/*申请SGL数组,申请iov数组*/
ret = nvmet_tcp_map_data(queue->cmd);
if (unlikely(ret)) {
pr_err("queue %d: failed to map data\n", queue->idx);
if (nvmet_tcp_has_inline_data(queue->cmd))
nvmet_tcp_fatal_error(queue);
else
nvmet_req_complete(req, ret);
ret = -EAGAIN;
goto out;
}

/*对上面申请的iov和sgl做映射,然后初始化cmd->recv_msg.msg_iter,
将状态变为NVMET_TCP_RECV_DATA,准备接收host端发送过来的数据,然后直接返回到nvmet_tcp_try_recv_one函数中去*/
if (nvmet_tcp_need_data_in(queue->cmd)) {
if (nvmet_tcp_has_inline_data(queue->cmd)) {
queue->rcv_state = NVMET_TCP_RECV_DATA;
nvmet_tcp_map_pdu_iovec(queue->cmd);
return 0;
}
/* send back R2T */
nvmet_tcp_queue_response(&queue->cmd->req);
goto out;
}

nvmet_req_execute(&queue->cmd->req);
out:
nvmet_prepare_receive_pdu(queue);
return ret;
}

nvmet_req_init

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops)
{
u8 flags = req->cmd->common.flags;
u16 status;

req->cq = cq;
req->sq = sq;
req->ops = ops;
req->sg = NULL;
req->sg_cnt = 0;
req->transfer_len = 0;
req->cqe->status = 0;
req->cqe->sq_head = 0;
req->ns = NULL;
req->error_loc = NVMET_NO_ERROR_LOC;
req->error_slba = 0;

/* no support for fused commands yet */
if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND)) || req->cmd->common.opcode == nvme_admin_ndp_execute) {
req->error_loc = offsetof(struct nvme_common_command, flags);
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
goto fail;
}

/*
* For fabrics, PSDT field shall describe metadata pointer (MPTR) that
* contains an address of a single contiguous physical buffer that is
* byte aligned.
*/
if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF)) {
req->error_loc = offsetof(struct nvme_common_command, flags);
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
printk("For fabrics, PSDT field shall describe metadata pointer (MPTR) status=%d",status);
goto fail;
}

if (unlikely(!req->sq->ctrl)){
/* will return an error for any Non-connect command: */
status = nvmet_parse_connect_cmd(req);
}
/*IO命令进入,做相关初始化操作*/
else if (likely(req->sq->qid != 0)){
status = nvmet_parse_io_cmd(req);
}
else if (nvme_is_fabrics(req->cmd)){
status = nvmet_parse_fabrics_cmd(req);
}
else if (req->sq->ctrl->subsys->type == NVME_NQN_DISC){
status = nvmet_parse_discovery_cmd(req);
}
/*admin命令进入,做相关初始化操作*/
else{
status = nvmet_parse_admin_cmd(req);
}


if (status)
goto fail;

trace_nvmet_req_init(req, req->cmd);

if (unlikely(!percpu_ref_tryget_live(&sq->ref))) {
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
printk("unlikely(!percpu_ref_tryget_live(&sq->ref)) status=%d",status);
goto fail;
}

if (sq->ctrl)
sq->ctrl->cmd_seen = true;

return true;

fail:
__nvmet_req_complete(req, status);
return false;
}
EXPORT_SYMBOL_GPL(nvmet_req_init);

nvmet_parse_io_cmd

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
u16 ret;

ret = nvmet_check_ctrl_status(req, cmd);
if (unlikely(ret))
return ret;
/*检查各种状态后,绑定执行函数*/
req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
if (unlikely(!req->ns)) {
req->error_loc = offsetof(struct nvme_common_command, nsid);
return NVME_SC_INVALID_NS | NVME_SC_DNR;
}
ret = nvmet_check_ana_state(req->port, req->ns);
if (unlikely(ret)) {
req->error_loc = offsetof(struct nvme_common_command, nsid);
return ret;
}
ret = nvmet_io_cmd_check_access(req);
if (unlikely(ret)) {
req->error_loc = offsetof(struct nvme_common_command, nsid);
return ret;
}

if (req->ns->file){
return nvmet_file_parse_io_cmd(req);
}
else{
/*Write操作进入这个函数绑定执行操作以及计算data_len*/
return nvmet_bdev_parse_io_cmd(req);
}
}

nvmet_bdev_parse_io_cmd

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;

switch (cmd->common.opcode) {
case nvme_cmd_read:
/*绑定request的执行函数,根据传输的参数length计算该request需要处理的数据长度*/
case nvme_cmd_write:
req->execute = nvmet_bdev_execute_rw;
req->data_len = nvmet_rw_len(req);
return 0;
case nvme_cmd_flush:
req->execute = nvmet_bdev_execute_flush;
req->data_len = 0;
return 0;
case nvme_cmd_dsm:
req->execute = nvmet_bdev_execute_dsm;
req->data_len = (le32_to_cpu(cmd->dsm.nr) + 1) *
sizeof(struct nvme_dsm_range);
return 0;
case nvme_cmd_write_zeroes:
req->execute = nvmet_bdev_execute_write_zeroes;
req->data_len = 0;
return 0;
default:
pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
req->sq->qid);
req->error_loc = offsetof(struct nvme_common_command, opcode);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
}

nvmet_tcp_map_data

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
{
printk("nvmet_tcp_map_data");
struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl;
u32 len = le32_to_cpu(sgl->length);

if (!cmd->req.data_len)
return 0;

if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) |
NVME_SGL_FMT_OFFSET)) {
if (!nvme_is_write(cmd->req.cmd))
return NVME_SC_INVALID_FIELD | NVME_SC_DNR;

if (len > cmd->req.port->inline_data_size)
return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
cmd->pdu_len = len;
}
/*这里transfer_len等于sgl->length*/
cmd->req.transfer_len += len;

/*申请sgl,准备好将数据放在这里*/
cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt);
if (!cmd->req.sg)
return NVME_SC_INTERNAL;
cmd->cur_sg = cmd->req.sg;

/*如果随着pdu一起过来的还有data,申请cmd->iov数组*/
if (nvmet_tcp_has_data_in(cmd)) {
cmd->iov = kmalloc_array(cmd->req.sg_cnt,
sizeof(*cmd->iov), GFP_KERNEL);
if (!cmd->iov)
goto err;
}

return 0;
err:
sgl_free(cmd->req.sg);
return NVME_SC_INTERNAL;
}

nvmet_tcp_try_recv_data

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
{
struct nvmet_tcp_cmd *cmd = queue->cmd;
int ret;

/*host端将非连续的page分次send,那么target端也要同步进行recv*/
while (msg_data_left(&cmd->recv_msg)) {
ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
cmd->recv_msg.msg_flags);
if (ret <= 0)
return ret;

cmd->pdu_recv += ret;
cmd->rbytes_done += ret;
}

/*解除SGL和iov的映射关系 此时,数据已经全部被接收*/
nvmet_tcp_unmap_pdu_iovec(cmd);

if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
cmd->rbytes_done == cmd->req.transfer_len) {
if (queue->data_digest) {
nvmet_tcp_prep_recv_ddgst(cmd);
return 0;
}
/*执行之前绑定的处理函数,这里是nvmet_bdev_execute_rw*/
nvmet_req_execute(&cmd->req);
}

nvmet_prepare_receive_pdu(queue);
return 0;
}

nvmet_bdev_execute_rw

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
static void nvmet_bdev_execute_rw(struct nvmet_req *req)
{
int sg_cnt = req->sg_cnt;
struct bio *bio;
struct scatterlist *sg;
sector_t sector;
int op, op_flags = 0, i;

if (!req->sg_cnt) {
nvmet_req_complete(req, 0);
return;
}

/*设置用于target端通过PCI接口下发sgl的op*/
if (req->cmd->rw.opcode == nvme_cmd_write) {
op = REQ_OP_WRITE;
op_flags = REQ_SYNC | REQ_IDLE;
if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA))
op_flags |= REQ_FUA;
} else {
op = REQ_OP_READ;
}

if (is_pci_p2pdma_page(sg_page(req->sg)))
op_flags |= REQ_NOMERGE;

sector = le64_to_cpu(req->cmd->rw.slba);
sector <<= (req->ns->blksize_shift - 9);

/*初始化或者申请BIO*/
if (req->data_len <= NVMET_MAX_INLINE_DATA_LEN) {
bio = &req->b.inline_bio;
bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
} else {
bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
}
bio_set_dev(bio, req->ns->bdev);
bio->bi_iter.bi_sector = sector;
bio->bi_private = req;
bio->bi_end_io = nvmet_bio_done;
bio_set_op_attrs(bio, op, op_flags);

/*遍历所有的sgl,对可以进行合并的sgl合并到同一个BIO中,不连续的则申请新的BIO,将上一个BIO通过submit_bio下发*/
for_each_sg(req->sg, sg, req->sg_cnt, i) {
while (bio_add_page(bio, sg_page(sg), sg->length, sg->offset)
!= sg->length) {
struct bio *prev = bio;

bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
bio_set_dev(bio, req->ns->bdev);
bio->bi_iter.bi_sector = sector;
bio_set_op_attrs(bio, op, op_flags);

bio_chain(bio, prev);
submit_bio(prev);
}

sector += sg->length >> 9;
sg_cnt--;
}

submit_bio(bio);
}

nvmet_bio_done

BIO全部下发结束后,触发bi_end_io,也就是nvmet_bio_done函数,函数一路到最后会触发nvmet_tcp_queue_response函数,表示该nvme命令处理完毕,需要回传结果了

1
2
3
4
5
6
7
8
9
static void nvmet_bio_done(struct bio *bio)
{
struct nvmet_req *req = bio->bi_private;

/*调用绑定好的complete函数,准备cqe*/
nvmet_req_complete(req, blk_to_nvme_status(req, bio->bi_status));
if (bio != &req->b.inline_bio)
bio_put(bio);
}
1
2
3
4
5
6
void nvmet_req_complete(struct nvmet_req *req, u16 status)
{
__nvmet_req_complete(req, status);
percpu_ref_put(&req->sq->ref);
}
EXPORT_SYMBOL_GPL(nvmet_req_complete);
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
{
if (!req->sq->sqhd_disabled)
nvmet_update_sq_head(req);
/*填充cqe,等待后续发送给host端*/
req->cqe->sq_id = cpu_to_le16(req->sq->qid);
req->cqe->command_id = req->cmd->common.command_id;

if (unlikely(status))
nvmet_set_error(req, status);

trace_nvmet_req_complete(req);

if (req->ns)
nvmet_put_namespace(req->ns);
req->ops->queue_response(req);
}
1
2
3
4
5
6
7
8
9
10
static void nvmet_tcp_queue_response(struct nvmet_req *req)
{
struct nvmet_tcp_cmd *cmd =
container_of(req, struct nvmet_tcp_cmd, req);
struct nvmet_tcp_queue *queue = cmd->queue;

llist_add(&cmd->lentry, &queue->resp_list);
/*触发新的io_work函数 发送cqe*/
queue_work_on(cmd->queue->cpu, nvmet_tcp_wq, &cmd->queue->io_work);
}

nvmet_tcp_try_send_one

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
bool last_in_batch)
{
struct nvmet_tcp_cmd *cmd = queue->snd_cmd;
int ret = 0;

/*此时还没有构造要发送的cmd,所以进入fetch_cmd构造cmd*/
if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) {
cmd = nvmet_tcp_fetch_cmd(queue);
if (unlikely(!cmd))
return 0;
}

if (cmd->state == NVMET_TCP_SEND_DATA_PDU) {
ret = nvmet_try_send_data_pdu(cmd);
if (ret <= 0)
goto done_send;
}

if (cmd->state == NVMET_TCP_SEND_DATA) {
ret = nvmet_try_send_data(cmd, last_in_batch);
if (ret <= 0)
goto done_send;
}

if (cmd->state == NVMET_TCP_SEND_DDGST) {
ret = nvmet_try_send_ddgst(cmd);
if (ret <= 0)
goto done_send;
}

if (cmd->state == NVMET_TCP_SEND_R2T) {
ret = nvmet_try_send_r2t(cmd, last_in_batch);
if (ret <= 0)
goto done_send;
}

/*构造完成后,进入nvmet_try_send_response函数,发送cqe*/
if (cmd->state == NVMET_TCP_SEND_RESPONSE)
ret = nvmet_try_send_response(cmd, last_in_batch);

done_send:
if (ret < 0) {
if (ret == -EAGAIN)
return 0;
return ret;
}

return 1;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue)
{
/*从response_send_list申请结构体*/
queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list,
struct nvmet_tcp_cmd, entry);
if (!queue->snd_cmd) {
/*如果无法申请说明list未初始化,初始化后重新申请*/
nvmet_tcp_process_resp_list(queue);
queue->snd_cmd =
list_first_entry_or_null(&queue->resp_send_list,
struct nvmet_tcp_cmd, entry);
if (unlikely(!queue->snd_cmd))
return NULL;
}

/*queue的send_list状态变化*/
list_del_init(&queue->snd_cmd->entry);
queue->send_list_len--;

if (nvmet_tcp_need_data_out(queue->snd_cmd))
nvmet_setup_c2h_data_pdu(queue->snd_cmd);
else if (nvmet_tcp_need_data_in(queue->snd_cmd))
nvmet_setup_r2t_pdu(queue->snd_cmd);
else
/*由于是write命令,不需要数据,所以调用nvmet_setup_response_pdu函数构造response_pdu*/
nvmet_setup_response_pdu(queue->snd_cmd);

return queue->snd_cmd;
}

nvmet_setup_response_pdu

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
{
struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu;
struct nvmet_tcp_queue *queue = cmd->queue;
u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);

cmd->offset = 0;
cmd->state = NVMET_TCP_SEND_RESPONSE;

/*初始化pdu*/
pdu->hdr.type = nvme_tcp_rsp;
pdu->hdr.flags = 0;
pdu->hdr.hlen = sizeof(*pdu);
pdu->hdr.pdo = 0;
pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
if (cmd->queue->hdr_digest) {
pdu->hdr.flags |= NVME_TCP_F_HDGST;
nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
bool last_in_batch)
{
u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
int flags = MSG_DONTWAIT;
int ret;

if (!last_in_batch && cmd->queue->send_list_len)
flags |= MSG_MORE;
else
flags |= MSG_EOR;

/*发送response_pdu给host端*/
ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu),
offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags);
if (ret <= 0)
return ret;
cmd->offset += ret;
left -= ret;

if (left)
return -EAGAIN;

kfree(cmd->iov);
sgl_free(cmd->req.sg);
cmd->queue->snd_cmd = NULL;
nvmet_tcp_put_cmd(cmd);
return 1;
}

至此,target端的工作结束,转回host端看host端try_recv的执行流程

host端

nvme_tcp_try_recv

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
{
struct socket *sock = queue->sock;
struct sock *sk = sock->sk;
read_descriptor_t rd_desc;
int consumed;

rd_desc.arg.data = queue;
rd_desc.count = 1;
lock_sock(sk);
queue->nr_cqe = 0;
/*调用nvme_tcp_recv_skb对接收到的数据进行处理*/
consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
release_sock(sk);
return consumed;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
unsigned int offset, size_t len)
{
struct nvme_tcp_queue *queue = desc->arg.data;
size_t consumed = len;
int result;

while (len) {
switch (nvme_tcp_recv_state(queue)) {
case NVME_TCP_RECV_PDU:
/*进入分支,调用nvme_tcp_recv_pdu函数*/
result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
break;
case NVME_TCP_RECV_DATA:
result = nvme_tcp_recv_data(queue, skb, &offset, &len);
break;
case NVME_TCP_RECV_DDGST:
result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
break;
default:
result = -EFAULT;
}
if (result) {
dev_err(queue->ctrl->ctrl.device,
"receive failed: %d\n", result);
queue->rd_enabled = false;
nvme_tcp_error_recovery(&queue->ctrl->ctrl);
return result;
}
}

return consumed;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
unsigned int *offset, size_t *len)
{
struct nvme_tcp_hdr *hdr;
char *pdu = queue->pdu;
size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
int ret;

/*将接收到的pdu数据写入queue的pdu中*/
ret = skb_copy_bits(skb, *offset,
&pdu[queue->pdu_offset], rcv_len);
if (unlikely(ret))
return ret;

queue->pdu_remaining -= rcv_len;
queue->pdu_offset += rcv_len;
*offset += rcv_len;
*len -= rcv_len;
if (queue->pdu_remaining){
return 0;
}

hdr = queue->pdu;
if (queue->hdr_digest) {
ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
if (unlikely(ret))
return ret;
}

if (queue->data_digest) {
ret = nvme_tcp_check_ddgst(queue, queue->pdu);
if (unlikely(ret))
return ret;
}

switch (hdr->type) {
case nvme_tcp_c2h_data:
return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
case nvme_tcp_rsp:
/*write命令进入一下分支,初始化recv ctx,准备处理cqe*/
nvme_tcp_init_recv_ctx(queue);
return nvme_tcp_handle_comp(queue, (void *)queue->pdu);
case nvme_tcp_r2t:
nvme_tcp_init_recv_ctx(queue);
return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
default:
dev_err(queue->ctrl->ctrl.device,
"unsupported pdu type (%d)\n", hdr->type);
return -EINVAL;
}
}
1
2
3
4
5
6
7
8
static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
{
queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
nvme_tcp_hdgst_len(queue);
queue->pdu_offset = 0;
queue->data_remaining = -1;
queue->ddgst_remaining = 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
struct nvme_tcp_rsp_pdu *pdu)
{
struct nvme_completion *cqe = &pdu->cqe;
int ret = 0;

/*
* AEN requests are special as they don't time out and can
* survive any kind of queue freeze and often don't respond to
* aborts. We don't even bother to allocate a struct request
* for them but rather special case them here.
*/
if (unlikely(nvme_tcp_queue_id(queue) == 0 &&
cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH))
nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
&cqe->result);
else
/*处理接受到的cqe*/
ret = nvme_tcp_process_nvme_cqe(queue, cqe);

return ret;
}

nvme_tcp_process_nvme_cqe

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
struct nvme_completion *cqe)
{
struct request *rq;

rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
if (!rq) {
dev_err(queue->ctrl->ctrl.device,
"queue %d tag 0x%x not found\n",
nvme_tcp_queue_id(queue), cqe->command_id);
nvme_tcp_error_recovery(&queue->ctrl->ctrl);
return -EINVAL;
}

nvme_end_request(rq, cqe->status, cqe->result);
queue->nr_cqe++;

return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
static inline void nvme_end_request(struct request *req, __le16 status,
union nvme_result result)
{
struct nvme_request *rq = nvme_req(req);

rq->status = le16_to_cpu(status) >> 1;
rq->result = result;
/* inject error when permitted by fault injection framework */
nvme_should_fail(req);
/*执行TCP queue绑定好的complete函数*/
blk_mq_complete_request(req);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
void nvme_complete_rq(struct request *req)
{
blk_status_t status = nvme_error_status(nvme_req(req)->status);

trace_nvme_complete_rq(req);

if (nvme_req(req)->ctrl->kas)
nvme_req(req)->ctrl->comp_seen = true;

if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req))
return;

if (!blk_queue_dying(req->q)) {
nvme_retry_req(req);
return;
}
}

nvme_trace_bio_complete(req, status);
/*结束request命令*/
blk_mq_end_request(req, status);
}
EXPORT_SYMBOL_GPL(nvme_complete_rq);

返回到最初的nvme_user_cmd函数中

nvme_passthru_end

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
{
/*
* Revalidate LBA changes prior to unfreezing. This is necessary to
* prevent memory corruption if a logical block size was changed by
* this command.
*/
if (effects & NVME_CMD_EFFECTS_LBCC)
nvme_update_formats(ctrl);
if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
nvme_unfreeze(ctrl);
nvme_mpath_unfreeze(ctrl->subsys);
mutex_unlock(&ctrl->subsys->lock);
nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
mutex_unlock(&ctrl->scan_lock);
}
if (effects & NVME_CMD_EFFECTS_CCC)
nvme_init_identify(ctrl);
if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC))
nvme_queue_scan(ctrl);
}

至此一条ioctl write命令结束