daos_obj_generate_oid
- 输入:
- daos_handle_t oh : 存储了一个unsigned int 64的cookie
- daos_obj_id_t oid : daos object id, 用两个64位int 存储 lo(low)和 high (high)
- enum DAOS_OT_KV_HASHED : flat KV (no akey) with integer dkey
- OC_SX :OC策略
- 0 :边长参数
- 0 :变长参数
- 实现,内部调用了
daos_obj_generate_oid2
- 输入:
- daos_handle_t
- daos_obj_id_t
- daos_otype_t
- daos_oclass_id_t : object class id, 32位int
- daos_oclass_hints_t :object class hint, 16位int
- args : 其他int参数
- dc_cont_hdl2pool_hdl
- 根据cookie获取pool handler
- rc = pl_map_query(pool->dp_pool, &attr)
- dc_pool_put(struct dc_pool *pool) // 将pool->dp_hlink放入了一个双向队列中?
- 按照选定的OC策略做相应的操作
- daos_obj_set_oid(oid, type, ord, nr_grp, args); // 就生成好了?
- 输入:
daos_kv_open
- 输入:
- daos_handle_t
- daos_obj_id_t
- mode : int 类型
- daos_handle_t
- daos_event_t : event and event queue, maybe used for debug
- 调用
rc = dc_task_create(dc_kv_open, NULL, ev, &task)
- 将参数包装成dc_kv_open结构体
- 生成一个kvOpen的task并调度
dts_buf_render(buf, BUFLEN)
- 输入buf和len
- 利用随机生成的字符填满buf
daos_kev_put
- 输入:
- daos_handle_t oh
- daos_handle_t th
- uint64_t flag
- char * key
- daos_size_t buf_size
- void * buf
- daos_event_t * ev
- 调用rc = dc_task_create(dc_kv_put, NULL, ev, &task);
- 将参数包装到task的args中
- 调用dc_task_schedule(task, true);
- task_is_valid(task)
- ev = task_ptr2args(task)->ta_ev;
- rc = daos_event_launch(ev);
- 输入:daos_event *ev
- rc = tse_task_schedule(task, instant);
kv_update
daos_io_0线程接收到io请求,crt_handle_rpc(void *arg)处理rpc
ds_obj_rw_handler(crt_rpc_t *rpc)处理rpc
obj_ioc_begin(orw->orw_oid.id_pub, orw->orw_map_ver,
orw->orw_pool_uuid, orw->orw_co_hdl,
orw->orw_co_uuid, opc_get(rpc->cr_opc),
orw->orw_flags, &ioc); // various check before access VOS
rc = process_epoch(&orw->orw_epoch, &orw->orw_epoch_first,
&orw->orw_flags); // 处理epoch相关
Object_Update
callstack
1
2
3
4
5
6
7
8
9
10
11
12libvos_srv.so!dkey_update(daos_key_t * dkey, uint32_t pm_ver, struct vos_io_context * ioc) (/home/kuhan/daos/src/vos/vos_io.c:1803)
libvos_srv.so!vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t * dkey, int err, daos_size_t * size, struct dtx_handle * dth) (/home/kuhan/daos/src/vos/vos_io.c:2278)
libobj.so!obj_rw_complete(struct dtx_handle * dth, int status, daos_handle_t ioh, struct obj_io_context * ioc, crt_rpc_t * rpc) (/home/kuhan/daos/src/object/srv_obj.c:128)
libobj.so!obj_local_rw_internal(struct dtx_handle * dth, uint64_t * split_offs, struct dcs_iod_csums * split_csums, daos_iod_t * split_iods, struct obj_io_context * ioc, crt_rpc_t * rpc) (/home/kuhan/daos/src/object/srv_obj.c:1677)
libobj.so!obj_local_rw(crt_rpc_t * rpc, struct obj_io_context * ioc, daos_iod_t * split_iods, struct dcs_iod_csums * split_csums, uint64_t * split_offs, struct dtx_handle * dth, _Bool pin) (/home/kuhan/daos/src/object/srv_obj.c:1697)
libobj.so!obj_tgt_update(dtx_sub_comp_cb_t comp_cb, void * arg, struct dtx_leader_handle * dlh) (/home/kuhan/daos/src/object/srv_obj.c:2425)
libobj.so!obj_tgt_update(struct dtx_leader_handle * dlh, void * arg, int idx, dtx_sub_comp_cb_t comp_cb) (/home/kuhan/daos/src/object/srv_obj.c:2356)
libobj.so!ds_obj_rw_handler(crt_rpc_t * rpc) (/home/kuhan/daos/src/object/srv_obj.c:2664)
libcart.so.4!crt_handle_rpc(void * arg) (/home/kuhan/daos/src/cart/crt_rpc.c:1638)
libabt.so.1!ABTD_ythread_func_wrapper (未知源:0)
libabt.so.1!make_fcontext (未知源:0)
[Unknown/Just-In-Time compiled code] (未知源:0)
obj_local_rw_internal
rc = vos_update_begin(ioc->ioc_vos_coh, orw->orw_oid,
orw->orw_epoch, cond_flags, dkey,
orw->orw_nr, iods, iod_csums,
ioc->ioc_coc->sc_props.dcp_dedup_size,
&ioh, dth);
rc = vos_check_akeys(iod_nr, iods);
vos_ioc_create
rc = vos_space_hold(vos_cont2pool(ioc->ic_cont), flags, dkey, iod_nr,
iods, iods_csums, &ioc->ic_space_held[0]);
rc = dkey_update_begin(ioc);
循环rc = akey_update_begin(ioc);
获取dcs_csum_info
获取daos_iod_t,1k
1
2
3
4
5
6
7
8
9
10iod
0x7f9a07a4dee0
iod_name
iod_type:DAOS_IOD_ARRAY
iod_size:1
iod_flags:0
iod_nr:1
iod_recxs:0x7f9a07e9fbf0
rx_idx:0
rx_nr:1024for (i = 0; i < iod->iod_nr; i++) //循环
size = (iod->iod_type == DAOS_IOD_SINGLE) ? iod->iod_size :
iod->iod_recxs[i].rx_nr * iod->iod_size; //获取size,1k–1024
media = vos_media_select(vos_cont2pool(ioc->ic_cont),
iod->iod_type, size); //决定往哪个media上写,0 –> scm
1
2
3
4
5
6
7
8
9
10
11
12
13
14/*
* A simple media selection policy embedded in VOS, which select media by
* akey type and record size.
*/
static inline uint16_t
vos_media_select(struct vos_pool *pool, daos_iod_type_t type, daos_size_t size)
{
if (pool->vp_vea_info == NULL)
return DAOS_MEDIA_SCM;
return (size >= VOS_BLK_SZ) ? DAOS_MEDIA_NVME : DAOS_MEDIA_SCM;
}
// 首先做特判
// 如果大于等于4k,写到NVME上,否则写到SCMiod->iod_type
为 DAOS_IOD_ARRAY类型:rc = vos_reserve_recx(ioc, media, size, recx_csum, csum_len);
为 struct bio_iov biov 分配内存
rc = reserve_space(ioc, media, size, &off);
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66oc
0x7f9a0724ef00
ic_ent_array_alloc
ea_data
ea_ents:0x0
ea_ent_nr:0
ea_size:0
ea_max:0
ea_inob:0
ea_first_delete:0
ea_delete_nr:0
ea_embedded_ents
ea_embedded
ic_ent_array:0x0
ic_bound:509171074455830528
ic_epr
ic_oid
ic_cont:0x7f9a07e9eb60
ic_iods:0x7f9a07a4dee0
iod_csums:0x0
ic_obj:0x0
ic_biod:0x7f99c89c63b0
ic_ts_set:0x7f99c89c6f90
ic_biov_csums:0x7f99c87c76e0
ic_biov_csums_at:0
ic_biov_csums_nr:1
ic_dkey_info
ic_akey_info
ic_sgl_at:0
ic_iov_at:0
ic_rsrvd_scm:0x7f9a07e86980
ic_umoffs:0x7f9a07e9e6a0
ic_umoffs_cnt:0
ic_umoffs_at:0
ic_blk_exts
next:0x7f9a0724fb38
prev:0x7f9a0724fb38
ic_space_held
ic_iod_nr:1
ic_dedup_th:4096
ic_dedup_entries
ic_dedup_bsgls:0x0
ic_dedup_bufs:0x0
ic_io_size:0
ic_update:1
ic_size_fetch:0
ic_save_recx:0
ic_dedup:0
ic_dedup_verify:0
ic_read_ts_only:0
ic_check_existence:0
ic_remove:0
ic_skip_fetch:0
ic_ec:0
ic_shadows:0x0
re_nr
re_total
re_snapshot
re_ep_valid
re_items
ic_recx_lists:0x0
re_nr
re_total
re_snapshot
re_ep_valid
re_items- 在SCM上申请内存
bio_addr_set(&biov.bi_addr, media, off);//设置偏移量
bio_iov_set_len(&biov, size); //设置长度
rc = iod_reserve(ioc, &biov);
*ioh = vos_ioc2ioh(ioc); //获取cookie?
biod = vos_ioh2desc(ioh); //获取io descriptor?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
330x7f99c89c63b0
bd_ctxt:0x7f99c89b0650
bic_link
bic_umem:0x7f99c860ab58
bic_pmempool_uuid:12137328209446634291
bic_blob:0x7f99c89b1530
bic_xs_ctxt:0x7f9a04427370
bic_inflight_dmas:0
bic_io_unit:4096
bic_pool_id
bic_opening:0
bic_closing:0
bd_rsrvd
brd_regions:0x0
brd_rg_max:0
brd_rg_cnt:0
brd_dma_chks:0x0
brd_chk_max:0
brd_chk_cnt:0
bd_dma_done:0x10
bd_inflights:0
bd_result:0
bd_chk_type:0
bd_type:0
bd_buffer_prep:0
bd_dma_issued:0
bd_retry:0
bd_rdma:0
bd_bulk_hdls:0x0
bd_bulk_max:0
bd_bulk_cnt:0
bd_sgl_cnt:1
bd_sglsrc = bio_iod_prep(biod, BIO_CHK_TYPE_IO, rma ? rpc->cr_ctx : NULL,CRT_BULK_RW);
rc = iterate_biov(biod, arg ? bulk_map_one : dma_map_one, arg);
for循环:rc = cb_fn(biod, biov, data); // 函数指针调用,此处写SCM
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44biod
0x7f99c89c63b0
bd_ctxt:0x7f99c89b0650
bic_link
bic_umem:0x7f99c860ab58
bic_pmempool_uuid:12137328209446634291
bic_blob:0x7f99c89b1530
bic_xs_ctxt:0x7f9a04427370
bic_inflight_dmas:0
bic_io_unit:4096
bic_pool_id
bic_opening:0
bic_closing:0
bd_rsrvd
bd_dma_done:0x10
bd_inflights:0
bd_result:0
bd_chk_type:0
bd_type:0
bd_buffer_prep:0
bd_dma_issued:0
bd_retry:0
bd_rdma:0
bd_bulk_hdls:0x0
bd_bulk_max:0
bd_bulk_cnt:0
bd_sgl_cnt:1
bd_sgls
--------------
biov
0x7f9a07e9eeb0
bi_buf:0x0
bi_data_len:1024
bi_addr
ba_off:4576720
ba_type:0 '\000'
ba_pad1:0 '\000'
ba_flags:0
ba_pad2:0
bi_prefix_len:0
bi_suffix_len:0
--------------
data
0x0函数指针调用dma_map_one(struct bio_desc *biod, struct bio_iov *biov, void *arg)
// /* Convert offset of @biov into memory pointer */
- direct_scm_access(biod, biov):
- bio_iov_set_raw_buf(biov,umem_off2ptr(umem, bio_iov2raw_off(biov)));
- direct_scm_access(biod, biov):
rc = bio_iod_copy(biod, orw->orw_sgls.ca_arrays, orw->orw_nr);
- 将参数包装成bio_copy_args结构体
- iterate_biov(biod, copy_one, &arg);
rc = vos_dedup_verify(ioh);
1
2
3
4
5
6/*
* Check if the dedup data is identical to the RDMA data in a temporal
* allocated DRAM extent, if memcmp fails, allocate a new SCM extent and
* update it's address in VOS tree, otherwise, keep using the original
* dedup data address in VOS tree.
*/rc = obj_verify_bio_csum(orw->orw_oid.id_pub, iods, iod_csums,biod, ioc->ioc_coc->sc_csummer,orw->orw_iod_array.oia_iod_nr); //verify CSUM
rc = obj_rw_complete(rpc, ioc, ioh, rc, dth); // the callstack is deep inside this function…
rc = vos_update_end(ioh, ioc->ioc_map_ver,&orwi->orw_dkey, status,&ioc->ioc_io_size, dth);
一些dtx commit逻辑?
err = dkey_update(ioc, pm_ver, dkey, dtx_is_valid_handle(dth) ? dth->dth_op_seq : VOS_SUB_OP_MAX); // update tree index
rc = obj_tree_init(obj); // initialize tree for an object
rc = key_tree_prepare(obj, obj->obj_toh, VOS_BTR_DKEY, dkey,SUBTR_CREATE, DAOS_INTENT_UPDATE, &krec, &ak_toh,ioc->ic_ts_set);
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16/**
* Load the subtree roots embedded in the parent tree record.
*
* akey tree : all akeys under the same dkey
* recx tree : all record extents under the same akey, this function will
* load both btree and evtree root.
*/
/* NB: In order to avoid complexities of passing parameters to the
* multi-nested tree, tree operations are not nested, instead:
*
* - In the case of fetch, we load the subtree root stored in the
* parent tree leaf.
* - In the case of update/insert, we call dbtree_update() which may
* create the root for the subtree, or just return it if it's already
* there.
*/rc = dbtree_fetch(toh, BTR_PROBE_EQ, intent, key,NULL, &riov);
/* use BTR_PROBE_BYPASS to avoid probe again */
rc = dbtree_upsert(toh, BTR_PROBE_BYPASS, intent, key, &riov);
Update the value of the provided key, or insert it as a new key if * there is no match.
tcx = btr_hdl2tcx(toh); ///** find the tree context of the handle */
rc = btr_tx_begin(tcx); // begin transaction?
rc = btr_upsert(tcx, opc, intent, key, val);
- rc = btr_insert(tcx, key, val); // bypass策略,直接取前一次probe的结果,create a new record, insert it into tree leaf node.
- btr_hkey_gen(tcx, key, &rec->rec_hkey[0]); //生成hkey
- rc = btr_node_insert_rec(tcx, trace, rec);
- btr_node_insert_rec_only(tcx, trace, rec);
- rc = btr_insert(tcx, key, val); // bypass策略,直接取前一次probe的结果,create a new record, insert it into tree leaf node.
btr_tx_end(tcx, rc);
- rc = umem_tx_commit(btr_umm(tcx));
vos_ilog_ts_ignore(vos_obj2umm(obj), &krec->kr_ilog);
vos_ilog_ts_mark(ts_set, &krec->kr_ilog);
rc = vos_ilog_update(ioc->ic_cont, &krec->kr_ilog, &ioc->ic_epr,
ioc->ic_bound, &obj->obj_ilog_info,
&ioc->ic_dkey_info, update_cond, ioc->ic_ts_set); // update dkey log ?
for循环:rc = akey_update(ioc, pm_ver, ak_toh, minor_epc);
rc = key_tree_prepare(obj, ak_toh, VOS_BTR_AKEY,
&iod->iod_name, flags, DAOS_INTENT_UPDATE,
&krec, &toh, ioc->ic_ts_set);
….
vos_ts_set_check_conflict(ioc->ic_ts_set, ioc->ic_epr.epr_hi) // Now that we are past the existence checks, ensure there isn’t a read conflict
err = vos_tx_end(ioc->ic_cont, dth, &ioc->ic_rsrvd_scm,&ioc->ic_blk_exts, tx_started, err); // dtx operations,on scm
vos_ts_set_upgrade(ioc->ic_ts_set);
vos_space_unhold(vos_cont2pool(ioc->ic_cont), &ioc->ic_space_held[0]);
vos_ioc_destroy(ioc, err != 0); // memory free
- bio_iod_free(ioc->ic_biod);
- vos_obj_release(vos_obj_cache_current(), ioc->ic_obj, evict);
- vos_ioc_reserve_fini(ioc);
- vos_ilog_fetch_finish(&ioc->ic_dkey_info);
- vos_ilog_fetch_finish(&ioc->ic_akey_info);
- vos_cont_decref(ioc->ic_cont);
- vos_ts_set_free(ioc->ic_ts_set);
- D_FREE(ioc);
vos_dth_set(NULL);
update流程
api调用–>task generate –> client rpc call –> server accept –> handle rpc –> decide type –> allocate memory –> write data –> update index –> free memory