Saturn

The devil is in the details.

0%

Dkey-Array Object update

daos_obj_generate_oid

  1. 输入:
    • daos_handle_t oh : 存储了一个unsigned int 64的cookie
    • daos_obj_id_t oid : daos object id, 用两个64位int 存储 lo(low)和 high (high)
    • enum DAOS_OT_KV_HASHED : flat KV (no akey) with integer dkey
    • OC_SX :OC策略
    • 0 :边长参数
    • 0 :变长参数
  2. 实现,内部调用了daos_obj_generate_oid2
    • 输入:
      • daos_handle_t
      • daos_obj_id_t
      • daos_otype_t
      • daos_oclass_id_t : object class id, 32位int
      • daos_oclass_hints_t :object class hint, 16位int
      • args : 其他int参数
    • dc_cont_hdl2pool_hdl
      • 根据cookie获取pool handler
    • rc = pl_map_query(pool->dp_pool, &attr)
    • dc_pool_put(struct dc_pool *pool) // 将pool->dp_hlink放入了一个双向队列中?
    • 按照选定的OC策略做相应的操作
    • daos_obj_set_oid(oid, type, ord, nr_grp, args); // 就生成好了?

daos_kv_open

  1. 输入:
    • daos_handle_t
    • daos_obj_id_t
    • mode : int 类型
    • daos_handle_t
    • daos_event_t : event and event queue, maybe used for debug
  2. 调用rc = dc_task_create(dc_kv_open, NULL, ev, &task)
    • 将参数包装成dc_kv_open结构体
    • 生成一个kvOpen的task并调度

dts_buf_render(buf, BUFLEN)

  1. 输入buf和len
  2. 利用随机生成的字符填满buf

daos_kev_put

  1. 输入:
    • daos_handle_t oh
    • daos_handle_t th
    • uint64_t flag
    • char * key
    • daos_size_t buf_size
    • void * buf
    • daos_event_t * ev
  2. 调用rc = dc_task_create(dc_kv_put, NULL, ev, &task);
  3. 将参数包装到task的args中
  4. 调用dc_task_schedule(task, true);
    1. task_is_valid(task)
    2. ev = task_ptr2args(task)->ta_ev;
    3. rc = daos_event_launch(ev);
      • 输入:daos_event *ev
    4. rc = tse_task_schedule(task, instant);

kv_update

  1. daos_io_0线程接收到io请求,crt_handle_rpc(void *arg)处理rpc

  2. ds_obj_rw_handler(crt_rpc_t *rpc)处理rpc

    • obj_ioc_begin(orw->orw_oid.id_pub, orw->orw_map_ver,

      ​ orw->orw_pool_uuid, orw->orw_co_hdl,

      ​ orw->orw_co_uuid, opc_get(rpc->cr_opc),

      ​ orw->orw_flags, &ioc); // various check before access VOS

  3. ​ rc = process_epoch(&orw->orw_epoch, &orw->orw_epoch_first,

    ​ &orw->orw_flags); // 处理epoch相关

Object_Update

  1. callstack

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    libvos_srv.so!dkey_update(daos_key_t * dkey, uint32_t pm_ver, struct vos_io_context * ioc) (/home/kuhan/daos/src/vos/vos_io.c:1803)
    libvos_srv.so!vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t * dkey, int err, daos_size_t * size, struct dtx_handle * dth) (/home/kuhan/daos/src/vos/vos_io.c:2278)
    libobj.so!obj_rw_complete(struct dtx_handle * dth, int status, daos_handle_t ioh, struct obj_io_context * ioc, crt_rpc_t * rpc) (/home/kuhan/daos/src/object/srv_obj.c:128)
    libobj.so!obj_local_rw_internal(struct dtx_handle * dth, uint64_t * split_offs, struct dcs_iod_csums * split_csums, daos_iod_t * split_iods, struct obj_io_context * ioc, crt_rpc_t * rpc) (/home/kuhan/daos/src/object/srv_obj.c:1677)
    libobj.so!obj_local_rw(crt_rpc_t * rpc, struct obj_io_context * ioc, daos_iod_t * split_iods, struct dcs_iod_csums * split_csums, uint64_t * split_offs, struct dtx_handle * dth, _Bool pin) (/home/kuhan/daos/src/object/srv_obj.c:1697)
    libobj.so!obj_tgt_update(dtx_sub_comp_cb_t comp_cb, void * arg, struct dtx_leader_handle * dlh) (/home/kuhan/daos/src/object/srv_obj.c:2425)
    libobj.so!obj_tgt_update(struct dtx_leader_handle * dlh, void * arg, int idx, dtx_sub_comp_cb_t comp_cb) (/home/kuhan/daos/src/object/srv_obj.c:2356)
    libobj.so!ds_obj_rw_handler(crt_rpc_t * rpc) (/home/kuhan/daos/src/object/srv_obj.c:2664)
    libcart.so.4!crt_handle_rpc(void * arg) (/home/kuhan/daos/src/cart/crt_rpc.c:1638)
    libabt.so.1!ABTD_ythread_func_wrapper (未知源:0)
    libabt.so.1!make_fcontext (未知源:0)
    [Unknown/Just-In-Time compiled code] (未知源:0)

obj_local_rw_internal

  1. ​ rc = vos_update_begin(ioc->ioc_vos_coh, orw->orw_oid,

    ​ orw->orw_epoch, cond_flags, dkey,

    ​ orw->orw_nr, iods, iod_csums,

    ​ ioc->ioc_coc->sc_props.dcp_dedup_size,

    ​ &ioh, dth);

    • rc = vos_check_akeys(iod_nr, iods);

    • vos_ioc_create

    • rc = vos_space_hold(vos_cont2pool(ioc->ic_cont), flags, dkey, iod_nr,

    ​ iods, iods_csums, &ioc->ic_space_held[0]);

    • rc = dkey_update_begin(ioc);

      • 循环rc = akey_update_begin(ioc);

        • 获取dcs_csum_info

        • 获取daos_iod_t,1k

          1
          2
          3
          4
          5
          6
          7
          8
          9
          10
          iod
          0x7f9a07a4dee0
          iod_name
          iod_type:DAOS_IOD_ARRAY
          iod_size:1
          iod_flags:0
          iod_nr:1
          iod_recxs:0x7f9a07e9fbf0
          rx_idx:0
          rx_nr:1024
        • for (i = 0; i < iod->iod_nr; i++) //循环

          • size = (iod->iod_type == DAOS_IOD_SINGLE) ? iod->iod_size :

            ​ iod->iod_recxs[i].rx_nr * iod->iod_size; //获取size,1k–1024

          • media = vos_media_select(vos_cont2pool(ioc->ic_cont),

            ​ iod->iod_type, size); //决定往哪个media上写,0 –> scm

            1
            2
            3
            4
            5
            6
            7
            8
            9
            10
            11
            12
            13
            14
            /*
            * A simple media selection policy embedded in VOS, which select media by
            * akey type and record size.
            */
            static inline uint16_t
            vos_media_select(struct vos_pool *pool, daos_iod_type_t type, daos_size_t size)
            {
            if (pool->vp_vea_info == NULL)
            return DAOS_MEDIA_SCM;

            return (size >= VOS_BLK_SZ) ? DAOS_MEDIA_NVME : DAOS_MEDIA_SCM;
            }
            // 首先做特判
            // 如果大于等于4k,写到NVME上,否则写到SCM
          • iod->iod_type
            为 DAOS_IOD_ARRAY类型:

            • rc = vos_reserve_recx(ioc, media, size, recx_csum, csum_len);

              • 为 struct bio_iov biov 分配内存

              • rc = reserve_space(ioc, media, size, &off);

                1
                2
                3
                4
                5
                6
                7
                8
                9
                10
                11
                12
                13
                14
                15
                16
                17
                18
                19
                20
                21
                22
                23
                24
                25
                26
                27
                28
                29
                30
                31
                32
                33
                34
                35
                36
                37
                38
                39
                40
                41
                42
                43
                44
                45
                46
                47
                48
                49
                50
                51
                52
                53
                54
                55
                56
                57
                58
                59
                60
                61
                62
                63
                64
                65
                66
                oc
                0x7f9a0724ef00
                ic_ent_array_alloc
                ea_data
                ea_ents:0x0
                ea_ent_nr:0
                ea_size:0
                ea_max:0
                ea_inob:0
                ea_first_delete:0
                ea_delete_nr:0
                ea_embedded_ents
                ea_embedded
                ic_ent_array:0x0
                ic_bound:509171074455830528
                ic_epr
                ic_oid
                ic_cont:0x7f9a07e9eb60
                ic_iods:0x7f9a07a4dee0
                iod_csums:0x0
                ic_obj:0x0
                ic_biod:0x7f99c89c63b0
                ic_ts_set:0x7f99c89c6f90
                ic_biov_csums:0x7f99c87c76e0
                ic_biov_csums_at:0
                ic_biov_csums_nr:1
                ic_dkey_info
                ic_akey_info
                ic_sgl_at:0
                ic_iov_at:0
                ic_rsrvd_scm:0x7f9a07e86980
                ic_umoffs:0x7f9a07e9e6a0
                ic_umoffs_cnt:0
                ic_umoffs_at:0
                ic_blk_exts
                next:0x7f9a0724fb38
                prev:0x7f9a0724fb38
                ic_space_held
                ic_iod_nr:1
                ic_dedup_th:4096
                ic_dedup_entries
                ic_dedup_bsgls:0x0
                ic_dedup_bufs:0x0
                ic_io_size:0
                ic_update:1
                ic_size_fetch:0
                ic_save_recx:0
                ic_dedup:0
                ic_dedup_verify:0
                ic_read_ts_only:0
                ic_check_existence:0
                ic_remove:0
                ic_skip_fetch:0
                ic_ec:0
                ic_shadows:0x0
                re_nr
                re_total
                re_snapshot
                re_ep_valid
                re_items
                ic_recx_lists:0x0
                re_nr
                re_total
                re_snapshot
                re_ep_valid
                re_items
                • 在SCM上申请内存
              • bio_addr_set(&biov.bi_addr, media, off);//设置偏移量

              • bio_iov_set_len(&biov, size); //设置长度

              • rc = iod_reserve(ioc, &biov);

    • *ioh = vos_ioc2ioh(ioc); //获取cookie?

  2. biod = vos_ioh2desc(ioh); //获取io descriptor?

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    0x7f99c89c63b0
    bd_ctxt:0x7f99c89b0650
    bic_link
    bic_umem:0x7f99c860ab58
    bic_pmempool_uuid:12137328209446634291
    bic_blob:0x7f99c89b1530
    bic_xs_ctxt:0x7f9a04427370
    bic_inflight_dmas:0
    bic_io_unit:4096
    bic_pool_id
    bic_opening:0
    bic_closing:0
    bd_rsrvd
    brd_regions:0x0
    brd_rg_max:0
    brd_rg_cnt:0
    brd_dma_chks:0x0
    brd_chk_max:0
    brd_chk_cnt:0
    bd_dma_done:0x10
    bd_inflights:0
    bd_result:0
    bd_chk_type:0
    bd_type:0
    bd_buffer_prep:0
    bd_dma_issued:0
    bd_retry:0
    bd_rdma:0
    bd_bulk_hdls:0x0
    bd_bulk_max:0
    bd_bulk_cnt:0
    bd_sgl_cnt:1
    bd_sgls
  3. rc = bio_iod_prep(biod, BIO_CHK_TYPE_IO, rma ? rpc->cr_ctx : NULL,CRT_BULK_RW);

    • rc = iterate_biov(biod, arg ? bulk_map_one : dma_map_one, arg);

      • for循环:rc = cb_fn(biod, biov, data); // 函数指针调用,此处写SCM

        1
        2
        3
        4
        5
        6
        7
        8
        9
        10
        11
        12
        13
        14
        15
        16
        17
        18
        19
        20
        21
        22
        23
        24
        25
        26
        27
        28
        29
        30
        31
        32
        33
        34
        35
        36
        37
        38
        39
        40
        41
        42
        43
        44
        biod
        0x7f99c89c63b0
        bd_ctxt:0x7f99c89b0650
        bic_link
        bic_umem:0x7f99c860ab58
        bic_pmempool_uuid:12137328209446634291
        bic_blob:0x7f99c89b1530
        bic_xs_ctxt:0x7f9a04427370
        bic_inflight_dmas:0
        bic_io_unit:4096
        bic_pool_id
        bic_opening:0
        bic_closing:0
        bd_rsrvd
        bd_dma_done:0x10
        bd_inflights:0
        bd_result:0
        bd_chk_type:0
        bd_type:0
        bd_buffer_prep:0
        bd_dma_issued:0
        bd_retry:0
        bd_rdma:0
        bd_bulk_hdls:0x0
        bd_bulk_max:0
        bd_bulk_cnt:0
        bd_sgl_cnt:1
        bd_sgls
        --------------
        biov
        0x7f9a07e9eeb0
        bi_buf:0x0
        bi_data_len:1024
        bi_addr
        ba_off:4576720
        ba_type:0 '\000'
        ba_pad1:0 '\000'
        ba_flags:0
        ba_pad2:0
        bi_prefix_len:0
        bi_suffix_len:0
        --------------
        data
        0x0

        函数指针调用dma_map_one(struct bio_desc *biod, struct bio_iov *biov, void *arg)

        // /* Convert offset of @biov into memory pointer */

        • direct_scm_access(biod, biov):
          • bio_iov_set_raw_buf(biov,umem_off2ptr(umem, bio_iov2raw_off(biov)));
  4. rc = bio_iod_copy(biod, orw->orw_sgls.ca_arrays, orw->orw_nr);

    • 将参数包装成bio_copy_args结构体
    • iterate_biov(biod, copy_one, &arg);
  5. rc = vos_dedup_verify(ioh);

    1
    2
    3
    4
    5
    6
    /*
    * Check if the dedup data is identical to the RDMA data in a temporal
    * allocated DRAM extent, if memcmp fails, allocate a new SCM extent and
    * update it's address in VOS tree, otherwise, keep using the original
    * dedup data address in VOS tree.
    */
  6. rc = obj_verify_bio_csum(orw->orw_oid.id_pub, iods, iod_csums,biod, ioc->ioc_coc->sc_csummer,orw->orw_iod_array.oia_iod_nr); //verify CSUM

  7. rc = obj_rw_complete(rpc, ioc, ioh, rc, dth); // the callstack is deep inside this function…

    • rc = vos_update_end(ioh, ioc->ioc_map_ver,&orwi->orw_dkey, status,&ioc->ioc_io_size, dth);

      • 一些dtx commit逻辑?

      • err = dkey_update(ioc, pm_ver, dkey, dtx_is_valid_handle(dth) ? dth->dth_op_seq : VOS_SUB_OP_MAX); // update tree index

        • rc = obj_tree_init(obj); // initialize tree for an object

        • rc = key_tree_prepare(obj, obj->obj_toh, VOS_BTR_DKEY, dkey,SUBTR_CREATE, DAOS_INTENT_UPDATE, &krec, &ak_toh,ioc->ic_ts_set);

          1
          2
          3
          4
          5
          6
          7
          8
          9
          10
          11
          12
          13
          14
          15
          16
          /**
          * Load the subtree roots embedded in the parent tree record.
          *
          * akey tree : all akeys under the same dkey
          * recx tree : all record extents under the same akey, this function will
          * load both btree and evtree root.
          */
          /* NB: In order to avoid complexities of passing parameters to the
          * multi-nested tree, tree operations are not nested, instead:
          *
          * - In the case of fetch, we load the subtree root stored in the
          * parent tree leaf.
          * - In the case of update/insert, we call dbtree_update() which may
          * create the root for the subtree, or just return it if it's already
          * there.
          */
          • rc = dbtree_fetch(toh, BTR_PROBE_EQ, intent, key,NULL, &riov);

          • /* use BTR_PROBE_BYPASS to avoid probe again */

            rc = dbtree_upsert(toh, BTR_PROBE_BYPASS, intent, key, &riov);

            • Update the value of the provided key, or insert it as a new key if
               * there is no match.
              
            • tcx = btr_hdl2tcx(toh); ///** find the tree context of the handle */

            • rc = btr_tx_begin(tcx); // begin transaction?

            • rc = btr_upsert(tcx, opc, intent, key, val);

              • rc = btr_insert(tcx, key, val); // bypass策略,直接取前一次probe的结果,create a new record, insert it into tree leaf node.
                • btr_hkey_gen(tcx, key, &rec->rec_hkey[0]); //生成hkey
                • rc = btr_node_insert_rec(tcx, trace, rec);
                  • btr_node_insert_rec_only(tcx, trace, rec);
            • btr_tx_end(tcx, rc);

              • rc = umem_tx_commit(btr_umm(tcx));
          • vos_ilog_ts_ignore(vos_obj2umm(obj), &krec->kr_ilog);

          • vos_ilog_ts_mark(ts_set, &krec->kr_ilog);

        • rc = vos_ilog_update(ioc->ic_cont, &krec->kr_ilog, &ioc->ic_epr,

          ​ ioc->ic_bound, &obj->obj_ilog_info,

          ​ &ioc->ic_dkey_info, update_cond, ioc->ic_ts_set); // update dkey log ?

        • for循环:rc = akey_update(ioc, pm_ver, ak_toh, minor_epc);

          • ​ rc = key_tree_prepare(obj, ak_toh, VOS_BTR_AKEY,

            ​ &iod->iod_name, flags, DAOS_INTENT_UPDATE,

            ​ &krec, &toh, ioc->ic_ts_set);

          • ….

      • vos_ts_set_check_conflict(ioc->ic_ts_set, ioc->ic_epr.epr_hi) // Now that we are past the existence checks, ensure there isn’t a read conflict

      • err = vos_tx_end(ioc->ic_cont, dth, &ioc->ic_rsrvd_scm,&ioc->ic_blk_exts, tx_started, err); // dtx operations,on scm

      • vos_ts_set_upgrade(ioc->ic_ts_set);

      • vos_space_unhold(vos_cont2pool(ioc->ic_cont), &ioc->ic_space_held[0]);

      • vos_ioc_destroy(ioc, err != 0); // memory free

        • bio_iod_free(ioc->ic_biod);
        • vos_obj_release(vos_obj_cache_current(), ioc->ic_obj, evict);
        • vos_ioc_reserve_fini(ioc);
        • vos_ilog_fetch_finish(&ioc->ic_dkey_info);
        • vos_ilog_fetch_finish(&ioc->ic_akey_info);
        • vos_cont_decref(ioc->ic_cont);
        • vos_ts_set_free(ioc->ic_ts_set);
        • D_FREE(ioc);
      • vos_dth_set(NULL);

update流程

api调用–>task generate –> client rpc call –> server accept –> handle rpc –> decide type –> allocate memory –> write data –> update index –> free memory