除了最基本的I/O via ppa-address以外,liblightnvm還包了一些其他的IO方式,如vblk,lba等.
(vblk與lba似乎實現(xiàn)都還有些不完整或沒有完全測試)

1
Intro
vblk: 把底下的若干個nvm_addr封裝成一個虛擬的塊(virtual block),用戶可以針對這個虛擬塊進(jìn)行:
- 擦除(擦除屬于這個塊的所有addr)
- 從這個塊的某個偏移處讀/寫若干字節(jié)(必須對齊)
- 從
nvm_vblk_alloc_line可以看出: vblk->nblks的單位是2個block.
- vblk I/O 用到的一些設(shè)置, 在open一個設(shè)備的時候進(jìn)行的預(yù)設(shè).(可以看出一個lun上的plane數(shù)只能是1,2,4)
nvm_dev.c:
line 219:
static int dev_attr_fill(struct nvm_dev *dev)
{
...
/* Derive a default plane mode */
switch(geo->nplanes) {
case 4:
dev->pmode = NVM_FLAG_PMODE_QUAD;
break;
case 2:
dev->pmode = NVM_FLAG_PMODE_DUAL;
break;
case 1:
dev->pmode = NVM_FLAG_PMODE_SNGL;
break;
default:
errno = EINVAL;
return -1;
}
dev->erase_naddrs_max = NVM_NADDR_MAX; //Macro at liblightnvm.h, line 42: #define NVM_NADDR_MAX 64
dev->write_naddrs_max = NVM_NADDR_MAX;
dev->read_naddrs_max = NVM_NADDR_MAX;
dev->meta_mode = NVM_META_MODE_NONE;
return 0;
}
vblk erase
nvm_vblk.c, line 116:
static inline int _cmd_nblks(int nblks, int cmd_nblks_max)//找到一個小于等于cmd_nblks_max并能整除nblks的最大整數(shù)作為并行化因子
{
int cmd_nblks = cmd_nblks_max;
while(nblks % cmd_nblks && cmd_nblks > 1) --cmd_nblks;
return cmd_nblks;
}
nvm_vblk.c, line 125:
ssize_t nvm_vblk_erase(struct nvm_vblk *vblk)
{
size_t nerr = 0;
const struct nvm_geo *geo = nvm_dev_get_geo(vblk->dev);
const int PMODE = vblk->dev->pmode;
const int BLK_NADDRS = geo->nplanes; //擦除的粒度單位,1個單位 = geo->nplanes個block地址
const int CMD_NBLKS = _cmd_nblks(vblk->nblks,
vblk->dev->erase_naddrs_max / BLK_NADDRS); //一個線程擦除單位數(shù)
//一個線程擦除的block數(shù) = 單位數(shù) * 每單位block數(shù) : CMD_NBLKS * geo->nplanes
const int NTHREADS = vblk->nblks < CMD_NBLKS ? 1 : vblk->nblks / CMD_NBLKS;//總共能出多少個線程
//這里可以看出當(dāng)nblks比較大時(大于vblk->dev->erase_naddrs_max / BLK_NADDRS),才可能出多線程。
#pragma omp parallel for num_threads(NTHREADS) schedule(static,1) reduction(+:nerr) ordered if (NTHREADS>1)
for (int off = 0; off < vblk->nblks; off += CMD_NBLKS) {
ssize_t err;
struct nvm_ret ret = {};
const int nblks = NVM_MIN(CMD_NBLKS, vblk->nblks - off);
const int naddrs = nblks * BLK_NADDRS;
struct nvm_addr addrs[naddrs];
for (int i = 0; i < naddrs; ++i) {
const int idx = off + (i / BLK_NADDRS);
addrs[i].ppa = vblk->blks[idx].ppa;
addrs[i].g.pl = i % geo->nplanes;
}
err = nvm_addr_erase(vblk->dev, addrs, naddrs, PMODE, &ret);//真正進(jìn)行擦除的地方(正如之前分析的Basic I/O)
if (err)
++nerr;
#pragma omp ordered
{}
}
if (nerr) {
errno = EIO;
return -1;
}
vblk->pos_write = 0;
vblk->pos_read = 0;
return vblk->nbytes;
}
這里的并行化采用了OpenMP, pthread vs OpenMP:
- 多數(shù)情況下openmp能很容易直接從串行程序改過來,并且在沒有多線程能力的情況下退回單線程執(zhí)行
- pthread得自己搞,而且如果沒有多線程執(zhí)行能力也會強制分時多線程,引入額外負(fù)擔(dān),好處是比openmp靈活
vblk write
由于NAND的特點,vblk包含的地址空間中的每個sector只能一次寫,vblk以pos_write維護(hù)目前寫了多少地址.(由此可知還剩多少sector可寫.
由于IO的單位是sector,因此這里寫多少個字節(jié)需要能整除sector_nbytes.
172:
static inline int _cmd_nspages(int nblks, int cmd_nspages_max)
{
int cmd_nspages = cmd_nspages_max;
while(nblks % cmd_nspages && cmd_nspages > 1) --cmd_nspages;
return cmd_nspages;
}
...
181:
ssize_t nvm_vblk_pwrite(struct nvm_vblk *vblk, const void *buf, size_t count,
size_t offset)//count與offset都必須能整除geo->nplanes * geo->nsectors,即`對齊`
{
size_t nerr = 0;
const int PMODE = nvm_dev_get_pmode(vblk->dev);
const struct nvm_geo *geo = nvm_dev_get_geo(vblk->dev);
const int SPAGE_NADDRS = geo->nplanes * geo->nsectors;//寫的粒度單位,1個單位 = geo->nplanes * geo->nsectors個sector地址 = geo->nplanes 個page大小
const int CMD_NSPAGES = _cmd_nspages(vblk->nblks,
vblk->dev->write_naddrs_max / SPAGE_NADDRS);//一個線程寫的單位數(shù)????這里可能有問題,multi-threading may not work. let‘s make a issue。
//一個線程I/O的sector數(shù) = 單位數(shù) * 每個單位的sector數(shù):CMD_NSPAGES * SPAGE_NADDRS
const int ALIGN = SPAGE_NADDRS * geo->sector_nbytes;//寫的粒度單位對應(yīng)的字節(jié)數(shù)
const int NTHREADS = vblk->nblks < CMD_NSPAGES ? 1 : vblk->nblks / CMD_NSPAGES;
const size_t bgn = offset / ALIGN;//從第幾個單位開始
const size_t end = bgn + (count / ALIGN); //到第幾個單位結(jié)束
char *padding_buf = NULL;
const size_t meta_tbytes = CMD_NSPAGES * SPAGE_NADDRS * geo->meta_nbytes;//一個線程最大可能I/O的metadata字節(jié)數(shù)
char *meta = NULL;
if (offset + count > vblk->nbytes) { // Check bounds
errno = EINVAL;
return -1;
}
if ((count % ALIGN) || (offset % ALIGN)) { // Check align
errno = EINVAL;
return -1;
}
if (!buf) { // Allocate and use a padding buffer
const size_t nbytes = CMD_NSPAGES * SPAGE_NADDRS * geo->sector_nbytes;
padding_buf = nvm_buf_alloc(geo, nbytes);
if (!padding_buf) {
errno = ENOMEM;
return -1;
}
nvm_buf_fill(padding_buf, nbytes);
}
if (vblk->dev->meta_mode != NVM_META_MODE_NONE) { // Meta
meta = nvm_buf_alloc(geo, meta_tbytes); // Alloc buf
if (!meta) {
errno = ENOMEM;
return -1;
}
switch(vblk->dev->meta_mode) { // Fill it
case NVM_META_MODE_ALPHA:
nvm_buf_fill(meta, meta_tbytes);
break;
case NVM_META_MODE_CONST:
for (size_t i = 0; i < meta_tbytes; ++i)
meta[i] = 65 + (meta_tbytes % 20);
break;
case NVM_META_MODE_NONE:
break;
}
}//目前版本的默認(rèn)模式是meta_none(master - 3b399c9c4bc315)
#pragma omp parallel for num_threads(NTHREADS) schedule(static,1) reduction(+:nerr) ordered if(NTHREADS>1)
for (size_t off = bgn; off < end; off += CMD_NSPAGES) {//每個線程CMD_NSPAGES個單位
struct nvm_ret ret = {};
const int nspages = NVM_MIN(CMD_NSPAGES, (int)(end - off));
const int naddrs = nspages * SPAGE_NADDRS;
struct nvm_addr addrs[naddrs];
const char *buf_off;
if (padding_buf)
buf_off = padding_buf;
else
buf_off = buf + (off - bgn) * geo->sector_nbytes * SPAGE_NADDRS;
for (int i = 0; i < naddrs; ++i) {
const int spg = off + (i / SPAGE_NADDRS);//目前寫到第spg個單位
!?。? const int idx = spg % vblk->nblks;//這個是比較關(guān)鍵的一句:保證了IO在lun之間均勻分布,而不是集中在某個lun某個ch上.
const int pg = (spg / vblk->nblks) % geo->npages;
addrs[i].ppa = vblk->blks[idx].ppa;
addrs[i].g.pg = pg;
addrs[i].g.pl = (i / geo->nsectors) % geo->nplanes;
addrs[i].g.sec = i % geo->nsectors;
}
const ssize_t err = nvm_addr_write(vblk->dev, addrs, naddrs,
buf_off, meta, PMODE, &ret);
if (err)
++nerr;
#pragma omp ordered
{}
}
free(padding_buf);
if (nerr) {
errno = EIO;
return -1;
}
return count;
}

vblk addressing.png
總結(jié)
- vblk:
- vblk->nblks值的單位 = geo->nplanes個block = geo->nplanes * geo->npages * geo->nsectors 個sector = geo->npages個寫粒度單位(頁)
- vblk擦除的粒度單位: geo->nplanes個block地址
- vblk寫的粒度單位: geo->nplanes * geo->nsectors個sector地址
- 擦除和讀寫的上限都是
NVM_NADDR_MAX個地址,但意義卻不同,擦出的地址單位是block地址,讀寫的地址單位是sector地址 - 使用OpenMP進(jìn)行并行優(yōu)化.
- 一個OCSSD的lun上的plane數(shù)目只能是1,2 或 4.(見dev_attr_fill(): nvm_dev.c)