DPDK igb_uio驅(qū)動分析

本文整理下之前的學習筆記,基于DPDK17.11版本源碼分析。主要分析一下igb_uio驅(qū)動源碼。

總線-設(shè)備-驅(qū)動

首先簡單介紹一下kernel中的總線-設(shè)備-驅(qū)動模型,以pci總線為例,pci總線上有兩個表,一個用于保存系統(tǒng)中的pci設(shè)備,一個用于保存pci設(shè)備對應(yīng)的驅(qū)動。每當加載pci設(shè)備驅(qū)動時,就會遍歷pci總線上的pci設(shè)備進行匹配,每當插入pci設(shè)備到系統(tǒng)中時,熱插拔機制就會自動遍歷pci總線上的pci設(shè)備驅(qū)動進行匹配,如果匹配成功則使用此驅(qū)動初始化設(shè)備。

注冊pci總線
可以調(diào)用bus_register注冊總線。比如下面的pci總線,平臺總線和usb總線等。

//注冊pci總線
struct bus_type pci_bus_type = {
    .name       = "pci",
    .match      = pci_bus_match,
    .uevent     = pci_uevent,
    .probe      = pci_device_probe,
    .remove     = pci_device_remove,
    .shutdown   = pci_device_shutdown,
    .dev_groups = pci_dev_groups,
    .bus_groups = pci_bus_groups,
    .drv_groups = pci_drv_groups,
    .pm     = PCI_PM_OPS_PTR,
};
bus_register(&pci_bus_type);

//注冊平臺總線
struct bus_type platform_bus_type = {
    .name       = "platform",
    .dev_groups = platform_dev_groups,
    .match      = platform_match,
    .uevent     = platform_uevent,
    .pm     = &platform_dev_pm_ops,
};
bus_register(&platform_bus_type);

//注冊usb總線
struct bus_type usb_bus_type = {
    .name =     "usb",
    .match =    usb_device_match,
    .uevent =   usb_uevent,
};
bus_register(&usb_bus_type);

//注冊virtio總線
static struct bus_type virtio_bus = {
    .name  = "virtio",
    .match = virtio_dev_match,
    .dev_groups = virtio_dev_groups,
    .uevent = virtio_uevent,
    .probe = virtio_dev_probe,
    .remove = virtio_dev_remove,
};
bus_register(&virtio_bus)

注冊總線后,會在 /sys/bus 下生成總線目錄,比如 pci 總線會生成目錄 /sys/bus/pci

/**
 * bus_register - register a driver-core subsystem
 * @bus: bus to register
 *
 * Once we have that, we register the bus with the kobject
 * infrastructure, then register the children subsystems it has:
 * the devices and drivers that belong to the subsystem.
 */
int bus_register(struct bus_type *bus)
    struct subsys_private *priv;
    struct lock_class_key *key = &bus->lock_key;

    priv = kzalloc(sizeof(struct subsys_private), GFP_KERNEL);
    priv->bus = bus;
    bus->p = priv;
    kobject_set_name(&priv->subsys.kobj, "%s", bus->name);
    priv->subsys.kobj.kset = bus_kset;
    priv->subsys.kobj.ktype = &bus_ktype;
    kset_register(&priv->subsys);
    
    //此值為1加載驅(qū)動時會自動探測設(shè)備進行匹配
    priv->drivers_autoprobe = 1;
    
    bus_create_file(bus, &bus_attr_uevent);
    
    //在總線目錄下,生成 devices 子目錄,下面再包含具體pci設(shè)備子目錄
    priv->devices_kset = kset_create_and_add("devices", NULL,
                         &priv->subsys.kobj);
    //在總線目錄下,生成 drivers 子目錄,下面再包含具體驅(qū)動子目錄
    priv->drivers_kset = kset_create_and_add("drivers", NULL,
                         &priv->subsys.kobj);
    //此鏈表用于保存加載的pci設(shè)備驅(qū)動
    klist_init(&priv->klist_devices, klist_devices_get, klist_devices_put);
    //此鏈表用于保存掃描到的pci設(shè)備
    klist_init(&priv->klist_drivers, NULL, NULL);
    
    //在sys文件系統(tǒng)創(chuàng)建 drivers_probe 和 drivers_autoprobe 文件
    add_probe_files(bus);
        bus_create_file(bus, &bus_attr_drivers_probe);
        bus_create_file(bus, &bus_attr_drivers_autoprobe);
    
    bus_add_groups(bus, bus->bus_groups);

注冊總線后,會生成文件/sys/bus/pci/drivers_autoprobe,寫此文件時在kernel中會調(diào)用如下函數(shù),如果為1,表示 bus 支持自動探測 device,則加載驅(qū)動時,自動遍歷所有pci設(shè)備進行匹配

store_drivers_autoprobe
static ssize_t store_drivers_autoprobe(struct bus_type *bus,
                       const char *buf, size_t count)
{
    if (buf[0] == '0')
        bus->p->drivers_autoprobe = 0;
    else
        bus->p->drivers_autoprobe = 1;
    return count;
}

注冊驅(qū)動到pci總線
結(jié)構(gòu)體struct pci_driver表示一個pci設(shè)備驅(qū)動,其中id_table和dynids用來保存此驅(qū)動支持的設(shè)備id等信息,如果有匹配的設(shè)備,則調(diào)用probe函數(shù)。

struct pci_driver {
    struct list_head node;
    const char *name;
    //靜態(tài)table,用來保存驅(qū)動支持的id
    const struct pci_device_id *id_table;   /* must be non-NULL for probe to be called */
    int  (*probe)  (struct pci_dev *dev, const struct pci_device_id *id);   /* New device inserted */
    void (*remove) (struct pci_dev *dev);   /* Device removed (NULL if not a hot-plug capable driver) */
    int  (*suspend) (struct pci_dev *dev, pm_message_t state);  /* Device suspended */
    int  (*suspend_late) (struct pci_dev *dev, pm_message_t state);
    int  (*resume_early) (struct pci_dev *dev);
    int  (*resume) (struct pci_dev *dev);                   /* Device woken up */
    void (*shutdown) (struct pci_dev *dev);
    int (*sriov_configure) (struct pci_dev *dev, int num_vfs); /* PF pdev */
    const struct pci_error_handlers *err_handler;
    struct device_driver    driver;
    //動態(tài)table,通過寫文件 new_id 動態(tài)添加id
    struct pci_dynids dynids;
};

調(diào)用函數(shù)pci_register_driver注冊pci設(shè)備驅(qū)動。

static struct pci_driver igbuio_pci_driver = {
    .name = "igb_uio",
    .id_table = NULL,  //DPDK 用到的 igb_uio, vfio-pci等驅(qū)動的id_table默認為空
    .probe = igbuio_pci_probe,
    .remove = igbuio_pci_remove,
};
pci_register_driver(&igbuio_pci_driver);


static const struct pci_device_id igb_pci_tbl[] = {
    { PCI_VDEVICE(INTEL, E1000_DEV_ID_I354_BACKPLANE_1GBPS) },
    { PCI_VDEVICE(INTEL, E1000_DEV_ID_I354_SGMII) },
    ...
}

static struct pci_driver igb_driver = {
    .name     = igb_driver_name,
    .id_table = igb_pci_tbl,  //正常的kernel驅(qū)動都有一個靜態(tài)的id_table
    .probe    = igb_probe,
    .remove   = igb_remove,
#ifdef CONFIG_PM
    .driver.pm = &igb_pm_ops,
#endif
    .shutdown = igb_shutdown,
    .sriov_configure = igb_pci_sriov_configure,
    .err_handler = &igb_err_handler
};
pci_register_driver(&igb_driver);

注冊驅(qū)動后,會在/sys/bus/pci/drivers目錄下創(chuàng)建以驅(qū)動名字命名的目錄,并在此目錄下創(chuàng)建new_id, bind和unbind等sys文件,可以通過這些文件動態(tài)修改驅(qū)動信息。

/*
 * pci_register_driver must be a macro so that KBUILD_MODNAME can be expanded
 */
#define pci_register_driver(driver)     \
    __pci_register_driver(driver, THIS_MODULE, KBUILD_MODNAME)

int __pci_register_driver(struct pci_driver *drv, struct module *owner,
              const char *mod_name)
{
    /* initialize common driver fields */
    drv->driver.name = drv->name;
    //bus固定為 pci_bus_type
    drv->driver.bus = &pci_bus_type;
    drv->driver.owner = owner;
    drv->driver.mod_name = mod_name;

    spin_lock_init(&drv->dynids.lock);
    INIT_LIST_HEAD(&drv->dynids.list);

    /* register with core */
    driver_register(&drv->driver);
        bus_add_driver(drv);
            struct bus_type *bus;
            struct driver_private *priv;
            
            bus = bus_get(drv->bus);
            priv = kzalloc(sizeof(*priv), GFP_KERNEL);
            klist_init(&priv->klist_devices, NULL, NULL);
            priv->driver = drv;
            drv->p = priv;
            priv->kobj.kset = bus->p->drivers_kset;
            kobject_init_and_add(&priv->kobj, &driver_ktype, NULL, "%s", drv->name);

            //將驅(qū)動添加到pci總線
            klist_add_tail(&priv->knode_bus, &bus->p->klist_drivers);

            //如果pci總線支持自動探測設(shè)備,則在加載驅(qū)動時就遍歷所有pci設(shè)備進行匹配
            if (drv->bus->p->drivers_autoprobe) {
                driver_attach(drv);
                    //遍歷所有的pci設(shè)備,和drv進行匹配
                    bus_for_each_dev(drv->bus, NULL, drv, __driver_attach);
                        //設(shè)備和驅(qū)動進行匹配
                        driver_match_device(drv, dev)
                        //如果匹配成功,并且設(shè)備還沒有加載其他驅(qū)動,則使用當前驅(qū)動drv
                        if (!dev->driver)
                            driver_probe_device(drv, dev);
            }

            module_add_driver(drv->owner, drv);

            driver_create_file(drv, &driver_attr_uevent);

            //bus->drv_groups 為 pci_drv_groups,
            //在sys文件系統(tǒng)創(chuàng)建 new_id 和 remove_id 文件
            driver_add_groups(drv, bus->drv_groups);
            
            //在sys文件系統(tǒng)創(chuàng)建 bind 和 unbind 文件,用來將驅(qū)動綁定和解綁定設(shè)備
            if (!drv->suppress_bind_attrs) {
                add_bind_files(drv);
                    driver_create_file(drv, &driver_attr_unbind);
                    driver_create_file(drv, &driver_attr_bind);
            }
}

向new_id寫入"0x0806 0x1521"信息(0x0806表示vendor id,0x1521為device id)時,會調(diào)用kernel中的store_new_id,解析相關(guān)字段后,保存到動態(tài)鏈表dynids,然后遍歷當前所有的pci設(shè)備進行匹配。

//定義struct driver_attribute driver_attr_new_id
static DRIVER_ATTR(new_id, S_IWUSR, NULL, store_new_id);
//定義 //struct driver_attribute driver_attr_remove_id
static DRIVER_ATTR(remove_id, S_IWUSR, NULL, store_remove_id);

//定義 struct attribute_group pci_drv_groups
static struct attribute *pci_drv_attrs[] = {
    &driver_attr_new_id.attr,
    &driver_attr_remove_id.attr,
    NULL,
};
ATTRIBUTE_GROUPS(pci_drv);

static ssize_t store_new_id(struct device_driver *driver, const char *buf,size_t count)
    fields = sscanf(buf, "%x %x %x %x %x %x %lx",
            &vendor, &device, &subvendor, &subdevice,
            &class, &class_mask, &driver_data);
    if (fields < 2)
        return -EINVAL;
        
    pci_add_dynid(pdrv, vendor, device, subvendor, subdevice, class, class_mask, driver_data);  
        struct pci_dynid *dynid;

        dynid = kzalloc(sizeof(*dynid), GFP_KERNEL);
        dynid->id.vendor = vendor;
        dynid->id.device = device;
        dynid->id.subvendor = subvendor;
        dynid->id.subdevice = subdevice;
        dynid->id.class = class;
        dynid->id.class_mask = class_mask;
        dynid->id.driver_data = driver_data;

        spin_lock(&drv->dynids.lock);
        list_add_tail(&dynid->node, &drv->dynids.list);
        spin_unlock(&drv->dynids.lock);

        //設(shè)置new id時,也會自動匹配設(shè)備
        return driver_attach(&drv->driver);

向bind文件寫入網(wǎng)卡的pci地址時,會調(diào)用kernel中的bind_store,將此網(wǎng)卡綁定到此驅(qū)動。
向unbind文件寫入網(wǎng)卡的pci地址時,會調(diào)用kernel中的unbind_store,將此網(wǎng)卡和此驅(qū)動解綁。

//定義 struct driver_attribute driver_attr_bind,寫文件時,調(diào)用 bind_store
static DRIVER_ATTR_WO(bind);
//定義 struct driver_attribute driver_attr_unbind,寫文件時,調(diào)用 unbind_store
static DRIVER_ATTR_WO(unbind);

/*
 * Manually attach a device to a driver.
 * Note: the driver must want to bind to the device,
 * it is not possible to override the driver's id table.
 */
static ssize_t bind_store(struct device_driver *drv, const char *buf, size_t count)
    dev = bus_find_device_by_name(bus, NULL, buf);
    if (dev && dev->driver == NULL && driver_match_device(drv, dev)) {
        if (dev->parent)    /* Needed for USB */
            device_lock(dev->parent);
        device_lock(dev);
        err = driver_probe_device(drv, dev);
        device_unlock(dev);
        if (dev->parent)
            device_unlock(dev->parent);

        if (err > 0) {
            /* success */
            err = count;
        } else if (err == 0) {
            /* driver didn't accept device */
            err = -ENODEV;
        }
    }
    
/* Manually detach a device from its associated driver. */
static ssize_t unbind_store(struct device_driver *drv, const char *buf, size_t count)
{
    struct bus_type *bus = bus_get(drv->bus);
    struct device *dev;
    int err = -ENODEV;

    dev = bus_find_device_by_name(bus, NULL, buf);
    if (dev && dev->driver == drv) {
        if (dev->parent)    /* Needed for USB */
            device_lock(dev->parent);
        device_release_driver(dev);
        if (dev->parent)
            device_unlock(dev->parent);
        err = count;
    }
    put_device(dev);
    bus_put(bus);
    return err;
}

發(fā)現(xiàn)pci設(shè)備
系統(tǒng)啟動時會掃描所有的pci設(shè)備,以他們的pci地址為名字創(chuàng)建目錄,并在此目錄下創(chuàng)建相關(guān)的sys文件。并且會遍歷所有的pci設(shè)備驅(qū)動進行匹配。

pci_scan_root_bus
    pci_scan_child_bus(b);
        pci_scan_slot
            pci_scan_single_device
                pci_scan_device
                pci_device_add
                    device_add(&dev->dev);
                        bus_add_device(dev);
                            //bus->dev_groups為pci_dev_groups,
                            //會在 /sys/bus/pci/devices/'pci address'/ 目錄下創(chuàng)建 vendor, device等目錄
                            device_add_groups(dev, bus->dev_groups);
                            //將設(shè)備添加到pci總線鏈表
                            klist_add_tail(&dev->p->knode_bus, &bus->p->klist_devices);

    pci_bus_add_devices
        pci_bus_add_device
            pci_create_sysfs_dev_files(dev);
                //如果pci配置空間大于 PCI_CFG_SPACE_SIZE(256字節(jié)),則創(chuàng)建 /sys/bus/pci/devices/0000:81:00.0/config文件,
                //大小為 4096 字節(jié)
                if (pdev->cfg_size > PCI_CFG_SPACE_SIZE)
                    retval = sysfs_create_bin_file(&pdev->dev.kobj, &pcie_config_attr);
                else //否則config文件大小為 256 字節(jié)
                    retval = sysfs_create_bin_file(&pdev->dev.kobj, &pci_config_attr);

                //創(chuàng)建 resource 文件,用戶態(tài)可以使用mmap映射 resource0 實現(xiàn)對網(wǎng)卡寄存器的操作
                pci_create_resource_files(pdev);
                    //創(chuàng)建 /sys/bus/pci/devices/0000:81:00.0/resource0 等文件
                    /* Expose the PCI resources from this device as files */
                    for (i = 0; i < PCI_ROM_RESOURCE; i++) {
                        /* skip empty resources */
                        if (!pci_resource_len(pdev, i))
                            continue;

                        retval = pci_create_attr(pdev, i, 0);
                            struct bin_attribute *res_attr;
                            res_attr = kzalloc(sizeof(*res_attr) + name_len, GFP_ATOMIC);
                            sysfs_bin_attr_init(res_attr);
                            if (write_combine) {
                                pdev->res_attr_wc[num] = res_attr;
                                sprintf(res_attr_name, "resource%d_wc", num);
                                res_attr->mmap = pci_mmap_resource_wc;
                            } else {
                                pdev->res_attr[num] = res_attr;
                                sprintf(res_attr_name, "resource%d", num);
                                res_attr->mmap = pci_mmap_resource_uc;
                            }
                            if (pci_resource_flags(pdev, num) & IORESOURCE_IO) {
                                res_attr->read = pci_read_resource_io;
                                res_attr->write = pci_write_resource_io;
                            }
                            res_attr->attr.name = res_attr_name;
                            res_attr->attr.mode = S_IRUSR | S_IWUSR;
                            res_attr->size = pci_resource_len(pdev, num);
                            res_attr->private = &pdev->resource[num];
                            //創(chuàng)建 kernel 文件
                            sysfs_create_bin_file(&pdev->dev.kobj, res_attr);

                        /* for prefetchable resources, create a WC mappable file */
                        if (!retval && pdev->resource[i].flags & IORESOURCE_PREFETCH)
                            retval = pci_create_attr(pdev, i, 1);
                    }

            //嘗試匹配驅(qū)動
            device_attach(&dev->dev);
                //遍歷所有driver,查看是否有匹配此設(shè)備的driver
                bus_for_each_drv(dev->bus, NULL, dev, __device_attach);
                    //判斷驅(qū)動和設(shè)備是否匹配
                    driver_match_device
                        //pci_bus_match
                        drv->bus->match
                            pci_match_device(pci_drv, pci_dev);
                    //如果有匹配的,則調(diào)用驅(qū)動的probe函數(shù)
                    driver_probe_device
                        really_probe(dev, drv);
                            //pci_device_probe
                            dev->bus->probe
                                __pci_device_probe
                                    pci_call_probe
                                        local_pci_probe
                                            pci_drv->probe(pci_dev, ddi->id);

向設(shè)備的driver_override文件寫入驅(qū)動名字,表示此設(shè)備只能綁定到此驅(qū)動。

static ssize_t driver_override_store(struct device *dev,
                     struct device_attribute *attr,
                     const char *buf, size_t count)
    struct pci_dev *pdev = to_pci_dev(dev);
    driver_override = kstrndup(buf, count, GFP_KERNEL);
    pdev->driver_override = driver_override;

如何匹配?
前面多次提到設(shè)備和驅(qū)動進行匹配,究竟如何匹配呢?

先看一下用來表示一個pci設(shè)備的結(jié)構(gòu)體pci_dev,其中如下幾個成員變量表示此pci設(shè)備的類型,一般vendor和device就足夠,vendor表示此設(shè)備是哪個廠商的,device表示此設(shè)備的類型。

struct pci_dev {
    ...
    unsigned short  vendor;
    unsigned short  device;
    unsigned short  subsystem_vendor;
    unsigned short  subsystem_device;
    unsigned int    class;      /* 3 bytes: (base,sub,prog-if) */
    ...
}

再看一下用來表示設(shè)備驅(qū)動的pci_driver,其中id_table和dynids用來保存此驅(qū)動支持的設(shè)備類型,前者是靜態(tài)值,后者可以通過驅(qū)動目錄下的new_id動態(tài)添加。設(shè)備類型使用pci_device_id結(jié)構(gòu)體來表示,其成員變量也是vendor,device等信息,和pci_dev中的信息是一樣的,所以可以使用這幾個字段進行匹配。

struct pci_device_id {
    __u32 vendor, device;       /* Vendor and device ID or PCI_ANY_ID*/
    __u32 subvendor, subdevice; /* Subsystem ID's or PCI_ANY_ID */
    __u32 class, class_mask;    /* (class,subclass,prog-if) triplet */
    kernel_ulong_t driver_data; /* Data private to the driver */
};

struct pci_driver {
    struct pci_device_id *id_table
    struct pci_dynids dynids;
    ...
}

最終使用函數(shù)pci_match_device進行驅(qū)動和設(shè)備的匹配。

static const struct pci_device_id pci_device_id_any = {
    .vendor = PCI_ANY_ID,
    .device = PCI_ANY_ID,
    .subvendor = PCI_ANY_ID,
    .subdevice = PCI_ANY_ID,
};

static const struct pci_device_id *pci_match_device(struct pci_driver *drv, struct pci_dev *dev)
    //如果設(shè)備設(shè)置了 driver_override,則只能綁定到driver_override指定的驅(qū)動上。
    //如果不是此驅(qū)動直接返回NULL
    /* When driver_override is set, only bind to the matching driver */
    if (dev->driver_override && strcmp(dev->driver_override, drv->name))
        return NULL;

    //首先查找驅(qū)動的動態(tài)鏈表和設(shè)備進行匹配
    /* Look at the dynamic ids first, before the static ones */
    spin_lock(&drv->dynids.lock);
    list_for_each_entry(dynid, &drv->dynids.list, node) {
        if (pci_match_one_device(&dynid->id, dev)) {
            found_id = &dynid->id;
            break;
        }
    }
    spin_unlock(&drv->dynids.lock);

    //如果沒匹配到,則查找驅(qū)動的靜態(tài)table
    if (!found_id)
        found_id = pci_match_id(drv->id_table, dev);
            while (ids->vendor || ids->subvendor || ids->class_mask) {
                if (pci_match_one_device(ids, dev))
                    return ids;
                ids++;
            }

    //如果仍然沒匹配到,但是指定了驅(qū)動,則強制認為匹配成功,返回 pci_device_id_any
    /* driver_override will always match, send a dummy id */
    if (!found_id && dev->driver_override)
        found_id = &pci_device_id_any;

    return found_id;

//具體的匹配規(guī)則
static inline const struct pci_device_id *
pci_match_one_device(const struct pci_device_id *id, const struct pci_dev *dev)
{
    if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) &&
        (id->device == PCI_ANY_ID || id->device == dev->device) &&
        (id->subvendor == PCI_ANY_ID || id->subvendor == dev->subsystem_vendor) &&
        (id->subdevice == PCI_ANY_ID || id->subdevice == dev->subsystem_device) &&
        !((id->class ^ dev->class) & id->class_mask))
        return id;
    return NULL;
}

綁定到 igb_uio 驅(qū)動

網(wǎng)卡如何綁定到igb_uio驅(qū)動呢?這里拿DPDK提供的腳步文件dpdk-devbind.py中的函數(shù)bind_one進行分析。

def bind_one(dev_id, driver, force):
    '''Bind the device given by "dev_id" to the driver "driver". If the device
    is already bound to a different driver, it will be unbound first'''
    dev = devices[dev_id]
    saved_driver = None  # used to rollback any unbind in case of failure

    //如果網(wǎng)卡已經(jīng)綁定到某個驅(qū)動,則判斷是否是要綁定的驅(qū)動,如果是則返回,
    //如果不是,則解綁之前的驅(qū)動。unbind_one只要向驅(qū)動的unbind寫入此網(wǎng)卡的pci地址即可解綁。
    # unbind any existing drivers we don't want
    if has_driver(dev_id):
        if dev["Driver_str"] == driver:
            print("%s already bound to driver %s, skipping\n"
                  % (dev_id, driver))
            return
        else:
            saved_driver = dev["Driver_str"]
            unbind_one(dev_id, force)
            dev["Driver_str"] = ""  # clear driver string

    //綁定方法根據(jù)kernel版本有不同的綁定方法。
    //對于kernel版本大于等于3.15的,首先將驅(qū)動名字寫入到網(wǎng)卡的文件 driver_override來指定此驅(qū)動。
    //而小于3.15的,需要將網(wǎng)卡的vendor和device id寫入驅(qū)動的new_id文件。
    //為什么大于等于3.15的不使用new_id呢?這是因為高版本的new_id不只是將設(shè)備類型添加到驅(qū)動的
    //動態(tài)鏈表,也會遍歷所有的設(shè)備將此類型的設(shè)備全部綁定到此驅(qū)動。如果你只想綁定一個網(wǎng)卡,
    //結(jié)果把同類型的網(wǎng)卡都綁定了,豈不是很尷尬。
    # For kernels >= 3.15 driver_override can be used to specify the driver
    # for a device rather than relying on the driver to provide a positive
    # match of the device.  The existing process of looking up
    # the vendor and device ID, adding them to the driver new_id,
    # will erroneously bind other devices too which has the additional burden
    # of unbinding those devices
    if driver in dpdk_drivers:
        filename = "/sys/bus/pci/devices/%s/driver_override" % dev_id
        if os.path.exists(filename):
            try:
                f = open(filename, "w")
            except:
                print("Error: bind failed for %s - Cannot open %s"
                      % (dev_id, filename))
                return
            try:
                f.write("%s" % driver)
                f.close()
            except:
                print("Error: bind failed for %s - Cannot write driver %s to "
                      "PCI ID " % (dev_id, driver))
                return
        # For kernels < 3.15 use new_id to add PCI id's to the driver
        else:
            filename = "/sys/bus/pci/drivers/%s/new_id" % driver
            try:
                f = open(filename, "w")
            except:
                print("Error: bind failed for %s - Cannot open %s"
                      % (dev_id, filename))
                return
            try:
                # Convert Device and Vendor Id to int to write to new_id
                f.write("%04x %04x" % (int(dev["Vendor"],16),
                        int(dev["Device"], 16)))
                f.close()
            except:
                print("Error: bind failed for %s - Cannot write new PCI ID to "
                      "driver %s" % (dev_id, driver))
                return

    //第二步是將網(wǎng)卡的pci地址寫入驅(qū)動的文件 /sys/bus/pci/drivers/%s/bind,這樣就能將
    //網(wǎng)卡和驅(qū)動綁定到一起。
    # do the bind by writing to /sys
    filename = "/sys/bus/pci/drivers/%s/bind" % driver
    try:
        f = open(filename, "a")
    except:
        print("Error: bind failed for %s - Cannot open %s"
              % (dev_id, filename))
        if saved_driver is not None:  # restore any previous driver
            bind_one(dev_id, saved_driver, force)
        return
    try:
        f.write(dev_id)
        f.close()
    except:
        # for some reason, closing dev_id after adding a new PCI ID to new_id
        # results in IOError. however, if the device was successfully bound,
        # we don't care for any errors and can safely ignore IOError
        tmp = get_pci_device_details(dev_id, True)
        if "Driver_str" in tmp and tmp["Driver_str"] == driver:
            return
        print("Error: bind failed for %s - Cannot bind to driver %s"
              % (dev_id, driver))
        if saved_driver is not None:  # restore any previous driver
            bind_one(dev_id, saved_driver, force)
        return

    //對于kernel版本大于等于3.15的,還要將文件 driver_override 清空,以便綁定到其他驅(qū)動。
    # For kernels > 3.15 driver_override is used to bind a device to a driver.
    # Before unbinding it, overwrite driver_override with empty string so that
    # the device can be bound to any other driver
    filename = "/sys/bus/pci/devices/%s/driver_override" % dev_id
    if os.path.exists(filename):
        try:
            f = open(filename, "w")
        except:
            print("Error: unbind failed for %s - Cannot open %s"
                  % (dev_id, filename))
            sys.exit(1)
        try:
            f.write("\00")
            f.close()
        except:
            print("Error: unbind failed for %s - Cannot open %s"
                  % (dev_id, filename))
            sys.exit(1)

igb_uio驅(qū)動的id_table為空,則在加載此驅(qū)動時,是不會匹配到任何設(shè)備的。

static struct pci_driver igbuio_pci_driver = {
    .name = "igb_uio",
    .id_table = NULL,  //DPDK 用到的 igb_uio, vfio-pci等驅(qū)動的id_table默認為空
    .probe = igbuio_pci_probe,
    .remove = igbuio_pci_remove,
};

經(jīng)過上面的分析,有三種方法可以將網(wǎng)卡綁定到驅(qū)動igb_uio

a. 如果kernel版本大于等于3.15,先向網(wǎng)卡的文件 /sys/bus/pci/devices/'pci address'/driver_override 寫入驅(qū)動名字igb_uio,再向驅(qū)動igb_uio的文件 /sys/bus/pci/drivers/igb_uio/bind寫入網(wǎng)卡的pci地址即可。
b. 如果kernel版本大于等于3.15,向驅(qū)動igb_uio的文件 /sys/bus/pci/drivers/igb_uio/new_id寫入網(wǎng)卡的vendor和device id,則會自動將所有此類型并且沒有綁定到任何驅(qū)動的網(wǎng)卡綁定到igb_uio。
c. 如果kernel版本小于3.15,先向驅(qū)動igb_uio的文件 /sys/bus/pci/drivers/igb_uio/new_id寫入網(wǎng)卡的vendor和device id,再向驅(qū)動igb_uio的文件 /sys/bus/pci/drivers/igb_uio/bind寫入網(wǎng)卡的pci地址即可。注意低版本的kernel,在向new_id寫入值時,只會將設(shè)備類型添加到此驅(qū)動的動態(tài)鏈表,而不會自動探測設(shè)備。

igb_uio probe
經(jīng)過前面的分析網(wǎng)卡綁定到了igb_uio驅(qū)動后,會調(diào)用驅(qū)動的probe函數(shù)igbuio_pci_probe,主要做了如下幾個事情:
a. 調(diào)用pci_enable_device使能pci設(shè)備
b. 設(shè)置DMA mask
c. 填充struct uio_info信息,注冊uio設(shè)備
d. 注冊中斷處理函數(shù)

static int
igbuio_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
    struct rte_uio_pci_dev *udev;
    dma_addr_t map_dma_addr;
    void *map_addr;

    udev = kzalloc(sizeof(struct rte_uio_pci_dev), GFP_KERNEL);
    
    //使能pci設(shè)備
    /*
     * enable device: ask low-level code to enable I/O and
     * memory
     */
    pci_enable_device(dev);

    /* enable bus mastering on the device */
    pci_set_master(dev);
    
    //將設(shè)備的memory類型BAR信息保存到 struct uio_info->mem中,
    //將設(shè)備的io類型BAR信息保存到 struct uio_info->port中
    /* remap IO memory */
    igbuio_setup_bars(dev, &udev->info);
    
    /* set 64-bit DMA mask */
    pci_set_dma_mask(dev,  DMA_BIT_MASK(64));
    pci_set_consistent_dma_mask(dev, DMA_BIT_MASK(64));

    //填充 struct uio_info 其他字段
    /* fill uio infos */
    udev->info.name = "igb_uio";
    udev->info.version = "0.1";
    udev->info.irqcontrol = igbuio_pci_irqcontrol;
    udev->info.open = igbuio_pci_open;
    udev->info.release = igbuio_pci_release;
    udev->info.priv = udev;
    udev->pdev = dev;
    
    //創(chuàng)建 /sys/bus/pci/devices/'pci address'/max_vf 文件,
    //寫此文件用來生成 VF,這說明即使網(wǎng)卡綁定到igb_uio口,仍然可以
    //生成 VF。
    sysfs_create_group(&dev->dev.kobj, &dev_attr_grp);
    
    //注冊uio,會生成 /dev/uiox 字符設(shè)備文件,
    //同時生成目錄 /sys/bus/pci/devices/'pci address'/uio/uiox
    /* register uio driver */
    uio_register_device(&dev->dev, &udev->info);

    //保存 struct rte_uio_pci_dev 到 dev->driver_data
    pci_set_drvdata(dev, udev);
        dev_set_drvdata(&pdev->dev, data);
            dev->driver_data = data;

宏uio_register_device用來注冊uio設(shè)備。

/* use a define to avoid include chaining to get THIS_MODULE */
#define uio_register_device(parent, info) \
    __uio_register_device(THIS_MODULE, parent, info)

int __uio_register_device(struct module *owner,
              struct device *parent,
              struct uio_info *info)
    //根據(jù) uio_info 生成 uio_device
    struct uio_device *idev;
    idev = devm_kzalloc(parent, sizeof(*idev), GFP_KERNEL);

    idev->owner = owner;
    idev->info = info;
    init_waitqueue_head(&idev->wait);
    atomic_set(&idev->event, 0);

    //分配最小未使用的id,保存到 idev->minor
    uio_get_minor(idev);
    //創(chuàng)建字符設(shè)備 /dev/uiox
    idev->dev = device_create(&uio_class, parent, MKDEV(uio_major, idev->minor), idev, "uio%d", idev->minor);
    
    //在 /sys/class/uio/uiox/下創(chuàng)建maps目錄,maps目錄下根據(jù) struct uio_info->mem和port信息
    //分別生成 mapx 和 portx 等目錄,這些目錄下又存放對應(yīng)類型的信息,比如起始地址,name,offset和size。
    //用戶態(tài)可以通過mmap mapx下的文件來操作網(wǎng)卡寄存器。
    //但是DPDK沒有使用此方法,而是直接mmap /sys/bus/pci/devices/'pci address'/resource0 文件實現(xiàn)。
    uio_dev_add_attributes(idev);
    info->uio_dev = idev;
    
    //注冊中斷。但是在新版本的DPDK中,注冊uio時沒有分配info->irq來注冊中斷,
    //而是在用戶態(tài) open /dev/uiox 時,在函數(shù) igbuio_pci_open 中注冊中斷。
    if (info->irq && (info->irq != UIO_IRQ_CUSTOM)) {
        devm_request_irq(idev->dev, info->irq, uio_interrupt, info->irq_flags, info->name, idev);
    }

簡單總結(jié)一下,igb_uio是DPDK使用網(wǎng)卡的一個通用驅(qū)動,不只intel網(wǎng)卡可以用,其他廠商的網(wǎng)卡也可以用(有一個例外,mellanox的網(wǎng)卡不用綁定到igb_uio就能被使用DPDK),因為它只使能了pci設(shè)備,注冊uio,和注冊中斷處理函數(shù),這些工作是不區(qū)分網(wǎng)卡類型的。
加載igb_uio時,不會自動探測pci設(shè)備,而是需要寫sys文件將設(shè)備綁定到igb_uio。

igb_uio依賴uio驅(qū)動,注冊uio設(shè)備后,會生成/dev/uiox,和網(wǎng)卡一一對應(yīng),用戶態(tài)可以poll /dev/uiox監(jiān)聽中斷是否到來。
同時uio設(shè)備還會將網(wǎng)卡的BAR地址通過sys文件系統(tǒng)暴露出去,用戶態(tài)可以mmap sys文件后操作網(wǎng)卡寄存器。但是DPDK沒有采用這種方式,而是直接mmap網(wǎng)卡自身暴露出去的sys文件 /sys/bus/pci/devices/'pci address'/resource0。

參考

https://www.cnblogs.com/jungle1996/p/12398915.html
https://www.cnblogs.com/jungle1996/p/12452636.html

最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容