diff options
Diffstat (limited to 'drivers/vfio/pci/vfio_pci_core.c')
| -rw-r--r-- | drivers/vfio/pci/vfio_pci_core.c | 254 |
1 files changed, 155 insertions, 99 deletions
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 06b6f3594a13..a0d69ddaf90d 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -156,7 +156,7 @@ no_mmap: } struct vfio_pci_group_info; -static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set); +static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set); static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, struct vfio_pci_group_info *groups); @@ -217,6 +217,10 @@ int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat bool needs_restore = false, needs_save = false; int ret; + /* Prevent changing power state for PFs with VFs enabled */ + if (pci_num_vf(pdev) && state > PCI_D0) + return -EBUSY; + if (vdev->needs_pm_restore) { if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) { pci_save_state(pdev); @@ -255,6 +259,17 @@ int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat return ret; } +/* + * The dev_pm_ops needs to be provided to make pci-driver runtime PM working, + * so use structure without any callbacks. + * + * The pci-driver core runtime PM routines always save the device state + * before going into suspended state. If the device is going into low power + * state with only with runtime PM ops, then no explicit handling is needed + * for the devices which have NoSoftRst-. + */ +static const struct dev_pm_ops vfio_pci_core_pm_ops = { }; + int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) { struct pci_dev *pdev = vdev->pdev; @@ -262,21 +277,23 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) u16 cmd; u8 msix_pos; - vfio_pci_set_power_state(vdev, PCI_D0); + if (!disable_idle_d3) { + ret = pm_runtime_resume_and_get(&pdev->dev); + if (ret < 0) + return ret; + } /* Don't allow our initial saved state to include busmaster */ pci_clear_master(pdev); ret = pci_enable_device(pdev); if (ret) - return ret; + goto out_power; /* If reset fails because of the device lock, fail this path entirely */ ret = pci_try_reset_function(pdev); - if (ret == -EAGAIN) { - pci_disable_device(pdev); - return ret; - } + if (ret == -EAGAIN) + goto out_disable_device; vdev->reset_works = !ret; pci_save_state(pdev); @@ -300,12 +317,8 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) } ret = vfio_config_init(vdev); - if (ret) { - kfree(vdev->pci_saved_state); - vdev->pci_saved_state = NULL; - pci_disable_device(pdev); - return ret; - } + if (ret) + goto out_free_state; msix_pos = pdev->msix_cap; if (msix_pos) { @@ -326,6 +339,16 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) return 0; + +out_free_state: + kfree(vdev->pci_saved_state); + vdev->pci_saved_state = NULL; +out_disable_device: + pci_disable_device(pdev); +out_power: + if (!disable_idle_d3) + pm_runtime_put(&pdev->dev); + return ret; } EXPORT_SYMBOL_GPL(vfio_pci_core_enable); @@ -433,8 +456,11 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev) out: pci_disable_device(pdev); - if (!vfio_pci_dev_set_try_reset(vdev->vdev.dev_set) && !disable_idle_d3) - vfio_pci_set_power_state(vdev, PCI_D3hot); + vfio_pci_dev_set_try_reset(vdev->vdev.dev_set); + + /* Put the pm-runtime usage counter acquired during enable */ + if (!disable_idle_d3) + pm_runtime_put(&pdev->dev); } EXPORT_SYMBOL_GPL(vfio_pci_core_disable); @@ -556,7 +582,7 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) struct vfio_pci_group_info { int count; - struct vfio_group **groups; + struct file **files; }; static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot) @@ -1018,10 +1044,10 @@ reset_info_exit: } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) { struct vfio_pci_hot_reset hdr; int32_t *group_fds; - struct vfio_group **groups; + struct file **files; struct vfio_pci_group_info info; bool slot = false; - int group_idx, count = 0, ret = 0; + int file_idx, count = 0, ret = 0; minsz = offsetofend(struct vfio_pci_hot_reset, count); @@ -1054,17 +1080,17 @@ reset_info_exit: return -EINVAL; group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); - groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL); - if (!group_fds || !groups) { + files = kcalloc(hdr.count, sizeof(*files), GFP_KERNEL); + if (!group_fds || !files) { kfree(group_fds); - kfree(groups); + kfree(files); return -ENOMEM; } if (copy_from_user(group_fds, (void __user *)(arg + minsz), hdr.count * sizeof(*group_fds))) { kfree(group_fds); - kfree(groups); + kfree(files); return -EFAULT; } @@ -1073,22 +1099,22 @@ reset_info_exit: * user interface and store the group and iommu ID. This * ensures the group is held across the reset. */ - for (group_idx = 0; group_idx < hdr.count; group_idx++) { - struct vfio_group *group; - struct fd f = fdget(group_fds[group_idx]); - if (!f.file) { + for (file_idx = 0; file_idx < hdr.count; file_idx++) { + struct file *file = fget(group_fds[file_idx]); + + if (!file) { ret = -EBADF; break; } - group = vfio_group_get_external_user(f.file); - fdput(f); - if (IS_ERR(group)) { - ret = PTR_ERR(group); + /* Ensure the FD is a vfio group FD.*/ + if (!vfio_file_iommu_group(file)) { + fput(file); + ret = -EINVAL; break; } - groups[group_idx] = group; + files[file_idx] = file; } kfree(group_fds); @@ -1098,15 +1124,15 @@ reset_info_exit: goto hot_reset_release; info.count = hdr.count; - info.groups = groups; + info.files = files; ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info); hot_reset_release: - for (group_idx--; group_idx >= 0; group_idx--) - vfio_group_put_external_user(groups[group_idx]); + for (file_idx--; file_idx >= 0; file_idx--) + fput(files[file_idx]); - kfree(groups); + kfree(files); return ret; } else if (cmd == VFIO_DEVICE_IOEVENTFD) { struct vfio_device_ioeventfd ioeventfd; @@ -1819,8 +1845,13 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_uninit_device); int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) { struct pci_dev *pdev = vdev->pdev; + struct device *dev = &pdev->dev; int ret; + /* Drivers must set the vfio_pci_core_device to their drvdata */ + if (WARN_ON(vdev != dev_get_drvdata(dev))) + return -EINVAL; + if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) return -EINVAL; @@ -1860,19 +1891,21 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) vfio_pci_probe_power_state(vdev); - if (!disable_idle_d3) { - /* - * pci-core sets the device power state to an unknown value at - * bootup and after being removed from a driver. The only - * transition it allows from this unknown state is to D0, which - * typically happens when a driver calls pci_enable_device(). - * We're not ready to enable the device yet, but we do want to - * be able to get to D3. Therefore first do a D0 transition - * before going to D3. - */ - vfio_pci_set_power_state(vdev, PCI_D0); - vfio_pci_set_power_state(vdev, PCI_D3hot); - } + /* + * pci-core sets the device power state to an unknown value at + * bootup and after being removed from a driver. The only + * transition it allows from this unknown state is to D0, which + * typically happens when a driver calls pci_enable_device(). + * We're not ready to enable the device yet, but we do want to + * be able to get to D3. Therefore first do a D0 transition + * before enabling runtime PM. + */ + vfio_pci_set_power_state(vdev, PCI_D0); + + dev->driver->pm = &vfio_pci_core_pm_ops; + pm_runtime_allow(dev); + if (!disable_idle_d3) + pm_runtime_put(dev); ret = vfio_register_group_dev(&vdev->vdev); if (ret) @@ -1881,7 +1914,9 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) out_power: if (!disable_idle_d3) - vfio_pci_set_power_state(vdev, PCI_D0); + pm_runtime_get_noresume(dev); + + pm_runtime_forbid(dev); out_vf: vfio_pci_vf_uninit(vdev); return ret; @@ -1890,9 +1925,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_register_device); void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev) { - struct pci_dev *pdev = vdev->pdev; - - vfio_pci_core_sriov_configure(pdev, 0); + vfio_pci_core_sriov_configure(vdev, 0); vfio_unregister_group_dev(&vdev->vdev); @@ -1900,21 +1933,16 @@ void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev) vfio_pci_vga_uninit(vdev); if (!disable_idle_d3) - vfio_pci_set_power_state(vdev, PCI_D0); + pm_runtime_get_noresume(&vdev->pdev->dev); + + pm_runtime_forbid(&vdev->pdev->dev); } EXPORT_SYMBOL_GPL(vfio_pci_core_unregister_device); pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, pci_channel_state_t state) { - struct vfio_pci_core_device *vdev; - struct vfio_device *device; - - device = vfio_device_get_from_dev(&pdev->dev); - if (device == NULL) - return PCI_ERS_RESULT_DISCONNECT; - - vdev = container_of(device, struct vfio_pci_core_device, vdev); + struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev); mutex_lock(&vdev->igate); @@ -1923,26 +1951,18 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, mutex_unlock(&vdev->igate); - vfio_device_put(device); - return PCI_ERS_RESULT_CAN_RECOVER; } EXPORT_SYMBOL_GPL(vfio_pci_core_aer_err_detected); -int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn) +int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev, + int nr_virtfn) { - struct vfio_pci_core_device *vdev; - struct vfio_device *device; + struct pci_dev *pdev = vdev->pdev; int ret = 0; device_lock_assert(&pdev->dev); - device = vfio_device_get_from_dev(&pdev->dev); - if (!device) - return -ENODEV; - - vdev = container_of(device, struct vfio_pci_core_device, vdev); - if (nr_virtfn) { mutex_lock(&vfio_pci_sriov_pfs_mutex); /* @@ -1957,22 +1977,42 @@ int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn) } list_add_tail(&vdev->sriov_pfs_item, &vfio_pci_sriov_pfs); mutex_unlock(&vfio_pci_sriov_pfs_mutex); - ret = pci_enable_sriov(pdev, nr_virtfn); + + /* + * The PF power state should always be higher than the VF power + * state. The PF can be in low power state either with runtime + * power management (when there is no user) or PCI_PM_CTRL + * register write by the user. If PF is in the low power state, + * then change the power state to D0 first before enabling + * SR-IOV. Also, this function can be called at any time, and + * userspace PCI_PM_CTRL write can race against this code path, + * so protect the same with 'memory_lock'. + */ + ret = pm_runtime_resume_and_get(&pdev->dev); if (ret) goto out_del; - ret = nr_virtfn; - goto out_put; + + down_write(&vdev->memory_lock); + vfio_pci_set_power_state(vdev, PCI_D0); + ret = pci_enable_sriov(pdev, nr_virtfn); + up_write(&vdev->memory_lock); + if (ret) { + pm_runtime_put(&pdev->dev); + goto out_del; + } + return nr_virtfn; } - pci_disable_sriov(pdev); + if (pci_num_vf(pdev)) { + pci_disable_sriov(pdev); + pm_runtime_put(&pdev->dev); + } out_del: mutex_lock(&vfio_pci_sriov_pfs_mutex); list_del_init(&vdev->sriov_pfs_item); out_unlock: mutex_unlock(&vfio_pci_sriov_pfs_mutex); -out_put: - vfio_device_put(device); return ret; } EXPORT_SYMBOL_GPL(vfio_pci_core_sriov_configure); @@ -1988,7 +2028,7 @@ static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev, unsigned int i; for (i = 0; i < groups->count; i++) - if (groups->groups[i] == vdev->vdev.group) + if (vfio_file_has_dev(groups->files[i], &vdev->vdev)) return true; return false; } @@ -2041,6 +2081,27 @@ vfio_pci_dev_set_resettable(struct vfio_device_set *dev_set) return pdev; } +static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set) +{ + struct vfio_pci_core_device *cur; + int ret; + + list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { + ret = pm_runtime_resume_and_get(&cur->pdev->dev); + if (ret) + goto unwind; + } + + return 0; + +unwind: + list_for_each_entry_continue_reverse(cur, &dev_set->device_list, + vdev.dev_set_list) + pm_runtime_put(&cur->pdev->dev); + + return ret; +} + /* * We need to get memory_lock for each device, but devices can share mmap_lock, * therefore we need to zap and hold the vma_lock for each device, and only then @@ -2147,43 +2208,38 @@ static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set) * - At least one of the affected devices is marked dirty via * needs_reset (such as by lack of FLR support) * Then attempt to perform that bus or slot reset. - * Returns true if the dev_set was reset. */ -static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set) +static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set) { struct vfio_pci_core_device *cur; struct pci_dev *pdev; - int ret; + bool reset_done = false; if (!vfio_pci_dev_set_needs_reset(dev_set)) - return false; + return; pdev = vfio_pci_dev_set_resettable(dev_set); if (!pdev) - return false; + return; /* - * The pci_reset_bus() will reset all the devices in the bus. - * The power state can be non-D0 for some of the devices in the bus. - * For these devices, the pci_reset_bus() will internally set - * the power state to D0 without vfio driver involvement. - * For the devices which have NoSoftRst-, the reset function can - * cause the PCI config space reset without restoring the original - * state (saved locally in 'vdev->pm_save'). + * Some of the devices in the bus can be in the runtime suspended + * state. Increment the usage count for all the devices in the dev_set + * before reset and decrement the same after reset. */ - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) - vfio_pci_set_power_state(cur, PCI_D0); + if (!disable_idle_d3 && vfio_pci_dev_set_pm_runtime_get(dev_set)) + return; - ret = pci_reset_bus(pdev); - if (ret) - return false; + if (!pci_reset_bus(pdev)) + reset_done = true; list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { - cur->needs_reset = false; + if (reset_done) + cur->needs_reset = false; + if (!disable_idle_d3) - vfio_pci_set_power_state(cur, PCI_D3hot); + pm_runtime_put(&cur->pdev->dev); } - return true; } void vfio_pci_core_set_params(bool is_nointxmask, bool is_disable_vga, |
