diff options
author | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2016-09-11 04:34:46 -0300 |
---|---|---|
committer | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2016-09-11 04:34:46 -0300 |
commit | 863981e96738983919de841ec669e157e6bdaeb0 (patch) | |
tree | d6d89a12e7eb8017837c057935a2271290907f76 /drivers/nvme | |
parent | 8dec7c70575785729a6a9e6719a955e9c545bcab (diff) |
Linux-libre 4.7.1-gnupck-4.7.1-gnu
Diffstat (limited to 'drivers/nvme')
-rw-r--r-- | drivers/nvme/host/Kconfig | 2 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 329 | ||||
-rw-r--r-- | drivers/nvme/host/lightnvm.c | 82 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 92 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 281 |
5 files changed, 472 insertions, 314 deletions
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index c894841c6..d296fc3ae 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -18,7 +18,7 @@ config BLK_DEV_NVME_SCSI depends on NVME_CORE ---help--- This adds support for the SG_IO ioctl on the NVMe character - and block devices nodes, as well a a translation for a small + and block devices nodes, as well as a translation for a small number of selected SCSI commands to NVMe commands to the NVMe driver. If you don't know what this means you probably want to say N here, unless you run a distro that abuses the SCSI diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 643f45713..d5fb55c0a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -58,6 +58,64 @@ static DEFINE_SPINLOCK(dev_list_lock); static struct class *nvme_class; +bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, + enum nvme_ctrl_state new_state) +{ + enum nvme_ctrl_state old_state = ctrl->state; + bool changed = false; + + spin_lock_irq(&ctrl->lock); + switch (new_state) { + case NVME_CTRL_LIVE: + switch (old_state) { + case NVME_CTRL_RESETTING: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; + case NVME_CTRL_RESETTING: + switch (old_state) { + case NVME_CTRL_NEW: + case NVME_CTRL_LIVE: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; + case NVME_CTRL_DELETING: + switch (old_state) { + case NVME_CTRL_LIVE: + case NVME_CTRL_RESETTING: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; + case NVME_CTRL_DEAD: + switch (old_state) { + case NVME_CTRL_DELETING: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; + default: + break; + } + spin_unlock_irq(&ctrl->lock); + + if (changed) + ctrl->state = new_state; + + return changed; +} +EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); + static void nvme_free_ns(struct kref *kref) { struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); @@ -138,6 +196,111 @@ struct request *nvme_alloc_request(struct request_queue *q, } EXPORT_SYMBOL_GPL(nvme_alloc_request); +static inline void nvme_setup_flush(struct nvme_ns *ns, + struct nvme_command *cmnd) +{ + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->common.opcode = nvme_cmd_flush; + cmnd->common.nsid = cpu_to_le32(ns->ns_id); +} + +static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmnd) +{ + struct nvme_dsm_range *range; + struct page *page; + int offset; + unsigned int nr_bytes = blk_rq_bytes(req); + + range = kmalloc(sizeof(*range), GFP_ATOMIC); + if (!range) + return BLK_MQ_RQ_QUEUE_BUSY; + + range->cattr = cpu_to_le32(0); + range->nlb = cpu_to_le32(nr_bytes >> ns->lba_shift); + range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->dsm.opcode = nvme_cmd_dsm; + cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); + cmnd->dsm.nr = 0; + cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); + + req->completion_data = range; + page = virt_to_page(range); + offset = offset_in_page(range); + blk_add_request_payload(req, page, offset, sizeof(*range)); + + /* + * we set __data_len back to the size of the area to be discarded + * on disk. This allows us to report completion on the full amount + * of blocks described by the request. + */ + req->__data_len = nr_bytes; + + return 0; +} + +static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmnd) +{ + u16 control = 0; + u32 dsmgmt = 0; + + if (req->cmd_flags & REQ_FUA) + control |= NVME_RW_FUA; + if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) + control |= NVME_RW_LR; + + if (req->cmd_flags & REQ_RAHEAD) + dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; + + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); + cmnd->rw.command_id = req->tag; + cmnd->rw.nsid = cpu_to_le32(ns->ns_id); + cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + + if (ns->ms) { + switch (ns->pi_type) { + case NVME_NS_DPS_PI_TYPE3: + control |= NVME_RW_PRINFO_PRCHK_GUARD; + break; + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + control |= NVME_RW_PRINFO_PRCHK_GUARD | + NVME_RW_PRINFO_PRCHK_REF; + cmnd->rw.reftag = cpu_to_le32( + nvme_block_nr(ns, blk_rq_pos(req))); + break; + } + if (!blk_integrity_rq(req)) + control |= NVME_RW_PRINFO_PRACT; + } + + cmnd->rw.control = cpu_to_le16(control); + cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); +} + +int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmd) +{ + int ret = 0; + + if (req->cmd_type == REQ_TYPE_DRV_PRIV) + memcpy(cmd, req->cmd, sizeof(*cmd)); + else if (req->cmd_flags & REQ_FLUSH) + nvme_setup_flush(ns, cmd); + else if (req->cmd_flags & REQ_DISCARD) + ret = nvme_setup_discard(ns, req, cmd); + else + nvme_setup_rw(ns, req, cmd); + + return ret; +} +EXPORT_SYMBOL_GPL(nvme_setup_cmd); + /* * Returns 0 on success. If the result is negative, it's a Linux error code; * if the result is positive, it's an NVM Express status code @@ -566,10 +729,14 @@ static void nvme_init_integrity(struct nvme_ns *ns) switch (ns->pi_type) { case NVME_NS_DPS_PI_TYPE3: integrity.profile = &t10_pi_type3_crc; + integrity.tag_size = sizeof(u16) + sizeof(u32); + integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; break; case NVME_NS_DPS_PI_TYPE1: case NVME_NS_DPS_PI_TYPE2: integrity.profile = &t10_pi_type1_crc; + integrity.tag_size = sizeof(u16); + integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; break; default: integrity.profile = NULL; @@ -894,6 +1061,8 @@ EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, struct request_queue *q) { + bool vwc = false; + if (ctrl->max_hw_sectors) { u32 max_segments = (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1; @@ -903,9 +1072,10 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, } if (ctrl->stripe_size) blk_queue_chunk_sectors(q, ctrl->stripe_size >> 9); - if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) - blk_queue_flush(q, REQ_FLUSH | REQ_FUA); blk_queue_virt_boundary(q, ctrl->page_size - 1); + if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) + vwc = true; + blk_queue_write_cache(q, vwc, vwc); } /* @@ -1055,6 +1225,9 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd, return ctrl->ops->reset_ctrl(ctrl); case NVME_IOCTL_SUBSYS_RESET: return nvme_reset_subsystem(ctrl); + case NVME_IOCTL_RESCAN: + nvme_queue_scan(ctrl); + return 0; default: return -ENOTTY; } @@ -1082,6 +1255,17 @@ static ssize_t nvme_sysfs_reset(struct device *dev, } static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); +static ssize_t nvme_sysfs_rescan(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + nvme_queue_scan(ctrl); + return count; +} +static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan); + static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1185,6 +1369,7 @@ nvme_show_int_function(cntlid); static struct attribute *nvme_dev_attrs[] = { &dev_attr_reset_controller.attr, + &dev_attr_rescan_controller.attr, &dev_attr_model.attr, &dev_attr_serial.attr, &dev_attr_firmware_rev.attr, @@ -1209,19 +1394,22 @@ static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) return nsa->ns_id - nsb->ns_id; } -static struct nvme_ns *nvme_find_ns(struct nvme_ctrl *ctrl, unsigned nsid) +static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) { - struct nvme_ns *ns; - - lockdep_assert_held(&ctrl->namespaces_mutex); + struct nvme_ns *ns, *ret = NULL; + mutex_lock(&ctrl->namespaces_mutex); list_for_each_entry(ns, &ctrl->namespaces, list) { - if (ns->ns_id == nsid) - return ns; + if (ns->ns_id == nsid) { + kref_get(&ns->kref); + ret = ns; + break; + } if (ns->ns_id > nsid) break; } - return NULL; + mutex_unlock(&ctrl->namespaces_mutex); + return ret; } static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) @@ -1230,8 +1418,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) struct gendisk *disk; int node = dev_to_node(ctrl->dev); - lockdep_assert_held(&ctrl->namespaces_mutex); - ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) return; @@ -1272,7 +1458,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (nvme_revalidate_disk(ns->disk)) goto out_free_disk; + mutex_lock(&ctrl->namespaces_mutex); list_add_tail(&ns->list, &ctrl->namespaces); + mutex_unlock(&ctrl->namespaces_mutex); + kref_get(&ctrl->kref); if (ns->type == NVME_NS_LIGHTNVM) return; @@ -1307,9 +1496,11 @@ static void nvme_ns_remove(struct nvme_ns *ns) blk_mq_abort_requeue_list(ns->queue); blk_cleanup_queue(ns->queue); } + mutex_lock(&ns->ctrl->namespaces_mutex); list_del_init(&ns->list); mutex_unlock(&ns->ctrl->namespaces_mutex); + nvme_put_ns(ns); } @@ -1317,10 +1508,11 @@ static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns; - ns = nvme_find_ns(ctrl, nsid); + ns = nvme_find_get_ns(ctrl, nsid); if (ns) { if (revalidate_disk(ns->disk)) nvme_ns_remove(ns); + nvme_put_ns(ns); } else nvme_alloc_ns(ctrl, nsid); } @@ -1349,9 +1541,11 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn) nvme_validate_ns(ctrl, nsid); while (++prev < nsid) { - ns = nvme_find_ns(ctrl, prev); - if (ns) + ns = nvme_find_get_ns(ctrl, prev); + if (ns) { nvme_ns_remove(ns); + nvme_put_ns(ns); + } } } nn -= j; @@ -1361,13 +1555,11 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn) return ret; } -static void __nvme_scan_namespaces(struct nvme_ctrl *ctrl, unsigned nn) +static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn) { struct nvme_ns *ns, *next; unsigned i; - lockdep_assert_held(&ctrl->namespaces_mutex); - for (i = 1; i <= nn; i++) nvme_validate_ns(ctrl, i); @@ -1377,38 +1569,118 @@ static void __nvme_scan_namespaces(struct nvme_ctrl *ctrl, unsigned nn) } } -void nvme_scan_namespaces(struct nvme_ctrl *ctrl) +static void nvme_scan_work(struct work_struct *work) { + struct nvme_ctrl *ctrl = + container_of(work, struct nvme_ctrl, scan_work); struct nvme_id_ctrl *id; unsigned nn; + if (ctrl->state != NVME_CTRL_LIVE) + return; + if (nvme_identify_ctrl(ctrl, &id)) return; - mutex_lock(&ctrl->namespaces_mutex); nn = le32_to_cpu(id->nn); if (ctrl->vs >= NVME_VS(1, 1) && !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) { if (!nvme_scan_ns_list(ctrl, nn)) goto done; } - __nvme_scan_namespaces(ctrl, le32_to_cpup(&id->nn)); + nvme_scan_ns_sequential(ctrl, nn); done: + mutex_lock(&ctrl->namespaces_mutex); list_sort(NULL, &ctrl->namespaces, ns_cmp); mutex_unlock(&ctrl->namespaces_mutex); kfree(id); + + if (ctrl->ops->post_scan) + ctrl->ops->post_scan(ctrl); +} + +void nvme_queue_scan(struct nvme_ctrl *ctrl) +{ + /* + * Do not queue new scan work when a controller is reset during + * removal. + */ + if (ctrl->state == NVME_CTRL_LIVE) + schedule_work(&ctrl->scan_work); } -EXPORT_SYMBOL_GPL(nvme_scan_namespaces); +EXPORT_SYMBOL_GPL(nvme_queue_scan); +/* + * This function iterates the namespace list unlocked to allow recovery from + * controller failure. It is up to the caller to ensure the namespace list is + * not modified by scan work while this function is executing. + */ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) { struct nvme_ns *ns, *next; + /* + * The dead states indicates the controller was not gracefully + * disconnected. In that case, we won't be able to flush any data while + * removing the namespaces' disks; fail all the queues now to avoid + * potentially having to clean up the failed sync later. + */ + if (ctrl->state == NVME_CTRL_DEAD) + nvme_kill_queues(ctrl); + list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) nvme_ns_remove(ns); } EXPORT_SYMBOL_GPL(nvme_remove_namespaces); +static void nvme_async_event_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = + container_of(work, struct nvme_ctrl, async_event_work); + + spin_lock_irq(&ctrl->lock); + while (ctrl->event_limit > 0) { + int aer_idx = --ctrl->event_limit; + + spin_unlock_irq(&ctrl->lock); + ctrl->ops->submit_async_event(ctrl, aer_idx); + spin_lock_irq(&ctrl->lock); + } + spin_unlock_irq(&ctrl->lock); +} + +void nvme_complete_async_event(struct nvme_ctrl *ctrl, + struct nvme_completion *cqe) +{ + u16 status = le16_to_cpu(cqe->status) >> 1; + u32 result = le32_to_cpu(cqe->result); + + if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) { + ++ctrl->event_limit; + schedule_work(&ctrl->async_event_work); + } + + if (status != NVME_SC_SUCCESS) + return; + + switch (result & 0xff07) { + case NVME_AER_NOTICE_NS_CHANGED: + dev_info(ctrl->device, "rescanning\n"); + nvme_queue_scan(ctrl); + break; + default: + dev_warn(ctrl->device, "async event result %08x\n", result); + } +} +EXPORT_SYMBOL_GPL(nvme_complete_async_event); + +void nvme_queue_async_events(struct nvme_ctrl *ctrl) +{ + ctrl->event_limit = NVME_NR_AERS; + schedule_work(&ctrl->async_event_work); +} +EXPORT_SYMBOL_GPL(nvme_queue_async_events); + static DEFINE_IDA(nvme_instance_ida); static int nvme_set_instance(struct nvme_ctrl *ctrl) @@ -1440,6 +1712,10 @@ static void nvme_release_instance(struct nvme_ctrl *ctrl) void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) { + flush_work(&ctrl->async_event_work); + flush_work(&ctrl->scan_work); + nvme_remove_namespaces(ctrl); + device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance)); spin_lock(&dev_list_lock); @@ -1475,12 +1751,16 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, { int ret; + ctrl->state = NVME_CTRL_NEW; + spin_lock_init(&ctrl->lock); INIT_LIST_HEAD(&ctrl->namespaces); mutex_init(&ctrl->namespaces_mutex); kref_init(&ctrl->kref); ctrl->dev = dev; ctrl->ops = ops; ctrl->quirks = quirks; + INIT_WORK(&ctrl->scan_work, nvme_scan_work); + INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); ret = nvme_set_instance(ctrl); if (ret) @@ -1522,9 +1802,6 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) mutex_lock(&ctrl->namespaces_mutex); list_for_each_entry(ns, &ctrl->namespaces, list) { - if (!kref_get_unless_zero(&ns->kref)) - continue; - /* * Revalidating a dead namespace sets capacity to 0. This will * end buffered writers dirtying pages that can't be synced. @@ -1535,8 +1812,6 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) blk_set_queue_dying(ns->queue); blk_mq_abort_requeue_list(ns->queue); blk_mq_start_stopped_hw_queues(ns->queue, true); - - nvme_put_ns(ns); } mutex_unlock(&ctrl->namespaces_mutex); } @@ -1607,9 +1882,9 @@ int __init nvme_core_init(void) void nvme_core_exit(void) { - unregister_blkdev(nvme_major, "nvme"); class_destroy(nvme_class); __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); + unregister_blkdev(nvme_major, "nvme"); } MODULE_LICENSE("GPL"); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 9461dd639..a0af05583 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -367,8 +367,8 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb, ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c, entries, len); if (ret) { - dev_err(ns->ctrl->dev, "L2P table transfer failed (%d)\n", - ret); + dev_err(ns->ctrl->device, + "L2P table transfer failed (%d)\n", ret); ret = -EIO; goto out; } @@ -387,41 +387,16 @@ out: return ret; } -static void nvme_nvm_bb_tbl_fold(struct nvm_dev *nvmdev, - int nr_dst_blks, u8 *dst_blks, - int nr_src_blks, u8 *src_blks) -{ - int blk, offset, pl, blktype; - - for (blk = 0; blk < nr_dst_blks; blk++) { - offset = blk * nvmdev->plane_mode; - blktype = src_blks[offset]; - - /* Bad blocks on any planes take precedence over other types */ - for (pl = 0; pl < nvmdev->plane_mode; pl++) { - if (src_blks[offset + pl] & - (NVM_BLK_T_BAD|NVM_BLK_T_GRWN_BAD)) { - blktype = src_blks[offset + pl]; - break; - } - } - - dst_blks[blk] = blktype; - } -} - static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, - int nr_dst_blks, nvm_bb_update_fn *update_bbtbl, - void *priv) + u8 *blks) { struct request_queue *q = nvmdev->q; struct nvme_ns *ns = q->queuedata; struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_nvm_command c = {}; struct nvme_nvm_bb_tbl *bb_tbl; - u8 *dst_blks = NULL; - int nr_src_blks = nr_dst_blks * nvmdev->plane_mode; - int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_src_blks; + int nr_blks = nvmdev->blks_per_lun * nvmdev->plane_mode; + int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks; int ret = 0; c.get_bb.opcode = nvme_nvm_admin_get_bb_tbl; @@ -432,54 +407,43 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, if (!bb_tbl) return -ENOMEM; - dst_blks = kzalloc(nr_dst_blks, GFP_KERNEL); - if (!dst_blks) { - ret = -ENOMEM; - goto out; - } - ret = nvme_submit_sync_cmd(ctrl->admin_q, (struct nvme_command *)&c, bb_tbl, tblsz); if (ret) { - dev_err(ctrl->dev, "get bad block table failed (%d)\n", ret); + dev_err(ctrl->device, "get bad block table failed (%d)\n", ret); ret = -EIO; goto out; } if (bb_tbl->tblid[0] != 'B' || bb_tbl->tblid[1] != 'B' || bb_tbl->tblid[2] != 'L' || bb_tbl->tblid[3] != 'T') { - dev_err(ctrl->dev, "bbt format mismatch\n"); + dev_err(ctrl->device, "bbt format mismatch\n"); ret = -EINVAL; goto out; } if (le16_to_cpu(bb_tbl->verid) != 1) { ret = -EINVAL; - dev_err(ctrl->dev, "bbt version not supported\n"); + dev_err(ctrl->device, "bbt version not supported\n"); goto out; } - if (le32_to_cpu(bb_tbl->tblks) != nr_src_blks) { + if (le32_to_cpu(bb_tbl->tblks) != nr_blks) { ret = -EINVAL; - dev_err(ctrl->dev, "bbt unsuspected blocks returned (%u!=%u)", - le32_to_cpu(bb_tbl->tblks), nr_src_blks); + dev_err(ctrl->device, + "bbt unsuspected blocks returned (%u!=%u)", + le32_to_cpu(bb_tbl->tblks), nr_blks); goto out; } - nvme_nvm_bb_tbl_fold(nvmdev, nr_dst_blks, dst_blks, - nr_src_blks, bb_tbl->blk); - - ppa = dev_to_generic_addr(nvmdev, ppa); - ret = update_bbtbl(ppa, nr_dst_blks, dst_blks, priv); - + memcpy(blks, bb_tbl->blk, nvmdev->blks_per_lun * nvmdev->plane_mode); out: - kfree(dst_blks); kfree(bb_tbl); return ret; } -static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct nvm_rq *rqd, - int type) +static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr *ppas, + int nr_ppas, int type) { struct nvme_ns *ns = nvmdev->q->queuedata; struct nvme_nvm_command c = {}; @@ -487,14 +451,15 @@ static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct nvm_rq *rqd, c.set_bb.opcode = nvme_nvm_admin_set_bb_tbl; c.set_bb.nsid = cpu_to_le32(ns->ns_id); - c.set_bb.spba = cpu_to_le64(rqd->ppa_addr.ppa); - c.set_bb.nlb = cpu_to_le16(rqd->nr_pages - 1); + c.set_bb.spba = cpu_to_le64(ppas->ppa); + c.set_bb.nlb = cpu_to_le16(nr_ppas - 1); c.set_bb.value = type; ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c, NULL, 0); if (ret) - dev_err(ns->ctrl->dev, "set bad block table failed (%d)\n", ret); + dev_err(ns->ctrl->device, "set bad block table failed (%d)\n", + ret); return ret; } @@ -504,8 +469,9 @@ static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd, c->ph_rw.opcode = rqd->opcode; c->ph_rw.nsid = cpu_to_le32(ns->ns_id); c->ph_rw.spba = cpu_to_le64(rqd->ppa_addr.ppa); + c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list); c->ph_rw.control = cpu_to_le16(rqd->flags); - c->ph_rw.length = cpu_to_le16(rqd->nr_pages - 1); + c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1); if (rqd->opcode == NVM_OP_HBWRITE || rqd->opcode == NVM_OP_HBREAD) c->hb_rw.slba = cpu_to_le64(nvme_block_nr(ns, @@ -576,7 +542,7 @@ static int nvme_nvm_erase_block(struct nvm_dev *dev, struct nvm_rq *rqd) c.erase.opcode = NVM_OP_ERASE; c.erase.nsid = cpu_to_le32(ns->ns_id); c.erase.spba = cpu_to_le64(rqd->ppa_addr.ppa); - c.erase.length = cpu_to_le16(rqd->nr_pages - 1); + c.erase.length = cpu_to_le16(rqd->nr_ppas - 1); return nvme_submit_sync_cmd(q, (struct nvme_command *)&c, NULL, 0); } @@ -601,10 +567,10 @@ static void *nvme_nvm_dev_dma_alloc(struct nvm_dev *dev, void *pool, return dma_pool_alloc(pool, mem_flags, dma_handler); } -static void nvme_nvm_dev_dma_free(void *pool, void *ppa_list, +static void nvme_nvm_dev_dma_free(void *pool, void *addr, dma_addr_t dma_handler) { - dma_pool_free(pool, ppa_list, dma_handler); + dma_pool_free(pool, addr, dma_handler); } static struct nvm_dev_ops nvme_nvm_dev_ops = { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index f846da4eb..1daa0482d 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -67,7 +67,17 @@ enum nvme_quirks { NVME_QUIRK_DISCARD_ZEROES = (1 << 2), }; +enum nvme_ctrl_state { + NVME_CTRL_NEW, + NVME_CTRL_LIVE, + NVME_CTRL_RESETTING, + NVME_CTRL_DELETING, + NVME_CTRL_DEAD, +}; + struct nvme_ctrl { + enum nvme_ctrl_state state; + spinlock_t lock; const struct nvme_ctrl_ops *ops; struct request_queue *admin_q; struct device *dev; @@ -84,7 +94,7 @@ struct nvme_ctrl { char serial[20]; char model[40]; char firmware_rev[8]; - int cntlid; + u16 cntlid; u32 ctrl_config; @@ -99,6 +109,8 @@ struct nvme_ctrl { u32 vs; bool subsystem; unsigned long quirks; + struct work_struct scan_work; + struct work_struct async_event_work; }; /* @@ -136,9 +148,10 @@ struct nvme_ctrl_ops { int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); - bool (*io_incapable)(struct nvme_ctrl *ctrl); int (*reset_ctrl)(struct nvme_ctrl *ctrl); void (*free_ctrl)(struct nvme_ctrl *ctrl); + void (*post_scan)(struct nvme_ctrl *ctrl); + void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx); }; static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl) @@ -150,17 +163,6 @@ static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl) return val & NVME_CSTS_RDY; } -static inline bool nvme_io_incapable(struct nvme_ctrl *ctrl) -{ - u32 val = 0; - - if (ctrl->ops->io_incapable(ctrl)) - return true; - if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val)) - return true; - return val & NVME_CSTS_CFS; -} - static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl) { if (!ctrl->subsystem) @@ -173,57 +175,20 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) return (sector >> (ns->lba_shift - 9)); } -static inline void nvme_setup_flush(struct nvme_ns *ns, - struct nvme_command *cmnd) +static inline unsigned nvme_map_len(struct request *rq) { - memset(cmnd, 0, sizeof(*cmnd)); - cmnd->common.opcode = nvme_cmd_flush; - cmnd->common.nsid = cpu_to_le32(ns->ns_id); + if (rq->cmd_flags & REQ_DISCARD) + return sizeof(struct nvme_dsm_range); + else + return blk_rq_bytes(rq); } -static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req, - struct nvme_command *cmnd) +static inline void nvme_cleanup_cmd(struct request *req) { - u16 control = 0; - u32 dsmgmt = 0; - - if (req->cmd_flags & REQ_FUA) - control |= NVME_RW_FUA; - if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) - control |= NVME_RW_LR; - - if (req->cmd_flags & REQ_RAHEAD) - dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; - - memset(cmnd, 0, sizeof(*cmnd)); - cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); - cmnd->rw.command_id = req->tag; - cmnd->rw.nsid = cpu_to_le32(ns->ns_id); - cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); - cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); - - if (ns->ms) { - switch (ns->pi_type) { - case NVME_NS_DPS_PI_TYPE3: - control |= NVME_RW_PRINFO_PRCHK_GUARD; - break; - case NVME_NS_DPS_PI_TYPE1: - case NVME_NS_DPS_PI_TYPE2: - control |= NVME_RW_PRINFO_PRCHK_GUARD | - NVME_RW_PRINFO_PRCHK_REF; - cmnd->rw.reftag = cpu_to_le32( - nvme_block_nr(ns, blk_rq_pos(req))); - break; - } - if (!blk_integrity_rq(req)) - control |= NVME_RW_PRINFO_PRACT; - } - - cmnd->rw.control = cpu_to_le16(control); - cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); + if (req->cmd_flags & REQ_DISCARD) + kfree(req->completion_data); } - static inline int nvme_error_status(u16 status) { switch (status & 0x7ff) { @@ -242,6 +207,8 @@ static inline bool nvme_req_needs_retry(struct request *req, u16 status) (jiffies - req->start_time) < req->timeout; } +bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, + enum nvme_ctrl_state new_state); int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap); int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); @@ -251,9 +218,14 @@ void nvme_uninit_ctrl(struct nvme_ctrl *ctrl); void nvme_put_ctrl(struct nvme_ctrl *ctrl); int nvme_init_identify(struct nvme_ctrl *ctrl); -void nvme_scan_namespaces(struct nvme_ctrl *ctrl); +void nvme_queue_scan(struct nvme_ctrl *ctrl); void nvme_remove_namespaces(struct nvme_ctrl *ctrl); +#define NVME_NR_AERS 1 +void nvme_complete_async_event(struct nvme_ctrl *ctrl, + struct nvme_completion *cqe); +void nvme_queue_async_events(struct nvme_ctrl *ctrl); + void nvme_stop_queues(struct nvme_ctrl *ctrl); void nvme_start_queues(struct nvme_ctrl *ctrl); void nvme_kill_queues(struct nvme_ctrl *ctrl); @@ -261,6 +233,8 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl); struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, unsigned int flags); void nvme_requeue_req(struct request *req); +int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmd); int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buf, unsigned bufflen); int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4fd733ff7..befac5b19 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -54,8 +54,7 @@ * We handle AEN commands ourselves and don't even let the * block layer know about them. */ -#define NVME_NR_AEN_COMMANDS 1 -#define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS) +#define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AERS) static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0); @@ -92,9 +91,7 @@ struct nvme_dev { struct msix_entry *entry; void __iomem *bar; struct work_struct reset_work; - struct work_struct scan_work; struct work_struct remove_work; - struct work_struct async_work; struct timer_list watchdog_timer; struct mutex shutdown_lock; bool subsystem; @@ -102,11 +99,6 @@ struct nvme_dev { dma_addr_t cmb_dma_addr; u64 cmb_size; u32 cmbsz; - unsigned long flags; - -#define NVME_CTRL_RESETTING 0 -#define NVME_CTRL_REMOVING 1 - struct nvme_ctrl ctrl; struct completion ioq_wait; }; @@ -271,40 +263,6 @@ static int nvme_init_request(void *data, struct request *req, return 0; } -static void nvme_queue_scan(struct nvme_dev *dev) -{ - /* - * Do not queue new scan work when a controller is reset during - * removal. - */ - if (test_bit(NVME_CTRL_REMOVING, &dev->flags)) - return; - queue_work(nvme_workq, &dev->scan_work); -} - -static void nvme_complete_async_event(struct nvme_dev *dev, - struct nvme_completion *cqe) -{ - u16 status = le16_to_cpu(cqe->status) >> 1; - u32 result = le32_to_cpu(cqe->result); - - if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) { - ++dev->ctrl.event_limit; - queue_work(nvme_workq, &dev->async_work); - } - - if (status != NVME_SC_SUCCESS) - return; - - switch (result & 0xff07) { - case NVME_AER_NOTICE_NS_CHANGED: - dev_info(dev->ctrl.device, "rescanning\n"); - nvme_queue_scan(dev); - default: - dev_warn(dev->ctrl.device, "async event result %08x\n", result); - } -} - /** * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell * @nvmeq: The queue to use @@ -334,16 +292,11 @@ static __le64 **iod_list(struct request *req) return (__le64 **)(iod->sg + req->nr_phys_segments); } -static int nvme_init_iod(struct request *rq, struct nvme_dev *dev) +static int nvme_init_iod(struct request *rq, unsigned size, + struct nvme_dev *dev) { struct nvme_iod *iod = blk_mq_rq_to_pdu(rq); int nseg = rq->nr_phys_segments; - unsigned size; - - if (rq->cmd_flags & REQ_DISCARD) - size = sizeof(struct nvme_dsm_range); - else - size = blk_rq_bytes(rq); if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC); @@ -368,6 +321,8 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req) __le64 **list = iod_list(req); dma_addr_t prp_dma = iod->first_dma; + nvme_cleanup_cmd(req); + if (iod->npages == 0) dma_pool_free(dev->prp_small_pool, list[0], prp_dma); for (i = 0; i < iod->npages; i++) { @@ -529,7 +484,7 @@ static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req, } static int nvme_map_data(struct nvme_dev *dev, struct request *req, - struct nvme_command *cmnd) + unsigned size, struct nvme_command *cmnd) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct request_queue *q = req->q; @@ -546,7 +501,7 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req, if (!dma_map_sg(dev->dev, iod->sg, iod->nents, dma_dir)) goto out; - if (!nvme_setup_prps(dev, req, blk_rq_bytes(req))) + if (!nvme_setup_prps(dev, req, size)) goto out_unmap; ret = BLK_MQ_RQ_QUEUE_ERROR; @@ -596,37 +551,6 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) } /* - * We reuse the small pool to allocate the 16-byte range here as it is not - * worth having a special pool for these or additional cases to handle freeing - * the iod. - */ -static int nvme_setup_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, - struct request *req, struct nvme_command *cmnd) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_dsm_range *range; - - range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, - &iod->first_dma); - if (!range) - return BLK_MQ_RQ_QUEUE_BUSY; - iod_list(req)[0] = (__le64 *)range; - iod->npages = 0; - - range->cattr = cpu_to_le32(0); - range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift); - range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); - - memset(cmnd, 0, sizeof(*cmnd)); - cmnd->dsm.opcode = nvme_cmd_dsm; - cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); - cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); - cmnd->dsm.nr = 0; - cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - return BLK_MQ_RQ_QUEUE_OK; -} - -/* * NOTE: ns is NULL when called on the admin queue. */ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, @@ -637,6 +561,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_dev *dev = nvmeq->dev; struct request *req = bd->rq; struct nvme_command cmnd; + unsigned map_len; int ret = BLK_MQ_RQ_QUEUE_OK; /* @@ -652,23 +577,17 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, } } - ret = nvme_init_iod(req, dev); + map_len = nvme_map_len(req); + ret = nvme_init_iod(req, map_len, dev); if (ret) return ret; - if (req->cmd_flags & REQ_DISCARD) { - ret = nvme_setup_discard(nvmeq, ns, req, &cmnd); - } else { - if (req->cmd_type == REQ_TYPE_DRV_PRIV) - memcpy(&cmnd, req->cmd, sizeof(cmnd)); - else if (req->cmd_flags & REQ_FLUSH) - nvme_setup_flush(ns, &cmnd); - else - nvme_setup_rw(ns, req, &cmnd); + ret = nvme_setup_cmd(ns, req, &cmnd); + if (ret) + goto out; - if (req->nr_phys_segments) - ret = nvme_map_data(dev, req, &cmnd); - } + if (req->nr_phys_segments) + ret = nvme_map_data(dev, req, map_len, &cmnd); if (ret) goto out; @@ -764,7 +683,7 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) */ if (unlikely(nvmeq->qid == 0 && cqe.command_id >= NVME_AQ_BLKMQ_DEPTH)) { - nvme_complete_async_event(nvmeq->dev, &cqe); + nvme_complete_async_event(&nvmeq->dev->ctrl, &cqe); continue; } @@ -833,21 +752,18 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) return 0; } -static void nvme_async_event_work(struct work_struct *work) +static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx) { - struct nvme_dev *dev = container_of(work, struct nvme_dev, async_work); + struct nvme_dev *dev = to_nvme_dev(ctrl); struct nvme_queue *nvmeq = dev->queues[0]; struct nvme_command c; memset(&c, 0, sizeof(c)); c.common.opcode = nvme_admin_async_event; + c.common.command_id = NVME_AQ_BLKMQ_DEPTH + aer_idx; spin_lock_irq(&nvmeq->q_lock); - while (dev->ctrl.event_limit > 0) { - c.common.command_id = NVME_AQ_BLKMQ_DEPTH + - --dev->ctrl.event_limit; - __nvme_submit_cmd(nvmeq, &c); - } + __nvme_submit_cmd(nvmeq, &c); spin_unlock_irq(&nvmeq->q_lock); } @@ -939,7 +855,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) * cancellation error. All outstanding requests are completed on * shutdown, so we return BLK_EH_HANDLED. */ - if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) { + if (dev->ctrl.state == NVME_CTRL_RESETTING) { dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, disable controller\n", req->tag, nvmeq->qid); @@ -1003,16 +919,15 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) return BLK_EH_RESET_TIMER; } -static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved) +static void nvme_cancel_io(struct request *req, void *data, bool reserved) { - struct nvme_queue *nvmeq = data; int status; if (!blk_mq_request_started(req)) return; - dev_dbg_ratelimited(nvmeq->dev->ctrl.device, - "Cancelling I/O %d QID %d\n", req->tag, nvmeq->qid); + dev_dbg_ratelimited(((struct nvme_dev *) data)->ctrl.device, + "Cancelling I/O %d", req->tag); status = NVME_SC_ABORT_REQ; if (blk_queue_dying(req->q)) @@ -1069,14 +984,6 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) return 0; } -static void nvme_clear_queue(struct nvme_queue *nvmeq) -{ - spin_lock_irq(&nvmeq->q_lock); - if (nvmeq->tags && *nvmeq->tags) - blk_mq_all_tag_busy_iter(*nvmeq->tags, nvme_cancel_queue_ios, nvmeq); - spin_unlock_irq(&nvmeq->q_lock); -} - static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) { struct nvme_queue *nvmeq = dev->queues[0]; @@ -1350,22 +1257,44 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) return result; } +static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) +{ + + /* If true, indicates loss of adapter communication, possibly by a + * NVMe Subsystem reset. + */ + bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); + + /* If there is a reset ongoing, we shouldn't reset again. */ + if (work_busy(&dev->reset_work)) + return false; + + /* We shouldn't reset unless the controller is on fatal error state + * _or_ if we lost the communication with it. + */ + if (!(csts & NVME_CSTS_CFS) && !nssro) + return false; + + /* If PCI error recovery process is happening, we cannot reset or + * the recovery mechanism will surely fail. + */ + if (pci_channel_offline(to_pci_dev(dev->dev))) + return false; + + return true; +} + static void nvme_watchdog_timer(unsigned long data) { struct nvme_dev *dev = (struct nvme_dev *)data; u32 csts = readl(dev->bar + NVME_REG_CSTS); - /* - * Skip controllers currently under reset. - */ - if (!work_pending(&dev->reset_work) && !work_busy(&dev->reset_work) && - ((csts & NVME_CSTS_CFS) || - (dev->subsystem && (csts & NVME_CSTS_NSSRO)))) { - if (queue_work(nvme_workq, &dev->reset_work)) { + /* Skip controllers under certain specific conditions. */ + if (nvme_should_reset(dev, csts)) { + if (queue_work(nvme_workq, &dev->reset_work)) dev_warn(dev->dev, "Failed status: 0x%x, reset controller.\n", csts); - } return; } @@ -1465,7 +1394,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) struct pci_dev *pdev = to_pci_dev(dev->dev); int result, i, vecs, nr_io_queues, size; - nr_io_queues = num_possible_cpus(); + nr_io_queues = num_online_cpus(); result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); if (result < 0) return result; @@ -1551,8 +1480,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) return result; } -static void nvme_set_irq_hints(struct nvme_dev *dev) +static void nvme_pci_post_scan(struct nvme_ctrl *ctrl) { + struct nvme_dev *dev = to_nvme_dev(ctrl); struct nvme_queue *nvmeq; int i; @@ -1567,16 +1497,6 @@ static void nvme_set_irq_hints(struct nvme_dev *dev) } } -static void nvme_dev_scan(struct work_struct *work) -{ - struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work); - - if (!dev->tagset.tags) - return; - nvme_scan_namespaces(&dev->ctrl); - nvme_set_irq_hints(dev); -} - static void nvme_del_queue_end(struct request *req, int error) { struct nvme_queue *nvmeq = req->end_io_data; @@ -1592,7 +1512,13 @@ static void nvme_del_cq_end(struct request *req, int error) if (!error) { unsigned long flags; - spin_lock_irqsave(&nvmeq->q_lock, flags); + /* + * We might be called with the AQ q_lock held + * and the I/O queue q_lock should always + * nest inside the AQ one. + */ + spin_lock_irqsave_nested(&nvmeq->q_lock, flags, + SINGLE_DEPTH_NESTING); nvme_process_cq(nvmeq); spin_unlock_irqrestore(&nvmeq->q_lock, flags); } @@ -1625,12 +1551,12 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) static void nvme_disable_io_queues(struct nvme_dev *dev) { - int pass; + int pass, queues = dev->online_queues - 1; unsigned long timeout; u8 opcode = nvme_admin_delete_sq; for (pass = 0; pass < 2; pass++) { - int sent = 0, i = dev->queue_count - 1; + int sent = 0, i = queues; reinit_completion(&dev->ioq_wait); retry: @@ -1684,7 +1610,6 @@ static int nvme_dev_add(struct nvme_dev *dev) nvme_free_queues(dev, dev->online_queues); } - nvme_queue_scan(dev); return 0; } @@ -1754,9 +1679,14 @@ static int nvme_pci_enable(struct nvme_dev *dev) static void nvme_dev_unmap(struct nvme_dev *dev) { + struct pci_dev *pdev = to_pci_dev(dev->dev); + int bars; + if (dev->bar) iounmap(dev->bar); - pci_release_regions(to_pci_dev(dev->dev)); + + bars = pci_select_bars(pdev, IORESOURCE_MEM); + pci_release_selected_regions(pdev, bars); } static void nvme_pci_disable(struct nvme_dev *dev) @@ -1797,8 +1727,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) } nvme_pci_disable(dev); - for (i = dev->queue_count - 1; i >= 0; i--) - nvme_clear_queue(dev->queues[i]); + blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_io, dev); + blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_io, dev); mutex_unlock(&dev->shutdown_lock); } @@ -1854,7 +1784,7 @@ static void nvme_reset_work(struct work_struct *work) struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); int result = -ENODEV; - if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags))) + if (WARN_ON(dev->ctrl.state == NVME_CTRL_RESETTING)) goto out; /* @@ -1864,11 +1794,9 @@ static void nvme_reset_work(struct work_struct *work) if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) nvme_dev_disable(dev, false); - if (test_bit(NVME_CTRL_REMOVING, &dev->flags)) + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) goto out; - set_bit(NVME_CTRL_RESETTING, &dev->flags); - result = nvme_pci_enable(dev); if (result) goto out; @@ -1890,8 +1818,14 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; - dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS; - queue_work(nvme_workq, &dev->async_work); + /* + * A controller that can not execute IO typically requires user + * intervention to correct. For such degraded controllers, the driver + * should not submit commands the user did not request, so skip + * registering for asynchronous event notification on this condition. + */ + if (dev->online_queues > 1) + nvme_queue_async_events(&dev->ctrl); mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ)); @@ -1901,13 +1835,20 @@ static void nvme_reset_work(struct work_struct *work) */ if (dev->online_queues < 2) { dev_warn(dev->ctrl.device, "IO queues not created\n"); + nvme_kill_queues(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl); } else { nvme_start_queues(&dev->ctrl); nvme_dev_add(dev); } - clear_bit(NVME_CTRL_RESETTING, &dev->flags); + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { + dev_warn(dev->ctrl.device, "failed to mark controller live\n"); + goto out; + } + + if (dev->online_queues > 1) + nvme_queue_scan(&dev->ctrl); return; out: @@ -1921,7 +1862,7 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work) nvme_kill_queues(&dev->ctrl); if (pci_get_drvdata(pdev)) - pci_stop_and_remove_bus_device_locked(pdev); + device_release_driver(&pdev->dev); nvme_put_ctrl(&dev->ctrl); } @@ -1955,13 +1896,6 @@ static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) return 0; } -static bool nvme_pci_io_incapable(struct nvme_ctrl *ctrl) -{ - struct nvme_dev *dev = to_nvme_dev(ctrl); - - return !dev->bar || dev->online_queues < 2; -} - static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl) { return nvme_reset(to_nvme_dev(ctrl)); @@ -1972,9 +1906,10 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .reg_read32 = nvme_pci_reg_read32, .reg_write32 = nvme_pci_reg_write32, .reg_read64 = nvme_pci_reg_read64, - .io_incapable = nvme_pci_io_incapable, .reset_ctrl = nvme_pci_reset_ctrl, .free_ctrl = nvme_pci_free_ctrl, + .post_scan = nvme_pci_post_scan, + .submit_async_event = nvme_pci_submit_async_event, }; static int nvme_dev_map(struct nvme_dev *dev) @@ -1994,7 +1929,7 @@ static int nvme_dev_map(struct nvme_dev *dev) return 0; release: - pci_release_regions(pdev); + pci_release_selected_regions(pdev, bars); return -ENODEV; } @@ -2026,10 +1961,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto free; - INIT_WORK(&dev->scan_work, nvme_dev_scan); INIT_WORK(&dev->reset_work, nvme_reset_work); INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); - INIT_WORK(&dev->async_work, nvme_async_event_work); setup_timer(&dev->watchdog_timer, nvme_watchdog_timer, (unsigned long)dev); mutex_init(&dev->shutdown_lock); @@ -2086,15 +2019,16 @@ static void nvme_remove(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); - set_bit(NVME_CTRL_REMOVING, &dev->flags); + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); + pci_set_drvdata(pdev, NULL); - flush_work(&dev->async_work); + + if (!pci_device_is_present(pdev)) + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD); + flush_work(&dev->reset_work); - flush_work(&dev->scan_work); - nvme_remove_namespaces(&dev->ctrl); nvme_uninit_ctrl(&dev->ctrl); nvme_dev_disable(dev, true); - flush_work(&dev->reset_work); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); nvme_release_cmb(dev); @@ -2135,14 +2069,17 @@ static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev, * shutdown the controller to quiesce. The controller will be restarted * after the slot reset through driver's slot_reset callback. */ - dev_warn(dev->ctrl.device, "error detected: state:%d\n", state); switch (state) { case pci_channel_io_normal: return PCI_ERS_RESULT_CAN_RECOVER; case pci_channel_io_frozen: + dev_warn(dev->ctrl.device, + "frozen state error detected, reset controller\n"); nvme_dev_disable(dev, false); return PCI_ERS_RESULT_NEED_RESET; case pci_channel_io_perm_failure: + dev_warn(dev->ctrl.device, + "failure state error detected, request disconnect\n"); return PCI_ERS_RESULT_DISCONNECT; } return PCI_ERS_RESULT_NEED_RESET; @@ -2177,6 +2114,12 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_VDEVICE(INTEL, 0x0953), .driver_data = NVME_QUIRK_STRIPE_SIZE | NVME_QUIRK_DISCARD_ZEROES, }, + { PCI_VDEVICE(INTEL, 0x0a53), + .driver_data = NVME_QUIRK_STRIPE_SIZE | + NVME_QUIRK_DISCARD_ZEROES, }, + { PCI_VDEVICE(INTEL, 0x0a54), + .driver_data = NVME_QUIRK_STRIPE_SIZE | + NVME_QUIRK_DISCARD_ZEROES, }, { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, |