diff options
Diffstat (limited to 'drivers/nvme/host/pci.c')
| -rw-r--r-- | drivers/nvme/host/pci.c | 3354 |
1 files changed, 3354 insertions, 0 deletions
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c new file mode 100644 index 000000000000..a526696d684d --- /dev/null +++ b/drivers/nvme/host/pci.c @@ -0,0 +1,3354 @@ +/* + * NVM Express device driver + * Copyright (c) 2011-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/bitops.h> +#include <linux/blkdev.h> +#include <linux/blk-mq.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/genhd.h> +#include <linux/hdreg.h> +#include <linux/idr.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/kdev_t.h> +#include <linux/kthread.h> +#include <linux/kernel.h> +#include <linux/list_sort.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/pci.h> +#include <linux/poison.h> +#include <linux/ptrace.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/t10-pi.h> +#include <linux/types.h> +#include <scsi/sg.h> +#include <asm-generic/io-64-nonatomic-lo-hi.h> + +#include <uapi/linux/nvme_ioctl.h> +#include "nvme.h" + +#define NVME_MINORS (1U << MINORBITS) +#define NVME_Q_DEPTH 1024 +#define NVME_AQ_DEPTH 256 +#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) +#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) +#define ADMIN_TIMEOUT (admin_timeout * HZ) +#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) + +static unsigned char admin_timeout = 60; +module_param(admin_timeout, byte, 0644); +MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); + +unsigned char nvme_io_timeout = 30; +module_param_named(io_timeout, nvme_io_timeout, byte, 0644); +MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); + +static unsigned char shutdown_timeout = 5; +module_param(shutdown_timeout, byte, 0644); +MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); + +static int nvme_major; +module_param(nvme_major, int, 0); + +static int nvme_char_major; +module_param(nvme_char_major, int, 0); + +static int use_threaded_interrupts; +module_param(use_threaded_interrupts, int, 0); + +static bool use_cmb_sqes = true; +module_param(use_cmb_sqes, bool, 0644); +MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes"); + +static DEFINE_SPINLOCK(dev_list_lock); +static LIST_HEAD(dev_list); +static struct task_struct *nvme_thread; +static struct workqueue_struct *nvme_workq; +static wait_queue_head_t nvme_kthread_wait; + +static struct class *nvme_class; + +static int __nvme_reset(struct nvme_dev *dev); +static int nvme_reset(struct nvme_dev *dev); +static int nvme_process_cq(struct nvme_queue *nvmeq); +static void nvme_dead_ctrl(struct nvme_dev *dev); + +struct async_cmd_info { + struct kthread_work work; + struct kthread_worker *worker; + struct request *req; + u32 result; + int status; + void *ctx; +}; + +/* + * An NVM Express queue. Each device has at least two (one for admin + * commands and one for I/O commands). + */ +struct nvme_queue { + struct device *q_dmadev; + struct nvme_dev *dev; + char irqname[24]; /* nvme4294967295-65535\0 */ + spinlock_t q_lock; + struct nvme_command *sq_cmds; + struct nvme_command __iomem *sq_cmds_io; + volatile struct nvme_completion *cqes; + struct blk_mq_tags **tags; + dma_addr_t sq_dma_addr; + dma_addr_t cq_dma_addr; + u32 __iomem *q_db; + u16 q_depth; + s16 cq_vector; + u16 sq_head; + u16 sq_tail; + u16 cq_head; + u16 qid; + u8 cq_phase; + u8 cqe_seen; + struct async_cmd_info cmdinfo; +}; + +/* + * Check we didin't inadvertently grow the command struct + */ +static inline void _nvme_check_size(void) +{ + BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); + BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); + BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); + BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); + BUILD_BUG_ON(sizeof(struct nvme_features) != 64); + BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); + BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); + BUILD_BUG_ON(sizeof(struct nvme_command) != 64); + BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); + BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); + BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); + BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); +} + +typedef void (*nvme_completion_fn)(struct nvme_queue *, void *, + struct nvme_completion *); + +struct nvme_cmd_info { + nvme_completion_fn fn; + void *ctx; + int aborted; + struct nvme_queue *nvmeq; + struct nvme_iod iod[0]; +}; + +/* + * Max size of iod being embedded in the request payload + */ +#define NVME_INT_PAGES 2 +#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size) +#define NVME_INT_MASK 0x01 + +/* + * Will slightly overestimate the number of pages needed. This is OK + * as it only leads to a small amount of wasted memory for the lifetime of + * the I/O. + */ +static int nvme_npages(unsigned size, struct nvme_dev *dev) +{ + unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); + return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); +} + +static unsigned int nvme_cmd_size(struct nvme_dev *dev) +{ + unsigned int ret = sizeof(struct nvme_cmd_info); + + ret += sizeof(struct nvme_iod); + ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev); + ret += sizeof(struct scatterlist) * NVME_INT_PAGES; + + return ret; +} + +static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_dev *dev = data; + struct nvme_queue *nvmeq = dev->queues[0]; + + WARN_ON(hctx_idx != 0); + WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); + WARN_ON(nvmeq->tags); + + hctx->driver_data = nvmeq; + nvmeq->tags = &dev->admin_tagset.tags[0]; + return 0; +} + +static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ + struct nvme_queue *nvmeq = hctx->driver_data; + + nvmeq->tags = NULL; +} + +static int nvme_admin_init_request(void *data, struct request *req, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) +{ + struct nvme_dev *dev = data; + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = dev->queues[0]; + + BUG_ON(!nvmeq); + cmd->nvmeq = nvmeq; + return 0; +} + +static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_dev *dev = data; + struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; + + if (!nvmeq->tags) + nvmeq->tags = &dev->tagset.tags[hctx_idx]; + + WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags); + hctx->driver_data = nvmeq; + return 0; +} + +static int nvme_init_request(void *data, struct request *req, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) +{ + struct nvme_dev *dev = data; + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; + + BUG_ON(!nvmeq); + cmd->nvmeq = nvmeq; + return 0; +} + +static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx, + nvme_completion_fn handler) +{ + cmd->fn = handler; + cmd->ctx = ctx; + cmd->aborted = 0; + blk_mq_start_request(blk_mq_rq_from_pdu(cmd)); +} + +static void *iod_get_private(struct nvme_iod *iod) +{ + return (void *) (iod->private & ~0x1UL); +} + +/* + * If bit 0 is set, the iod is embedded in the request payload. + */ +static bool iod_should_kfree(struct nvme_iod *iod) +{ + return (iod->private & NVME_INT_MASK) == 0; +} + +/* Special values must be less than 0x1000 */ +#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) +#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) +#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) +#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) + +static void special_completion(struct nvme_queue *nvmeq, void *ctx, + struct nvme_completion *cqe) +{ + if (ctx == CMD_CTX_CANCELLED) + return; + if (ctx == CMD_CTX_COMPLETED) { + dev_warn(nvmeq->q_dmadev, + "completed id %d twice on queue %d\n", + cqe->command_id, le16_to_cpup(&cqe->sq_id)); + return; + } + if (ctx == CMD_CTX_INVALID) { + dev_warn(nvmeq->q_dmadev, + "invalid id %d completed on queue %d\n", + cqe->command_id, le16_to_cpup(&cqe->sq_id)); + return; + } + dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx); +} + +static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn) +{ + void *ctx; + + if (fn) + *fn = cmd->fn; + ctx = cmd->ctx; + cmd->fn = special_completion; + cmd->ctx = CMD_CTX_CANCELLED; + return ctx; +} + +static void async_req_completion(struct nvme_queue *nvmeq, void *ctx, + struct nvme_completion *cqe) +{ + u32 result = le32_to_cpup(&cqe->result); + u16 status = le16_to_cpup(&cqe->status) >> 1; + + if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) + ++nvmeq->dev->event_limit; + if (status != NVME_SC_SUCCESS) + return; + + switch (result & 0xff07) { + case NVME_AER_NOTICE_NS_CHANGED: + dev_info(nvmeq->q_dmadev, "rescanning\n"); + schedule_work(&nvmeq->dev->scan_work); + default: + dev_warn(nvmeq->q_dmadev, "async event result %08x\n", result); + } +} + +static void abort_completion(struct nvme_queue *nvmeq, void *ctx, + struct nvme_completion *cqe) +{ + struct request *req = ctx; + + u16 status = le16_to_cpup(&cqe->status) >> 1; + u32 result = le32_to_cpup(&cqe->result); + + blk_mq_free_request(req); + + dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); + ++nvmeq->dev->abort_limit; +} + +static void async_completion(struct nvme_queue *nvmeq, void *ctx, + struct nvme_completion *cqe) +{ + struct async_cmd_info *cmdinfo = ctx; + cmdinfo->result = le32_to_cpup(&cqe->result); + cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; + queue_kthread_work(cmdinfo->worker, &cmdinfo->work); + blk_mq_free_request(cmdinfo->req); +} + +static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq, + unsigned int tag) +{ + struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, tag); + + return blk_mq_rq_to_pdu(req); +} + +/* + * Called with local interrupts disabled and the q_lock held. May not sleep. + */ +static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag, + nvme_completion_fn *fn) +{ + struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag); + void *ctx; + if (tag >= nvmeq->q_depth) { + *fn = special_completion; + return CMD_CTX_INVALID; + } + if (fn) + *fn = cmd->fn; + ctx = cmd->ctx; + cmd->fn = special_completion; + cmd->ctx = CMD_CTX_COMPLETED; + return ctx; +} + +/** + * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell + * @nvmeq: The queue to use + * @cmd: The command to send + * + * Safe to use from interrupt context + */ +static void __nvme_submit_cmd(struct nvme_queue *nvmeq, + struct nvme_command *cmd) +{ + u16 tail = nvmeq->sq_tail; + + if (nvmeq->sq_cmds_io) + memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd)); + else + memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); + + if (++tail == nvmeq->q_depth) + tail = 0; + writel(tail, nvmeq->q_db); + nvmeq->sq_tail = tail; +} + +static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) +{ + unsigned long flags; + spin_lock_irqsave(&nvmeq->q_lock, flags); + __nvme_submit_cmd(nvmeq, cmd); + spin_unlock_irqrestore(&nvmeq->q_lock, flags); +} + +static __le64 **iod_list(struct nvme_iod *iod) +{ + return ((void *)iod) + iod->offset; +} + +static inline void iod_init(struct nvme_iod *iod, unsigned nbytes, + unsigned nseg, unsigned long private) +{ + iod->private = private; + iod->offset = offsetof(struct nvme_iod, sg[nseg]); + iod->npages = -1; + iod->length = nbytes; + iod->nents = 0; +} + +static struct nvme_iod * +__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev, + unsigned long priv, gfp_t gfp) +{ + struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + + sizeof(__le64 *) * nvme_npages(bytes, dev) + + sizeof(struct scatterlist) * nseg, gfp); + + if (iod) + iod_init(iod, bytes, nseg, priv); + + return iod; +} + +static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev, + gfp_t gfp) +{ + unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) : + sizeof(struct nvme_dsm_range); + struct nvme_iod *iod; + + if (rq->nr_phys_segments <= NVME_INT_PAGES && + size <= NVME_INT_BYTES(dev)) { + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq); + + iod = cmd->iod; + iod_init(iod, size, rq->nr_phys_segments, + (unsigned long) rq | NVME_INT_MASK); + return iod; + } + + return __nvme_alloc_iod(rq->nr_phys_segments, size, dev, + (unsigned long) rq, gfp); +} + +static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) +{ + const int last_prp = dev->page_size / 8 - 1; + int i; + __le64 **list = iod_list(iod); + dma_addr_t prp_dma = iod->first_dma; + + if (iod->npages == 0) + dma_pool_free(dev->prp_small_pool, list[0], prp_dma); + for (i = 0; i < iod->npages; i++) { + __le64 *prp_list = list[i]; + dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); + dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); + prp_dma = next_prp_dma; + } + + if (iod_should_kfree(iod)) + kfree(iod); +} + +static int nvme_error_status(u16 status) +{ + switch (status & 0x7ff) { + case NVME_SC_SUCCESS: + return 0; + case NVME_SC_CAP_EXCEEDED: + return -ENOSPC; + default: + return -EIO; + } +} + +#ifdef CONFIG_BLK_DEV_INTEGRITY +static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) +{ + if (be32_to_cpu(pi->ref_tag) == v) + pi->ref_tag = cpu_to_be32(p); +} + +static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) +{ + if (be32_to_cpu(pi->ref_tag) == p) + pi->ref_tag = cpu_to_be32(v); +} + +/** + * nvme_dif_remap - remaps ref tags to bip seed and physical lba + * + * The virtual start sector is the one that was originally submitted by the + * block layer. Due to partitioning, MD/DM cloning, etc. the actual physical + * start sector may be different. Remap protection information to match the + * physical LBA on writes, and back to the original seed on reads. + * + * Type 0 and 3 do not have a ref tag, so no remapping required. + */ +static void nvme_dif_remap(struct request *req, + void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) +{ + struct nvme_ns *ns = req->rq_disk->private_data; + struct bio_integrity_payload *bip; + struct t10_pi_tuple *pi; + void *p, *pmap; + u32 i, nlb, ts, phys, virt; + + if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3) + return; + + bip = bio_integrity(req->bio); + if (!bip) + return; + + pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; + + p = pmap; + virt = bip_get_seed(bip); + phys = nvme_block_nr(ns, blk_rq_pos(req)); + nlb = (blk_rq_bytes(req) >> ns->lba_shift); + ts = ns->disk->integrity->tuple_size; + + for (i = 0; i < nlb; i++, virt++, phys++) { + pi = (struct t10_pi_tuple *)p; + dif_swap(phys, virt, pi); + p += ts; + } + kunmap_atomic(pmap); +} + +static int nvme_noop_verify(struct blk_integrity_iter *iter) +{ + return 0; +} + +static int nvme_noop_generate(struct blk_integrity_iter *iter) +{ + return 0; +} + +struct blk_integrity nvme_meta_noop = { + .name = "NVME_META_NOOP", + .generate_fn = nvme_noop_generate, + .verify_fn = nvme_noop_verify, +}; + +static void nvme_init_integrity(struct nvme_ns *ns) +{ + struct blk_integrity integrity; + + switch (ns->pi_type) { + case NVME_NS_DPS_PI_TYPE3: + integrity = t10_pi_type3_crc; + break; + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + integrity = t10_pi_type1_crc; + break; + default: + integrity = nvme_meta_noop; + break; + } + integrity.tuple_size = ns->ms; + blk_integrity_register(ns->disk, &integrity); + blk_queue_max_integrity_segments(ns->queue, 1); +} +#else /* CONFIG_BLK_DEV_INTEGRITY */ +static void nvme_dif_remap(struct request *req, + void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) +{ +} +static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) +{ +} +static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) +{ +} +static void nvme_init_integrity(struct nvme_ns *ns) +{ +} +#endif + +static void req_completion(struct nvme_queue *nvmeq, void *ctx, + struct nvme_completion *cqe) +{ + struct nvme_iod *iod = ctx; + struct request *req = iod_get_private(iod); + struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); + + u16 status = le16_to_cpup(&cqe->status) >> 1; + + if (unlikely(status)) { + if (!(status & NVME_SC_DNR || blk_noretry_request(req)) + && (jiffies - req->start_time) < req->timeout) { + unsigned long flags; + + blk_mq_requeue_request(req); + spin_lock_irqsave(req->q->queue_lock, flags); + if (!blk_queue_stopped(req->q)) + blk_mq_kick_requeue_list(req->q); + spin_unlock_irqrestore(req->q->queue_lock, flags); + return; + } + + if (req->cmd_type == REQ_TYPE_DRV_PRIV) { + if (cmd_rq->ctx == CMD_CTX_CANCELLED) + status = -EINTR; + } else { + status = nvme_error_status(status); + } + } + + if (req->cmd_type == REQ_TYPE_DRV_PRIV) { + u32 result = le32_to_cpup(&cqe->result); + req->special = (void *)(uintptr_t)result; + } + + if (cmd_rq->aborted) + dev_warn(nvmeq->dev->dev, + "completing aborted command with status:%04x\n", + status); + + if (iod->nents) { + dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents, + rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + if (blk_integrity_rq(req)) { + if (!rq_data_dir(req)) + nvme_dif_remap(req, nvme_dif_complete); + dma_unmap_sg(nvmeq->dev->dev, iod->meta_sg, 1, + rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + } + } + nvme_free_iod(nvmeq->dev, iod); + + blk_mq_complete_request(req, status); +} + +/* length is in bytes. gfp flags indicates whether we may sleep. */ +static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, + int total_len, gfp_t gfp) +{ + struct dma_pool *pool; + int length = total_len; + struct scatterlist *sg = iod->sg; + int dma_len = sg_dma_len(sg); + u64 dma_addr = sg_dma_address(sg); + u32 page_size = dev->page_size; + int offset = dma_addr & (page_size - 1); + __le64 *prp_list; + __le64 **list = iod_list(iod); + dma_addr_t prp_dma; + int nprps, i; + + length -= (page_size - offset); + if (length <= 0) + return total_len; + + dma_len -= (page_size - offset); + if (dma_len) { + dma_addr += (page_size - offset); + } else { + sg = sg_next(sg); + dma_addr = sg_dma_address(sg); + dma_len = sg_dma_len(sg); + } + + if (length <= page_size) { + iod->first_dma = dma_addr; + return total_len; + } + + nprps = DIV_ROUND_UP(length, page_size); + if (nprps <= (256 / 8)) { + pool = dev->prp_small_pool; + iod->npages = 0; + } else { + pool = dev->prp_page_pool; + iod->npages = 1; + } + + prp_list = dma_pool_alloc(pool, gfp, &prp_dma); + if (!prp_list) { + iod->first_dma = dma_addr; + iod->npages = -1; + return (total_len - length) + page_size; + } + list[0] = prp_list; + iod->first_dma = prp_dma; + i = 0; + for (;;) { + if (i == page_size >> 3) { + __le64 *old_prp_list = prp_list; + prp_list = dma_pool_alloc(pool, gfp, &prp_dma); + if (!prp_list) + return total_len - length; + list[iod->npages++] = prp_list; + prp_list[0] = old_prp_list[i - 1]; + old_prp_list[i - 1] = cpu_to_le64(prp_dma); + i = 1; + } + prp_list[i++] = cpu_to_le64(dma_addr); + dma_len -= page_size; + dma_addr += page_size; + length -= page_size; + if (length <= 0) + break; + if (dma_len > 0) + continue; + BUG_ON(dma_len < 0); + sg = sg_next(sg); + dma_addr = sg_dma_address(sg); + dma_len = sg_dma_len(sg); + } + + return total_len; +} + +static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req, + struct nvme_iod *iod) +{ + struct nvme_command cmnd; + + memcpy(&cmnd, req->cmd, sizeof(cmnd)); + cmnd.rw.command_id = req->tag; + if (req->nr_phys_segments) { + cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + cmnd.rw.prp2 = cpu_to_le64(iod->first_dma); + } + + __nvme_submit_cmd(nvmeq, &cmnd); +} + +/* + * We reuse the small pool to allocate the 16-byte range here as it is not + * worth having a special pool for these or additional cases to handle freeing + * the iod. + */ +static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, + struct request *req, struct nvme_iod *iod) +{ + struct nvme_dsm_range *range = + (struct nvme_dsm_range *)iod_list(iod)[0]; + struct nvme_command cmnd; + + range->cattr = cpu_to_le32(0); + range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift); + range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + + memset(&cmnd, 0, sizeof(cmnd)); + cmnd.dsm.opcode = nvme_cmd_dsm; + cmnd.dsm.command_id = req->tag; + cmnd.dsm.nsid = cpu_to_le32(ns->ns_id); + cmnd.dsm.prp1 = cpu_to_le64(iod->first_dma); + cmnd.dsm.nr = 0; + cmnd.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); + + __nvme_submit_cmd(nvmeq, &cmnd); +} + +static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, + int cmdid) +{ + struct nvme_command cmnd; + + memset(&cmnd, 0, sizeof(cmnd)); + cmnd.common.opcode = nvme_cmd_flush; + cmnd.common.command_id = cmdid; + cmnd.common.nsid = cpu_to_le32(ns->ns_id); + + __nvme_submit_cmd(nvmeq, &cmnd); +} + +static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, + struct nvme_ns *ns) +{ + struct request *req = iod_get_private(iod); + struct nvme_command cmnd; + u16 control = 0; + u32 dsmgmt = 0; + + if (req->cmd_flags & REQ_FUA) + control |= NVME_RW_FUA; + if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) + control |= NVME_RW_LR; + + if (req->cmd_flags & REQ_RAHEAD) + dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; + + memset(&cmnd, 0, sizeof(cmnd)); + cmnd.rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); + cmnd.rw.command_id = req->tag; + cmnd.rw.nsid = cpu_to_le32(ns->ns_id); + cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + cmnd.rw.prp2 = cpu_to_le64(iod->first_dma); + cmnd.rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + cmnd.rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + + if (ns->ms) { + switch (ns->pi_type) { + case NVME_NS_DPS_PI_TYPE3: + control |= NVME_RW_PRINFO_PRCHK_GUARD; + break; + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + control |= NVME_RW_PRINFO_PRCHK_GUARD | + NVME_RW_PRINFO_PRCHK_REF; + cmnd.rw.reftag = cpu_to_le32( + nvme_block_nr(ns, blk_rq_pos(req))); + break; + } + if (blk_integrity_rq(req)) + cmnd.rw.metadata = + cpu_to_le64(sg_dma_address(iod->meta_sg)); + else + control |= NVME_RW_PRINFO_PRACT; + } + + cmnd.rw.control = cpu_to_le16(control); + cmnd.rw.dsmgmt = cpu_to_le32(dsmgmt); + + __nvme_submit_cmd(nvmeq, &cmnd); + + return 0; +} + +/* + * NOTE: ns is NULL when called on the admin queue. + */ +static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct nvme_ns *ns = hctx->queue->queuedata; + struct nvme_queue *nvmeq = hctx->driver_data; + struct nvme_dev *dev = nvmeq->dev; + struct request *req = bd->rq; + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_iod *iod; + enum dma_data_direction dma_dir; + + /* + * If formated with metadata, require the block layer provide a buffer + * unless this namespace is formated such that the metadata can be + * stripped/generated by the controller with PRACT=1. + */ + if (ns && ns->ms && !blk_integrity_rq(req)) { + if (!(ns->pi_type && ns->ms == 8) && + req->cmd_type != REQ_TYPE_DRV_PRIV) { + blk_mq_complete_request(req, -EFAULT); + return BLK_MQ_RQ_QUEUE_OK; + } + } + + iod = nvme_alloc_iod(req, dev, GFP_ATOMIC); + if (!iod) + return BLK_MQ_RQ_QUEUE_BUSY; + + if (req->cmd_flags & REQ_DISCARD) { + void *range; + /* + * We reuse the small pool to allocate the 16-byte range here + * as it is not worth having a special pool for these or + * additional cases to handle freeing the iod. + */ + range = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, + &iod->first_dma); + if (!range) + goto retry_cmd; + iod_list(iod)[0] = (__le64 *)range; + iod->npages = 0; + } else if (req->nr_phys_segments) { + dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; + + sg_init_table(iod->sg, req->nr_phys_segments); + iod->nents = blk_rq_map_sg(req->q, req, iod->sg); + if (!iod->nents) + goto error_cmd; + + if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir)) + goto retry_cmd; + + if (blk_rq_bytes(req) != + nvme_setup_prps(dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) { + dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); + goto retry_cmd; + } + if (blk_integrity_rq(req)) { + if (blk_rq_count_integrity_sg(req->q, req->bio) != 1) + goto error_cmd; + + sg_init_table(iod->meta_sg, 1); + if (blk_rq_map_integrity_sg( + req->q, req->bio, iod->meta_sg) != 1) + goto error_cmd; + + if (rq_data_dir(req)) + nvme_dif_remap(req, nvme_dif_prep); + + if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir)) + goto error_cmd; + } + } + + nvme_set_info(cmd, iod, req_completion); + spin_lock_irq(&nvmeq->q_lock); + if (req->cmd_type == REQ_TYPE_DRV_PRIV) + nvme_submit_priv(nvmeq, req, iod); + else if (req->cmd_flags & REQ_DISCARD) + nvme_submit_discard(nvmeq, ns, req, iod); + else if (req->cmd_flags & REQ_FLUSH) + nvme_submit_flush(nvmeq, ns, req->tag); + else + nvme_submit_iod(nvmeq, iod, ns); + + nvme_process_cq(nvmeq); + spin_unlock_irq(&nvmeq->q_lock); + return BLK_MQ_RQ_QUEUE_OK; + + error_cmd: + nvme_free_iod(dev, iod); + return BLK_MQ_RQ_QUEUE_ERROR; + retry_cmd: + nvme_free_iod(dev, iod); + return BLK_MQ_RQ_QUEUE_BUSY; +} + +static int nvme_process_cq(struct nvme_queue *nvmeq) +{ + u16 head, phase; + + head = nvmeq->cq_head; + phase = nvmeq->cq_phase; + + for (;;) { + void *ctx; + nvme_completion_fn fn; + struct nvme_completion cqe = nvmeq->cqes[head]; + if ((le16_to_cpu(cqe.status) & 1) != phase) + break; + nvmeq->sq_head = le16_to_cpu(cqe.sq_head); + if (++head == nvmeq->q_depth) { + head = 0; + phase = !phase; + } + ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn); + fn(nvmeq, ctx, &cqe); + } + + /* If the controller ignores the cq head doorbell and continuously + * writes to the queue, it is theoretically possible to wrap around + * the queue twice and mistakenly return IRQ_NONE. Linux only + * requires that 0.1% of your interrupts are handled, so this isn't + * a big problem. + */ + if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) + return 0; + + writel(head, nvmeq->q_db + nvmeq->dev->db_stride); + nvmeq->cq_head = head; + nvmeq->cq_phase = phase; + + nvmeq->cqe_seen = 1; + return 1; +} + +static irqreturn_t nvme_irq(int irq, void *data) +{ + irqreturn_t result; + struct nvme_queue *nvmeq = data; + spin_lock(&nvmeq->q_lock); + nvme_process_cq(nvmeq); + result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE; + nvmeq->cqe_seen = 0; + spin_unlock(&nvmeq->q_lock); + return result; +} + +static irqreturn_t nvme_irq_check(int irq, void *data) +{ + struct nvme_queue *nvmeq = data; + struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; + if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) + return IRQ_NONE; + return IRQ_WAKE_THREAD; +} + +/* + * Returns 0 on success. If the result is negative, it's a Linux error code; + * if the result is positive, it's an NVM Express status code + */ +int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buffer, void __user *ubuffer, unsigned bufflen, + u32 *result, unsigned timeout) +{ + bool write = cmd->common.opcode & 1; + struct bio *bio = NULL; + struct request *req; + int ret; + + req = blk_mq_alloc_request(q, write, GFP_KERNEL, false); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->cmd_type = REQ_TYPE_DRV_PRIV; + req->cmd_flags |= REQ_FAILFAST_DRIVER; + req->__data_len = 0; + req->__sector = (sector_t) -1; + req->bio = req->biotail = NULL; + + req->timeout = timeout ? timeout : ADMIN_TIMEOUT; + + req->cmd = (unsigned char *)cmd; + req->cmd_len = sizeof(struct nvme_command); + req->special = (void *)0; + + if (buffer && bufflen) { + ret = blk_rq_map_kern(q, req, buffer, bufflen, __GFP_WAIT); + if (ret) + goto out; + } else if (ubuffer && bufflen) { + ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, __GFP_WAIT); + if (ret) + goto out; + bio = req->bio; + } + + blk_execute_rq(req->q, NULL, req, 0); + if (bio) + blk_rq_unmap_user(bio); + if (result) + *result = (u32)(uintptr_t)req->special; + ret = req->errors; + out: + blk_mq_free_request(req); + return ret; +} + +int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buffer, unsigned bufflen) +{ + return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0); +} + +static int nvme_submit_async_admin_req(struct nvme_dev *dev) +{ + struct nvme_queue *nvmeq = dev->queues[0]; + struct nvme_command c; + struct nvme_cmd_info *cmd_info; + struct request *req; + + req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, true); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->cmd_flags |= REQ_NO_TIMEOUT; + cmd_info = blk_mq_rq_to_pdu(req); + nvme_set_info(cmd_info, NULL, async_req_completion); + + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_admin_async_event; + c.common.command_id = req->tag; + + blk_mq_free_request(req); + __nvme_submit_cmd(nvmeq, &c); + return 0; +} + +static int nvme_submit_admin_async_cmd(struct nvme_dev *dev, + struct nvme_command *cmd, + struct async_cmd_info *cmdinfo, unsigned timeout) +{ + struct nvme_queue *nvmeq = dev->queues[0]; + struct request *req; + struct nvme_cmd_info *cmd_rq; + + req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->timeout = timeout; + cmd_rq = blk_mq_rq_to_pdu(req); + cmdinfo->req = req; + nvme_set_info(cmd_rq, cmdinfo, async_completion); + cmdinfo->status = -EINTR; + + cmd->common.command_id = req->tag; + + nvme_submit_cmd(nvmeq, cmd); + return 0; +} + +static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.delete_queue.opcode = opcode; + c.delete_queue.qid = cpu_to_le16(id); + + return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); +} + +static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, + struct nvme_queue *nvmeq) +{ + struct nvme_command c; + int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; + + /* + * Note: we (ab)use the fact the the prp fields survive if no data + * is attached to the request. + */ + memset(&c, 0, sizeof(c)); + c.create_cq.opcode = nvme_admin_create_cq; + c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); + c.create_cq.cqid = cpu_to_le16(qid); + c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); + c.create_cq.cq_flags = cpu_to_le16(flags); + c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); + + return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); +} + +static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, + struct nvme_queue *nvmeq) +{ + struct nvme_command c; + int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; + + /* + * Note: we (ab)use the fact the the prp fields survive if no data + * is attached to the request. + */ + memset(&c, 0, sizeof(c)); + c.create_sq.opcode = nvme_admin_create_sq; + c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); + c.create_sq.sqid = cpu_to_le16(qid); + c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); + c.create_sq.sq_flags = cpu_to_le16(flags); + c.create_sq.cqid = cpu_to_l |
