summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--block/bio-integrity.c13
-rw-r--r--block/blk-core.c8
-rw-r--r--block/blk-mq.c13
-rw-r--r--block/blk-timeout.c11
-rw-r--r--block/blk.h2
-rw-r--r--drivers/nvme/host/Kconfig11
-rw-r--r--drivers/nvme/host/Makefile5
-rw-r--r--drivers/nvme/host/core.c1472
-rw-r--r--drivers/nvme/host/lightnvm.c35
-rw-r--r--drivers/nvme/host/nvme.h242
-rw-r--r--drivers/nvme/host/pci.c2700
-rw-r--r--drivers/nvme/host/scsi.c212
-rw-r--r--drivers/target/target_core_iblock.c4
-rw-r--r--include/linux/aer.h1
-rw-r--r--include/linux/bio.h32
-rw-r--r--include/linux/blk_types.h2
-rw-r--r--include/linux/blkdev.h1
-rw-r--r--include/linux/nvme.h27
-rw-r--r--include/uapi/linux/Kbuild2
19 files changed, 2596 insertions, 2197 deletions
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index f6325d573c10..711e4d8de6fa 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -66,7 +66,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
}
if (unlikely(!bip))
- return NULL;
+ return ERR_PTR(-ENOMEM);
memset(bip, 0, sizeof(*bip));
@@ -89,7 +89,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
return bip;
err:
mempool_free(bip, bs->bio_integrity_pool);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL(bio_integrity_alloc);
@@ -298,10 +298,10 @@ int bio_integrity_prep(struct bio *bio)
/* Allocate bio integrity payload and integrity vectors */
bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
- if (unlikely(bip == NULL)) {
+ if (IS_ERR(bip)) {
printk(KERN_ERR "could not allocate data integrity bioset\n");
kfree(buf);
- return -EIO;
+ return PTR_ERR(bip);
}
bip->bip_flags |= BIP_BLOCK_INTEGRITY;
@@ -465,9 +465,8 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
BUG_ON(bip_src == NULL);
bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
-
- if (bip == NULL)
- return -EIO;
+ if (IS_ERR(bip))
+ return PTR_ERR(bip);
memcpy(bip->bip_vec, bip_src->bip_vec,
bip_src->bip_vcnt * sizeof(struct bio_vec));
diff --git a/block/blk-core.c b/block/blk-core.c
index 476244d59309..ab51685988c2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -680,6 +680,13 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref)
wake_up_all(&q->mq_freeze_wq);
}
+static void blk_rq_timed_out_timer(unsigned long data)
+{
+ struct request_queue *q = (struct request_queue *)data;
+
+ kblockd_schedule_work(&q->timeout_work);
+}
+
struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
{
struct request_queue *q;
@@ -841,6 +848,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
goto fail;
+ INIT_WORK(&q->timeout_work, blk_timeout_work);
q->request_fn = rfn;
q->prep_rq_fn = NULL;
q->unprep_rq_fn = NULL;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6889d7183a2a..4c0622fae413 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -603,8 +603,6 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
blk_mq_complete_request(rq, -EIO);
return;
}
- if (rq->cmd_flags & REQ_NO_TIMEOUT)
- return;
if (time_after_eq(jiffies, rq->deadline)) {
if (!blk_mark_rq_complete(rq))
@@ -615,15 +613,19 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
}
}
-static void blk_mq_rq_timer(unsigned long priv)
+static void blk_mq_timeout_work(struct work_struct *work)
{
- struct request_queue *q = (struct request_queue *)priv;
+ struct request_queue *q =
+ container_of(work, struct request_queue, timeout_work);
struct blk_mq_timeout_data data = {
.next = 0,
.next_set = 0,
};
int i;
+ if (blk_queue_enter(q, true))
+ return;
+
blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
if (data.next_set) {
@@ -638,6 +640,7 @@ static void blk_mq_rq_timer(unsigned long priv)
blk_mq_tag_idle(hctx);
}
}
+ blk_queue_exit(q);
}
/*
@@ -2008,7 +2011,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
hctxs[i]->queue_num = i;
}
- setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
+ INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
q->nr_queues = nr_cpu_ids;
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 3610af561748..a30441a200c0 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -127,13 +127,16 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout
}
}
-void blk_rq_timed_out_timer(unsigned long data)
+void blk_timeout_work(struct work_struct *work)
{
- struct request_queue *q = (struct request_queue *) data;
+ struct request_queue *q =
+ container_of(work, struct request_queue, timeout_work);
unsigned long flags, next = 0;
struct request *rq, *tmp;
int next_set = 0;
+ if (blk_queue_enter(q, true))
+ return;
spin_lock_irqsave(q->queue_lock, flags);
list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
@@ -143,6 +146,7 @@ void blk_rq_timed_out_timer(unsigned long data)
mod_timer(&q->timeout, round_jiffies_up(next));
spin_unlock_irqrestore(q->queue_lock, flags);
+ blk_queue_exit(q);
}
/**
@@ -193,9 +197,6 @@ void blk_add_timer(struct request *req)
struct request_queue *q = req->q;
unsigned long expiry;
- if (req->cmd_flags & REQ_NO_TIMEOUT)
- return;
-
/* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */
if (!q->mq_ops && !q->rq_timed_out_fn)
return;
diff --git a/block/blk.h b/block/blk.h
index c43926d3d74d..70e4aee9cdcb 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -93,7 +93,7 @@ static inline void blk_flush_integrity(void)
}
#endif
-void blk_rq_timed_out_timer(unsigned long data);
+void blk_timeout_work(struct work_struct *work);
unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req);
void blk_delete_timer(struct request *);
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 002a94abdbc4..5d6237391dcd 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -8,3 +8,14 @@ config BLK_DEV_NVME
To compile this driver as a module, choose M here: the
module will be called nvme.
+
+config BLK_DEV_NVME_SCSI
+ bool "SCSI emulation for NVMe device nodes"
+ depends on BLK_DEV_NVME
+ ---help---
+ This adds support for the SG_IO ioctl on the NVMe character
+ and block devices nodes, as well a a translation for a small
+ number of selected SCSI commands to NVMe commands to the NVMe
+ driver. If you don't know what this means you probably want
+ to say N here, and if you know what it means you probably
+ want to say N as well.
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index a5fe23952586..51bf90871549 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -1,5 +1,6 @@
obj-$(CONFIG_BLK_DEV_NVME) += nvme.o
-lightnvm-$(CONFIG_NVM) := lightnvm.o
-nvme-y += pci.o scsi.o $(lightnvm-y)
+lightnvm-$(CONFIG_NVM) := lightnvm.o
+nvme-y += core.o pci.o $(lightnvm-y)
+nvme-$(CONFIG_BLK_DEV_NVME_SCSI) += scsi.o
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
new file mode 100644
index 000000000000..c5bf001af559
--- /dev/null
+++ b/drivers/nvme/host/core.c
@@ -0,0 +1,1472 @@
+/*
+ * NVM Express device driver
+ * Copyright (c) 2011-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/hdreg.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list_sort.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/pr.h>
+#include <linux/ptrace.h>
+#include <linux/nvme_ioctl.h>
+#include <linux/t10-pi.h>
+#include <scsi/sg.h>
+#include <asm/unaligned.h>
+
+#include "nvme.h"
+
+#define NVME_MINORS (1U << MINORBITS)
+
+static int nvme_major;
+module_param(nvme_major, int, 0);
+
+static int nvme_char_major;
+module_param(nvme_char_major, int, 0);
+
+static LIST_HEAD(nvme_ctrl_list);
+DEFINE_SPINLOCK(dev_list_lock);
+
+static struct class *nvme_class;
+
+static void nvme_free_ns(struct kref *kref)
+{
+ struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
+
+ if (ns->type == NVME_NS_LIGHTNVM)
+ nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
+
+ spin_lock(&dev_list_lock);
+ ns->disk->private_data = NULL;
+ spin_unlock(&dev_list_lock);
+
+ nvme_put_ctrl(ns->ctrl);
+ put_disk(ns->disk);
+ kfree(ns);
+}
+
+static void nvme_put_ns(struct nvme_ns *ns)
+{
+ kref_put(&ns->kref, nvme_free_ns);
+}
+
+static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk)
+{
+ struct nvme_ns *ns;
+
+ spin_lock(&dev_list_lock);
+ ns = disk->private_data;
+ if (ns && !kref_get_unless_zero(&ns->kref))
+ ns = NULL;
+ spin_unlock(&dev_list_lock);
+
+ return ns;
+}
+
+void nvme_requeue_req(struct request *req)
+{
+ unsigned long flags;
+
+ blk_mq_requeue_request(req);
+ spin_lock_irqsave(req->q->queue_lock, flags);
+ if (!blk_queue_stopped(req->q))
+ blk_mq_kick_requeue_list(req->q);
+ spin_unlock_irqrestore(req->q->queue_lock, flags);
+}
+
+struct request *nvme_alloc_request(struct request_queue *q,
+ struct nvme_command *cmd, unsigned int flags)
+{
+ bool write = cmd->common.opcode & 1;
+ struct request *req;
+
+ req = blk_mq_alloc_request(q, write, flags);
+ if (IS_ERR(req))
+ return req;
+
+ req->cmd_type = REQ_TYPE_DRV_PRIV;
+ req->cmd_flags |= REQ_FAILFAST_DRIVER;
+ req->__data_len = 0;
+ req->__sector = (sector_t) -1;
+ req->bio = req->biotail = NULL;
+
+ req->cmd = (unsigned char *)cmd;
+ req->cmd_len = sizeof(struct nvme_command);
+ req->special = (void *)0;
+
+ return req;
+}
+
+/*
+ * Returns 0 on success. If the result is negative, it's a Linux error code;
+ * if the result is positive, it's an NVM Express status code
+ */
+int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+ void *buffer, unsigned bufflen, u32 *result, unsigned timeout)
+{
+ struct request *req;
+ int ret;
+
+ req = nvme_alloc_request(q, cmd, 0);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
+
+ if (buffer && bufflen) {
+ ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
+ if (ret)
+ goto out;
+ }
+
+ blk_execute_rq(req->q, NULL, req, 0);
+ if (result)
+ *result = (u32)(uintptr_t)req->special;
+ ret = req->errors;
+ out:
+ blk_mq_free_request(req);
+ return ret;
+}
+
+int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+ void *buffer, unsigned bufflen)
+{
+ return __nvme_submit_sync_cmd(q, cmd, buffer, bufflen, NULL, 0);
+}
+
+int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
+ void __user *ubuffer, unsigned bufflen,
+ void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
+ u32 *result, unsigned timeout)
+{
+ bool write = cmd->common.opcode & 1;
+ struct nvme_ns *ns = q->queuedata;
+ struct gendisk *disk = ns ? ns->disk : NULL;
+ struct request *req;
+ struct bio *bio = NULL;
+ void *meta = NULL;
+ int ret;
+
+ req = nvme_alloc_request(q, cmd, 0);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
+
+ if (ubuffer && bufflen) {
+ ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
+ GFP_KERNEL);
+ if (ret)
+ goto out;
+ bio = req->bio;
+
+ if (!disk)
+ goto submit;
+ bio->bi_bdev = bdget_disk(disk, 0);
+ if (!bio->bi_bdev) {
+ ret = -ENODEV;
+ goto out_unmap;
+ }
+
+ if (meta_buffer) {
+ struct bio_integrity_payload *bip;
+
+ meta = kmalloc(meta_len, GFP_KERNEL);
+ if (!meta) {
+ ret = -ENOMEM;
+ goto out_unmap;
+ }
+
+ if (write) {
+ if (copy_from_user(meta, meta_buffer,
+ meta_len)) {
+ ret = -EFAULT;
+ goto out_free_meta;
+ }
+ }
+
+ bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
+ if (IS_ERR(bip)) {
+ ret = PTR_ERR(bip);
+ goto out_free_meta;
+ }
+
+ bip->bip_iter.bi_size = meta_len;
+ bip->bip_iter.bi_sector = meta_seed;
+
+ ret = bio_integrity_add_page(bio, virt_to_page(meta),
+ meta_len, offset_in_page(meta));
+ if (ret != meta_len) {
+ ret = -ENOMEM;
+ goto out_free_meta;
+ }
+ }
+ }
+ submit:
+ blk_execute_rq(req->q, disk, req, 0);
+ ret = req->errors;
+ if (result)
+ *result = (u32)(uintptr_t)req->special;
+ if (meta && !ret && !write) {
+ if (copy_to_user(meta_buffer, meta, meta_len))
+ ret = -EFAULT;
+ }
+ out_free_meta:
+ kfree(meta);
+ out_unmap:
+ if (bio) {
+ if (disk && bio->bi_bdev)
+ bdput(bio->bi_bdev);
+ blk_rq_unmap_user(bio);
+ }
+ out:
+ blk_mq_free_request(req);
+ return ret;
+}
+
+int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
+ void __user *ubuffer, unsigned bufflen, u32 *result,
+ unsigned timeout)
+{
+ return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0,
+ result, timeout);
+}
+
+int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
+{
+ struct nvme_command c = { };
+ int error;
+
+ /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
+ c.identify.opcode = nvme_admin_identify;
+ c.identify.cns = cpu_to_le32(1);
+
+ *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
+ if (!*id)
+ return -ENOMEM;
+
+ error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
+ sizeof(struct nvme_id_ctrl));
+ if (error)
+ kfree(*id);
+ return error;
+}
+
+static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
+{
+ struct nvme_command c = { };
+
+ c.identify.opcode = nvme_admin_identify;
+ c.identify.cns = cpu_to_le32(2);
+ c.identify.nsid = cpu_to_le32(nsid);
+ return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000);
+}
+
+int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
+ struct nvme_id_ns **id)
+{
+ struct nvme_command c = { };
+ int error;
+
+ /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
+ c.identify.opcode = nvme_admin_identify,
+ c.identify.nsid = cpu_to_le32(nsid),
+
+ *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
+ if (!*id)
+ return -ENOMEM;
+
+ error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
+ sizeof(struct nvme_id_ns));
+ if (error)
+ kfree(*id);
+ return error;
+}
+
+int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
+ dma_addr_t dma_addr, u32 *result)
+{
+ struct nvme_command c;
+
+ memset(&c, 0, sizeof(c));
+ c.features.opcode = nvme_admin_get_features;
+ c.features.nsid = cpu_to_le32(nsid);
+ c.features.prp1 = cpu_to_le64(dma_addr);
+ c.features.fid = cpu_to_le32(fid);
+
+ return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0);
+}
+
+int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
+ dma_addr_t dma_addr, u32 *result)
+{
+ struct nvme_command c;
+
+ memset(&c, 0, sizeof(c));
+ c.features.opcode = nvme_admin_set_features;
+ c.features.prp1 = cpu_to_le64(dma_addr);
+ c.features.fid = cpu_to_le32(fid);
+ c.features.dword11 = cpu_to_le32(dword11);
+
+ return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0);
+}
+
+int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log)
+{
+ struct nvme_command c = { };
+ int error;
+
+ c.common.opcode = nvme_admin_get_log_page,
+ c.common.nsid = cpu_to_le32(0xFFFFFFFF),
+ c.common.cdw10[0] = cpu_to_le32(
+ (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) |
+ NVME_LOG_SMART),
+
+ *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL);
+ if (!*log)
+ return -ENOMEM;
+
+ error = nvme_submit_sync_cmd(dev->admin_q, &c, *log,
+ sizeof(struct nvme_smart_log));
+ if (error)
+ kfree(*log);
+ return error;
+}
+
+int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
+{
+ u32 q_count = (*count - 1) | ((*count - 1) << 16);
+ u32 result;
+ int status, nr_io_queues;
+
+ status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0,
+ &result);
+ if (status)
+ return status;
+
+ nr_io_queues = min(result & 0xffff, result >> 16) + 1;
+ *count = min(*count, nr_io_queues);
+ return 0;
+}
+
+static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
+{
+ struct nvme_user_io io;
+ struct nvme_command c;
+ unsigned length, meta_len;
+ void __user *metadata;
+
+ if (copy_from_user(&io, uio, sizeof(io)))
+ return -EFAULT;
+
+ switch (io.opcode) {
+ case nvme_cmd_write:
+ case nvme_cmd_read:
+ case nvme_cmd_compare:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ length = (io.nblocks + 1) << ns->lba_shift;
+ meta_len = (io.nblocks + 1) * ns->ms;
+ metadata = (void __user *)(uintptr_t)io.metadata;
+
+ if (ns->ext) {
+ length += meta_len;
+ meta_len = 0;
+ } else if (meta_len) {
+ if ((io.metadata & 3) || !io.metadata)
+ return -EINVAL;
+ }
+
+ memset(&c, 0, sizeof(c));
+ c.rw.opcode = io.opcode;
+ c.rw.flags = io.flags;
+ c.rw.nsid = cpu_to_le32(ns->ns_id);
+ c.rw.slba = cpu_to_le64(io.slba);
+ c.rw.length = cpu_to_le16(io.nblocks);
+ c.rw.control = cpu_to_le16(io.control);
+ c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
+ c.rw.reftag = cpu_to_le32(io.reftag);
+ c.rw.apptag = cpu_to_le16(io.apptag);
+ c.rw.appmask = cpu_to_le16(io.appmask);
+
+ return __nvme_submit_user_cmd(ns->queue, &c,
+ (void __user *)(uintptr_t)io.addr, length,
+ metadata, meta_len, io.slba, NULL, 0);
+}
+
+static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+ struct nvme_passthru_cmd __user *ucmd)
+{
+ struct nvme_passthru_cmd cmd;
+ struct nvme_command c;
+ unsigned timeout = 0;
+ int status;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
+ return -EFAULT;
+
+ memset(&c, 0, sizeof(c));
+ c.common.opcode = cmd.opcode;
+ c.common.flags = cmd.flags;
+ c.common.nsid = cpu_to_le32(cmd.nsid);
+ c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
+ c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
+ c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
+ c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
+ c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
+ c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
+ c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
+ c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
+
+ if (cmd.timeout_ms)
+ timeout = msecs_to_jiffies(cmd.timeout_ms);
+
+ status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
+ (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
+ &cmd.result, timeout);
+ if (status >= 0) {
+ if (put_user(cmd.result, &ucmd->result))
+ return -EFAULT;
+ }
+
+ return status;
+}
+
+static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ struct nvme_ns *ns = bdev->bd_disk->private_data;
+
+ switch (cmd) {
+ case NVME_IOCTL_ID:
+ force_successful_syscall_return();
+ return ns->ns_id;
+ case NVME_IOCTL_ADMIN_CMD:
+ return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
+ case NVME_IOCTL_IO_CMD:
+ return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
+ case NVME_IOCTL_SUBMIT_IO:
+ return nvme_submit_io(ns, (void __user *)arg);
+#ifdef CONFIG_BLK_DEV_NVME_SCSI
+ case SG_GET_VERSION_NUM:
+ return nvme_sg_get_version_num((void __user *)arg);
+ case SG_IO:
+ return nvme_sg_io(ns, (void __user *)arg);
+#endif
+ default:
+ return -ENOTTY;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case SG_IO:
+ return -ENOIOCTLCMD;
+ }
+ return nvme_ioctl(bdev, mode, cmd, arg);
+}
+#else
+#define nvme_compat_ioctl NULL
+#endif
+
+static int nvme_open(struct block_device *bdev, fmode_t mode)
+{
+ return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO;
+}
+
+static void nvme_release(struct gendisk *disk, fmode_t mode)
+{
+ nvme_put_ns(disk->private_data);
+}
+
+static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ /* some standard values */
+ geo->heads = 1 << 6;
+ geo->sectors = 1 << 5;
+ geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
+ return 0;
+}
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+static void nvme_init_integrity(struct nvme_ns *ns)
+{
+ struct blk_integrity integrity;
+
+ switch (ns->pi_type) {
+ case NVME_NS_DPS_PI_TYPE3:
+ integrity.profile = &t10_pi_type3_crc;
+ break;
+ case NVME_NS_DPS_PI_TYPE1:
+ case NVME_NS_DPS_PI_TYPE2:
+ integrity.profile = &t10_pi_type1_crc;
+ break;
+ default:
+ integrity.profile = NULL;
+ break;
+ }
+ integrity.tuple_size = ns->ms;
+ blk_integrity_register(ns->disk, &integrity);
+ blk_queue_max_integrity_segments(ns->queue, 1);
+}
+#else
+static void nvme_init_integrity(struct nvme_ns *ns)
+{
+}
+#endif /* CONFIG_BLK_DEV_INTEGRITY */
+
+static void nvme_config_discard(struct nvme_ns *ns)
+{
+ u32 logical_block_size = queue_logical_block_size(ns->queue);
+ ns->queue->limits.discard_zeroes_data = 0;
+ ns->queue->limits.discard_alignment = logical_block_size;
+ ns->queue->limits.discard_granularity = logical_block_size;
+ blk_queue_max_discard_sectors(ns->queue, 0xffffffff);
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
+}
+
+static int nvme_revalidate_disk(struct gendisk *disk)
+{
+ struct nvme_ns *ns = disk->private_data;
+ struct nvme_id_ns *id;
+ u8 lbaf, pi_type;
+ u16 old_ms;
+ unsigned short bs;
+
+ if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) {
+ dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n",
+ __func__, ns->ctrl->instance, ns->ns_id);
+ return -ENODEV;
+ }
+ if (id->ncap == 0) {
+ kfree(id);
+ return -ENODEV;
+ }
+
+ if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) {
+ if (nvme_nvm_register(ns->queue, disk->disk_name)) {
+ dev_warn(ns->ctrl->dev,
+ "%s: LightNVM init failure\n", __func__);
+ kfree(id);
+ return -ENODEV;
+ }
+ ns->type = NVME_NS_LIGHTNVM;
+ }
+
+ if (ns->ctrl->vs >= NVME_VS(1, 1))
+ memcpy(ns->eui, id->eui64, sizeof(ns->eui));
+ if (ns->ctrl->vs >= NVME_VS(1, 2))
+ memcpy(ns->uuid, id->nguid, sizeof(ns->uuid));
+
+ old_ms = ns->ms;
+ lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
+ ns->lba_shift = id->lbaf[lbaf].ds;
+ ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
+ ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
+
+ /*
+ * If identify namespace failed, use default 512 byte block size so
+ * block layer can use before failing read/write for 0 capacity.
+ */
+ if (ns->lba_shift == 0)
+ ns->lba_shift = 9;
+ bs = 1 << ns->lba_shift;
+ /* XXX: PI implementation requires metadata equal t10 pi tuple size */
+ pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
+ id->dps & NVME_NS_DPS_PI_MASK : 0;
+
+ blk_mq_freeze_queue(disk->queue);
+ if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
+ ns->ms != old_ms ||
+ bs != queue_logical_block_size(disk->queue) ||
+ (ns->ms && ns->ext)))
+ blk_integrity_unregister(disk);
+
+ ns->pi_type = pi_type;
+ blk_queue_logical_block_size(ns->queue, bs);
+
+ if (ns->ms && !blk_get_integrity(disk) && !ns->ext)
+ nvme_init_integrity(ns);
+ if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
+ set_capacity(disk, 0);
+ else
+ set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
+
+ if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
+ nvme_config_discard(ns);
+ blk_mq_unfreeze_queue(disk->queue);
+
+ kfree(id);
+ return 0;
+}
+
+static char nvme_pr_type(enum pr_type type)
+{
+ switch (type) {
+ case PR_WRITE_EXCLUSIVE:
+ return 1;
+ case PR_EXCLUSIVE_ACCESS:
+ return 2;
+ case PR_WRITE_EXCLUSIVE_REG_ONLY:
+ return 3;
+ case PR_EXCLUSIVE_ACCESS_REG_ONLY:
+ return 4;
+ case PR_WRITE_EXCLUSIVE_ALL_REGS:
+ return 5;
+ case PR_EXCLUSIVE_ACCESS_ALL_REGS:
+ return 6;
+ default:
+ return 0;
+ }
+};
+
+static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
+ u64 key, u64 sa_key, u8 op)
+{
+ struct nvme_ns *ns = bdev->bd_disk->private_data;
+ struct nvme_command c;
+ u8 data[16] = { 0, };
+
+ put_unaligned_le64(key, &data[0]);
+ put_unaligned_le64(sa_key, &data[8]);
+
+ memset(&c, 0, sizeof(c));
+ c.common.opcode = op;
+ c.common.nsid = cpu_to_le32(ns->ns_id);
+ c.common.cdw10[0] = cpu_to_le32(cdw10);
+
+ return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
+}
+
+static int nvme_pr_register(struct block_device *bdev, u64 old,
+ u64 new, unsigned flags)
+{
+ u32 cdw10;
+
+ if (flags & ~PR_FL_IGNORE_KEY)
+ return -EOPNOTSUPP;
+
+ cdw10 = old ? 2 : 0;
+ cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
+ cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
+ return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
+}
+
+static int nvme_pr_reserve(struct block_device *bdev, u64 key,
+ enum pr_type type, unsigned flags)
+{
+ u32 cdw10;
+
+ if (flags & ~PR_FL_IGNORE_KEY)
+ return -EOPNOTSUPP;
+
+ cdw10 = nvme_pr_type(type) << 8;
+ cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
+ return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
+}
+
+static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
+ enum pr_type type, bool abort)
+{
+ u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
+ return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
+}
+
+static int nvme_pr_clear(struct block_device *bdev, u64 key)
+{
+ u32 cdw10 = 1 | (key ? 1 << 3 : 0);
+ return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
+}
+
+static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
+{
+ u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
+ return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
+}
+
+static const struct pr_ops nvme_pr_ops = {
+ .pr_register = nvme_pr_register,
+ .pr_reserve = nvme_pr_reserve,
+ .pr_release = nvme_pr_release,
+ .pr_preempt = nvme_pr_preempt,
+ .pr_clear = nvme_pr_clear,
+};
+
+static const struct block_device_operations nvme_fops = {
+ .owner = THIS_MODULE,
+ .ioctl = nvme_ioctl,
+ .compat_ioctl = nvme_compat_ioctl,
+ .open = nvme_open,
+ .release = nvme_release,
+ .getgeo = nvme_getgeo,
+ .revalidate_disk= nvme_revalidate_disk,
+ .pr_ops = &nvme_pr_ops,
+};
+
+static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
+{
+ unsigned long timeout =
+ ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
+ u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
+ int ret;
+
+ while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
+ if ((csts & NVME_CSTS_RDY) == bit)
+ break;
+
+ msleep(100);
+ if (fatal_signal_pending(current))
+ return -EINTR;
+ if (time_after(jiffies, timeout)) {
+ dev_err(ctrl->dev,
+ "Device not ready; aborting %s\n", enabled ?
+ "initialisation" : "reset");
+ return -ENODEV;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * If the device has been passed off to us in an enabled state, just clear
+ * the enabled bit. The spec says we should set the 'shutdown notification
+ * bits', but doing so may cause the device to complete commands to the
+ * admin queue ... and we don't know what memory that might be pointing at!
+ */
+int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
+{
+ int ret;
+
+ ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
+ ctrl->ctrl_config &= ~NVME_CC_ENABLE;
+
+ ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
+ if (ret)
+ return ret;
+ return nvme_wait_ready(ctrl, cap, false);
+}
+
+int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
+{
+ /*
+ * Default to a 4K page size, with the intention to update this
+ * path in the future to accomodate architectures with differing
+ * kernel and IO page sizes.
+ */
+ unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
+ int ret;
+
+ if (page_shift < dev_page_min) {
+ dev_err(ctrl->dev,
+ "Minimum device page size %u too large for host (%u)\n",
+ 1 << dev_page_min, 1 << page_shift);
+ return -ENODEV;
+ }
+
+ ctrl->page_size = 1 << page_shift;
+
+ ctrl->ctrl_config = NVME_CC_CSS_NVM;
+ ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
+ ctrl->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
+ ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
+ ctrl->ctrl_config |= NVME_CC_ENABLE;
+
+ ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
+ if (ret)
+ return ret;
+ return nvme_wait_ready(ctrl, cap, true);