diff options
Diffstat (limited to 'drivers/block')
38 files changed, 2121 insertions, 2490 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 3ccef9eba..1b8094d4d 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -404,18 +404,6 @@ config BLK_DEV_RAM_DAX and will prevent RAM block device backing store memory from being allocated from highmem (only a problem for highmem systems). -config BLK_DEV_PMEM - tristate "Persistent memory block device support" - depends on HAS_IOMEM - help - Saying Y here will allow you to use a contiguous range of reserved - memory as one or more persistent block devices. - - To compile this driver as a module, choose M here: the module will be - called 'pmem'. - - If unsure, say N. - config CDROM_PKTCDVD tristate "Packet writing on CD/DVD media" depends on !UML diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 9cc6c18a1..02b688d14 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -14,7 +14,6 @@ obj-$(CONFIG_PS3_VRAM) += ps3vram.o obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o obj-$(CONFIG_BLK_DEV_RAM) += brd.o -obj-$(CONFIG_BLK_DEV_PMEM) += pmem.o obj-$(CONFIG_BLK_DEV_LOOP) += loop.o obj-$(CONFIG_BLK_CPQ_DA) += cpqarray.o obj-$(CONFIG_BLK_CPQ_CISS_DA) += cciss.o diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index ff20f192b..0422c4726 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -139,8 +139,6 @@ static struct board_type products[] = { {0x3214103C, "Smart Array E200i", &SA5_access}, {0x3215103C, "Smart Array E200i", &SA5_access}, {0x3237103C, "Smart Array E500", &SA5_access}, - {0x3223103C, "Smart Array P800", &SA5_access}, - {0x3234103C, "Smart Array P400", &SA5_access}, {0x323D103C, "Smart Array P700m", &SA5_access}, }; @@ -574,8 +572,6 @@ static void cciss_procinit(ctlr_info_t *h) /* List of controllers which cannot be hard reset on kexec with reset_devices */ static u32 unresettable_controller[] = { - 0x324a103C, /* Smart Array P712m */ - 0x324b103C, /* SmartArray P711m */ 0x3223103C, /* Smart Array P800 */ 0x3234103C, /* Smart Array P400 */ 0x3235103C, /* Smart Array P400i */ @@ -586,12 +582,32 @@ static u32 unresettable_controller[] = { 0x3215103C, /* Smart Array E200i */ 0x3237103C, /* Smart Array E500 */ 0x323D103C, /* Smart Array P700m */ + 0x40800E11, /* Smart Array 5i */ 0x409C0E11, /* Smart Array 6400 */ 0x409D0E11, /* Smart Array 6400 EM */ + 0x40700E11, /* Smart Array 5300 */ + 0x40820E11, /* Smart Array 532 */ + 0x40830E11, /* Smart Array 5312 */ + 0x409A0E11, /* Smart Array 641 */ + 0x409B0E11, /* Smart Array 642 */ + 0x40910E11, /* Smart Array 6i */ }; /* List of controllers which cannot even be soft reset */ static u32 soft_unresettable_controller[] = { + 0x40800E11, /* Smart Array 5i */ + 0x40700E11, /* Smart Array 5300 */ + 0x40820E11, /* Smart Array 532 */ + 0x40830E11, /* Smart Array 5312 */ + 0x409A0E11, /* Smart Array 641 */ + 0x409B0E11, /* Smart Array 642 */ + 0x40910E11, /* Smart Array 6i */ + /* Exclude 640x boards. These are two pci devices in one slot + * which share a battery backed cache module. One controls the + * cache, the other accesses the cache through the one that controls + * it. If we reset the one controlling the cache, the other will + * likely not be happy. Just forbid resetting this conjoined mess. + */ 0x409C0E11, /* Smart Array 6400 */ 0x409D0E11, /* Smart Array 6400 EM */ }; @@ -4667,8 +4683,7 @@ static int cciss_kdump_hard_reset_controller(struct pci_dev *pdev) */ cciss_lookup_board_id(pdev, &board_id); if (!ctlr_is_resettable(board_id)) { - dev_warn(&pdev->dev, "Cannot reset Smart Array 640x " - "due to shared cache module."); + dev_warn(&pdev->dev, "Controller not resettable\n"); return -ENODEV; } diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c index ecd845cd2..1537302e5 100644 --- a/drivers/block/cciss_scsi.c +++ b/drivers/block/cciss_scsi.c @@ -84,7 +84,6 @@ static struct scsi_host_template cciss_driver_template = { .show_info = cciss_scsi_show_info, .queuecommand = cciss_scsi_queue_command, .this_id = 7, - .cmd_per_lun = 1, .use_clustering = DISABLE_CLUSTERING, /* Can't have eh_bus_reset_handler or eh_host_reset_handler for cciss */ .eh_device_reset_handler= cciss_eh_device_reset_handler, diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c index a6ee3d750..6b88a35fb 100644 --- a/drivers/block/drbd/drbd_debugfs.c +++ b/drivers/block/drbd/drbd_debugfs.c @@ -419,14 +419,6 @@ static int in_flight_summary_show(struct seq_file *m, void *pos) return 0; } -/* simple_positive(file->f_path.dentry) respectively debugfs_positive(), - * but neither is "reachable" from here. - * So we have our own inline version of it above. :-( */ -static inline int debugfs_positive(struct dentry *dentry) -{ - return d_really_is_positive(dentry) && !d_unhashed(dentry); -} - /* make sure at *open* time that the respective object won't go away. */ static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, void *), void *data, struct kref *kref, @@ -444,7 +436,7 @@ static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, vo /* serialize with d_delete() */ mutex_lock(&d_inode(parent)->i_mutex); /* Make sure the object is still alive */ - if (debugfs_positive(file->f_path.dentry) + if (simple_positive(file->f_path.dentry) && kref_get_unless_zero(kref)) ret = 0; mutex_unlock(&d_inode(parent)->i_mutex); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index b905e9888..efd19c2da 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -38,6 +38,7 @@ #include <linux/mutex.h> #include <linux/major.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/genhd.h> #include <linux/idr.h> #include <net/tcp.h> diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 81fde9ef7..a1518539b 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2359,7 +2359,7 @@ static void drbd_cleanup(void) * @congested_data: User data * @bdi_bits: Bits the BDI flusher thread is currently interested in * - * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested. + * Returns 1<<WB_async_congested and/or 1<<WB_sync_congested if we are congested. */ static int drbd_congested(void *congested_data, int bdi_bits) { @@ -2376,14 +2376,14 @@ static int drbd_congested(void *congested_data, int bdi_bits) } if (test_bit(CALLBACK_PENDING, &first_peer_device(device)->connection->flags)) { - r |= (1 << BDI_async_congested); + r |= (1 << WB_async_congested); /* Without good local data, we would need to read from remote, * and that would need the worker thread as well, which is * currently blocked waiting for that usermode helper to * finish. */ if (!get_ldev_if_state(device, D_UP_TO_DATE)) - r |= (1 << BDI_sync_congested); + r |= (1 << WB_sync_congested); else put_ldev(device); r &= bdi_bits; @@ -2399,9 +2399,9 @@ static int drbd_congested(void *congested_data, int bdi_bits) reason = 'b'; } - if (bdi_bits & (1 << BDI_async_congested) && + if (bdi_bits & (1 << WB_async_congested) && test_bit(NET_CONGESTED, &first_peer_device(device)->connection->flags)) { - r |= (1 << BDI_async_congested); + r |= (1 << WB_async_congested); reason = reason == 'b' ? 'a' : 'n'; } diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index cee20354a..c097909c5 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -598,7 +598,7 @@ static struct socket *drbd_try_connect(struct drbd_connection *connection) memcpy(&peer_in6, &connection->peer_addr, peer_addr_len); what = "sock_create_kern"; - err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family, + err = sock_create_kern(&init_net, ((struct sockaddr *)&src_in6)->sa_family, SOCK_STREAM, IPPROTO_TCP, &sock); if (err < 0) { sock = NULL; @@ -693,7 +693,7 @@ static int prepare_listen_socket(struct drbd_connection *connection, struct acce memcpy(&my_addr, &connection->my_addr, my_addr_len); what = "sock_create_kern"; - err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family, + err = sock_create_kern(&init_net, ((struct sockaddr *)&my_addr)->sa_family, SOCK_STREAM, IPPROTO_TCP, &s_listen); if (err) { s_listen = NULL; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index a5b343d20..f7a4c9d7f 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -474,6 +474,28 @@ static int loop_flush(struct loop_device *lo) return loop_switch(lo, NULL); } +static void loop_reread_partitions(struct loop_device *lo, + struct block_device *bdev) +{ + int rc; + + /* + * bd_mutex has been held already in release path, so don't + * acquire it if this function is called in such case. + * + * If the reread partition isn't from release path, lo_refcnt + * must be at least one and it can only become zero when the + * current holder is released. + */ + if (!atomic_read(&lo->lo_refcnt)) + rc = __blkdev_reread_part(bdev); + else + rc = blkdev_reread_part(bdev); + if (rc) + pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n", + __func__, lo->lo_number, lo->lo_file_name, rc); +} + /* * loop_change_fd switched the backing store of a loopback device to * a new file. This is useful for operating system installers to free up @@ -522,7 +544,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, fput(old_file); if (lo->lo_flags & LO_FLAGS_PARTSCAN) - ioctl_by_bdev(bdev, BLKRRPART, 0); + loop_reread_partitions(lo, bdev); return 0; out_putf: @@ -538,24 +560,6 @@ static inline int is_loop_device(struct file *file) return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR; } -/* - * for AUFS - * no get/put for file. - */ -struct file *loop_backing_file(struct super_block *sb) -{ - struct file *ret; - struct loop_device *l; - - ret = NULL; - if (MAJOR(sb->s_dev) == LOOP_MAJOR) { - l = sb->s_bdev->bd_disk->private_data; - ret = l->lo_backing_file; - } - return ret; -} -EXPORT_SYMBOL(loop_backing_file); - /* loop sysfs attributes */ static ssize_t loop_attr_show(struct device *dev, char *page, @@ -584,7 +588,7 @@ static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf) spin_lock_irq(&lo->lo_lock); if (lo->lo_backing_file) - p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1); + p = file_path(lo->lo_backing_file, buf, PAGE_SIZE - 1); spin_unlock_irq(&lo->lo_lock); if (IS_ERR_OR_NULL(p)) @@ -777,7 +781,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, if (part_shift) lo->lo_flags |= LO_FLAGS_PARTSCAN; if (lo->lo_flags & LO_FLAGS_PARTSCAN) - ioctl_by_bdev(bdev, BLKRRPART, 0); + loop_reread_partitions(lo, bdev); /* Grab the block_device to prevent its destruction after we * put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev). @@ -849,7 +853,7 @@ static int loop_clr_fd(struct loop_device *lo) * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d * command to fail with EBUSY. */ - if (lo->lo_refcnt > 1) { + if (atomic_read(&lo->lo_refcnt) > 1) { lo->lo_flags |= LO_FLAGS_AUTOCLEAR; mutex_unlock(&lo->lo_ctl_mutex); return 0; @@ -858,6 +862,9 @@ static int loop_clr_fd(struct loop_device *lo) if (filp == NULL) return -EINVAL; + /* freeze request queue during the transition */ + blk_mq_freeze_queue(lo->lo_queue); + spin_lock_irq(&lo->lo_lock); lo->lo_state = Lo_rundown; lo->lo_backing_file = NULL; @@ -889,8 +896,10 @@ static int loop_clr_fd(struct loop_device *lo) lo->lo_state = Lo_unbound; /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); + blk_mq_unfreeze_queue(lo->lo_queue); + if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev) - ioctl_by_bdev(bdev, BLKRRPART, 0); + loop_reread_partitions(lo, bdev); lo->lo_flags = 0; if (!part_shift) lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; @@ -967,7 +976,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) !(lo->lo_flags & LO_FLAGS_PARTSCAN)) { lo->lo_flags |= LO_FLAGS_PARTSCAN; lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; - ioctl_by_bdev(lo->lo_device, BLKRRPART, 0); + loop_reread_partitions(lo, lo->lo_device); } lo->lo_encrypt_key_size = info->lo_encrypt_key_size; @@ -1348,9 +1357,7 @@ static int lo_open(struct block_device *bdev, fmode_t mode) goto out; } - mutex_lock(&lo->lo_ctl_mutex); - lo->lo_refcnt++; - mutex_unlock(&lo->lo_ctl_mutex); + atomic_inc(&lo->lo_refcnt); out: mutex_unlock(&loop_index_mutex); return err; @@ -1361,11 +1368,10 @@ static void lo_release(struct gendisk *disk, fmode_t mode) struct loop_device *lo = disk->private_data; int err; - mutex_lock(&lo->lo_ctl_mutex); - - if (--lo->lo_refcnt) - goto out; + if (atomic_dec_return(&lo->lo_refcnt)) + return; + mutex_lock(&lo->lo_ctl_mutex); if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) { /* * In autoclear mode, stop the loop thread @@ -1382,7 +1388,6 @@ static void lo_release(struct gendisk *disk, fmode_t mode) loop_flush(lo); } -out: mutex_unlock(&lo->lo_ctl_mutex); } @@ -1619,6 +1624,7 @@ static int loop_add(struct loop_device **l, int i) disk->flags |= GENHD_FL_NO_PART_SCAN; disk->flags |= GENHD_FL_EXT_DEVT; mutex_init(&lo->lo_ctl_mutex); + atomic_set(&lo->lo_refcnt, 0); lo->lo_number = i; spin_lock_init(&lo->lo_lock); disk->major = LOOP_MAJOR; @@ -1736,7 +1742,7 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd, mutex_unlock(&lo->lo_ctl_mutex); break; } - if (lo->lo_refcnt > 0) { + if (atomic_read(&lo->lo_refcnt) > 0) { ret = -EBUSY; mutex_unlock(&lo->lo_ctl_mutex); break; diff --git a/drivers/block/loop.h b/drivers/block/loop.h index 49564edf5..25e8997ed 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -28,7 +28,7 @@ struct loop_func_table; struct loop_device { int lo_number; - int lo_refcnt; + atomic_t lo_refcnt; loff_t lo_offset; loff_t lo_sizelimit; int lo_flags; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 3bd7ca985..f504232c1 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -163,12 +163,6 @@ static bool mtip_check_surprise_removal(struct pci_dev *pdev) else dev_warn(&dd->pdev->dev, "%s: dd->queue is NULL\n", __func__); - if (dd->port) { - set_bit(MTIP_PF_SR_CLEANUP_BIT, &dd->port->flags); - wake_up_interruptible(&dd->port->svc_wait); - } else - dev_warn(&dd->pdev->dev, - "%s: dd->port is NULL\n", __func__); return true; /* device removed */ } @@ -269,8 +263,11 @@ static int mtip_hba_reset(struct driver_data *dd) /* Flush */ readl(dd->mmio + HOST_CTL); - /* Spin for up to 2 seconds, waiting for reset acknowledgement */ - timeout = jiffies + msecs_to_jiffies(2000); + /* + * Spin for up to 10 seconds waiting for reset acknowledgement. Spec + * is 1 sec but in LUN failure conditions, up to 10 secs are required + */ + timeout = jiffies + msecs_to_jiffies(10000); do { mdelay(10); if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) @@ -623,8 +620,7 @@ static void mtip_handle_tfe(struct driver_data *dd) set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); - if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) && - test_bit(MTIP_TAG_INTERNAL, port->allocated)) { + if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) { cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL); dbg_printk(MTIP_DRV_NAME " TFE for the internal command\n"); @@ -896,6 +892,10 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data) /* Acknowledge the interrupt status on the port.*/ port_stat = readl(port->mmio + PORT_IRQ_STAT); + if (unlikely(port_stat == 0xFFFFFFFF)) { + mtip_check_surprise_removal(dd->pdev); + return IRQ_HANDLED; + } writel(port_stat, port->mmio + PORT_IRQ_STAT); /* Demux port status */ @@ -991,15 +991,10 @@ static bool mtip_pause_ncq(struct mtip_port *port, reply = port->rxfis + RX_FIS_D2H_REG; task_file_data = readl(port->mmio+PORT_TFDATA); - if (fis->command == ATA_CMD_SEC_ERASE_UNIT) - clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag); - if ((task_file_data & 1)) return false; if (fis->command == ATA_CMD_SEC_ERASE_PREP) { - set_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags); - set_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag); port->ic_pause_timer = jiffies; return true; } else if ((fis->command == ATA_CMD_DOWNLOAD_MICRO) && @@ -1011,8 +1006,10 @@ static bool mtip_pause_ncq(struct mtip_port *port, ((fis->command == 0xFC) && (fis->features == 0x27 || fis->features == 0x72 || fis->features == 0x62 || fis->features == 0x26))) { + clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag); /* Com reset after secure erase or lowlevel format */ mtip_restart_port(port); + clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags); return false; } @@ -1112,9 +1109,10 @@ static int mtip_exec_internal_command(struct mtip_port *port, int_cmd = mtip_get_int_command(dd); set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); - port->ic_pause_timer = 0; - clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags); + if (fis->command == ATA_CMD_SEC_ERASE_PREP) + set_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags); + clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags); if (atomic == GFP_KERNEL) { @@ -1251,11 +1249,11 @@ static int mtip_exec_internal_command(struct mtip_port *port, exec_ic_exit: /* Clear the allocated and active bits for the internal command. */ mtip_put_int_command(dd, int_cmd); + clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); if (rv >= 0 && mtip_pause_ncq(port, fis)) { /* NCQ paused */ return rv; } - clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); wake_up_interruptible(&port->svc_wait); return rv; @@ -2625,18 +2623,6 @@ static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf, readl(dd->mmio + HOST_IRQ_STAT)); size += sprintf(&buf[size], "\n"); - size += sprintf(&buf[size], "L/ Allocated : [ 0x"); - - for (n = dd->slot_groups-1; n >= 0; n--) { - if (sizeof(long) > sizeof(u32)) - group_allocated = - dd->port->allocated[n/2] >> (32*(n&1)); - else - group_allocated = dd->port->allocated[n]; - size += sprintf(&buf[size], "%08X ", group_allocated); - } - size += sprintf(&buf[size], "]\n"); - size += sprintf(&buf[size], "L/ Commands in Q : [ 0x"); for (n = dd->slot_groups-1; n >= 0; n--) { @@ -2780,48 +2766,6 @@ static void mtip_hw_debugfs_exit(struct driver_data *dd) debugfs_remove_recursive(dd->dfs_node); } -static int mtip_free_orphan(struct driver_data *dd) -{ - struct kobject *kobj; - - if (dd->bdev) { - if (dd->bdev->bd_holders >= 1) - return -2; - - bdput(dd->bdev); - dd->bdev = NULL; - } - - mtip_hw_debugfs_exit(dd); - - spin_lock(&rssd_index_lock); - ida_remove(&rssd_index_ida, dd->index); - spin_unlock(&rssd_index_lock); - - if (!test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag) && - test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) { - put_disk(dd->disk); - } else { - if (dd->disk) { - kobj = kobject_get(&disk_to_dev(dd->disk)->kobj); - if (kobj) { - mtip_hw_sysfs_exit(dd, kobj); - kobject_put(kobj); - } - del_gendisk(dd->disk); - dd->disk = NULL; - } - if (dd->queue) { - dd->queue->queuedata = NULL; - blk_cleanup_queue(dd->queue); - blk_mq_free_tag_set(&dd->tags); - dd->queue = NULL; - } - } - kfree(dd); - return 0; -} - /* * Perform any init/resume time hardware setup * @@ -2944,7 +2888,6 @@ static int mtip_ftl_rebuild_poll(struct driver_data *dd) mtip_block_initialize(dd); return 0; } - ssleep(10); } while (time_before(jiffies, timeout)); /* Check for timeout */ @@ -2969,7 +2912,6 @@ static int mtip_service_thread(void *data) unsigned long slot, slot_start, slot_wrap; unsigned int num_cmd_slots = dd->slot_groups * 32; struct mtip_port *port = dd->port; - int ret; while (1) { if (kthread_should_stop() || @@ -2990,10 +2932,6 @@ static int mtip_service_thread(void *data) test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags)) goto st_out; - /* If I am an orphan, start self cleanup */ - if (test_bit(MTIP_PF_SR_CLEANUP_BIT, &port->flags)) - break; - if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) goto st_out; @@ -3047,26 +2985,6 @@ restart_eh: } } - /* wait for pci remove to exit */ - while (1) { - if (test_bit(MTIP_DDF_REMOVE_DONE_BIT, &dd->dd_flag)) - break; - msleep_interruptible(1000); - if (kthread_should_stop()) - goto st_out; - } - - while (1) { - ret = mtip_free_orphan(dd); - if (!ret) { - /* NOTE: All data structures are invalid, do not - * access any here */ - return 0; - } - msleep_interruptible(1000); - if (kthread_should_stop()) - goto st_out; - } st_out: return 0; } @@ -3394,6 +3312,7 @@ static int mtip_hw_exit(struct driver_data *dd) /* Release the IRQ. */ irq_set_affinity_hint(dd->pdev->irq, NULL); devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd); + msleep(1000); /* Free dma regions */ mtip_dma_free(dd); @@ -3699,6 +3618,26 @@ static const struct block_device_operations mtip_block_ops = { .owner = THIS_MODULE }; +static inline bool is_se_active(struct driver_data *dd) +{ + if (unlikely(test_bit(MTIP_PF_SE_ACTIVE_BIT, &dd->port->flags))) { + if (dd->port->ic_pause_timer) { + unsigned long to = dd->port->ic_pause_timer + + msecs_to_jiffies(1000); + if (time_after(jiffies, to)) { + clear_bit(MTIP_PF_SE_ACTIVE_BIT, + &dd->port->flags); + clear_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag); + dd->port->ic_pause_timer = 0; + wake_up_interruptible(&dd->port->svc_wait); + return false; + } + } + return true; + } + return false; +} + /* * Block layer make request function. * @@ -3716,6 +3655,9 @@ static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq) struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); unsigned int nents; + if (is_se_active(dd)) + return -ENODATA; + if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) { if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) { @@ -3814,6 +3756,14 @@ static int mtip_init_cmd(void *data, struct request *rq, unsigned int hctx_idx, struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64; + /* + * For flush requests, request_idx starts at the end of the + * tag space. Since we don't support FLUSH/FUA, simply return + * 0 as there's nothing to be done. + */ + if (request_idx >= MTIP_MAX_COMMAND_SLOTS) + return 0; + cmd->command = dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, &cmd->command_dma, GFP_KERNEL); if (!cmd->command) @@ -3900,7 +3850,8 @@ static int mtip_block_initialize(struct driver_data *dd) dd->disk->driverfs_dev = &dd->pdev->dev; dd->disk->major = dd->major; - dd->disk->first_minor = dd->instance * MTIP_MAX_MINORS; + dd->disk->first_minor = index * MTIP_MAX_MINORS; + dd->disk->minors = MTIP_MAX_MINORS; dd->disk->fops = &mtip_block_ops; dd->disk->private_data = dd; dd->index = index; @@ -4066,52 +4017,51 @@ static int mtip_block_remove(struct driver_data *dd) { struct kobject *kobj; - if (!dd->sr) { - mtip_hw_debugfs_exit(dd); + mtip_hw_debugfs_exit(dd); - if (dd->mtip_svc_handler) { - set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags); - wake_up_interruptible(&dd->port->svc_wait); - kthread_stop(dd->mtip_svc_handler); - } + if (dd->mtip_svc_handler) { + set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags); + wake_up_interruptible(&dd->port->svc_wait); + kthread_stop(dd->mtip_svc_handler); + } - /* Clean up the sysfs attributes, if created */ - if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) { - kobj = kobject_get(&disk_to_dev(dd->disk)->kobj); - if (kobj) { - mtip_hw_sysfs_exit(dd, kobj); - kobject_put(kobj); - } + /* Clean up the sysfs attributes, if created */ + if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) { + kobj = kobject_get(&disk_to_dev(dd->disk)->kobj); + if (kobj) { + mtip_hw_sysfs_exit(dd, kobj); + kobject_put(kobj); } + } + if (!dd->sr) mtip_standby_drive(dd); - - /* - * Delete our gendisk structure. This also removes the device - * from /dev - */ - if (dd->bdev) { - bdput(dd->bdev); - dd->bdev = NULL; - } - if (dd->disk) { - if (dd->disk->queue) { - del_gendisk(dd->disk); - blk_cleanup_queue(dd->queue); - blk_mq_free_tag_set(&dd->tags); - dd->queue = NULL; - } else - put_disk(dd->disk); - } - dd->disk = NULL; - - spin_lock(&rssd_index_lock); - ida_remove(&rssd_index_ida, dd->index); - spin_unlock(&rssd_index_lock); - } else { + else dev_info(&dd->pdev->dev, "device %s surprise removal\n", dd->disk->disk_name); + + /* + * Delete our gendisk structure. This also removes the device + * from /dev + */ + if (dd->bdev) { + bdput(dd->bdev); + dd->bdev = NULL; + } + if (dd->disk) { + del_gendisk(dd->disk); + if (dd->disk->queue) { + blk_cleanup_queue(dd->queue); + blk_mq_free_tag_set(&dd->tags); + dd->queue = NULL; + } + put_disk(dd->disk); } + dd->disk = NULL; + + spin_lock(&rssd_index_lock); + ida_remove(&rssd_index_ida, dd->index); + spin_unlock(&rssd_index_lock); /* De-initialize the protocol layer. */ mtip_hw_exit(dd); @@ -4140,12 +4090,12 @@ static int mtip_block_shutdown(struct driver_data *dd) dev_info(&dd->pdev->dev, "Shutting down %s ...\n", dd->disk->disk_name); + del_gendisk(dd->disk); if (dd->disk->queue) { - del_gendisk(dd->disk); blk_cleanup_queue(dd->queue); blk_mq_free_tag_set(&dd->tags); - } else - put_disk(dd->disk); + } + put_disk(dd->disk); dd->disk = NULL; dd->queue = NULL; } @@ -4507,6 +4457,7 @@ static void mtip_pci_remove(struct pci_dev *pdev) "Completion workers still active!\n"); } + blk_mq_stop_hw_queues(dd->queue); /* Clean up the block layer. */ mtip_block_remove(dd); @@ -4524,10 +4475,7 @@ static void mtip_pci_remove(struct pci_dev *pdev) list_del_init(&dd->remove_list); spin_unlock_irqrestore(&dev_lock, flags); - if (!dd->sr) - kfree(dd); - else - set_bit(MTIP_DDF_REMOVE_DONE_BIT, &dd->dd_flag); + kfree(dd); pcim_iounmap_regions(pdev, 1 << MTIP_ABAR); pci_set_drvdata(pdev, NULL); diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index ba1b31ee2..327478400 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -142,7 +142,6 @@ enum { MTIP_PF_SVC_THD_ACTIVE_BIT = 4, MTIP_PF_ISSUE_CMDS_BIT = 5, MTIP_PF_REBUILD_BIT = 6, - MTIP_PF_SR_CLEANUP_BIT = 7, MTIP_PF_SVC_THD_STOP_BIT = 8, /* below are bit numbers in 'dd_flag' defined in driver_data */ @@ -150,7 +149,6 @@ enum { MTIP_DDF_REMOVE_PENDING_BIT = 1, MTIP_DDF_OVER_TEMP_BIT = 2, MTIP_DDF_WRITE_PROTECT_BIT = 3, - MTIP_DDF_REMOVE_DONE_BIT = 4, MTIP_DDF_CLEANUP_BIT = 5, MTIP_DDF_RESUME_BIT = 6, MTIP_DDF_INIT_DONE_BIT = 7, @@ -412,19 +410,13 @@ struct mtip_port { * by the DMA when the driver issues internal commands. */ dma_addr_t sector_buffer_dma; - /* - * Bit significant, used to determine if a command slot has - * been allocated. i.e. the slot is in use. Bits are cleared - * when the command slot and all associated data structures - * are no longer needed. - */ + u16 *log_buf; dma_addr_t log_buf_dma; u8 *smart_buf; dma_addr_t smart_buf_dma; - unsigned long allocated[SLOTBITS_IN_LONGS]; /* * used to queue commands when an internal command is in progress * or error handling is active diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 39e5f7fae..0e385d8e9 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -230,29 +230,40 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req) int result, flags; struct nbd_request request; unsigned long size = blk_rq_bytes(req); + u32 type; + + if (req->cmd_type == REQ_TYPE_DRV_PRIV) + type = NBD_CMD_DISC; + else if (req->cmd_flags & REQ_DISCARD) + type = NBD_CMD_TRIM; + else if (req->cmd_flags & REQ_FLUSH) + type = NBD_CMD_FLUSH; + else if (rq_data_dir(req) == WRITE) + type = NBD_CMD_WRITE; + else + type = NBD_CMD_READ; memset(&request, 0, sizeof(request)); request.magic = htonl(NBD_REQUEST_MAGIC); - request.type = htonl(nbd_cmd(req)); - - if (nbd_cmd(req) != NBD_CMD_FLUSH && nbd_cmd(req) != NBD_CMD_DISC) { + request.type = htonl(type); + if (type != NBD_CMD_FLUSH && type != NBD_CMD_DISC) { request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); request.len = htonl(size); } memcpy(request.handle, &req, sizeof(req)); dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n", - req, nbdcmd_to_ascii(nbd_cmd(req)), + req, nbdcmd_to_ascii(type), (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req)); result = sock_xmit(nbd, 1, &request, sizeof(request), - (nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0); + (type == NBD_CMD_WRITE) ? MSG_MORE : 0); if (result <= 0) { dev_err(disk_to_dev(nbd->disk), "Send control failed (result %d)\n", result); return -EIO; } - if (nbd_cmd(req) == NBD_CMD_WRITE) { + if (type == NBD_CMD_WRITE) { struct req_iterator iter; struct bio_vec bvec; /* @@ -352,7 +363,7 @@ static struct request *nbd_read_stat(struct nbd_device *nbd) } dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req); - if (nbd_cmd(req) == NBD_CMD_READ) { + if (rq_data_dir(req) != WRITE) { struct req_iterator iter; struct bio_vec bvec; @@ -452,23 +463,11 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req) if (req->cmd_type != REQ_TYPE_FS) goto error_out; - nbd_cmd(req) = NBD_CMD_READ; - if (rq_data_dir(req) == WRITE) { - if ((req->cmd_flags & REQ_DISCARD)) { - WARN_ON(!(nbd->flags & NBD_FLAG_SEND_TRIM)); - nbd_cmd(req) = NBD_CMD_TRIM; - } else - nbd_cmd(req) = NBD_CMD_WRITE; - if (nbd->flags & NBD_FLAG_READ_ONLY) { - dev_err(disk_to_dev(nbd->disk), - "Write on read-only\n"); - goto error_out; - } - } - - if (req->cmd_flags & REQ_FLUSH) { - BUG_ON(unlikely(blk_rq_sectors(req))); - nbd_cmd(req) = NBD_CMD_FLUSH; + if (rq_data_dir(req) == WRITE && + (nbd->flags & NBD_FLAG_READ_ONLY)) { + dev_err(disk_to_dev(nbd->disk), + "Write on read-only\n"); + goto error_out; } req->errors = 0; @@ -592,8 +591,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, fsync_bdev(bdev); mutex_lock(&nbd->tx_lock); blk_rq_init(NULL, &sreq); - sreq.cmd_type = REQ_TYPE_SPECIAL; - nbd_cmd(&sreq) = NBD_CMD_DISC; + sreq.cmd_type = REQ_TYPE_DRV_PRIV; /* Check again after getting mutex back. */ if (!nbd->sock) @@ -713,7 +711,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, bdev->bd_inode->i_size = 0; set_capacity(nbd->disk, 0); if (max_part > 0) - ioctl_by_bdev(bdev, BLKRRPART, 0); + blkdev_reread_part(bdev); if (nbd->disconnect) /* user requested, ignore socket errors */ return 0; return nbd->harderror; diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 65cd61a41..3177b245d 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -99,7 +99,7 @@ static int null_set_queue_mode(const char *str, const struct kernel_param *kp) return null_param_store_val(str, &queue_mode, NULL_Q_BIO, NULL_Q_MQ); } -static struct kernel_param_ops null_queue_mode_param_ops = { +static const struct kernel_param_ops null_queue_mode_param_ops = { .set = null_set_queue_mode, .get = param_get_int, }; @@ -127,7 +127,7 @@ static int null_set_irqmode(const char *str, const struct kernel_param *kp) NULL_IRQ_TIMER); } -static struct kernel_param_ops null_irqmode_param_ops = { +static const struct kernel_param_ops null_irqmode_param_ops = { .set = null_set_irqmode, .get = param_get_int, }; @@ -240,9 +240,20 @@ static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) while ((entry = llist_del_all(&cq->list)) != NULL) { entry = llist_reverse_order(entry); do { + struct request_queue *q = NULL; + cmd = container_of(entry, struct nullb_cmd, ll_list); entry = entry->next; + if (cmd->rq) + q = cmd->rq->q; end_cmd(cmd); + + if (q && !q->mq_ops && blk_queue_stopped(q)) { + spin_lock(q->queue_lock); + if (blk_queue_stopped(q)) + blk_start_queue(q); + spin_unlock(q->queue_lock); + } } while (entry); } @@ -257,7 +268,7 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd) if (llist_add(&cmd->ll_list, &cq->list)) { ktime_t kt = ktime_set(0, completion_nsec); - hrtimer_start(&cq->timer, kt, HRTIMER_MODE_REL); + hrtimer_start(&cq->timer, kt, HRTIMER_MODE_REL_PINNED); } put_cpu(); @@ -334,6 +345,7 @@ static int null_rq_prep_fn(struct request_queue *q, struct request *req) req->special = cmd; return BLKPREP_OK; } + blk_stop_queue(q); return BLKPREP_DEFER; } diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 683dff272..7920c2741 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -29,6 +29,7 @@ #include <linux/kdev_t.h> #include <linux/kthread.h> #include <linux/kernel.h> +#include <linux/list_sort.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/moduleparam.h> @@ -80,6 +81,7 @@ static wait_queue_head_t nvme_kthread_wait; static struct class *nvme_class; static void nvme_reset_failed_dev(struct work_struct *ws); +static int nvme_reset(struct nvme_dev *dev); static int nvme_process_cq(struct nvme_queue *nvmeq); struct async_cmd_info { @@ -102,6 +104,7 @@ struct nvme_queue { spinlock_t q_lock; struct nvme_command *sq_cmds; volatile struct nvme_completion *cqes; + struct blk_mq_tags **tags; dma_addr_t sq_dma_addr; dma_addr_t cq_dma_addr; u32 __iomem *q_db; @@ -114,7 +117,6 @@ struct nvme_queue { u8 cq_phase; u8 cqe_seen; struct async_cmd_info cmdinfo; - struct blk_mq_hw_ctx *hctx; }; /* @@ -182,12 +184,22 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, struct nvme_dev *dev = data; struct nvme_queue *nvmeq = dev->queues[0]; - WARN_ON(nvmeq->hctx); - nvmeq->hctx = hctx; + WARN_ON(hctx_idx != 0); + WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); + WARN_ON(nvmeq->tags); + hctx->driver_data = nvmeq; + nvmeq->tags = &dev->admin_tagset.tags[0]; return 0; } +static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ + struct nvme_queue *nvmeq = hctx->driver_data; + + nvmeq->tags = NULL; +} + static int nvme_admin_init_request(void *data, struct request *req, unsigned int hctx_idx, unsigned int rq_idx, unsigned int numa_node) @@ -201,27 +213,16 @@ static int nvme_admin_init_request(void *data, struct request *req, return 0; } -static void nvme_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) -{ - struct nvme_queue *nvmeq = hctx->driver_data; - - nvmeq->hctx = NULL; -} - static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { struct nvme_dev *dev = data; - struct nvme_queue *nvmeq = dev->queues[ - (hctx_idx % dev->queue_count) + 1]; - - if (!nvmeq->hctx) - nvmeq->hctx = hctx; + struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; - /* nvmeq queues are shared between namespaces. We assume here that - * blk-mq map the tags so they match up with the nvme queue tags. */ - WARN_ON(nvmeq->hctx->tags != hctx->tags); + if (!nvmeq->tags) + nvmeq->tags = &dev->tagset.tags[hctx_idx]; + WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags); hctx->driver_data = nvmeq; return 0; } @@ -307,9 +308,16 @@ static void async_req_completion(struct nvme_queue *nvmeq, void *ctx, if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) ++nvmeq->dev->event_limit; - if (status == NVME_SC_SUCCESS) - dev_warn(nvmeq->q_dmadev, - "async event result %08x\n", result); + if (status != NVME_SC_SUCCESS) + return; + + switch (result & 0xff07) { + case NVME_AER_NOTICE_NS_CHANGED: + dev_info(nvmeq->q_dmadev, "rescanning\n"); + schedule_work(&nvmeq->dev->scan_work); + default: + dev_warn(nvmeq->q_dmadev, "async event result %08x\n", result); + } } static void abort_completion(struct nvme_queue *nvmeq, void *ctx, @@ -320,7 +328,7 @@ static void abort_completion(struct nvme_queue *nvmeq, void *ctx, u16 status = le16_to_cpup(&cqe->status) >> 1; u32 result = le32_to_cpup(&cqe->result); - blk_mq_free_hctx_request(nvmeq->hctx, req); + blk_mq_free_request(req); dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); ++nvmeq->dev->abort_limit; @@ -333,14 +341,13 @@ static void async_completion(struct nvme_queue *nvmeq, void *ctx, cmdinfo->result = le32_to_cpup(&cqe->result); cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; queue_kthread_work(cmdinfo->worker, &cmdinfo->work); - blk_mq_free_hctx_request(nvmeq->hctx, cmdinfo->req); + blk_mq_free_request(cmdinfo->req); } static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq, unsigned int tag) { - struct blk_mq_hw_ctx *hctx = nvmeq->hctx; - struct request *req = blk_mq_tag_to_rq(hctx->tags, tag); + struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, tag); return blk_mq_rq_to_pdu(req); } @@ -445,7 +452,7 @@ static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev, (unsigned long) rq, gfp); } -void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) +static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) { const int last_prp = dev->page_size / 8 - 1; int i; @@ -605,22 +612,33 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, spin_unlock_irqrestore(req->q->queue_lock, flags); return; } - req->errors = nvme_error_status(status); + if (req->cmd_type == REQ_TYPE_DRV_PRIV) { + if (cmd_rq->ctx == CMD_CTX_CANCELLED) + req->errors = -EINTR; + else + req->errors = status; + } else { + req->errors = nvme_error_status(status); + } } else req->errors = 0; + if (req->cmd_type == REQ_TYPE_DRV_PRIV) { + u32 result = le32_to_cpup(&cqe->result); + req->special = (void *)(uintptr_t)result; + } if (cmd_rq->aborted) - dev_warn(&nvmeq->dev->pci_dev->dev, + dev_warn(nvmeq->dev->dev, "completing aborted command with status:%04x\n", status); if (iod->nents) { - dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents, + dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents, rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); if (blk_integrity_rq(req)) { if (!rq_data_dir(req)) nvme_dif_remap(req, nvme_dif_complete); - dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->meta_sg, 1, + dma_unmap_sg(nvmeq->dev->dev, iod->meta_sg, 1, rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); } } @@ -630,8 +648,8 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, } /* length is in bytes. gfp flags indicates whether we may sleep. */ -int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, - gfp_t gfp) +static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, + int total_len, gfp_t gfp) { struct dma_pool *pool; int length = total_len; @@ -709,6 +727,23 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, return total_len; } +static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req, + struct nvme_iod *iod) +{ + struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; + + memcpy(cmnd, req->cmd, sizeof(struct nvme_command)); + cmnd->rw.command_id = req->tag; + if (req->nr_phys_segments) { + cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); + } + + if (++nvmeq->sq_tail == nvmeq->q_depth) + nvmeq->sq_tail = 0; + writel(nvmeq->sq_tail, nvmeq->q_db); +} + /* * We reuse the small pool to allocate the 16-byte range here as it is not * worth having a special pool for these or additional cases to handle freeing @@ -807,11 +842,15 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, return 0; } +/* + * NOTE: ns is NULL when called on the admin queue. + */ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct nvme_ns *ns = hctx->queue->queuedata; struct nvme_queue *nvmeq = hctx->driver_data; + struct nvme_dev *dev = nvmeq->dev; struct request *req = bd->rq; struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); struct nvme_iod *iod; @@ -822,15 +861,16 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, * unless this namespace is formated such that the metadata can be * stripped/generated by the controller with PRACT=1. */ - if (ns->ms && !blk_integrity_rq(req)) { - if (!(ns->pi_type && ns->ms == 8)) { + if (ns && ns->ms && !blk_integrity_rq(req)) { + if (!(ns->pi_type && ns->ms == 8) && + req->cmd_type != REQ_TYPE_DRV_PRIV) { req->errors = -EFAULT; blk_mq_complete_request(req); return BLK_MQ_RQ_QUEUE_OK; } } - iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC); + iod = nvme_alloc_iod(req, dev, GFP_ATOMIC); if (!iod) return BLK_MQ_RQ_QUEUE_BUSY; @@ -841,8 +881,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, * as it is not worth having a special pool for these or * additional cases to handle freeing the iod. */ - range = dma_pool_alloc(nvmeq->dev->prp_small_pool, - GFP_ATOMIC, + range = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, &iod->first_dma); if (!range) goto retry_cmd; @@ -860,9 +899,8 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, goto retry_cmd; if (blk_rq_bytes(req) != - nvme_setup_prps(nvmeq->dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) { - dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, - iod->nents, dma_dir); + nvme_setup_prps(dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) { + dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); goto retry_cmd; } if (blk_integrity_rq(req)) { @@ -884,7 +922,9 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, nvme_set_info(cmd, iod, req_completion); spin_lock_irq(&nvmeq->q_lock); - if (req->cmd_flags & REQ_DISCARD) + if (req->cmd_type == REQ_TYPE_DRV_PRIV) + nvme_submit_priv(nvmeq, req, iod); + else if (req->cmd_flags & REQ_DISCARD) nvme_submit_discard(nvmeq, ns, req, iod); else if (req->cmd_flags & REQ_FLUSH) nvme_submit_flush(nvmeq, ns, req->tag); @@ -896,10 +936,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_MQ_RQ_QUEUE_OK; error_cmd: - nvme_free_iod(nvmeq->dev, iod); + nvme_free_iod(dev, iod); return BLK_MQ_RQ_QUEUE_ERROR; retry_cmd: - nvme_free_iod(nvmeq->dev, iod); + nvme_free_iod(dev, iod); return BLK_MQ_RQ_QUEUE_BUSY; } @@ -942,15 +982,6 @@ static int nvme_process_cq(struct nvme_queue *nvmeq) return 1; } -/* Admin queue isn't initialized as a request queue. If at some point this - * happens anyway, make sure to notify the user */ -static int nvme_admin_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) -{ - WARN_ON_ONCE(1); - return BLK_MQ_RQ_QUEUE_ERROR; -} - static irqreturn_t nvme_irq(int irq, void *data) { irqreturn_t result; @@ -972,46 +1003,61 @@ static irqreturn_t nvme_irq_check(int irq, void *data) return IRQ_WAKE_THREAD; } -struct sync_cmd_info { - struct task_struct *task; - u32 result; - int status; -}; - -static void sync_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) -{ - struct sync_cmd_info *cmdinfo = ctx; - cmdinfo->result = le32_to_cpup(&cqe->result); - cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; - wake_up_process(cmdinfo->task); -} - /* * Returns 0 on success. If the result is negative, it's a Linux error code; * if the result is positive, it's an NVM Express status code */ -static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd, - u32 *result, unsigned timeout) +int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buffer, void __user *ubuffer, unsigned bufflen, + u32 *result, unsigned timeout) { - struct sync_cmd_info cmdinfo; - struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = cmd_rq->nvmeq; + bool write = cmd->common.opcode & 1; + struct bio *bio = NULL; + struct request *req; + int ret; - cmdinfo.task = current; - cmdinfo.status = -EINTR; + req = blk_mq_alloc_request(q, write, GFP_KERNEL, false); + if (IS_ERR(req)) + return PTR_ERR(req); - cmd->common.command_id = req->tag; + req->cmd_type = REQ_TYPE_DRV_PRIV; + req->cmd_flags |= REQ_FAILFAST_DRIVER; + req->__data_len = 0; + req->__sector = (sector_t) -1; + req->bio = req->biotail = NULL; + + req->timeout = timeout ? timeout : ADMIN_TIMEOUT; - nvme_set_info(cmd_rq, &cmdinfo, sync_completion); + req->cmd = (unsigned char *)cmd; + req->cmd_len = sizeof(struct nvme_command); + req->special = (void *)0; - set_current_state(TASK_UNINTERRUPTIBLE); - nvme_submit_cmd(nvmeq, cmd); - schedule(); + if (buffer && bufflen) { + ret = blk_rq_map_kern(q, req, buffer, bufflen, __GFP_WAIT); + if (ret) + goto out; + } else if (ubuffer && bufflen) { + ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, __GFP_WAIT); + if (ret) + goto out; + bio = req->bio; + } + blk_execute_rq(req->q, NULL, req, 0); + if (bio) + blk_rq_unmap_user(bio); if (result) - *result = cmdinfo.result; - return cmdinfo.status; + *result = (u32)(uintptr_t)req->special; + ret = req->errors; + out: + blk_mq_free_request(req); + return ret; +} + +int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buffer, unsigned bufflen) +{ + return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0); } static int nvme_submit_async_admin_req(struct nvme_dev *dev) @@ -1033,7 +1079,7 @@ static int nvme_submit_async_admin_req(struct nvme_dev *dev) c.common.opcode = nvme_admin_async_event; c.common.command_id = req->tag; - blk_mq_free_hctx_request(nvmeq->hctx, req); + blk_mq_free_request(req); return __nvme_submit_cmd(nvmeq, &c); } @@ -1060,41 +1106,6 @@ static int nvme_submit_admin_async_cmd(struct nvme_dev *dev, return nvme_submit_cmd(nvmeq, cmd); } -static int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, - u32 *result, unsigned timeout) -{ - int res; - struct request *req; - - req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); - if (IS_ERR(req)) - return PTR_ERR(req); - res = nvme_submit_sync_cmd(req, cmd, result, timeout); - blk_mq_free_request(req); - return res; -} - -int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, - u32 *result) -{ - return __nvme_submit_admin_cmd(dev, cmd, result, ADMIN_TIMEOUT); -} - -int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_ns *ns, - struct nvme_command *cmd, u32 *result) -{ - int res; - struct request *req; - - req = blk_mq_alloc_request(ns->queue, WRITE, (GFP_KERNEL|__GFP_WAIT), - false); - if (IS_ERR(req)) - return PTR_ERR(req); - res = nvme_submit_sync_cmd(req, cmd, result, NVME_IO_TIMEOUT); - blk_mq_free_request(req); - return res; -} - static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) { struct nvme_command c; @@ -1103,7 +1114,7 @@ static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) c.delete_queue.opcode = opcode; c.delete_queue.qid = cpu_to_le16(id); - return nvme_submit_admin_cmd(dev, &c, NULL); + return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); } static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, @@ -1112,6 +1123,10 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, struct nvme_command c; int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; + /* + * Note: we (ab)use the fact the the prp fields survive if no data + * is attached to the request. + */ memset(&c, 0, sizeof(c)); c.create_cq.opcode = nvme_admin_create_cq; c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); @@ -1120,7 +1135,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, c.create_cq.cq_flags = cpu_to_le16(flags); c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); - return nvme_submit_admin_cmd(dev, &c, NULL); + return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); } static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, @@ -1129,6 +1144,10 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, struct nvme_command c; int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; + /* + * Note: we (ab)use the fact the the prp fields survive if no data + * is attached to the request. + */ memset(&c, 0, sizeof(c)); c.create_sq.opcode = nvme_admin_create_sq; c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); @@ -1137,7 +1156,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, c.create_sq.sq_flags = cpu_to_le16(flags); c.create_sq.cqid = cpu_to_le16(qid); - return nvme_submit_admin_cmd(dev, &c, NULL); + return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); } static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) @@ -1150,18 +1169,45 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); } -int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, - dma_addr_t dma_addr) +int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id) { - struct nvme_command c; + struct nvme_command c = { }; + int error; - memset(&c, 0, sizeof(c)); + /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ c.identify.opcode = nvme_admin_identify; - c.identify.nsid = cpu_to_le32(nsid); - c.identify.prp1 = cpu_to_le64(dma_addr); - c.identify.cns = cpu_to_le32(cns); + c.identify.cns = cpu_to_le32(1); + + *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); + if (!*id) + return -ENOMEM; - return nvme_submit_admin_cmd(dev, &c, NULL); + error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, + sizeof(struct nvme_id_ctrl)); + if (error) + kfree(*id); + return error; +} + +int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid, + struct nvme_id_ns **id) +{ + struct nvme_command c = { }; + int error; + + /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ + c.identify.opcode = nvme_admin_identify, + c.identify.nsid = cpu_to_le32(nsid), + + *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL); + if (!*id) + return -ENOMEM; + + error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, + sizeof(struct nvme_id_ns)); + if (error) + kfree(*id); + return error; } int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, @@ -1175,7 +1221,8 @@ int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, c.features.prp1 = cpu_to_le64(dma_addr); c.features.fid = cpu_to_le32(fid); - return nvme_submit_admin_cmd(dev, &c, result); + return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0, + result, 0); } int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, @@ -1189,7 +1236,30 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, c.features.fid = cpu_to_le32(fid); c.features.dword11 = cpu_to_le32(dword11); - return nvme_submit_admin_cmd(dev, &c, result); + return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0, + result, 0); +} + +int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log) +{ + struct nvme_command c = { }; + int error; + + c.common.opcode = nvme_admin_get_log_page, + c.common.nsid = cpu_to_le32(0xFFFFFFFF), + c.common.cdw10[0] = cpu_to_le32( + (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) | + NVME_LOG_SMART), + + *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL); + if (!*log) + return -ENOMEM; + + error = nvme_submit_sync_cmd(dev->admin_q, &c, *log, + sizeof(struct nvme_smart_log)); + if (error) + kfree(*log); + return error; } /** @@ -1214,8 +1284,7 @@ static void nvme_abort_req(struct request *req) if (work_busy(&dev->reset_work)) goto out; list_del_init(&dev->node); - dev_warn(&dev->pci_dev->dev, - "I/O %d QID %d timeout, reset controller\n", + dev_warn(dev->dev, "I/O %d QID %d timeout, reset controller\n", req->tag, nvmeq->qid); dev->reset_workfn = nvme_reset_failed_dev; queue_work(nvme_workq, &dev->reset_work); @@ -1254,8 +1323,7 @@ static void nvme_abort_req(struct request *req) } } -static void nvme_cancel_queue_ios(struct blk_mq_hw_ctx *hctx, - struct request *req, void *data, bool reserved) +static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved) { struct nvme_queue *nvmeq = data; void *ctx; @@ -1352,11 +1420,9 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) static void nvme_clear_queue(struct nvme_queue *nvmeq) { - struct blk_mq_hw_ctx *hctx = nvmeq->hctx; - spin_lock_irq(&nvmeq->q_lock); - if (hctx && hctx->tags) - blk_mq_tag_busy_iter(hctx, nvme_cancel_queue_ios, nvmeq); + if (nvmeq->tags && *nvmeq->tags) + blk_mq_all_tag_busy_iter(*nvmeq->tags, nvme_cancel_queue_ios, nvmeq); spin_unlock_irq(&nvmeq->q_lock); } @@ -1384,22 +1450,21 @@ static void nvme_disable_queue(struct nvme_dev *dev, int qid) static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) { - struct device *dmadev = &dev->pci_dev->dev; struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL); if (!nvmeq) return NULL; - nvmeq->cqes = dma_zalloc_coherent(dmadev, CQ_SIZE(depth), + nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth), &nvmeq->cq_dma_addr, GFP_KERNEL); if (!nvmeq->cqes) goto free_nvmeq; - nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), + nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), &nvmeq->sq_dma_addr, GFP_KERNEL); if (!nvmeq->sq_cmds) goto free_cqdma; - nvmeq->q_dmadev = dmadev; + nvmeq->q_dmadev = dev->dev; nvmeq->dev = dev; snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", dev->instance, qid); @@ -1409,13 +1474,17 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; nvmeq->q_depth = depth; nvmeq->qid = qid; - dev->queue_count++; + nvmeq->cq_vector = -1; dev->queues[qid] = nvmeq; + /* make sure queue descriptor is set before queue count, for kthread */ + mb(); + dev->queue_count++; + return nvmeq; free_cqdma: - dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes, + dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); free_nvmeq: kfree(nvmeq); @@ -1487,7 +1556,7 @@ static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) if (fatal_signal_pending(current)) return -EINTR; if (time_after(jiffies, timeout)) { - dev_err(&dev->pci_dev->dev, + dev_err(dev->dev, "Device not ready; aborting %s\n", enabled ? "initialisation" : "reset"); return -ENODEV; @@ -1537,7 +1606,7 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev) if (fatal_signal_pending(current)) return -EINTR; if (time_after(jiffies, timeout)) { - dev_err(&dev->pci_dev->dev, + dev_err(dev->dev, "Device shutdown incomplete; abort shutdown\n"); return -ENODEV; } @@ -1547,10 +1616,10 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev) } static struct blk_mq_ops nvme_mq_admin_ops = { - .queue_rq = nvme_admin_queue_rq, + .queue_rq = nvme_queue_rq, .map_queue = blk_mq_map_queue, .init_hctx = nvme_admin_init_hctx, - .exit_hctx = nvme_exit_hctx, + .exit_hctx = nvme_admin_exit_hctx, .init_request = nvme_admin_init_request, .timeout = nvme_timeout, }; @@ -1559,7 +1628,6 @@ static struct blk_mq_ops nvme_mq_ops = { .queue_rq = nvme_queue_rq, .map_queue = blk_mq_map_queue, .init_hctx = nvme_init_hctx, - .exit_hctx = nvme_exit_hctx, .init_request = nvme_init_request, .timeout = nvme_timeout, }; @@ -1580,7 +1648,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; dev->admin_tagset.reserved_tags = 1; dev->admin_tagset.timeout = ADMIN_TIMEOUT; - dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev); + dev->admin_tagset.numa_node = dev_to_node(dev->dev); dev->admin_tagset.cmd_size = nvme_cmd_size(dev); dev->admin_tagset.driver_data = dev; @@ -1594,6 +1662,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) } if (!blk_get_queue(dev->admin_q)) { nvme_dev_remove_admin(dev); + dev->admin_q = NULL; return -ENODEV; } } else @@ -1613,14 +1682,14 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12; if (page_shift < dev_page_min) { - dev_err(&dev->pci_dev->dev, + dev_err(dev->dev, "Minimum device page size (%u) too large for " "host (%u)\n", 1 << dev_page_min, 1 << page_shift); return -ENODEV; } if (page_shift > dev_page_max) { - dev_info(&dev->pci_dev->dev, + dev_info(dev->dev, "Device maximum page size (%u) smaller than " "host (%u); enabling work-around\n", 1 << dev_page_max, 1 << page_shift); @@ -1658,8 +1727,10 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) nvmeq->cq_vector = 0; result = queue_request_irq(dev, nvmeq, nvmeq->irqname); - if (result) + if (result) { + nvmeq->cq_vector = -1; goto free_nvmeq; + } return result; @@ -1668,126 +1739,43 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) return result; } -struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, - unsigned long addr, unsigned length) -{ - int i, err, count, nents, offset; - struct scatterlist *sg; - struct page **pages; - struct nvme_iod *iod; - - if (addr & 3) - return ERR_PTR(-EINVAL); - if (!length || length > INT_MAX - PAGE_SIZE) - return ERR_PTR(-EINVAL); - - offset = offset_in_page(addr); - count = DIV_ROUND_UP(offset + length, PAGE_SIZE); - pages = kcalloc(count, sizeof(*pages), GFP_KERNEL); - if (!pages) - return ERR_PTR(-ENOMEM); - - err = get_user_pages_fast(addr, count, 1, pages); - if (err < count) { - count = err; - err = -EFAULT; - goto put_pages; - } - - err = -ENOMEM; - iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL); - if (!iod) - goto put_pages; - - sg = iod->sg; - sg_init_table(sg, count); - for (i = 0; i < count; i++) { - sg_set_page(&sg[i], pages[i], - min_t(unsigned, length, PAGE_SIZE - offset), - offset); - length -= (PAGE_SIZE - offset); - offset = 0; - } - sg_mark_end(&sg[i - 1]); - iod->nents = count; - - nents = dma_map_sg(&dev->pci_dev->dev, sg, count, - write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - if (!nents) - goto free_iod; - - kfree(pages); - return iod; - - free_iod: - kfree(iod); - put_pages: - for (i = 0; i < count; i++) - put_page(pages[i]); - kfree(pages); - return ERR_PTR(err); -} - -void nvme_unmap_user_pages(struct nvme_dev *dev, int write, - struct nvme_iod *iod) -{ - int i; - - dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, - write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - - for (i = 0; i < iod->nents; i++) - put_page(sg_page(&iod->sg[i])); -} - static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) { struct nvme_dev *dev = ns->dev; struct nvme_user_io io; struct nvme_command c; - unsigned length, meta_len, prp_len; + unsigned length, meta_len; int status, write; - struct nvme_iod *iod; dma_addr_t meta_dma = 0; void *meta = NULL; void __user *metadata; if (copy_from_user(&io, uio, sizeof(io))) return -EFAULT; - length = (io.nblocks + 1) << ns->lba_shift; - meta_len = (io.nblocks + 1) * ns->ms; - - if (meta_len && ((io.metadata & 3) || !io.metadata) && !ns->ext) - return -EINVAL; - else if (meta_len && ns->ext) { - length += meta_len; - meta_len = 0; - } - - metadata = (void __user *)(unsigned long)io.metadata; - - write = io.opcode & 1; switch (io.opcode) { case nvme_cmd_write: case nvme_cmd_read: case nvme_cmd_compare: - iod = nvme_map_user_pages(dev, write, io.addr, length); break; default: return -EINVAL; } - if (IS_ERR(iod)) - return PTR_ERR(iod); + length = (io.nblocks + 1) << ns->lba_shift; + meta_len = (io.nblocks + 1) * ns->ms; + metadata = (void __user *)(unsigned long)io.metadata; + write = io.opcode & 1; - prp_len = nvme_setup_prps(dev, iod, length, GFP_KERNEL); - if (length != prp_len) { - status = -ENOMEM; - goto unmap; + if (ns->ext) { + length += meta_len; + meta_len = 0; } if (meta_len) { - meta = dma_alloc_coherent(&dev->pci_dev->dev, meta_len, + if (((io.metadata & 3) || !io.metadata) && !ns->ext) + return -EINVAL; + + meta = dma_alloc_coherent(dev->dev, meta_len, &meta_dma, GFP_KERNEL); if (!meta) { @@ -1813,19 +1801,17 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.reftag = cpu_to_le32(io.reftag); c.rw.apptag = cpu_to_le16(io.apptag); c.rw.appmask = cpu_to_le16(io.appmask); - c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - c.rw.prp2 = cpu_to_le64(iod->first_dma); c.rw.metadata = cpu_to_le64(meta_dma); - status = nvme_submit_io_cmd(dev, ns, &c, NULL); + + status = __nvme_submit_sync_cmd(ns->queue, &c, NULL, + (void __user *)io.addr, length, NULL, 0); unmap: - nvme_unmap_user_pages(dev, write, iod); - nvme_free_iod(dev, iod); if (meta) { if (status == NVME_SC_SUCCESS && !write) { if (copy_to_user(metadata, meta, meta_len)) status = -EFAULT; } - dma_free_coherent(&dev->pci_dev->dev, meta_len, meta, meta_dma); + dma_free_coherent(dev->dev, meta_len, meta, meta_dma); } return status; } @@ -1835,9 +1821,8 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, { struct nvme_passthru_cmd cmd; struct nvme_command c; - int status, length; - struct nvme_iod *uninitialized_var(iod); - unsigned timeout; + unsigned timeout = 0; + int status; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -1857,46 +1842,17 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); - length = cmd.data_len; - if (cmd.data_len) { - iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr, - length); - if (IS_ERR(iod)) - return PTR_ERR(iod); - length = nvme_setup_prps(dev, iod, length, GFP_KERNEL); - c.common.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - c.common.prp2 = cpu_to_le64(iod->first_dma); - } - - timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) : - ADMIN_TIMEOUT; - - if (length != cmd.data_len) - status = -ENOMEM; - else if (ns) { - struct request *req; - - req = blk_mq_alloc_request(ns->queue, WRITE, - (GFP_KERNEL|__GFP_WAIT), false); - if (IS_ERR(req)) - status = PTR_ERR(req); - else { - status = nvme_submit_sync_cmd(req, &c, &cmd.result, - timeout); - blk_mq_free_request(req); - } - } else - status = __nvme_submit_admin_cmd(dev, &c, &cmd.result, timeout); + if (cmd.timeout_ms) + timeout = msecs_to_jiffies(cmd.timeout_ms); - if (cmd.data_len) { - nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); - nvme_free_iod(dev, iod); + status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c, + NULL, (void __user *)cmd.addr, cmd.data_len, + &cmd.result, timeout); + if (status >= 0) { + if (put_user(cmd.result, &ucmd->result)) + return -EFAULT; } - if ((status >= 0) && copy_to_user(&ucmd->result, &cmd.result, - sizeof(cmd.result))) - status = -EFAULT; - return status; } @@ -1988,23 +1944,18 @@ static int nvme_revalidate_disk(struct gendisk *disk) struct nvme_ns *ns = disk->private_data; struct nvme_dev *dev = ns->dev; struct nvme_id_ns *id; - dma_addr_t dma_addr; u8 lbaf, pi_type; u16 old_ms; unsigned short bs; - id = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr, - GFP_KERNEL); - if (!id) { - dev_warn(&dev->pci_dev->dev, "%s: Memory alocation failure\n", - __func__); - return 0; + if (nvme_identify_ns(dev, ns->ns_id, &id)) { + dev_warn(dev->dev, "%s: Identify failure nvme%dn%d\n", __func__, + dev->instance, ns->ns_id); + return -ENODEV; } - if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) { - dev_warn(&dev->pci_dev->dev, - "identify failed ns:%d, setting capacity to 0\n", - ns->ns_id); - memset(id, 0, sizeof(*id)); + if (id->ncap == 0) { + kfree(id); + return -ENODEV; } old_ms = ns->ms; @@ -2038,7 +1989,7 @@ static int nvme_revalidate_disk(struct gendisk *disk) !ns->ext) nvme_init_integrity(ns); - if (id->ncap == 0 || (ns->ms && !blk_get_integrity(disk))) + if (ns->ms && !blk_get_integrity(disk)) set_capacity(disk, 0); else set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); @@ -2046,7 +1997,7 @@ static int nvme_revalidate_disk(struct gendisk *disk) if (dev->oncs & NVME_CTRL_ONCS_DSM) nvme_config_discard(ns); - dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr); + kfree(id); return 0; } @@ -2073,7 +2024,7 @@ static int nvme_kthread(void *data) if (work_busy(&dev->reset_work)) continue; list_del_init(&dev->node); - dev_warn(&dev->pci_dev->dev, + dev_warn(dev->dev, "Failed status: %x, reset controller\n", readl(&dev->bar->csts)); dev->reset_workfn = nvme_reset_failed_dev; @@ -2105,7 +2056,7 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) { struct nvme_ns *ns; struct gendisk *disk; - int node = dev_to_node(&dev->pci_dev->dev); + int node = dev_to_node(dev->dev); ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) @@ -2153,11 +2104,25 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) * requires it. */ set_capacity(disk, 0); - nvme_revalidate_disk(ns->disk); + if (nvme_revalidate_disk(ns->disk)) + goto out_free_disk; + add_disk(ns->disk); - if (ns->ms) - revalidate_disk(ns->disk); + if (ns->ms) { + struct block_device *bd = bdget_disk(ns->disk, 0); + if (!bd) + return; + if (blkdev_get(bd, FMODE_READ, NULL)) { + bdput(bd); + return; + } + blkdev_reread_part(bd); + blkdev_put(bd, FMODE_READ); + } return; + out_free_disk: + kfree(disk); + list_del(&ns->list); out_free_queue: blk_cleanup_queue(ns->queue); out_free_ns: @@ -2188,8 +2153,7 @@ static int set_queue_count(struct nvme_dev *dev, int count) if (status < 0) return status; if (status > 0) { - dev_err(&dev->pci_dev->dev, "Could not set queue count (%d)\n", - status); + dev_err(dev->dev, "Could not set queue count (%d)\n", status); return 0; } return min(result & 0xffff, result >> 16) + 1; @@ -2203,7 +2167,7 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) static int nvme_setup_io_queues(struct nvme_dev *dev) { struct nvme_queue *adminq = dev->queues[0]; - struct pci_dev *pdev = dev->pci_dev; + struct pci_dev *pdev = to_pci_dev(dev->dev); int result, i, vecs, nr_io_queues, size; nr_io_queues = num_possible_cpus(); @@ -2261,8 +2225,10 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) dev->max_qid = nr_io_queues; result = queue_request_irq(dev, adminq, adminq->irqname); - if (result) + if (result) { + adminq->cq_vector = -1; goto free_queues; + } /* Free previously allocated queues that are no longer usable */ nvme_free_queues(dev, nr_io_queues + 1); @@ -2275,6 +2241,99 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) return result; } +static void nvme_free_namespace(struct nvme_ns *ns) +{ + list_del(&ns->list); + + spin_lock(&dev_list_lock); + ns->disk->private_data = NULL; + spin_unlock(&dev_list_lock); + + put_disk(ns->disk); + kfree(ns); +} + +static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); + struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); + + return nsa->ns_id - nsb->ns_id; +} + +static struct nvme_ns *nvme_find_ns(struct nvme_dev *dev, unsigned nsid) +{ + struct nvme_ns *ns; + + list_for_each_entry(ns, &dev->namespaces, list) { + if (ns->ns_id == nsid) + return ns; + if (ns->ns_id > nsid) + break; + } + return NULL; +} + +static inline bool nvme_io_incapable(struct nvme_dev *dev) +{ + return (!dev->bar || readl(&dev->bar->csts) & NVME_CSTS_CFS || + dev->online_queues < 2); +} + +static void nvme_ns_remove(struct nvme_ns *ns) +{ + bool kill = nvme_io_incapable(ns->dev) && !blk_queue_dying(ns->queue); + + if (kill) + blk_set_queue_dying(ns->queue); + if (ns->disk->flags & GENHD_FL_UP) { + if (blk_get_integrity(ns->disk)) + blk_integrity_unregister(ns->disk); + del_gendisk(ns->disk); + } + if (kill || !blk_queue_dying(ns->queue)) { + blk_mq_abort_requeue_list(ns->queue); + blk_cleanup_queue(ns->queue); + } +} + +static void nvme_scan_namespaces(struct nvme_dev *dev, unsigned nn) +{ + struct nvme_ns *ns, *next; + unsigned i; + + for (i = 1; i <= nn; i++) { + ns = nvme_find_ns(dev, i); + if (ns) { + if (revalidate_disk(ns->disk)) { + nvme_ns_remove(ns); + nvme_free_namespace(ns); + } + } else + nvme_alloc_ns(dev, i); + } + list_for_each_entry_safe(ns, next, &dev->namespaces, list) { + if (ns->ns_id > nn) { + nvme_ns_remove(ns); + nvme_free_namespace(ns); + } + } + list_sort(NULL, &dev->namespaces, ns_cmp); +} + +static void nvme_dev_scan(struct work_struct *work) +{ + struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work); + struct nvme_id_ctrl *ctrl; + + if (!dev->tagset.tags) + return; + if (nvme_identify_ctrl(dev, &ctrl)) + return; + nvme_scan_namespaces(dev, le32_to_cpup(&ctrl->nn)); + kfree(ctrl); +} + /* * Return: error value if an error occurred setting up the queues or calling * Identify Device. 0 if these succeeded, even if adding some of the @@ -2283,26 +2342,18 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) */ static int nvme_dev_add(struct nvme_dev *dev) { - struct pci_dev *pdev = dev->pci_dev; + struct pci_dev *pdev = to_pci_dev(dev->dev); int res; - unsigned nn, i; + unsigned nn; struct nvme_id_ctrl *ctrl; - void *mem; - dma_addr_t dma_addr; int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; - mem = dma_alloc_coherent(&pdev->dev, 4096, &dma_addr, GFP_KERNEL); - if (!mem) - return -ENOMEM; - - res = nvme_identify(dev, 0, 1, dma_addr); + res = nvme_identify_ctrl(dev, &ctrl); if (res) { - dev_err(&pdev->dev, "Identify Controller failed (%d)\n", res); - dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr); + dev_err(dev->dev, "Identify Controller failed (%d)\n", res); return -EIO; } - ctrl = mem; nn = le32_to_cpup(&ctrl->nn); dev->oncs = le16_to_cpup(&ctrl->oncs); dev->abort_limit = ctrl->acl + 1; @@ -2324,24 +2375,23 @@ static int nvme_dev_add(struct nvme_dev *dev) } else dev->max_hw_sectors = max_hw_sectors; } - dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr); + kfree(ctrl); - dev->tagset.ops = &nvme_mq_ops; - dev->tagset.nr_hw_queues = dev->online_queues - 1; - dev->tagset.timeout = NVME_IO_TIMEOUT; - dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev); - dev->tagset.queue_depth = + if (!dev->tagset.tags) { + dev->tagset.ops = &nvme_mq_ops; + dev->tagset.nr_hw_queues = dev->online_queues - 1; + dev->tagset.timeout = NVME_IO_TIMEOUT; + dev->tagset.numa_node = dev_to_node(dev->dev); + dev->tagset.queue_depth = min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; - dev->tagset.cmd_size = nvme_cmd_size(dev); - dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; - dev->tagset.driver_data = dev; - - if (blk_mq_alloc_tag_set(&dev->tagset)) - return 0; - - for (i = 1; i <= nn; i++) - nvme_alloc_ns(dev, i); + dev->tagset.cmd_size = nvme_cmd_size(dev); + dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; + dev->tagset.driver_data = dev; + if (blk_mq_alloc_tag_set(&dev->tagset)) + return 0; + } + schedule_work(&dev->scan_work); return 0; } @@ -2349,7 +2399,7 @@ static int nvme_dev_map(struct nvme_dev *dev) { u64 cap; int bars, result = -ENOMEM; - struct pci_dev *pdev = dev->pci_dev; + struct pci_dev *pdev = to_pci_dev(dev->dev); if (pci_enable_device_mem(pdev)) return result; @@ -2363,8 +2413,8 @@ static int nvme_dev_map(struct nvme_dev *dev) if (pci_request_selected_regions(pdev, bars, "nvme")) goto disable_pci; - if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) && - dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32))) + if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && + dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32))) goto disable; dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); @@ -2405,19 +2455,21 @@ static int nvme_dev_map(struct nvme_dev *dev) static void nvme_dev_unmap(struct nvme_dev *dev) { - if (dev->pci_dev->msi_enabled) - pci_disable_msi(dev->pci_dev); - else if (dev->pci_dev->msix_enabled) - pci_disable_msix(dev->pci_dev); + struct pci_dev *pdev = to_pci_dev(dev->dev); + + if (pdev->msi_enabled) + pci_disable_msi(pdev); + else if (pdev->msix_enabled) + pci_disable_msix(pdev); if (dev->bar) { iounmap(dev->bar); dev->bar = NULL; - pci_release_regions(dev->pci_dev); + pci_release_regions(pdev); } - if (pci_is_enabled(dev->pci_dev)) - pci_disable_device(dev->pci_dev); + if (pci_is_enabled(pdev)) + pci_disable_device(pdev); } struct nvme_delq_ctx { @@ -2536,7 +2588,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev) &worker, "nvme%d", dev->instance); if (IS_ERR(kworker_task)) { - dev_err(&dev->pci_dev->dev, + dev_err(dev->dev, "Failed to create queue del task\n"); for (i = dev->queue_count - 1; i > 0; i--) nvme_disable_queue(dev, i); @@ -2587,9 +2639,9 @@ static void nvme_freeze_queues(struct nvme_dev *dev) list_for_each_entry(ns, &dev->namespaces, list) { blk_mq_freeze_queue_start(ns->queue); - spin_lock(ns->queue->queue_lock); + spin_lock_irq(ns->queue->queue_lock); queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue); - spin_unlock(ns->queue->queue_lock); + spin_unlock_irq(ns->queue->queue_lock); blk_mq_cancel_requeue_work(ns->queue); blk_mq_stop_hw_queues(ns->queue); @@ -2639,29 +2691,19 @@ static void nvme_dev_remove(struct nvme_dev *dev) { struct nvme_ns *ns; - list_for_each_entry(ns, &dev->namespaces, list) { - if (ns->disk->flags & GENHD_FL_UP) { - if (blk_get_integrity(ns->disk)) - blk_integrity_unregister(ns->disk); - del_gendisk(ns->disk); - } - if (!blk_queue_dying(ns->queue)) { - blk_mq_abort_requeue_list(ns->queue); - blk_cleanup_queue(ns->queue); - } - } + list_for_each_entry(ns, &dev->namespaces, list) + nvme_ns_remove(ns); } static int nvme_setup_prp_pools(struct nvme_dev *dev) { - struct device *dmadev = &dev->pci_dev->dev; - dev->prp_page_pool = dma_pool_create("prp list page", dmadev, + dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, PAGE_SIZE, PAGE_SIZE, 0); if (!dev->prp_page_pool) return -ENOMEM; /* Optimisation for I/Os between 4k and 128k */ - dev->prp_small_pool = dma_pool_create("prp list 256", dmadev, + dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev, 256, 256, 0); if (!dev->prp_small_pool) { dma_pool_destroy(dev->prp_page_pool); @@ -2709,28 +2751,22 @@ static void nvme_free_namespaces(struct nvme_dev *dev) { struct nvme_ns *ns, *next; - list_for_each_entry_safe(ns, next, &dev->namespaces, list) { - list_del(&ns->list); - - spin_lock(&dev_list_lock); - ns->disk->private_data = NULL; - spin_unlock(&dev_list_lock); - - put_disk(ns->disk); - kfree(ns); - } + list_for_each_entry_safe(ns, next, &dev->namespaces, list) + nvme_free_namespace(ns); } static void nvme_free_dev(struct kref *kref) { struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); - pci_dev_put(dev->pci_dev); + put_device(dev->dev); put_device(dev->device); nvme_free_namespaces(dev); nvme_release_instance(dev); - blk_mq_free_tag_set(&dev->tagset); - blk_put_queue(dev->admin_q); + if (dev->tagset.tags) + blk_mq_free_tag_set(&dev->tagset); + if (dev->admin_q) + blk_put_queue(dev->admin_q); kfree(dev->queues); kfree(dev->entry); kfree(dev); @@ -2781,6 +2817,9 @@ static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) return -ENOTTY; ns = list_first_entry(&dev->namespaces, struct nvme_ns, list); return nvme_user_cmd(dev, ns, (void __user *)arg); + case NVME_IOCTL_RESET: + dev_warn(dev->dev, "resetting controller\n"); + return nvme_reset(dev); default: return -ENOTTY; } @@ -2802,11 +2841,11 @@ static void nvme_set_irq_hints(struct nvme_dev *dev) for (i = 0; i < dev->online_queues; i++) { nvmeq = dev->queues[i]; - if (!nvmeq->hctx) + if (!nvmeq->tags || !(*nvmeq->tags)) continue; irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, - nvmeq->hctx->cpumask); + blk_mq_tags_cpumask(*nvmeq->tags)); } } @@ -2858,6 +2897,9 @@ static int nvme_dev_start(struct nvme_dev *dev) free_tags: nvme_dev_remove_admin(dev); + blk_put_queue(dev->admin_q); + dev->admin_q = NULL; + dev->queues[0]->tags = NULL; disable: nvme_disable_queue(dev, 0); nvme_dev_list_remove(dev); @@ -2869,7 +2911,7 @@ static int nvme_dev_start(struct nvme_dev *dev) static int nvme_remove_dead_ctrl(void *arg) { struct nvme_dev *dev = (struct nvme_dev *)arg; - struct pci_dev *pdev = dev->pci_dev; + struct pci_dev *pdev = to_pci_dev(dev->dev); if (pci_get_drvdata(pdev)) pci_stop_and_remove_bus_device_locked(pdev); @@ -2899,24 +2941,43 @@ static int nvme_dev_resume(struct nvme_dev *dev) spin_unlock(&dev_list_lock); } else { nvme_unfreeze_queues(dev); + nvme_dev_add(dev); nvme_set_irq_hints(dev); } return 0; } +static void nvme_dead_ctrl(struct nvme_dev *dev) +{ + dev_warn(dev->dev, "Device failed to resume\n"); + kref_get(&dev->kref); + if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", + dev->instance))) { + dev_err(dev->dev, + "Failed to start controller remove task\n"); + kref_put(&dev->kref, nvme_free_dev); + } +} + static void nvme_dev_reset(struct nvme_dev *dev) { + bool in_probe = work_busy(&dev->probe_work); + nvme_dev_shutdown(dev); - if (nvme_dev_resume(dev)) { - dev_warn(&dev->pci_dev->dev, "Device failed to resume\n"); - kref_get(&dev->kref); - if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", - dev->instance))) { - dev_err(&dev->pci_dev->dev, - "Failed to start controller remove task\n"); - kref_put(&dev->kref, nvme_free_dev); - } + + /* Synchronize with device probe so that work will see failure status + * and exit gracefully without trying to schedule another reset */ + flush_work(&dev->probe_work); + + /* Fail this device if reset occured during probe to avoid + * infinite initialization loops. */ + if (in_probe) { + nvme_dead_ctrl(dev); + return; } + /* Schedule device resume asynchronously so the reset work is available + * to cleanup errors that may occur during reinitialization */ + schedule_work(&dev->probe_work); } static void nvme_reset_failed_dev(struct work_struct *ws) @@ -2931,6 +2992,45 @@ static void nvme_reset_workfn(struct work_struct *work) dev->reset_workfn(work); } +static int nvme_reset(struct nvme_dev *dev) +{ + int ret = -EBUSY; + + if (!dev->admin_q || blk_queue_dying(dev->admin_q)) + return -ENODEV; + + spin_lock(&dev_list_lock); + if (!work_pending(&dev->reset_work)) { + dev->reset_workfn = nvme_reset_failed_dev; + queue_work(nvme_workq, &dev->reset_work); + ret = 0; + } + spin_unlock(&dev_list_lock); + + if (!ret) { + flush_work(&dev->reset_work); + flush_work(&dev->probe_work); + return 0; + } + + return ret; +} + +static ssize_t nvme_sysfs_reset(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct nvme_dev *ndev = dev_get_drvdata(dev); + int ret; + + ret = nvme_reset(ndev); + if (ret < 0) + return ret; + + return count; +} +static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); + static void nvme_async_probe(struct work_struct *work); static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { @@ -2956,7 +3056,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_LIST_HEAD(&dev->namespaces); dev->reset_workfn = nvme_reset_failed_dev; INIT_WORK(&dev->reset_work, nvme_reset_workfn); - dev->pci_dev = pci_dev_get(pdev); + dev->dev = get_device(&pdev->dev); pci_set_drvdata(pdev, dev); result = nvme_set_instance(dev); if (result) @@ -2975,18 +3075,27 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto release_pools; } get_device(dev->device); + dev_set_drvdata(dev->device, dev); + + result = device_create_file(dev->device, &dev_attr_reset_controller); + if (result) + goto put_dev; INIT_LIST_HEAD(&dev->node); + INIT_WORK(&dev->scan_work, nvme_dev_scan); INIT_WORK(&dev->probe_work, nvme_async_probe); schedule_work(&dev->probe_work); return 0; + put_dev: + device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); + put_device(dev->device); release_pools: nvme_release_prp_pools(dev); release: nvme_release_instance(dev); put_pci: - pci_dev_put(dev->pci_dev); + put_device(dev->dev); free: kfree(dev->queues); kfree(dev->entry); @@ -2997,24 +3106,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) static void nvme_async_probe(struct work_struct *work) { struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work); - int result; - - result = nvme_dev_start(dev); - if (result) - goto reset; - if (dev->online_queues > 1) - result = nvme_dev_add(dev); - if (result) - goto reset; - - nvme_set_irq_hints(dev); - return; - reset: - if (!work_busy(&dev->reset_work)) { - dev->reset_workfn = nvme_reset_failed_dev; - queue_work(nvme_workq, &dev->reset_work); - } + if (nvme_dev_resume(dev) && !work_busy(&dev->reset_work)) + nvme_dead_ctrl(dev); } static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) @@ -3044,8 +3138,10 @@ static void nvme_remove(struct pci_dev *pdev) pci_set_drvdata(pdev, NULL); flush_work(&dev->probe_work); flush_work(&dev->reset_work); - nvme_dev_shutdown(dev); + flush_work(&dev->scan_work); + device_remove_file(dev->device, &dev_attr_reset_controller); nvme_dev_remove(dev); + nvme_dev_shutdown(dev); nvme_dev_remove_admin(dev); device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); nvme_free_queues(dev, 0); diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index 44f2514fb..e5a63f06f 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -41,15 +41,13 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/types.h> +#include <asm/unaligned.h> #include <scsi/sg.h> #include <scsi/scsi.h> static int sg_version_num = 30534; /* 2 digits for each component */ -#define SNTI_TRANSLATION_SUCCESS 0 -#define SNTI_INTERNAL_ERROR 1 - /* VPD Page Codes */ #define VPD_SUPPORTED_PAGES 0x00 #define VPD_SERIAL_NUMBER 0x80 @@ -58,49 +56,14 @@ static int sg_version_num = 30534; /* 2 digits for each component */ #define VPD_BLOCK_LIMITS 0xB0 #define VPD_BLOCK_DEV_CHARACTERISTICS 0xB1 -/* CDB offsets */ -#define REPORT_LUNS_CDB_ALLOC_LENGTH_OFFSET 6 -#define REPORT_LUNS_SR_OFFSET 2 -#define READ_CAP_16_CDB_ALLOC_LENGTH_OFFSET 10 -#define REQUEST_SENSE_CDB_ALLOC_LENGTH_OFFSET 4 -#define REQUEST_SENSE_DESC_OFFSET 1 -#define REQUEST_SENSE_DESC_MASK 0x01 -#define DESCRIPTOR_FORMAT_SENSE_DATA_TYPE 1 -#define INQUIRY_EVPD_BYTE_OFFSET 1 -#define INQUIRY_PAGE_CODE_BYTE_OFFSET 2 -#define INQUIRY_EVPD_BIT_MASK 1 -#define INQUIRY_CDB_ALLOCATION_LENGTH_OFFSET 3 -#define START_STOP_UNIT_CDB_IMMED_OFFSET 1 -#define START_STOP_UNIT_CDB_IMMED_MASK 0x1 -#define START_STOP_UNIT_CDB_POWER_COND_MOD_OFFSET 3 -#define START_STOP_UNIT_CDB_POWER_COND_MOD_MASK 0xF -#define START_STOP_UNIT_CDB_POWER_COND_OFFSET 4 -#define START_STOP_UNIT_CDB_POWER_COND_MASK 0xF0 -#define START_STOP_UNIT_CDB_NO_FLUSH_OFFSET 4 -#define START_STOP_UNIT_CDB_NO_FLUSH_MASK 0x4 -#define START_STOP_UNIT_CDB_START_OFFSET 4 -#define START_STOP_UNIT_CDB_START_MASK 0x1 -#define WRITE_BUFFER_CDB_MODE_OFFSET 1 -#define WRITE_BUFFER_CDB_MODE_MASK 0x1F -#define WRITE_BUFFER_CDB_BUFFER_ID_OFFSET 2 -#define WRITE_BUFFER_CDB_BUFFER_OFFSET_OFFSET 3 -#define WRITE_BUFFER_CDB_PARM_LIST_LENGTH_OFFSET 6 -#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_OFFSET 1 -#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_MASK 0xC0 -#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_SHIFT 6 -#define FORMAT_UNIT_CDB_LONG_LIST_OFFSET 1 -#define FORMAT_UNIT_CDB_LONG_LIST_MASK 0x20 -#define FORMAT_UNIT_CDB_FORMAT_DATA_OFFSET 1 -#define FORMAT_UNIT_CDB_FORMAT_DATA_MASK 0x10 +/* format unit paramter list offsets */ #define FORMAT_UNIT_SHORT_PARM_LIST_LEN 4 #define FORMAT_UNIT_LONG_PARM_LIST_LEN 8 #define FORMAT_UNIT_PROT_INT_OFFSET 3 #define FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET 0 #define FORMAT_UNIT_PROT_FIELD_USAGE_MASK 0x07 -#define UNMAP_CDB_PARAM_LIST_LENGTH_OFFSET 7 /* Misc. defines */ -#define NIBBLE_SHIFT 4 #define FIXED_SENSE_DATA 0x70 #define DESC_FORMAT_SENSE_DATA 0x72 #define FIXED_SENSE_DATA_ADD_LENGTH 10 @@ -144,27 +107,6 @@ static int sg_version_num = 30534; /* 2 digits for each component */ #define EXTENDED_INQUIRY_DATA_PAGE_LENGTH 0x3C #define RESERVED_FIELD 0 -/* SCSI READ/WRITE Defines */ -#define IO_CDB_WP_MASK 0xE0 -#define IO_CDB_WP_SHIFT 5 -#define IO_CDB_FUA_MASK 0x8 -#define IO_6_CDB_LBA_OFFSET 0 -#define IO_6_CDB_LBA_MASK 0x001FFFFF -#define IO_6_CDB_TX_LEN_OFFSET 4 -#define IO_6_DEFAULT_TX_LEN 256 -#define IO_10_CDB_LBA_OFFSET 2 -#define IO_10_CDB_TX_LEN_OFFSET 7 -#define IO_10_CDB_WP_OFFSET 1 -#define IO_10_CDB_FUA_OFFSET 1 -#define IO_12_CDB_LBA_OFFSET 2 -#define IO_12_CDB_TX_LEN_OFFSET 6 -#define IO_12_CDB_WP_OFFSET 1 -#define IO_12_CDB_FUA_OFFSET 1 -#define IO_16_CDB_FUA_OFFSET 1 -#define IO_16_CDB_WP_OFFSET 1 -#define IO_16_CDB_LBA_OFFSET 2 -#define IO_16_CDB_TX_LEN_OFFSET 10 - /* Mode Sense/Select defines */ #define MODE_PAGE_INFO_EXCEP 0x1C #define MODE_PAGE_CACHING 0x08 @@ -179,23 +121,14 @@ static int sg_version_num = 30534; /* 2 digits for each component */ #define MODE_PAGE_INF_EXC_LEN 0x0C #define MODE_PAGE_ALL_LEN 0x54 #define MODE_SENSE6_MPH_SIZE 4 -#define MODE_SENSE6_ALLOC_LEN_OFFSET 4 -#define MODE_SENSE_PAGE_CONTROL_OFFSET 2 #define MODE_SENSE_PAGE_CONTROL_MASK 0xC0 #define MODE_SENSE_PAGE_CODE_OFFSET 2 #define MODE_SENSE_PAGE_CODE_MASK 0x3F -#define MODE_SENSE_LLBAA_OFFSET 1 #define MODE_SENSE_LLBAA_MASK 0x10 #define MODE_SENSE_LLBAA_SHIFT 4 -#define MODE_SENSE_DBD_OFFSET 1 #define MODE_SENSE_DBD_MASK 8 #define MODE_SENSE_DBD_SHIFT 3 #define MODE_SENSE10_MPH_SIZE 8 -#define MODE_SENSE10_ALLOC_LEN_OFFSET 7 -#define MODE_SELECT_CDB_PAGE_FORMAT_OFFSET 1 -#define MODE_SELECT_CDB_SAVE_PAGES_OFFSET 1 -#define MODE_SELECT_6_CDB_PARAM_LIST_LENGTH_OFFSET 4 -#define MODE_SELECT_10_CDB_PARAM_LIST_LENGTH_OFFSET 7 #define MODE_SELECT_CDB_PAGE_FORMAT_MASK 0x10 #define MODE_SELECT_CDB_SAVE_PAGES_MASK 0x1 #define MODE_SELECT_6_BD_OFFSET 3 @@ -221,14 +154,11 @@ static int sg_version_num = 30534; /* 2 digits for each component */ #define LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH 0x07 #define LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE 0x2F #define LOG_PAGE_TEMPERATURE_PAGE 0x0D -#define LOG_SENSE_CDB_SP_OFFSET 1 #define LOG_SENSE_CDB_SP_NOT_ENABLED 0 -#define LOG_SENSE_CDB_PC_OFFSET 2 #define LOG_SENSE_CDB_PC_MASK 0xC0 #define LOG_SENSE_CDB_PC_SHIFT 6 #define LOG_SENSE_CDB_PC_CUMULATIVE_VALUES 1 #define LOG_SENSE_CDB_PAGE_CODE_MASK 0x3F -#define LOG_SENSE_CDB_ALLOC_LENGTH_OFFSET 7 #define REMAINING_INFO_EXCP_PAGE_LENGTH 0x8 #define LOG_INFO_EXCP_PAGE_LENGTH 0xC #define REMAINING_TEMP_PAGE_LENGTH 0xC @@ -278,77 +208,11 @@ static int sg_version_num = 30534; /* 2 digits for each component */ #define SCSI_ASCQ_POWER_LOSS_EXPECTED 0x08 #define SCSI_ASCQ_INVALID_LUN_ID 0x09 -/** - * DEVICE_SPECIFIC_PARAMETER in mode parameter header (see sbc2r16) to - * enable DPOFUA support type 0x10 value. - */ -#define DEVICE_SPECIFIC_PARAMETER 0 -#define VPD_ID_DESCRIPTOR_LENGTH sizeof(VPD_IDENTIFICATION_DESCRIPTOR) - -/* MACROs to extract information from CDBs */ - -#define GET_OPCODE(cdb) cdb[0] - -#define GET_U8_FROM_CDB(cdb, index) (cdb[index] << 0) - -#define GET_U16_FROM_CDB(cdb, index) ((cdb[index] << 8) | (cdb[index + 1] << 0)) - -#define GET_U24_FROM_CDB(cdb, index) ((cdb[index] << 16) | \ -(cdb[index + 1] << 8) | \ -(cdb[index + 2] << 0)) - -#define GET_U32_FROM_CDB(cdb, index) ((cdb[index] << 24) | \ -(cdb[index + 1] << 16) | \ -(cdb[index + 2] << 8) | \ -(cdb[index + 3] << 0)) - -#define GET_U64_FROM_CDB(cdb, index) ((((u64)cdb[index]) << 56) | \ -(((u64)cdb[index + 1]) << 48) | \ -(((u64)cdb[index + 2]) << 40) | \ -(((u64)cdb[index + 3]) << 32) | \ -(((u64)cdb[index + 4]) << 24) | \ -(((u64)cdb[index + 5]) << 16) | \ -(((u64)cdb[index + 6]) << 8) | \ -(((u64)cdb[index + 7]) << 0)) - -/* Inquiry Helper Macros */ -#define GET_INQ_EVPD_BIT(cdb) \ -((GET_U8_FROM_CDB(cdb, INQUIRY_EVPD_BYTE_OFFSET) & \ -INQUIRY_EVPD_BIT_MASK) ? 1 : 0) - -#define GET_INQ_PAGE_CODE(cdb) \ -(GET_U8_FROM_CDB(cdb, INQUIRY_PAGE_CODE_BYTE_OFFSET)) - -#define GET_INQ_ALLOC_LENGTH(cdb) \ -(GET_U16_FROM_CDB(cdb, INQUIRY_CDB_ALLOCATION_LENGTH_OFFSET)) - -/* Report LUNs Helper Macros */ -#define GET_REPORT_LUNS_ALLOC_LENGTH(cdb) \ -(GET_U32_FROM_CDB(cdb, REPORT_LUNS_CDB_ALLOC_LENGTH_OFFSET)) - -/* Read Capacity Helper Macros */ -#define GET_READ_CAP_16_ALLOC_LENGTH(cdb) \ -(GET_U32_FROM_CDB(cdb, READ_CAP_16_CDB_ALLOC_LENGTH_OFFSET)) - -#define IS_READ_CAP_16(cdb) \ -((cdb[0] == SERVICE_ACTION_IN_16 && cdb[1] == SAI_READ_CAPACITY_16) ? 1 : 0) - -/* Request Sense Helper Macros */ -#define GET_REQUEST_SENSE_ALLOC_LENGTH(cdb) \ -(GET_U8_FROM_CDB(cdb, REQUEST_SENSE_CDB_ALLOC_LENGTH_OFFSET)) - -/* Mode Sense Helper Macros */ -#define GET_MODE_SENSE_DBD(cdb) \ -((GET_U8_FROM_CDB(cdb, MODE_SENSE_DBD_OFFSET) & MODE_SENSE_DBD_MASK) >> \ -MODE_SENSE_DBD_SHIFT) - -#define GET_MODE_SENSE_LLBAA(cdb) \ -((GET_U8_FROM_CDB(cdb, MODE_SENSE_LLBAA_OFFSET) & \ -MODE_SENSE_LLBAA_MASK) >> MODE_SENSE_LLBAA_SHIFT) - -#define GET_MODE_SENSE_MPH_SIZE(cdb10) \ -(cdb10 ? MODE_SENSE10_MPH_SIZE : MODE_SENSE6_MPH_SIZE) - +/* copied from drivers/usb/gadget/function/storage_common.h */ +static inline u32 get_unaligned_be24(u8 *buf) +{ + return 0xffffff & (u32) get_unaligned_be32(buf - 1); +} /* Struct to gather data that needs to be extracted from a SCSI CDB. Not conforming to any particular CDB variant, but compatible with all. */ @@ -369,8 +233,6 @@ struct nvme_trans_io_cdb { static int nvme_trans_copy_to_user(struct sg_io_hdr *hdr, void *from, unsigned long n) { - int res = SNTI_TRANSLATION_SUCCESS; - unsigned long not_copied; int i; void *index = from; size_t remaining = n; @@ -380,29 +242,25 @@ static int nvme_trans_copy_to_user(struct sg_io_hdr *hdr, void *from, struct sg_iovec sgl; for (i = 0; i < hdr->iovec_count; i++) { - not_copied = copy_from_user(&sgl, hdr->dxferp + + if (copy_from_user(&sgl, hdr->dxferp + i * sizeof(struct sg_iovec), - sizeof(struct sg_iovec)); - if (not_copied) + sizeof(struct sg_iovec))) return -EFAULT; xfer_len = min(remaining, sgl.iov_len); - not_copied = copy_to_user(sgl.iov_base, index, - xfer_len); - if (not_copied) { - res = -EFAULT; - break; - } + if (copy_to_user(sgl.iov_base, index, xfer_len)) + return -EFAULT; + index += xfer_len; remaining -= xfer_len; if (remaining == 0) break; } - return res; + return 0; } - not_copied = copy_to_user(hdr->dxferp, from, n); - if (not_copied) - res = -EFAULT; - return res; + + if (copy_to_user(hdr->dxferp, from, n)) + return -EFAULT; + return 0; } /* Copy data from userspace memory */ @@ -410,8 +268,6 @@ static int nvme_trans_copy_to_user(struct sg_io_hdr *hdr, void *from, static int nvme_trans_copy_from_user(struct sg_io_hdr *hdr, void *to, unsigned long n) { - int res = SNTI_TRANSLATION_SUCCESS; - unsigned long not_copied; int i; void *index = to; size_t remaining = n; @@ -421,30 +277,24 @@ static int nvme_trans_copy_from_user(struct sg_io_hdr *hdr, void *to, struct sg_iovec sgl; for (i = 0; i < hdr->iovec_count; i++) { - not_copied = copy_from_user(&sgl, hdr->dxferp + + if (copy_from_user(&sgl, hdr->dxferp + i * sizeof(struct sg_iovec), - sizeof(struct sg_iovec)); - if (not_copied) + sizeof(struct sg_iovec))) return -EFAULT; xfer_len = min(remaining, sgl.iov_len); - not_copied = copy_from_user(index, sgl.iov_base, - xfer_len); - if (not_copied) { - res = -EFAULT; - break; - } + if (copy_from_user(index, sgl.iov_base, xfer_len)) + return -EFAULT; index += xfer_len; remaining -= xfer_len; if (remaining == 0) break; } - return res; + return 0; } - not_copied = copy_from_user(to, hdr->dxferp, n); - if (not_copied) - res = -EFAULT; - return res; + if (copy_from_user(to, hdr->dxferp, n)) + return -EFAULT; + return 0; } /* Status/Sense Buffer Writeback */ @@ -452,7 +302,6 @@ static int nvme_trans_copy_from_user(struct sg_io_hdr *hdr, void *to, static int nvme_trans_completion(struct sg_io_hdr *hdr, u8 status, u8 sense_key, u8 asc, u8 ascq) { - int res = SNTI_TRANSLATION_SUCCESS; u8 xfer_len; u8 resp[DESC_FMT_SENSE_DATA_SIZE]; @@ -477,25 +326,29 @@ static int nvme_trans_completion(struct sg_io_hdr *hdr, u8 status, u8 sense_key, xfer_len = min_t(u8, hdr->mx_sb_len, DESC_FMT_SENSE_DATA_SIZE); hdr->sb_len_wr = xfer_len; if (copy_to_user(hdr->sbp, resp, xfer_len) > 0) - res = -EFAULT; + return -EFAULT; } - return res; + return 0; } +/* + * Take a status code from a lowlevel routine, and if it was a positive NVMe + * error code update the sense data based on it. In either case the passed + * in value is returned again, unless an -EFAULT from copy_to_user overrides + * it. + */ static int nvme_trans_status_code(struct sg_io_hdr *hdr, int nvme_sc) { u8 status, sense_key, asc, ascq; - int res = SNTI_TRANSLATION_SUCCESS; + int res; /* For non-nvme (Linux) errors, simply return the error code */ if (nvme_sc < 0) return nvme_sc; /* Mask DNR, More, and reserved fields */ - nvme_sc &= 0x7FF; - - switch (nvme_sc) { + switch (nvme_sc & 0x7FF) { /* Generic Command Status */ case NVME_SC_SUCCESS: status = SAM_STAT_GOOD; @@ -662,8 +515,7 @@ static int nvme_trans_status_code(struct sg_io_hdr *hdr, int nvme_sc) } res = nvme_trans_completion(hdr, status, sense_key, asc, ascq); - - return res; + return res ? res : nvme_sc; } /* INQUIRY Helper Functions */ @@ -673,10 +525,8 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, int alloc_len) { struct nvme_dev *dev = ns->dev; - dma_addr_t dma_addr; - void *mem; struct nvme_id_ns *id_ns; - int res = SNTI_TRANSLATION_SUCCESS; + int res; int nvme_sc; int xfer_len; u8 resp_data_format = 0x02; @@ -684,31 +534,17 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, u8 cmdque = 0x01 << 1; u8 fw_offset = sizeof(dev->firmware_rev); - mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), - &dma_addr, GFP_KERNEL); - if (mem == NULL) { - res = -ENOMEM; - goto out_dma; - } - /* nvme ns identify - use DPS value for PROTECT field */ - nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); - /* - * If nvme_sc was -ve, res will be -ve here. - * If nvme_sc was +ve, the status would bace been translated, and res - * can only be 0 or -ve. - * - If 0 && nvme_sc > 0, then go into next if where res gets nvme_sc - * - If -ve, return because its a Linux error. - */ if (res) - goto out_free; - if (nvme_sc) { - res = nvme_sc; - goto out_free; - } - id_ns = mem; - (id_ns->dps) ? (protect = 0x01) : (protect = 0); + return res; + + if (id_ns->dps) + protect = 0x01; + else + protect = 0; + kfree(id_ns); memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); inq_response[2] = VERSION_SPC_4; @@ -725,20 +561,13 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, strncpy(&inq_response[32], dev->firmware_rev + fw_offset, 4); xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); - res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); - - out_free: - dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, - dma_addr); - out_dma: - return res; + return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); } static int nvme_trans_supported_vpd_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *inq_response, int alloc_len) { - int res = SNTI_TRANSLATION_SUCCESS; int xfer_len; memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); @@ -752,9 +581,7 @@ static int nvme_trans_supported_vpd_pages(struct nvme_ns *ns, inq_response[9] = INQ_BDEV_LIMITS_PAGE; xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); - res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); - - return res; + return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); } static int nvme_trans_unit_serial_page(struct nvme_ns *ns, @@ -762,7 +589,6 @@ static int nvme_trans_unit_serial_page(struct nvme_ns *ns, int alloc_len) { struct nvme_dev *dev = ns->dev; - int res = SNTI_TRANSLATION_SUCCESS; int xfer_len; memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); @@ -771,53 +597,42 @@ static int nvme_trans_unit_serial_page(struct nvme_ns *ns, strncpy(&inq_response[4], dev->serial, INQ_SERIAL_NUMBER_LENGTH); xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); - res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); - - return res; + return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); } static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *inq_response, int alloc_len) { struct nvme_dev *dev = ns->dev; - dma_addr_t dma_addr; - void *mem; - int res = SNTI_TRANSLATION_SUCCESS; + int res; int nvme_sc; int xfer_len; __be32 tmp_id = cpu_to_be32(ns->ns_id); - mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), - &dma_addr, GFP_KERNEL); - if (mem == NULL) { - res = -ENOMEM; - goto out_dma; - } - memset(inq_response, 0, alloc_len); inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; /* Page Code */ if (readl(&dev->bar->vs) >= NVME_VS(1, 1)) { - struct nvme_id_ns *id_ns = mem; - void *eui = id_ns->eui64; - int len = sizeof(id_ns->eui64); + struct nvme_id_ns *id_ns; + void *eui; + int len; - nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) - goto out_free; - if (nvme_sc) { - res = nvme_sc; - goto out_free; - } + return res; + eui = id_ns->eui64; + len = sizeof(id_ns->eui64); if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) { if (bitmap_empty(eui, len * 8)) { eui = id_ns->nguid; len = sizeof(id_ns->nguid); } } - if (bitmap_empty(eui, len * 8)) + if (bitmap_empty(eui, len * 8)) { + kfree(id_ns); goto scsi_string; + } inq_response[3] = 4 + len; /* Page Length */ /* Designation Descriptor start */ @@ -826,14 +641,14 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, inq_response[6] = 0x00; /* Rsvd */ inq_response[7] = len; /* Designator Length */ memcpy(&inq_response[8], eui, len); + kfree(id_ns); } else { scsi_string: if (alloc_len < 72) { - res = nvme_trans_completion(hdr, + return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out_free; } inq_response[3] = 0x48; /* Page Length */ /* Designation Descriptor start */ @@ -842,30 +657,22 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, inq_response[6] = 0x00; /* Rsvd */ inq_response[7] = 0x44; /* Designator Length */ - sprintf(&inq_response[8], "%04x", dev->pci_dev->vendor); + sprintf(&inq_response[8], "%04x", to_pci_dev(dev->dev)->vendor); memcpy(&inq_response[12], dev->model, sizeof(dev->model)); sprintf(&inq_response[52], "%04x", tmp_id); memcpy(&inq_response[56], dev->serial, sizeof(dev->serial)); } xfer_len = alloc_len; - res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); - - out_free: - dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, - dma_addr); - out_dma: - return res; + return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); } static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, int alloc_len) { u8 *inq_response; - int res = SNTI_TRANSLATION_SUCCESS; + int res; int nvme_sc; struct nvme_dev *dev = ns->dev; - dma_addr_t dma_addr; - void *mem; struct nvme_id_ctrl *id_ctrl; struct nvme_id_ns *id_ns; int xfer_len; @@ -878,45 +685,32 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 luiclr = 0x01; inq_response = kmalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL); - if (inq_response == NULL) { - res = -ENOMEM; - goto out_mem; - } - - mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), - &dma_addr, GFP_KERNEL); - if (mem == NULL) { - res = -ENOMEM; - goto out_dma; - } + if (inq_response == NULL) + return -ENOMEM; - /* nvme ns identify */ - nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) - goto out_free; - if (nvme_sc) { - res = nvme_sc; - goto out_free; - } - id_ns = mem; - spt = spt_lut[(id_ns->dpc) & 0x07] << 3; - (id_ns->dps) ? (protect = 0x01) : (protect = 0); + goto out_free_inq; + + spt = spt_lut[id_ns->dpc & 0x07] << 3; + if (id_ns->dps) + protect = 0x01; + else + protect = 0; + kfree(id_ns); + grd_chk = protect << 2; app_chk = protect << 1; ref_chk = protect; - /* nvme controller identify */ - nvme_sc = nvme_identify(dev, 0, 1, dma_addr); + nvme_sc = nvme_identify_ctrl(dev, &id_ctrl); res = nvme_trans_status_code(hdr, nvme_sc); if (res) - goto out_free; - if (nvme_sc) { - res = nvme_sc; - goto out_free; - } - id_ctrl = mem; + goto out_free_inq; + v_sup = id_ctrl->vwc; + kfree(id_ctrl); memset(inq_response, 0, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); inq_response[1] = INQ_EXTENDED_INQUIRY_DATA_PAGE; /* Page Code */ @@ -932,12 +726,8 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); - out_free: - dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, - dma_addr); - out_dma: + out_free_inq: kfree(inq_response); - out_mem: return res; } @@ -965,7 +755,7 @@ static int nvme_trans_bdev_char_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, int alloc_len) { u8 *inq_response; - int res = SNTI_TRANSLATION_SUCCESS; + int res; int xfer_len; inq_response = kzalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL); @@ -994,7 +784,7 @@ static int nvme_trans_bdev_char_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, static int nvme_trans_log_supp_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr, int alloc_len) { - int res = SNTI_TRANSLATION_SUCCESS; + int res; int xfer_len; u8 *log_response; @@ -1022,47 +812,30 @@ static int nvme_trans_log_supp_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr, static int nvme_trans_log_info_exceptions(struct nvme_ns *ns, struct sg_io_hdr *hdr, int alloc_len) { - int res = SNTI_TRANSLATION_SUCCESS; + int res; int xfer_len; u8 *log_response; - struct nvme_command c; struct nvme_dev *dev = ns->dev; struct nvme_smart_log *smart_log; - dma_addr_t dma_addr; - void *mem; u8 temp_c; u16 temp_k; log_response = kzalloc(LOG_INFO_EXCP_PAGE_LENGTH, GFP_KERNEL); - if (log_response == NULL) { - res = -ENOMEM; - goto out_mem; - } + if (log_response == NULL) + return -ENOMEM; - mem = dma_alloc_coherent(&dev->pci_dev->dev, - sizeof(struct nvme_smart_log), - &dma_addr, GFP_KERNEL); - if (mem == NULL) { - res = -ENOMEM; - goto out_dma; - } + res = nvme_get_log_page(dev, &smart_log); + if (res < 0) + goto out_free_response; - /* Get SMART Log Page */ - memset(&c, 0, sizeof(c)); - c.common.opcode = nvme_admin_get_log_page; - c.common.nsid = cpu_to_le32(0xFFFFFFFF); - c.common.prp1 = cpu_to_le64(dma_addr); - c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) / - BYTES_TO_DWORDS) - 1) << 16) | NVME_LOG_SMART); - res = nvme_submit_admin_cmd(dev, &c, NULL); if (res != NVME_SC_SUCCESS) { temp_c = LOG_TEMP_UNKNOWN; } else { - smart_log = mem; temp_k = (smart_log->temperature[1] << 8) + (smart_log->temperature[0]); temp_c = temp_k - KELVIN_TEMP_FACTOR; } + kfree(smart_log); log_response[0] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE; /* Subpage=0x00, Page Length MSB=0 */ @@ -1078,59 +851,39 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns, xfer_len = min(alloc_len, LOG_INFO_EXCP_PAGE_LENGTH); res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); - dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log), - mem, dma_addr); - out_dma: + out_free_response: kfree(log_response); - out_mem: return res; } static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, int alloc_len) { - int res = SNTI_TRANSLATION_SUCCESS; + int res; int xfer_len; u8 *log_response; - struct nvme_command c; struct nvme_dev *dev = ns->dev; struct nvme_smart_log *smart_log; - dma_addr_t dma_addr; - void *mem; u32 feature_resp; u8 temp_c_cur, temp_c_thresh; u16 temp_k; log_response = kzalloc(LOG_TEMP_PAGE_LENGTH, GFP_KERNEL); - if (log_response == NULL) { - res = -ENOMEM; - goto out_mem; - } + if (log_response == NULL) + return -ENOMEM; - mem = dma_alloc_coherent(&dev->pci_dev->dev, - sizeof(struct nvme_smart_log), - &dma_addr, GFP_KERNEL); - if (mem == NULL) { - res = -ENOMEM; - goto out_dma; - } + res = nvme_get_log_page(dev, &smart_log); + if (res < 0) + goto out_free_response; - /* Get SMART Log Page */ - memset(&c, 0, sizeof(c)); - c.common.opcode = nvme_admin_get_log_page; - c.common.nsid = cpu_to_le32(0xFFFFFFFF); - c.common.prp1 = cpu_to_le64(dma_addr); - c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) / - BYTES_TO_DWORDS) - 1) << 16) | NVME_LOG_SMART); - res = nvme_submit_admin_cmd(dev, &c, NULL); if (res != NVME_SC_SUCCESS) { temp_c_cur = LOG_TEMP_UNKNOWN; } else { - smart_log = mem; temp_k = (smart_log->temperature[1] << 8) + (smart_log->temperature[0]); temp_c_cur = temp_k - KELVIN_TEMP_FACTOR; } + kfree(smart_log); /* Get Features for Temp Threshold */ res = nvme_get_features(dev, NVME_FEAT_TEMP_THRESH, 0, 0, @@ -1159,11 +912,8 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, xfer_len = min(alloc_len, LOG_TEMP_PAGE_LENGTH); res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); - dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log), - mem, dma_addr); - out_dma: + out_free_response: kfree(log_response); - out_mem: return res; } @@ -1174,59 +924,45 @@ static int nvme_trans_fill_mode_parm_hdr(u8 *resp, int len, u8 cdb10, u8 llbaa, { /* Quick check to make sure I don't stomp on my own memory... */ if ((cdb10 && len < 8) || (!cdb10 && len < 4)) - return SNTI_INTERNAL_ERROR; + return -EINVAL; if (cdb10) { resp[0] = (mode_data_length & 0xFF00) >> 8; resp[1] = (mode_data_length & 0x00FF); - /* resp[2] and [3] are zero */ + resp[3] = 0x10 /* DPOFUA */; resp[4] = llbaa; resp[5] = RESERVED_FIELD; resp[6] = (blk_desc_len & 0xFF00) >> 8; resp[7] = (blk_desc_len & 0x00FF); } else { resp[0] = (mode_data_length & 0x00FF); - /* resp[1] and [2] are zero */ + resp[2] = 0x10 /* DPOFUA */; resp[3] = (blk_desc_len & 0x00FF); } - return SNTI_TRANSLATION_SUCCESS; + return 0; } static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *resp, int len, u8 llbaa) { - int res = SNTI_TRANSLATION_SUCCESS; + int res; int nvme_sc; struct nvme_dev *dev = ns->dev; - dma_addr_t dma_addr; - void *mem; struct nvme_id_ns *id_ns; u8 flbas; u32 lba_length; if (llbaa == 0 && len < MODE_PAGE_BLK_DES_LEN) - return SNTI_INTERNAL_ERROR; + return -EINVAL; else if (llbaa > 0 && len < MODE_PAGE_LLBAA_BLK_DES_LEN) - return SNTI_INTERNAL_ERROR; - - mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), - &dma_addr, GFP_KERNEL); - if (mem == NULL) { - res = -ENOMEM; - goto out; - } + return -EINVAL; - /* nvme ns identify */ - nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) - goto out_dma; - if (nvme_sc) { - res = nvme_sc; - goto out_dma; - } - id_ns = mem; + return res; + flbas = (id_ns->flbas) & 0x0F; lba_length = (1 << (id_ns->lbaf[flbas].ds)); @@ -1246,10 +982,7 @@ static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr, memcpy(&resp[12], &tmp_len, sizeof(u32)); } - out_dma: - dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, - dma_addr); - out: + kfree(id_ns); return res; } @@ -1258,7 +991,7 @@ static int nvme_trans_fill_control_page(struct nvme_ns *ns, int len) { if (len < MODE_PAGE_CONTROL_LEN) - return SNTI_INTERNAL_ERROR; + return -EINVAL; resp[0] = MODE_PAGE_CONTROL; resp[1] = MODE_PAGE_CONTROL_LEN_FIELD; @@ -1272,78 +1005,69 @@ static int nvme_trans_fill_control_page(struct nvme_ns *ns, resp[9] = 0xFF; /* Bytes 10,11: Extended selftest completion time = 0x0000 */ - return SNTI_TRANSLATION_SUCCESS; + return 0; } static int nvme_trans_fill_caching_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *resp, int len) { - int res = SNTI_TRANSLATION_SUCCESS; + int res = 0; int nvme_sc; struct nvme_dev *dev = ns->dev; u32 feature_resp; u8 vwc; if (len < MODE_PAGE_CACHING_LEN) - return SNTI_INTERNAL_ERROR; + return -EINVAL; nvme_sc = nvme_get_features(dev, NVME_FEAT_VOLATILE_WC, 0, 0, &feature_resp); res = nvme_trans_status_code(hdr, nvme_sc); if (res) - goto out; - if (nvme_sc) { - res = nvme_sc; - goto out; - } + return res; + vwc = feature_resp & 0x00000001; resp[0] = MODE_PAGE_CACHING; resp[1] = MODE_PAGE_CACHING_LEN_FIELD; resp[2] = vwc << 2; - - out: - return res; + return 0; } static int nvme_trans_fill_pow_cnd_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *resp, int len) { - int res = SNTI_TRANSLATION_SUCCESS; - if (len < MODE_PAGE_POW_CND_LEN) - return SNTI_INTERNAL_ERROR; + return -EINVAL; resp[0] = MODE_PAGE_POWER_CONDITION; resp[1] = MODE_PAGE_POW_CND_LEN_FIELD; /* All other bytes are zero */ - return res; + return 0; } static int nvme_trans_fill_inf_exc_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *resp, int len) { - int res = SNTI_TRANSLATION_SUCCESS; - if (len < MODE_PAGE_INF_EXC_LEN) - return SNTI_INTERNAL_ERROR; + return -EINVAL; resp[0] = MODE_PAGE_INFO_EXCEP; resp[1] = MODE_PAGE_INF_EXC_LEN_FIELD; resp[2] = 0x88; /* All other bytes are zero */ - return res; + return 0; } static int nvme_trans_fill_all_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *resp, int len) { - int res = SNTI_TRANSLATION_SUCCESS; + int res; u16 mode_pages_offset_1 = 0; u16 mode_pages_offset_2, mode_pages_offset_3, mode_pages_offset_4; @@ -1353,23 +1077,18 @@ static int nvme_trans_fill_all_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr, res = nvme_trans_fill_caching_page(ns, hdr, &resp[mode_pages_offset_1], MODE_PAGE_CACHING_LEN); - if (res != SNTI_TRANSLATION_SUCCESS) - goto out; + if (res) + return res; res = nvme_trans_fill_control_page(ns, hdr, &resp[mode_pages_offset_2], MODE_PAGE_CONTROL_LEN); - if (res != SNTI_TRANSLATION_SUCCESS) - goto out; + if (res) + return res; res = nvme_trans_fill_pow_cnd_page(ns, hdr, &resp[mode_pages_offset_3], MODE_PAGE_POW_CND_LEN); - if (res != SNTI_TRANSLATION_SUCCESS) - goto out; - res = nvme_trans_fill_inf_exc_page(ns, hdr, &resp[mode_pages_offset_4], + if (res) + return res; + return nvme_trans_fill_inf_exc_page(ns, hdr, &resp[mode_pages_offset_4], MODE_PAGE_INF_EXC_LEN); - if (res != SNTI_TRANSLATION_SUCCESS) - goto out; - - out: - return res; } static inline int nvme_trans_get_blk_desc_len(u8 dbd, u8 llbaa) @@ -1390,7 +1109,7 @@ static int nvme_trans_mode_page_create(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *, int), u16 mode_pages_tot_len) { - int res = SNTI_TRANSLATION_SUCCESS; + int res; int xfer_len; u8 *response; u8 dbd, llbaa; @@ -1399,9 +1118,10 @@ static int nvme_trans_mode_page_create(struct nvme_ns *ns, u16 mode_pages_offset_1; u16 blk_desc_len, blk_desc_offset, mode_data_length; - dbd = GET_MODE_SENSE_DBD(cmd); - llbaa = GET_MODE_SENSE_LLBAA(cmd); - mph_size = GET_MODE_SENSE_MPH_SIZE(cdb10); + dbd = (cmd[1] & MODE_SENSE_DBD_MASK) >> MODE_SENSE_DBD_SHIFT; + llbaa = (cmd[1] & MODE_SENSE_LLBAA_MASK) >> MODE_SENSE_LLBAA_SHIFT; + mph_size = cdb10 ? MODE_SENSE10_MPH_SIZE : MODE_SENSE6_MPH_SIZE; + blk_desc_len = nvme_trans_get_blk_desc_len(dbd, llbaa); resp_size = mph_size + blk_desc_len + mode_pages_tot_len; @@ -1419,18 +1139,18 @@ static int nvme_trans_mode_page_create(struct nvme_ns *ns, res = nvme_trans_fill_mode_parm_hdr(&response[0], mph_size, cdb10, llbaa, mode_data_length, blk_desc_len); - if (res != SNTI_TRANSLATION_SUCCESS) + if (res) goto out_free; if (blk_desc_len > 0) { res = nvme_trans_fill_blk_desc(ns, hdr, &response[blk_desc_offset], blk_desc_len, llbaa); - if (res != SNTI_TRANSLATION_SUCCESS) + if (res) goto out_free; } res = mode_page_fill_func(ns, hdr, &response[mode_pages_offset_1], mode_pages_tot_len); - if (res != SNTI_TRANSLATION_SUCCESS) + if (res) goto out_free; xfer_len = min(alloc_len, resp_size); @@ -1485,33 +1205,20 @@ static void nvme_trans_fill_read_cap(u8 *response, struct nvme_id_ns *id_ns, static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 pc, u8 pcmod, u8 start) { - int res = SNTI_TRANSLATION_SUCCESS; + int res; int nvme_sc; struct nvme_dev *dev = ns->dev; - dma_addr_t dma_addr; - void *mem; struct nvme_id_ctrl *id_ctrl; int lowest_pow_st; /* max npss = lowest power consumption */ unsigned ps_desired = 0; - /* NVMe Controller Identify */ - mem = dma_alloc_coherent(&dev->pci_dev->dev, - sizeof(struct nvme_id_ctrl), - &dma_addr, GFP_KERNEL); - if (mem == NULL) { - res = -ENOMEM; - goto out; - } - nvme_sc = nvme_identify(dev, 0, 1, dma_addr); + nvme_sc = nvme_identify_ctrl(dev, &id_ctrl); res = nvme_trans_status_code(hdr, nvme_sc); if (res) - goto out_dma; - if (nvme_sc) { - res = nvme_sc; - goto out_dma; - } - id_ctrl = mem; + return res; + lowest_pow_st = max(POWER_STATE_0, (int)(id_ctrl->npss - 1)); + kfree(id_ctrl); switch (pc) { case NVME_POWER_STATE_START_VALID: @@ -1551,79 +1258,48 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr, } nvme_sc = nvme_set_features(dev, NVME_FEAT_POWER_MGMT, ps_desired, 0, NULL); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_dma; - if (nvme_sc) - res = nvme_sc; - out_dma: - dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ctrl), mem, - dma_addr); - out: - return res; + return nvme_trans_status_code(hdr, nvme_sc); } -/* Write Buffer Helper Functions */ -/* Also using this for Format Unit with hdr passed as NULL, and buffer_id, 0 */ +static int nvme_trans_send_activate_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 buffer_id) +{ + struct nvme_command c; + int nvme_sc; -static int nvme_trans_send_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_admin_activate_fw; + c.common.cdw10[0] = cpu_to_le32(buffer_id | NVME_FWACT_REPL_ACTV); + + nvme_sc = nvme_submit_sync_cmd(ns->queue, &c, NULL, 0); + return nvme_trans_status_code(hdr, nvme_sc); +} + +static int nvme_trans_send_download_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 opcode, u32 tot_len, u32 offset, u8 buffer_id) { - int res = SNTI_TRANSLATION_SUCCESS; int nvme_sc; struct nvme_dev *dev = ns->dev; struct nvme_command c; - struct nvme_iod *iod = NULL; - unsigned length; - memset(&c, 0, sizeof(c)); - c.common.opcode = opcode; - if (opcode == nvme_admin_download_fw) { - if (hdr->iovec_count > 0) { - /* Assuming SGL is not allowed for this command */ - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - iod = nvme_map_user_pages(dev, DMA_TO_DEVICE, - (unsigned long)hdr->dxferp, tot_len); - if (IS_ERR(iod)) { - res = PTR_ERR(iod); - goto out; - } - length = nvme_setup_prps(dev, iod, tot_len, GFP_KERNEL); - if (length != tot_len) { - res = -ENOMEM; - goto out_unmap; - } - - c.dlfw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - c.dlfw.prp2 = cpu_to_le64(iod->first_dma); - c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1); - c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS); - } else if (opcode == nvme_admin_activate_fw) { - u32 cdw10 = buffer_id | NVME_FWACT_REPL_ACTV; - c.common.cdw10[0] = cpu_to_le32(cdw10); + if (hdr->iovec_count > 0) { + /* Assuming SGL is not allowed for this command */ + return nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); } - nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_unmap; - if (nvme_sc) - res = nvme_sc; - - out_unmap: - if (opcode == nvme_admin_download_fw) { - nvme_unmap_user_pages(dev, DMA_TO_DEVICE, iod); - nvme_free_iod(dev, iod); - } - out: - return res; + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_admin_download_fw; + c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1); + c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS); + + nvme_sc = __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, + hdr->dxferp, tot_len, NULL, 0); + return nvme_trans_status_code(hdr, nvme_sc); } /* Mode Select Helper Functions */ @@ -1686,7 +1362,7 @@ static void nvme_trans_modesel_save_bd(struct nvme_ns *ns, u8 *parm_list, static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *mode_page, u8 page_code) { - int res = SNTI_TRANSLATION_SUCCESS; + int res = 0; int nvme_sc; struct nvme_dev *dev = ns->dev; unsigned dword11; @@ -1697,12 +1373,6 @@ static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr, nvme_sc = nvme_set_features(dev, NVME_FEAT_VOLATILE_WC, dword11, 0, NULL); res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - break; - if (nvme_sc) { - res = nvme_sc; - break; - } break; case MODE_PAGE_CONTROL: break; @@ -1714,8 +1384,6 @@ static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr, ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - if (!res) - res = SNTI_INTERNAL_ERROR; break; } break; @@ -1723,8 +1391,6 @@ static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr, res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - if (!res) - res = SNTI_INTERNAL_ERROR; break; } @@ -1735,7 +1401,7 @@ static int nvme_trans_modesel_data(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *cmd, u16 parm_list_len, u8 pf, u8 sp, u8 cdb10) { - int res = SNTI_TRANSLATION_SUCCESS; + int res; u8 *parm_list; u16 bd_len; u8 llbaa = 0; @@ -1751,7 +1417,7 @@ static int nvme_trans_modesel_data(struct nvme_ns *ns, struct sg_io_hdr *hdr, } res = nvme_trans_copy_from_user(hdr, parm_list, parm_list_len); - if (res != SNTI_TRANSLATION_SUCCESS) + if (res) goto out_mem; nvme_trans_modesel_get_bd_len(parm_list, cdb10, &bd_len, &llbaa); @@ -1789,7 +1455,7 @@ static int nvme_trans_modesel_data(struct nvme_ns *ns, struct sg_io_hdr *hdr, mp_size = parm_list[index + 1] + 2; res = nvme_trans_modesel_get_mp(ns, hdr, &parm_list[index], page_code); - if (res != SNTI_TRANSLATION_SUCCESS) + if (res) break; index += mp_size; } while (index < parm_list_len); @@ -1805,12 +1471,9 @@ static int nvme_trans_modesel_data(struct nvme_ns *ns, struct sg_io_hdr *hdr, static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns, struct sg_io_hdr *hdr) { - int res = SNTI_TRANSLATION_SUCCESS; + int res = 0; int nvme_sc; struct nvme_dev *dev = ns->dev; - dma_addr_t dma_addr; - void *mem; - struct nvme_id_ns *id_ns; u8 flbas; /* @@ -1821,22 +1484,12 @@ static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns, */ if (ns->mode_select_num_blocks == 0 || ns->mode_select_block_len == 0) { - mem = dma_alloc_coherent(&dev->pci_dev->dev, - sizeof(struct nvme_id_ns), &dma_addr, GFP_KERNEL); - if (mem == NULL) { - res = -ENOMEM; - goto out; - } - /* nvme ns identify */ - nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + struct nvme_id_ns *id_ns; + + nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) - goto out_dma; - if (nvme_sc) { - res = nvme_sc; - goto out_dma; - } - id_ns = mem; + return res; if (ns->mode_select_num_blocks == 0) ns->mode_select_num_blocks = le64_to_cpu(id_ns->ncap); @@ -1845,18 +1498,17 @@ static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns, ns->mode_select_block_len = (1 << (id_ns->lbaf[flbas].ds)); } - out_dma: - dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), - mem, dma_addr); + + kfree(id_ns); } - out: - return res; + + return 0; } static int nvme_trans_fmt_get_parm_header(struct sg_io_hdr *hdr, u8 len, u8 format_prot_info, u8 *nvme_pf_code) { - int res = SNTI_TRANSLATION_SUCCESS; + int res; u8 *parm_list; u8 pf_usage, pf_code; @@ -1866,7 +1518,7 @@ static int nvme_trans_fmt_get_parm_header(struct sg_io_hdr *hdr, u8 len, goto out; } res = nvme_trans_copy_from_user(hdr, parm_list, len); - if (res != SNTI_TRANSLATION_SUCCESS) + if (res) goto out_mem; if ((parm_list[FORMAT_UNIT_IMMED_OFFSET] & @@ -1916,11 +1568,9 @@ static int nvme_trans_fmt_get_parm_header(struct sg_io_hdr *hdr, u8 len, static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 prot_info) { - int res = SNTI_TRANSLATION_SUCCESS; + int res; int nvme_sc; struct nvme_dev *dev = ns->dev; - dma_addr_t dma_addr; - void *mem; struct nvme_id_ns *id_ns; u8 i; u8 flbas, nlbaf; @@ -1929,22 +1579,11 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, struct nvme_command c; /* Loop thru LBAF's in id_ns to match reqd lbaf, put in cdw10 */ - mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), - &dma_addr, GFP_KERNEL); - if (mem == NULL) { - res = -ENOMEM; - goto out; - } - /* nvme ns identify */ - nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) - goto out_dma; - if (nvme_sc) { - res = nvme_sc; - goto out_dma; - } - id_ns = mem; + return res; + flbas = (id_ns->flbas) & 0x0F; nlbaf = id_ns->nlbaf; @@ -1972,69 +1611,13 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, c.format.nsid = cpu_to_le32(ns->ns_id); c.format.cdw10 = cpu_to_le32(cdw10); - nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL); + nvme_sc = nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_dma; - if (nvme_sc) - res = nvme_sc; - out_dma: - dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, - dma_addr); - out: + kfree(id_ns); return res; } -/* Read/Write Helper Functions */ - -static inline void nvme_trans_get_io_cdb6(u8 *cmd, - struct nvme_trans_io_cdb *cdb_info) -{ - cdb_info->fua = 0; - cdb_info->prot_info = 0; - cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_6_CDB_LBA_OFFSET) & - IO_6_CDB_LBA_MASK; - cdb_info->xfer_len = GET_U8_FROM_CDB(cmd, IO_6_CDB_TX_LEN_OFFSET); - - /* sbc3r27 sec 5.32 - TRANSFER LEN of 0 implies a 256 Block transfer */ - if (cdb_info->xfer_len == 0) - cdb_info->xfer_len = IO_6_DEFAULT_TX_LEN; -} - -static inline void nvme_trans_get_io_cdb10(u8 *cmd, - struct nvme_trans_io_cdb *cdb_info) -{ - cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_10_CDB_FUA_OFFSET) & - IO_CDB_FUA_MASK; - cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_10_CDB_WP_OFFSET) & - IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT; - cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_10_CDB_LBA_OFFSET); - cdb_info->xfer_len = GET_U16_FROM_CDB(cmd, IO_10_CDB_TX_LEN_OFFSET); -} - -static inline void nvme_trans_get_io_cdb12(u8 *cmd, - struct nvme_trans_io_cdb *cdb_info) -{ - cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_12_CDB_FUA_OFFSET) & - IO_CDB_FUA_MASK; - cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_12_CDB_WP_OFFSET) & - IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT; - cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_12_CDB_LBA_OFFSET); - cdb_info->xfer_len = GET_U32_FROM_CDB(cmd, IO_12_CDB_TX_LEN_OFFSET); -} - -static inline void nvme_trans_get_io_cdb16(u8 *cmd, - struct nvme_trans_io_cdb *cdb_info) -{ - cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_16_CDB_FUA_OFFSET) & - IO_CDB_FUA_MASK; - cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_16_CDB_WP_OFFSET) & - IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT; - cdb_info->lba = GET_U64_FROM_CDB(cmd, IO_16_CDB_LBA_OFFSET); - cdb_info->xfer_len = GET_U32_FROM_CDB(cmd, IO_16_CDB_TX_LEN_OFFSET); -} - static inline u32 nvme_trans_io_get_num_cmds(struct sg_io_hdr *hdr, struct nvme_trans_io_cdb *cdb_info, u32 max_blocks) @@ -2064,11 +1647,8 @@ static u16 nvme_trans_io_get_control(struct nvme_ns *ns, static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, struct nvme_trans_io_cdb *cdb_info, u8 is_write) { - int res = SNTI_TRANSLATION_SUCCESS; - int nvme_sc; - struct nvme_dev *dev = ns->dev; + int nvme_sc = NVME_SC_SUCCESS; u32 num_cmds; - struct nvme_iod *iod; u64 unit_len; u64 unit_num_blocks; /* Number of blocks to xfer in each nvme cmd */ u32 retcode; @@ -2119,45 +1699,20 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, control = nvme_trans_io_get_control(ns, cdb_info); c.rw.control = cpu_to_le16(control); - iod = nvme_map_user_pages(dev, - (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, - (unsigned long)next_mapping_addr, unit_len); - if (IS_ERR(iod)) { - res = PTR_ERR(iod); - goto out; - } - retcode = nvme_setup_prps(dev, iod, unit_len, GFP_KERNEL); - if (retcode != unit_len) { - nvme_unmap_user_pages(dev, - (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, - iod); - nvme_free_iod(dev, iod); - res = -ENOMEM; - goto out; + if (get_capacity(ns->disk) - unit_num_blocks < + cdb_info->lba + nvme_offset) { + nvme_sc = NVME_SC_LBA_RANGE; + break; } - c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - c.rw.prp2 = cpu_to_le64(iod->first_dma); + nvme_sc = __nvme_submit_sync_cmd(ns->queue, &c, NULL, + next_mapping_addr, unit_len, NULL, 0); + if (nvme_sc) + break; nvme_offset += unit_num_blocks; - - nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL); - if (nvme_sc != NVME_SC_SUCCESS) { - nvme_unmap_user_pages(dev, - (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, - iod); - nvme_free_iod(dev, iod); - res = nvme_trans_status_code(hdr, nvme_sc); - goto out; - } - nvme_unmap_user_pages(dev, - (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, - iod); - nvme_free_iod(dev, iod); } - res = nvme_trans_status_code(hdr, NVME_SC_SUCCESS); - out: - return res; + return nvme_trans_status_code(hdr, nvme_sc); } @@ -2166,8 +1721,8 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, static int nvme_trans_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 is_write, u8 *cmd) { - int res = SNTI_TRANSLATION_SUCCESS; - struct nvme_trans_io_cdb cdb_info; + int res = 0; + struct nvme_trans_io_cdb cdb_info = { 0, }; u8 opcode = cmd[0]; u64 xfer_bytes; u64 sum_iov_len = 0; @@ -2175,27 +1730,52 @@ static int nvme_trans_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 is_write, int i; size_t not_copied; - /* Extract Fields from CDB */ + /* + * The FUA and WPROTECT fields are not supported in 6-byte CDBs, + * but always in the same place for all others. + */ + switch (opcode) { + case WRITE_6: + case READ_6: + break; + default: + cdb_info.fua = cmd[1] & 0x8; + cdb_info.prot_info = (cmd[1] & 0xe0) >> 5; + if (cdb_info.prot_info && !ns->pi_type) { + return nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + } + } + switch (opcode) { case WRITE_6: case READ_6: - nvme_trans_get_io_cdb6(cmd, &cdb_info); + cdb_info.lba = get_unaligned_be24(&cmd[1]); + cdb_info.xfer_len = cmd[4]; + if (cdb_info.xfer_len == 0) + cdb_info.xfer_len = 256; break; case WRITE_10: case READ_10: - nvme_trans_get_io_cdb10(cmd, &cdb_info); + cdb_info.lba = get_unaligned_be32(&cmd[2]); + cdb_info.xfer_len = get_unaligned_be16(&cmd[7]); break; case WRITE_12: case READ_12: - nvme_trans_get_io_cdb12(cmd, &cdb_info); + cdb_info.lba = get_unaligned_be32(&cmd[2]); + cdb_info.xfer_len = get_unaligned_be32(&cmd[6]); break; case WRITE_16: case READ_16: - nvme_trans_get_io_cdb16(cmd, &cdb_info); + cdb_info.lba = get_unaligned_be64(&cmd[2]); + cdb_info.xfer_len = get_unaligned_be32(&cmd[10]); break; default: /* Will never really reach here */ - res = SNTI_INTERNAL_ERROR; + res = -EIO; goto out; } @@ -2237,7 +1817,7 @@ static int nvme_trans_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 is_write, /* Send NVMe IO Command(s) */ res = nvme_trans_do_nvme_io(ns, hdr, &cdb_info, is_write); - if (res != SNTI_TRANSLATION_SUCCESS) + if (res) goto out; out: @@ -2247,15 +1827,15 @@ static int nvme_trans_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 is_write, static int nvme_trans_inquiry(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *cmd) { - int res = SNTI_TRANSLATION_SUCCESS; + int res = 0; u8 evpd; u8 page_code; int alloc_len; u8 *inq_response; - evpd = GET_INQ_EVPD_BIT(cmd); - page_code = GET_INQ_PAGE_CODE(cmd); - alloc_len = GET_INQ_ALLOC_LENGTH(cmd); + evpd = cmd[1] & 0x01; + page_code = cmd[2]; + alloc_len = get_unaligned_be16(&cmd[3]); inq_response = kmalloc(max(alloc_len, STANDARD_INQUIRY_LENGTH), GFP_KERNEL); @@ -2316,29 +1896,27 @@ static int nvme_trans_inquiry(struct nvme_ns *ns, struct sg_io_hdr *hdr, static int nvme_trans_log_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *cmd) { - int res = SNTI_TRANSLATION_SUCCESS; + int res; u16 alloc_len; - u8 sp; u8 pc; u8 page_code; - sp = GET_U8_FROM_CDB(cmd, LOG_SENSE_CDB_SP_OFFSET); - if (sp != LOG_SENSE_CDB_SP_NOT_ENABLED) { + if (cmd[1] != LOG_SENSE_CDB_SP_NOT_ENABLED) { res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); goto out; } - pc = GET_U8_FROM_CDB(cmd, LOG_SENSE_CDB_PC_OFFSET); - page_code = pc & LOG_SENSE_CDB_PAGE_CODE_MASK; - pc = (pc & LOG_SENSE_CDB_PC_MASK) >> LOG_SENSE_CDB_PC_SHIFT; + + page_code = cmd[2] & LOG_SENSE_CDB_PAGE_CODE_MASK; + pc = (cmd[2] & LOG_SENSE_CDB_PC_MASK) >> LOG_SENSE_CDB_PC_SHIFT; if (pc != LOG_SENSE_CDB_PC_CUMULATIVE_VALUES) { res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); goto out; } - alloc_len = GET_U16_FROM_CDB(cmd, LOG_SENSE_CDB_ALLOC_LENGTH_OFFSET); + alloc_len = get_unaligned_be16(&cmd[7]); switch (page_code) { case LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE: res = nvme_trans_log_supp_pages(ns, hdr, alloc_len); @@ -2363,24 +1941,18 @@ static int nvme_trans_log_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, static int nvme_trans_mode_select(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *cmd) { - int res = SNTI_TRANSLATION_SUCCESS; u8 cdb10 = 0; u16 parm_list_len; u8 page_format; u8 save_pages; - page_format = GET_U8_FROM_CDB(cmd, MODE_SELECT_CDB_PAGE_FORMAT_OFFSET); - page_format &= MODE_SELECT_CDB_PAGE_FORMAT_MASK; + page_format = cmd[1] & MODE_SELECT_CDB_PAGE_FORMAT_MASK; + save_pages = cmd[1] & MODE_SELECT_CDB_SAVE_PAGES_MASK; - save_pages = GET_U8_FROM_CDB(cmd, MODE_SELECT_CDB_SAVE_PAGES_OFFSET); - save_pages &= MODE_SELECT_CDB_SAVE_PAGES_MASK; - - if (GET_OPCODE(cmd) == MODE_SELECT) { - parm_list_len = GET_U8_FROM_CDB(cmd, - MODE_SELECT_6_CDB_PARAM_LIST_LENGTH_OFFSET); + if (cmd[0] == MODE_SELECT) { + parm_list_len = cmd[4]; } else { - parm_list_len = GET_U16_FROM_CDB(cmd, - MODE_SELECT_10_CDB_PARAM_LIST_LENGTH_OFFSET); + parm_list_len = cmd[7]; cdb10 = 1; } @@ -2389,42 +1961,36 @@ static int nvme_trans_mode_select(struct nvme_ns *ns, struct sg_io_hdr *hdr, * According to SPC-4 r24, a paramter list length field of 0 * shall not be considered an error */ - res = nvme_trans_modesel_data(ns, hdr, cmd, parm_list_len, + return nvme_trans_modesel_data(ns, hdr, cmd, parm_list_len, page_format, save_pages, cdb10); } - return res; + return 0; } static int nvme_trans_mode_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *cmd) { - int res = SNTI_TRANSLATION_SUCCESS; + int res = 0; u16 alloc_len; u8 cdb10 = 0; - u8 page_code; - u8 pc; - if (GET_OPCODE(cmd) == MODE_SENSE) { - alloc_len = GET_U8_FROM_CDB(cmd, MODE_SENSE6_ALLOC_LEN_OFFSET); + if (cmd[0] == MODE_SENSE) { + alloc_len = cmd[4]; } else { - alloc_len = GET_U16_FROM_CDB(cmd, - MODE_SENSE10_ALLOC_LEN_OFFSET); + alloc_len = get_unaligned_be16(&cmd[7]); cdb10 = 1; } - pc = GET_U8_FROM_CDB(cmd, MODE_SENSE_PAGE_CONTROL_OFFSET) & - MODE_SENSE_PAGE_CONTROL_MASK; - if (pc != MODE_SENSE_PC_CURRENT_VALUES) { + if ((cmd[2] & MODE_SENSE_PAGE_CONTROL_MASK) != + MODE_SENSE_PC_CURRENT_VALUES) { res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); goto out; } - page_code = GET_U8_FROM_CDB(cmd, MODE_SENSE_PAGE_CODE_OFFSET) & - MODE_SENSE_PAGE_CODE_MASK; - switch (page_code) { + switch (cmd[2] & MODE_SENSE_PAGE_CODE_MASK) { case MODE_PAGE_CACHING: res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, cdb10, @@ -2467,47 +2033,34 @@ static int nvme_trans_mode_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, } static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) + u8 *cmd, u8 cdb16) { - int res = SNTI_TRANSLATION_SUCCESS; + int res; int nvme_sc; - u32 alloc_len = READ_CAP_10_RESP_SIZE; - u32 resp_size = READ_CAP_10_RESP_SIZE; + u32 alloc_len; + u32 resp_size; u32 xfer_len; - u8 cdb16; struct nvme_dev *dev = ns->dev; - dma_addr_t dma_addr; - void *mem; struct nvme_id_ns *id_ns; u8 *response; - cdb16 = IS_READ_CAP_16(cmd); if (cdb16) { - alloc_len = GET_READ_CAP_16_ALLOC_LENGTH(cmd); + alloc_len = get_unaligned_be32(&cmd[10]); resp_size = READ_CAP_16_RESP_SIZE; + } else { + alloc_len = READ_CAP_10_RESP_SIZE; + resp_size = READ_CAP_10_RESP_SIZE; } - mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), - &dma_addr, GFP_KERNEL); - if (mem == NULL) { - res = -ENOMEM; - goto out; - } - /* nvme ns identify */ - nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) - goto out_dma; - if (nvme_sc) { - res = nvme_sc; - goto out_dma; - } - id_ns = mem; + return res; response = kzalloc(resp_size, GFP_KERNEL); if (response == NULL) { res = -ENOMEM; - goto out_dma; + goto out_free_id; } nvme_trans_fill_read_cap(response, id_ns, cdb16); @@ -2515,72 +2068,53 @@ static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr, res = nvme_trans_copy_to_user(hdr, response, xfer_len); kfree(response); - out_dma: - dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, - dma_addr); - out: + out_free_id: + kfree(id_ns); return res; } static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *cmd) { - int res = SNTI_TRANSLATION_SUCCESS; + int res; int nvme_sc; u32 alloc_len, xfer_len, resp_size; - u8 select_report; u8 *response; struct nvme_dev *dev = ns->dev; - dma_addr_t dma_addr; - void *mem; struct nvme_id_ctrl *id_ctrl; u32 ll_length, lun_id; u8 lun_id_offset = REPORT_LUNS_FIRST_LUN_OFFSET; __be32 tmp_len; - alloc_len = GET_REPORT_LUNS_ALLOC_LENGTH(cmd); - select_report = GET_U8_FROM_CDB(cmd, REPORT_LUNS_SR_OFFSET); - - if ((select_report != ALL_LUNS_RETURNED) && - (select_report != ALL_WELL_KNOWN_LUNS_RETURNED) && - (select_report != RESTRICTED_LUNS_RETURNED)) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + switch (cmd[2]) { + default: + return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } else { - /* NVMe Controller Identify */ - mem = dma_alloc_coherent(&dev->pci_dev->dev, - sizeof(struct nvme_id_ctrl), - &dma_addr, GFP_KERNEL); - if (mem == NULL) { - res = -ENOMEM; - goto out; - } - nvme_sc = nvme_identify(dev, 0, 1, dma_addr); + case ALL_LUNS_RETURNED: + case ALL_WELL_KNOWN_LUNS_RETURNED: + case RESTRICTED_LUNS_RETURNED: + nvme_sc = nvme_identify_ctrl(dev, &id_ctrl); res = nvme_trans_status_code(hdr, nvme_sc); if (res) - goto out_dma; - if (nvme_sc) { - res = nvme_sc; - goto out_dma; - } - id_ctrl = mem; + return res; + ll_length = le32_to_cpu(id_ctrl->nn) * LUN_ENTRY_SIZE; resp_size = ll_length + LUN_DATA_HEADER_SIZE; + alloc_len = get_unaligned_be32(&cmd[6]); if (alloc_len < resp_size) { res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out_dma; + goto out_free_id; } response = kzalloc(resp_size, GFP_KERNEL); if (response == NULL) { res = -ENOMEM; - goto out_dma; + goto out_free_id; } /* The first LUN ID will always be 0 per the SAM spec */ @@ -2601,24 +2135,21 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr, res = nvme_trans_copy_to_user(hdr, response, xfer_len); kfree(response); - out_dma: - dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ctrl), mem, - dma_addr); - out: + out_free_id: + kfree(id_ctrl); return res; } static int nvme_trans_request_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *cmd) { - int res = SNTI_TRANSLATION_SUCCESS; + int res; u8 alloc_len, xfer_len, resp_size; u8 desc_format; u8 *response; - alloc_len = GET_REQUEST_SENSE_ALLOC_LENGTH(cmd); - desc_format = GET_U8_FROM_CDB(cmd, REQUEST_SENSE_DESC_OFFSET); - desc_format &= REQUEST_SENSE_DESC_MASK; + desc_format = cmd[1] & 0x01; + alloc_len = cmd[4]; resp_size = ((desc_format) ? (DESC_FMT_SENSE_DATA_SIZE) : (FIXED_FMT_SENSE_DATA_SIZE)); @@ -2628,7 +2159,7 @@ static int nvme_trans_request_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, goto out; } - if (desc_format == DESCRIPTOR_FORMAT_SENSE_DATA_TYPE) { + if (desc_format) { /* Descriptor Format Sense Data */ response[0] = DESC_FORMAT_SENSE_DATA; response[1] = NO_SENSE; @@ -2667,95 +2198,58 @@ static int nvme_trans_security_protocol(struct nvme_ns *ns, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); } -static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) +static int nvme_trans_synchronize_cache(struct nvme_ns *ns, + struct sg_io_hdr *hdr) { - int res = SNTI_TRANSLATION_SUCCESS; int nvme_sc; struct nvme_command c; - u8 immed, pcmod, pc, no_flush, start; - immed = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_IMMED_OFFSET); - pcmod = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_POWER_COND_MOD_OFFSET); - pc = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_POWER_COND_OFFSET); - no_flush = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_NO_FLUSH_OFFSET); - start = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_START_OFFSET); + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_cmd_flush; + c.common.nsid = cpu_to_le32(ns->ns_id); - immed &= START_STOP_UNIT_CDB_IMMED_MASK; - pcmod &= START_STOP_UNIT_CDB_POWER_COND_MOD_MASK; - pc = (pc & START_STOP_UNIT_CDB_POWER_COND_MASK) >> NIBBLE_SHIFT; - no_flush &= START_STOP_UNIT_CDB_NO_FLUSH_MASK; - start &= START_STOP_UNIT_CDB_START_MASK; + nvme_sc = nvme_submit_sync_cmd(ns->queue, &c, NULL, 0); + return nvme_trans_status_code(hdr, nvme_sc); +} + +static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + u8 immed, pcmod, pc, no_flush, start; + + immed = cmd[1] & 0x01; + pcmod = cmd[3] & 0x0f; + pc = (cmd[4] & 0xf0) >> 4; + no_flush = cmd[4] & 0x04; + start = cmd[4] & 0x01; if (immed != 0) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); } else { if (no_flush == 0) { /* Issue NVME FLUSH command prior to START STOP UNIT */ - memset(&c, 0, sizeof(c)); - c.common.opcode = nvme_cmd_flush; - c.common.nsid = cpu_to_le32(ns->ns_id); - - nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL); - res = nvme_trans_status_code(hdr, nvme_sc); + int res = nvme_trans_synchronize_cache(ns, hdr); if (res) - goto out; - if (nvme_sc) { - res = nvme_sc; - goto out; - } + return res; } /* Setup the expected power state transition */ - res = nvme_trans_power_state(ns, hdr, pc, pcmod, start); + return nvme_trans_power_state(ns, hdr, pc, pcmod, start); } - - out: - return res; -} - -static int nvme_trans_synchronize_cache(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *cmd) -{ - int res = SNTI_TRANSLATION_SUCCESS; - int nvme_sc; - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.common.opcode = nvme_cmd_flush; - c.common.nsid = cpu_to_le32(ns->ns_id); - - nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL); - - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out; - if (nvme_sc) - res = nvme_sc; - - out: - return res; } static int nvme_trans_format_unit(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *cmd) { - int res = SNTI_TRANSLATION_SUCCESS; + int res; u8 parm_hdr_len = 0; u8 nvme_pf_code = 0; u8 format_prot_info, long_list, format_data; - format_prot_info = GET_U8_FROM_CDB(cmd, - FORMAT_UNIT_CDB_FORMAT_PROT_INFO_OFFSET); - long_list = GET_U8_FROM_CDB(cmd, FORMAT_UNIT_CDB_LONG_LIST_OFFSET); - format_data = GET_U8_FROM_CDB(cmd, FORMAT_UNIT_CDB_FORMAT_DATA_OFFSET); - - format_prot_info = (format_prot_info & - FORMAT_UNIT_CDB_FORMAT_PROT_INFO_MASK) >> - FORMAT_UNIT_CDB_FORMAT_PROT_INFO_SHIFT; - long_list &= FORMAT_UNIT_CDB_LONG_LIST_MASK; - format_data &= FORMAT_UNIT_CDB_FORMAT_DATA_MASK; + format_prot_info = (cmd[1] & 0xc0) >> 6; + long_list = cmd[1] & 0x20; + format_data = cmd[1] & 0x10; if (format_data != 0) { if (format_prot_info != 0) { @@ -2779,16 +2273,16 @@ static int nvme_trans_format_unit(struct nvme_ns *ns, struct sg_io_hdr *hdr, if (parm_hdr_len > 0) { res = nvme_trans_fmt_get_parm_header(hdr, parm_hdr_len, format_prot_info, &nvme_pf_code); - if (res != SNTI_TRANSLATION_SUCCESS) + if (res) goto out; } /* Attempt to activate any previously downloaded firmware image */ - res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw, 0, 0, 0); + res = nvme_trans_send_activate_fw_cmd(ns, hdr, 0); /* Determine Block size and count and send format command */ res = nvme_trans_fmt_set_blk_size_count(ns, hdr); - if (res != SNTI_TRANSLATION_SUCCESS) + if (res) goto out; res = nvme_trans_fmt_send_cmd(ns, hdr, nvme_pf_code); @@ -2801,28 +2295,24 @@ static int nvme_trans_test_unit_ready(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *cmd) { - int res = SNTI_TRANSLATION_SUCCESS; struct nvme_dev *dev = ns->dev; if (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, NOT_READY, SCSI_ASC_LUN_NOT_READY, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); else - res = nvme_trans_completion(hdr, SAM_STAT_GOOD, NO_SENSE, 0, 0); - - return res; + return nvme_trans_completion(hdr, SAM_STAT_GOOD, NO_SENSE, 0, 0); } static int nvme_trans_write_buffer(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *cmd) { - int res = SNTI_TRANSLATION_SUCCESS; + int res = 0; u32 buffer_offset, parm_list_length; u8 buffer_id, mode; - parm_list_length = - GET_U24_FROM_CDB(cmd, WRITE_BUFFER_CDB_PARM_LIST_LENGTH_OFFSET); + parm_list_length = get_unaligned_be24(&cmd[6]); if (parm_list_length % BYTES_TO_DWORDS != 0) { /* NVMe expects Firmware file to be a whole number of DWORDS */ res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, @@ -2830,38 +2320,32 @@ static int nvme_trans_write_buffer(struct nvme_ns *ns, struct sg_io_hdr *hdr, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); goto out; } - buffer_id = GET_U8_FROM_CDB(cmd, WRITE_BUFFER_CDB_BUFFER_ID_OFFSET); + buffer_id = cmd[2]; if (buffer_id > NVME_MAX_FIRMWARE_SLOT) { res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); goto out; } - mode = GET_U8_FROM_CDB(cmd, WRITE_BUFFER_CDB_MODE_OFFSET) & - WRITE_BUFFER_CDB_MODE_MASK; - buffer_offset = - GET_U24_FROM_CDB(cmd, WRITE_BUFFER_CDB_BUFFER_OFFSET_OFFSET); + mode = cmd[1] & 0x1f; + buffer_offset = get_unaligned_be24(&cmd[3]); switch (mode) { case DOWNLOAD_SAVE_ACTIVATE: - res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_download_fw, + res = nvme_trans_send_download_fw_cmd(ns, hdr, nvme_admin_download_fw, parm_list_length, buffer_offset, buffer_id); - if (res != SNTI_TRANSLATION_SUCCESS) + if (res) goto out; - res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw, - parm_list_length, buffer_offset, - buffer_id); + res = nvme_trans_send_activate_fw_cmd(ns, hdr, buffer_id); break; case DOWNLOAD_SAVE_DEFER_ACTIVATE: - res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_download_fw, + res = nvme_trans_send_download_fw_cmd(ns, hdr, nvme_admin_download_fw, parm_list_length, buffer_offset, buffer_id); break; case ACTIVATE_DEFERRED_MICROCODE: - res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw, - parm_list_length, buffer_offset, - buffer_id); + res = nvme_trans_send_activate_fw_cmd(ns, hdr, buffer_id); break; default: res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, @@ -2890,15 +2374,13 @@ struct scsi_unmap_parm_list { static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *cmd) { - struct nvme_dev *dev = ns->dev; struct scsi_unmap_parm_list *plist; struct nvme_dsm_range *range; struct nvme_command c; - int i, nvme_sc, res = -ENOMEM; + int i, nvme_sc, res; u16 ndesc, list_len; - dma_addr_t dma_addr; - list_len = GET_U16_FROM_CDB(cmd, UNMAP_CDB_PARAM_LIST_LENGTH_OFFSET); + list_len = get_unaligned_be16(&cmd[7]); if (!list_len) return -EINVAL; @@ -2907,7 +2389,7 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr, return -ENOMEM; res = nvme_trans_copy_from_user(hdr, plist, list_len); - if (res != SNTI_TRANSLATION_SUCCESS) + if (res) goto out; ndesc = be16_to_cpu(plist->unmap_blk_desc_data_len) >> 4; @@ -2916,10 +2398,11 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr, goto out; } - range = dma_alloc_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range), - &dma_addr, GFP_KERNEL); - if (!range) + range = kcalloc(ndesc, sizeof(*range), GFP_KERNEL); + if (!range) { + res = -ENOMEM; goto out; + } for (i = 0; i < ndesc; i++) { range[i].nlb = cpu_to_le32(be32_to_cpu(plist->desc[i].nlb)); @@ -2930,15 +2413,14 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr, memset(&c, 0, sizeof(c)); c.dsm.opcode = nvme_cmd_dsm; c.dsm.nsid = cpu_to_le32(ns->ns_id); - c.dsm.prp1 = cpu_to_le64(dma_addr); c.dsm.nr = cpu_to_le32(ndesc - 1); c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL); + nvme_sc = nvme_submit_sync_cmd(ns->queue, &c, range, + ndesc * sizeof(*range)); res = nvme_trans_status_code(hdr, nvme_sc); - dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range), - range, dma_addr); + kfree(range); out: kfree(plist); return res; @@ -2993,13 +2475,16 @@ static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr) retcode = nvme_trans_mode_sense(ns, hdr, cmd); break; case READ_CAPACITY: - retcode = nvme_trans_read_capacity(ns, hdr, cmd); + retcode = nvme_trans_read_capacity(ns, hdr, cmd, 0); break; case SERVICE_ACTION_IN_16: - if (IS_READ_CAP_16(cmd)) - retcode = nvme_trans_read_capacity(ns, hdr, cmd); - else + switch (cmd[1]) { + case SAI_READ_CAPACITY_16: + retcode = nvme_trans_read_capacity(ns, hdr, cmd, 1); + break; + default: goto out; + } break; case REPORT_LUNS: retcode = nvme_trans_report_luns(ns, hdr, cmd); @@ -3015,7 +2500,7 @@ static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr) retcode = nvme_trans_start_stop(ns, hdr, cmd); break; case SYNCHRONIZE_CACHE: - retcode = nvme_trans_synchronize_cache(ns, hdr, cmd); + retcode = nvme_trans_synchronize_cache(ns, hdr); break; case FORMAT_UNIT: retcode = nvme_trans_format_unit(ns, hdr, cmd); @@ -3053,15 +2538,16 @@ int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr) if (hdr.cmd_len > BLK_MAX_CDB) return -EINVAL; + /* + * A positive return code means a NVMe status, which has been + * translated to sense data. + */ retcode = nvme_scsi_translate(ns, &hdr); if (retcode < 0) return retcode; - if (retcode > 0) - retcode = SNTI_TRANSLATION_SUCCESS; if (copy_to_user(u_hdr, &hdr, sizeof(sg_io_hdr_t)) > 0) return -EFAULT; - - return retcode; + return 0; } int nvme_sg_get_version_num(int __user *ip) diff --git a/drivers/block/paride/paride.c b/drivers/block/paride/paride.c index 48c50f11f..0e287993b 100644 --- a/drivers/block/paride/paride.c +++ b/drivers/block/paride/paride.c @@ -30,6 +30,7 @@ #include <linux/wait.h> #include <linux/sched.h> /* TASK_* */ #include <linux/parport.h> +#include <linux/slab.h> #include "paride.h" @@ -244,17 +245,19 @@ void paride_unregister(PIP * pr) EXPORT_SYMBOL(paride_unregister); -static int pi_register_parport(PIA * pi, int verbose) +static int pi_register_parport(PIA *pi, int verbose, int unit) { struct parport *port; + struct pardev_cb par_cb; port = parport_find_base(pi->port); if (!port) return 0; - - pi->pardev = parport_register_device(port, - pi->device, NULL, - pi_wake_up, NULL, 0, (void *) pi); + memset(&par_cb, 0, sizeof(par_cb)); + par_cb.wakeup = pi_wake_up; + par_cb.private = (void *)pi; + pi->pardev = parport_register_dev_model(port, pi->device, &par_cb, + unit); parport_put_port(port); if (!pi->pardev) return 0; @@ -311,7 +314,7 @@ static int pi_probe_unit(PIA * pi, int unit, char *scratch, int verbose) e = pi->proto->max_units; } - if (!pi_register_parport(pi, verbose)) + if (!pi_register_parport(pi, verbose, s)) return 0; if (pi->proto->test_port) { @@ -432,3 +435,45 @@ int pi_init(PIA * pi, int autoprobe, int port, int mode, } EXPORT_SYMBOL(pi_init); + +static int pi_probe(struct pardevice *par_dev) +{ + struct device_driver *drv = par_dev->dev.driver; + int len = strlen(drv->name); + + if (strncmp(par_dev->name, drv->name, len)) + return -ENODEV; + + return 0; +} + +void *pi_register_driver(char *name) +{ + struct parport_driver *parp_drv; + int ret; + + parp_drv = kzalloc(sizeof(*parp_drv), GFP_KERNEL); + if (!parp_drv) + return NULL; + + parp_drv->name = name; + parp_drv->probe = pi_probe; + parp_drv->devmodel = true; + + ret = parport_register_driver(parp_drv); + if (ret) { + kfree(parp_drv); + return NULL; + } + return (void *)parp_drv; +} +EXPORT_SYMBOL(pi_register_driver); + +void pi_unregister_driver(void *_drv) +{ + struct parport_driver *drv = _drv; + + parport_unregister_driver(drv); + kfree(drv); +} +EXPORT_SYMBOL(pi_unregister_driver); diff --git a/drivers/block/paride/paride.h b/drivers/block/paride/paride.h index 2bddbf455..ddb9e589d 100644 --- a/drivers/block/paride/paride.h +++ b/drivers/block/paride/paride.h @@ -165,6 +165,8 @@ typedef struct pi_protocol PIP; extern int paride_register( PIP * ); extern void paride_unregister ( PIP * ); +void *pi_register_driver(char *); +void pi_unregister_driver(void *); #endif /* __DRIVERS_PARIDE_H__ */ /* end of paride.h */ diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 3b7c9f1be..93362362a 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -221,6 +221,7 @@ static int pcd_busy; /* request being processed ? */ static int pcd_sector; /* address of next requested sector */ static int pcd_count; /* number of blocks still to do */ static char *pcd_buf; /* buffer for request in progress */ +static void *par_drv; /* reference of parport driver */ /* kernel glue structures */ @@ -690,6 +691,12 @@ static int pcd_detect(void) printk("%s: %s version %s, major %d, nice %d\n", name, name, PCD_VERSION, major, nice); + par_drv = pi_register_driver(name); + if (!par_drv) { + pr_err("failed to register %s driver\n", name); + return -1; + } + k = 0; if (pcd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */ cd = pcd; @@ -723,6 +730,7 @@ static int pcd_detect(void) printk("%s: No CD-ROM drive found\n", name); for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) put_disk(cd->disk); + pi_unregister_driver(par_drv); return -1; } @@ -984,6 +992,7 @@ static void __exit pcd_exit(void) } blk_cleanup_queue(pcd_queue); unregister_blkdev(major, name); + pi_unregister_driver(par_drv); } MODULE_LICENSE("GPL"); diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index d48715b28..b9242d782 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -247,6 +247,8 @@ static char *pd_errs[17] = { "ERR", "INDEX", "ECC", "DRQ", "SEEK", "WRERR", "IDNF", "MC", "UNC", "???", "TMO" }; +static void *par_drv; /* reference of parport driver */ + static inline int status_reg(struct pd_unit *disk) { return pi_read_regr(disk->pi, 1, 6); @@ -442,7 +444,7 @@ static char *pd_buf; /* buffer for request in progress */ static enum action do_pd_io_start(void) { - if (pd_req->cmd_type == REQ_TYPE_SPECIAL) { + if (pd_req->cmd_type == REQ_TYPE_DRV_PRIV) { phase = pd_special; return pd_special(); } @@ -725,7 +727,7 @@ static int pd_special_command(struct pd_unit *disk, if (IS_ERR(rq)) return PTR_ERR(rq); - rq->cmd_type = REQ_TYPE_SPECIAL; + rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->special = func; err = blk_execute_rq(disk->gd->queue, disk->gd, rq, 0); @@ -872,6 +874,12 @@ static int pd_detect(void) pd_drive_count++; } + par_drv = pi_register_driver(name); + if (!par_drv) { + pr_err("failed to register %s driver\n", name); + return -1; + } + if (pd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */ disk = pd; if (pi_init(disk->pi, 1, -1, -1, -1, -1, -1, pd_scratch, @@ -902,8 +910,10 @@ static int pd_detect(void) found = 1; } } - if (!found) + if (!found) { printk("%s: no valid drive found\n", name); + pi_unregister_driver(par_drv); + } return found; } diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index 9a15fd3c9..7a7d977a7 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -264,6 +264,7 @@ static int pf_cmd; /* current command READ/WRITE */ static struct pf_unit *pf_current;/* unit of current request */ static int pf_mask; /* stopper for pseudo-int */ static char *pf_buf; /* buffer for request in progress */ +static void *par_drv; /* reference of parport driver */ /* kernel glue structures */ @@ -703,6 +704,11 @@ static int pf_detect(void) printk("%s: %s version %s, major %d, cluster %d, nice %d\n", name, name, PF_VERSION, major, cluster, nice); + par_drv = pi_register_driver(name); + if (!par_drv) { + pr_err("failed to register %s driver\n", name); + return -1; + } k = 0; if (pf_drive_count == 0) { if (pi_init(pf->pi, 1, -1, -1, -1, -1, -1, pf_scratch, PI_PF, @@ -735,6 +741,7 @@ static int pf_detect(void) printk("%s: No ATAPI disk detected\n", name); for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) put_disk(pf->disk); + pi_unregister_driver(par_drv); return -1; } diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c index 876d0c3ea..bfbd4c852 100644 --- a/drivers/block/paride/pg.c +++ b/drivers/block/paride/pg.c @@ -227,6 +227,7 @@ static int pg_identify(struct pg *dev, int log); static char pg_scratch[512]; /* scratch block buffer */ static struct class *pg_class; +static void *par_drv; /* reference of parport driver */ /* kernel glue structures */ @@ -481,6 +482,12 @@ static int pg_detect(void) printk("%s: %s version %s, major %d\n", name, name, PG_VERSION, major); + par_drv = pi_register_driver(name); + if (!par_drv) { + pr_err("failed to register %s driver\n", name); + return -1; + } + k = 0; if (pg_drive_count == 0) { if (pi_init(dev->pi, 1, -1, -1, -1, -1, -1, pg_scratch, @@ -511,6 +518,7 @@ static int pg_detect(void) if (k) return 0; + pi_unregister_driver(par_drv); printk("%s: No ATAPI device detected\n", name); return -1; } diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c index 2596042eb..1740d75e8 100644 --- a/drivers/block/paride/pt.c +++ b/drivers/block/paride/pt.c @@ -232,6 +232,7 @@ static int pt_identify(struct pt_unit *tape); static struct pt_unit pt[PT_UNITS]; static char pt_scratch[512]; /* scratch block buffer */ +static void *par_drv; /* reference of parport driver */ /* kernel glue structures */ @@ -605,6 +606,12 @@ static int pt_detect(void) printk("%s: %s version %s, major %d\n", name, name, PT_VERSION, major); + par_drv = pi_register_driver(name); + if (!par_drv) { + pr_err("failed to register %s driver\n", name); + return -1; + } + specified = 0; for (unit = 0; unit < PT_UNITS; unit++) { struct pt_unit *tape = &pt[unit]; @@ -644,6 +651,7 @@ static int pt_detect(void) if (found) return 0; + pi_unregister_driver(par_drv); printk("%s: No ATAPI tape drive detected\n", name); return -1; } diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 09e628daf..4c20c2281 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -61,6 +61,7 @@ #include <linux/freezer.h> #include <linux/mutex.h> #include <linux/slab.h> +#include <linux/backing-dev.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_ioctl.h> #include <scsi/scsi.h> diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c deleted file mode 100644 index eabf4a8d0..000000000 --- a/drivers/block/pmem.c +++ /dev/null @@ -1,262 +0,0 @@ -/* - * Persistent Memory Driver - * - * Copyright (c) 2014, Intel Corporation. - * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>. - * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - */ - -#include <asm/cacheflush.h> -#include <linux/blkdev.h> -#include <linux/hdreg.h> -#include <linux/init.h> -#include <linux/platform_device.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/slab.h> - -#define PMEM_MINORS 16 - -struct pmem_device { - struct request_queue *pmem_queue; - struct gendisk *pmem_disk; - - /* One contiguous memory region per device */ - phys_addr_t phys_addr; - void *virt_addr; - size_t size; -}; - -static int pmem_major; -static atomic_t pmem_index; - -static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, - unsigned int len, unsigned int off, int rw, - sector_t sector) -{ - void *mem = kmap_atomic(page); - size_t pmem_off = sector << 9; - - if (rw == READ) { - memcpy(mem + off, pmem->virt_addr + pmem_off, len); - flush_dcache_page(page); - } else { - flush_dcache_page(page); - memcpy(pmem->virt_addr + pmem_off, mem + off, len); - } - - kunmap_atomic(mem); -} - -static void pmem_make_request(struct request_queue *q, struct bio *bio) -{ - struct block_device *bdev = bio->bi_bdev; - struct pmem_device *pmem = bdev->bd_disk->private_data; - int rw; - struct bio_vec bvec; - sector_t sector; - struct bvec_iter iter; - int err = 0; - - if (bio_end_sector(bio) > get_capacity(bdev->bd_disk)) { - err = -EIO; - goto out; - } - - BUG_ON(bio->bi_rw & REQ_DISCARD); - - rw = bio_data_dir(bio); - sector = bio->bi_iter.bi_sector; - bio_for_each_segment(bvec, bio, iter) { - pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset, - rw, sector); - sector += bvec.bv_len >> 9; - } - -out: - bio_endio(bio, err); -} - -static int pmem_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, int rw) -{ - struct pmem_device *pmem = bdev->bd_disk->private_data; - - pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector); - page_endio(page, rw & WRITE, 0); - - return 0; -} - -static long pmem_direct_access(struct block_device *bdev, sector_t sector, - void **kaddr, unsigned long *pfn, long size) -{ - struct pmem_device *pmem = bdev->bd_disk->private_data; - size_t offset = sector << 9; - - if (!pmem) - return -ENODEV; - - *kaddr = pmem->virt_addr + offset; - *pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT; - - return pmem->size - offset; -} - -static const struct block_device_operations pmem_fops = { - .owner = THIS_MODULE, - .rw_page = pmem_rw_page, - .direct_access = pmem_direct_access, -}; - -static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res) -{ - struct pmem_device *pmem; - struct gendisk *disk; - int idx, err; - - err = -ENOMEM; - pmem = kzalloc(sizeof(*pmem), GFP_KERNEL); - if (!pmem) - goto out; - - pmem->phys_addr = res->start; - pmem->size = resource_size(res); - - err = -EINVAL; - if (!request_mem_region(pmem->phys_addr, pmem->size, "pmem")) { - dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n", &pmem->phys_addr, pmem->size); - goto out_free_dev; - } - - /* - * Map the memory as non-cachable, as we can't write back the contents - * of the CPU caches in case of a crash. - */ - err = -ENOMEM; - pmem->virt_addr = ioremap_nocache(pmem->phys_addr, pmem->size); - if (!pmem->virt_addr) - goto out_release_region; - - pmem->pmem_queue = blk_alloc_queue(GFP_KERNEL); - if (!pmem->pmem_queue) - goto out_unmap; - - blk_queue_make_request(pmem->pmem_queue, pmem_make_request); - blk_queue_max_hw_sectors(pmem->pmem_queue, 1024); - blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY); - - disk = alloc_disk(PMEM_MINORS); - if (!disk) - goto out_free_queue; - - idx = atomic_inc_return(&pmem_index) - 1; - - disk->major = pmem_major; - disk->first_minor = PMEM_MINORS * idx; - disk->fops = &pmem_fops; - disk->private_data = pmem; - disk->queue = pmem->pmem_queue; - disk->flags = GENHD_FL_EXT_DEVT; - sprintf(disk->disk_name, "pmem%d", idx); - disk->driverfs_dev = dev; - set_capacity(disk, pmem->size >> 9); - pmem->pmem_disk = disk; - - add_disk(disk); - - return pmem; - -out_free_queue: - blk_cleanup_queue(pmem->pmem_queue); -out_unmap: - iounmap(pmem->virt_addr); -out_release_region: - release_mem_region(pmem->phys_addr, pmem->size); -out_free_dev: - kfree(pmem); -out: - return ERR_PTR(err); -} - -static void pmem_free(struct pmem_device *pmem) -{ - del_gendisk(pmem->pmem_disk); - put_disk(pmem->pmem_disk); - blk_cleanup_queue(pmem->pmem_queue); - iounmap(pmem->virt_addr); - release_mem_region(pmem->phys_addr, pmem->size); - kfree(pmem); -} - -static int pmem_probe(struct platform_device *pdev) -{ - struct pmem_device *pmem; - struct resource *res; - - if (WARN_ON(pdev->num_resources > 1)) - return -ENXIO; - - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENXIO; - - pmem = pmem_alloc(&pdev->dev, res); - if (IS_ERR(pmem)) - return PTR_ERR(pmem); - - platform_set_drvdata(pdev, pmem); - - return 0; -} - -static int pmem_remove(struct platform_device *pdev) -{ - struct pmem_device *pmem = platform_get_drvdata(pdev); - - pmem_free(pmem); - return 0; -} - -static struct platform_driver pmem_driver = { - .probe = pmem_probe, - .remove = pmem_remove, - .driver = { - .owner = THIS_MODULE, - .name = "pmem", - }, -}; - -static int __init pmem_init(void) -{ - int error; - - pmem_major = register_blkdev(0, "pmem"); - if (pmem_major < 0) - return pmem_major; - - error = platform_driver_register(&pmem_driver); - if (error) - unregister_blkdev(pmem_major, "pmem"); - return error; -} -module_init(pmem_init); - -static void pmem_exit(void) -{ - platform_driver_unregister(&pmem_driver); - unregister_blkdev(pmem_major, "pmem"); -} -module_exit(pmem_exit); - -MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>"); -MODULE_LICENSE("GPL v2"); diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index ef45cfb98..b1612eb16 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -1,5 +1,5 @@ /* - * ps3vram - Use extra PS3 video ram as MTD block device. + * ps3vram - Use extra PS3 video ram as block device. * * Copyright 2009 Sony Corporation * @@ -73,8 +73,8 @@ struct ps3vram_priv { u64 memory_handle; u64 context_handle; - u32 *ctrl; - void *reports; + u32 __iomem *ctrl; + void __iomem *reports; u8 *xdr_buf; u32 *fifo_base; @@ -104,7 +104,7 @@ static char *size = "256M"; module_param(size, charp, 0); MODULE_PARM_DESC(size, "memory size"); -static u32 *ps3vram_get_notifier(void *reports, int notifier) +static u32 __iomem *ps3vram_get_notifier(void __iomem *reports, int notifier) { return reports + DMA_NOTIFIER_OFFSET_BASE + DMA_NOTIFIER_SIZE * notifier; @@ -113,22 +113,22 @@ static u32 *ps3vram_get_notifier(void *reports, int notifier) static void ps3vram_notifier_reset(struct ps3_system_bus_device *dev) { struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); - u32 *notify = ps3vram_get_notifier(priv->reports, NOTIFIER); + u32 __iomem *notify = ps3vram_get_notifier(priv->reports, NOTIFIER); int i; for (i = 0; i < 4; i++) - notify[i] = 0xffffffff; + iowrite32be(0xffffffff, notify + i); } static int ps3vram_notifier_wait(struct ps3_system_bus_device *dev, unsigned int timeout_ms) { struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); - u32 *notify = ps3vram_get_notifier(priv->reports, NOTIFIER); + u32 __iomem *notify = ps3vram_get_notifier(priv->reports, NOTIFIER); unsigned long timeout; for (timeout = 20; timeout; timeout--) { - if (!notify[3]) + if (!ioread32be(notify + 3)) return 0; udelay(10); } @@ -136,7 +136,7 @@ static int ps3vram_notifier_wait(struct ps3_system_bus_device *dev, timeout = jiffies + msecs_to_jiffies(timeout_ms); do { - if (!notify[3]) + if (!ioread32be(notify + 3)) return 0; msleep(1); } while (time_before(jiffies, timeout)); @@ -148,8 +148,8 @@ static void ps3vram_init_ring(struct ps3_system_bus_device *dev) { struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); - priv->ctrl[CTRL_PUT] = FIFO_BASE + FIFO_OFFSET; - priv->ctrl[CTRL_GET] = FIFO_BASE + FIFO_OFFSET; + iowrite32be(FIFO_BASE + FIFO_OFFSET, priv->ctrl + CTRL_PUT); + iowrite32be(FIFO_BASE + FIFO_OFFSET, priv->ctrl + CTRL_GET); } static int ps3vram_wait_ring(struct ps3_system_bus_device *dev, @@ -159,14 +159,14 @@ static int ps3vram_wait_ring(struct ps3_system_bus_device *dev, unsigned long timeout = jiffies + msecs_to_jiffies(timeout_ms); do { - if (priv->ctrl[CTRL_PUT] == priv->ctrl[CTRL_GET]) + if (ioread32be(priv->ctrl + CTRL_PUT) == ioread32be(priv->ctrl + CTRL_GET)) return 0; msleep(1); } while (time_before(jiffies, timeout)); dev_warn(&dev->core, "FIFO timeout (%08x/%08x/%08x)\n", - priv->ctrl[CTRL_PUT], priv->ctrl[CTRL_GET], - priv->ctrl[CTRL_TOP]); + ioread32be(priv->ctrl + CTRL_PUT), ioread32be(priv->ctrl + CTRL_GET), + ioread32be(priv->ctrl + CTRL_TOP)); return -ETIMEDOUT; } @@ -189,7 +189,7 @@ static void ps3vram_rewind_ring(struct ps3_system_bus_device *dev) ps3vram_out_ring(priv, 0x20000000 | (FIFO_BASE + FIFO_OFFSET)); - priv->ctrl[CTRL_PUT] = FIFO_BASE + FIFO_OFFSET; + iowrite32be(FIFO_BASE + FIFO_OFFSET, priv->ctrl + CTRL_PUT); /* asking the HV for a blit will kick the FIFO */ status = lv1_gpu_fb_blit(priv->context_handle, 0, 0, 0, 0); @@ -207,8 +207,8 @@ static void ps3vram_fire_ring(struct ps3_system_bus_device *dev) mutex_lock(&ps3_gpu_mutex); - priv->ctrl[CTRL_PUT] = FIFO_BASE + FIFO_OFFSET + - (priv->fifo_ptr - priv->fifo_base) * sizeof(u32); + iowrite32be(FIFO_BASE + FIFO_OFFSET + (priv->fifo_ptr - priv->fifo_base) + * sizeof(u32), priv->ctrl + CTRL_PUT); /* asking the HV for a blit will kick the FIFO */ status = lv1_gpu_fb_blit(priv->context_handle, 0, 0, 0, 0); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 010ce0b1f..bc67a93aa 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -346,6 +346,7 @@ struct rbd_device { struct rbd_image_header header; unsigned long flags; /* possibly lock protected */ struct rbd_spec *spec; + struct rbd_options *opts; char *header_name; @@ -725,34 +726,36 @@ static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) } /* - * mount options + * (Per device) rbd map options */ enum { + Opt_queue_depth, Opt_last_int, /* int args above */ Opt_last_string, /* string args above */ Opt_read_only, Opt_read_write, - /* Boolean args above */ - Opt_last_bool, + Opt_err }; static match_table_t rbd_opts_tokens = { + {Opt_queue_depth, "queue_depth=%d"}, /* int args above */ /* string args above */ {Opt_read_only, "read_only"}, {Opt_read_only, "ro"}, /* Alternate spelling */ {Opt_read_write, "read_write"}, {Opt_read_write, "rw"}, /* Alternate spelling */ - /* Boolean args above */ - {-1, NULL} + {Opt_err, NULL} }; struct rbd_options { + int queue_depth; bool read_only; }; +#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ #define RBD_READ_ONLY_DEFAULT false static int parse_rbd_opts_token(char *c, void *private) @@ -762,27 +765,27 @@ static int parse_rbd_opts_token(char *c, void *private) int token, intval, ret; token = match_token(c, rbd_opts_tokens, argstr); - if (token < 0) - return -EINVAL; - if (token < Opt_last_int) { ret = match_int(&argstr[0], &intval); if (ret < 0) { - pr_err("bad mount option arg (not int) " - "at '%s'\n", c); + pr_err("bad mount option arg (not int) at '%s'\n", c); return ret; } dout("got int token %d val %d\n", token, intval); } else if (token > Opt_last_int && token < Opt_last_string) { - dout("got string token %d val %s\n", token, - argstr[0].from); - } else if (token > Opt_last_string && token < Opt_last_bool) { - dout("got Boolean token %d\n", token); + dout("got string token %d val %s\n", token, argstr[0].from); } else { dout("got token %d\n", token); } switch (token) { + case Opt_queue_depth: + if (intval < 1) { + pr_err("queue_depth out of range\n"); + return -EINVAL; + } + rbd_opts->queue_depth = intval; + break; case Opt_read_only: rbd_opts->read_only = true; break; @@ -790,9 +793,10 @@ static int parse_rbd_opts_token(char *c, void *private) rbd_opts->read_only = false; break; default: - rbd_assert(false); - break; + /* libceph prints "bad option" msg */ + return -EINVAL; } + return 0; } @@ -1564,22 +1568,39 @@ static void rbd_obj_request_end(struct rbd_obj_request *obj_request) /* * Wait for an object request to complete. If interrupted, cancel the * underlying osd request. + * + * @timeout: in jiffies, 0 means "wait forever" */ -static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) +static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request, + unsigned long timeout) { - int ret; + long ret; dout("%s %p\n", __func__, obj_request); - - ret = wait_for_completion_interruptible(&obj_request->completion); - if (ret < 0) { - dout("%s %p interrupted\n", __func__, obj_request); + ret = wait_for_completion_interruptible_timeout( + &obj_request->completion, + ceph_timeout_jiffies(timeout)); + if (ret <= 0) { + if (ret == 0) + ret = -ETIMEDOUT; rbd_obj_request_end(obj_request); - return ret; + } else { + ret = 0; } - dout("%s %p done\n", __func__, obj_request); - return 0; + dout("%s %p ret %d\n", __func__, obj_request, (int)ret); + return ret; +} + +static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) +{ + return __rbd_obj_request_wait(obj_request, 0); +} + +static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request, + unsigned long timeout) +{ + return __rbd_obj_request_wait(obj_request, timeout); } static void rbd_img_request_complete(struct rbd_img_request *img_request) @@ -2389,7 +2410,7 @@ static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, } if (opcode == CEPH_OSD_OP_DELETE) - osd_req_op_init(osd_request, num_ops, opcode); + osd_req_op_init(osd_request, num_ops, opcode, 0); else osd_req_op_extent_init(osd_request, num_ops, opcode, offset, length, 0, 0); @@ -2860,7 +2881,7 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) goto out; stat_request->callback = rbd_img_obj_exists_callback; - osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT); + osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0); osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, false, false); rbd_osd_req_format_read(stat_request); @@ -3134,6 +3155,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper( bool watch) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct ceph_options *opts = osdc->client->options; struct rbd_obj_request *obj_request; int ret; @@ -3160,7 +3182,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper( if (ret) goto out; - ret = rbd_obj_request_wait(obj_request); + ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout); if (ret) goto out; @@ -3762,10 +3784,9 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); rbd_dev->tag_set.ops = &rbd_mq_ops; - rbd_dev->tag_set.queue_depth = BLKDEV_MAX_RQ; + rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth; rbd_dev->tag_set.numa_node = NUMA_NO_NODE; - rbd_dev->tag_set.flags = - BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; rbd_dev->tag_set.nr_hw_queues = 1; rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); @@ -3785,6 +3806,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) /* set io sizes to object size */ segment_size = rbd_obj_bytes(&rbd_dev->header); blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); + blk_queue_max_segments(q, segment_size / SECTOR_SIZE); blk_queue_max_segment_size(q, segment_size); blk_queue_io_min(q, segment_size); blk_queue_io_opt(q, segment_size); @@ -4056,7 +4078,8 @@ static void rbd_spec_free(struct kref *kref) } static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, - struct rbd_spec *spec) + struct rbd_spec *spec, + struct rbd_options *opts) { struct rbd_device *rbd_dev; @@ -4070,8 +4093,9 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, INIT_LIST_HEAD(&rbd_dev->node); init_rwsem(&rbd_dev->header_rwsem); - rbd_dev->spec = spec; rbd_dev->rbd_client = rbdc; + rbd_dev->spec = spec; + rbd_dev->opts = opts; /* Initialize the layout used for all rbd requests */ @@ -4087,6 +4111,7 @@ static void rbd_dev_destroy(struct rbd_device *rbd_dev) { rbd_put_client(rbd_dev->rbd_client); rbd_spec_put(rbd_dev->spec); + kfree(rbd_dev->opts); kfree(rbd_dev); } @@ -4945,6 +4970,7 @@ static int rbd_add_parse_args(const char *buf, goto out_mem; rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; + rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT; copts = ceph_parse_options(options, mon_addrs, mon_addrs + mon_addrs_size - 1, @@ -4975,8 +5001,8 @@ out_err: */ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) { + struct ceph_options *opts = rbdc->client->options; u64 newest_epoch; - unsigned long timeout = rbdc->client->options->mount_timeout * HZ; int tries = 0; int ret; @@ -4991,7 +5017,8 @@ again: if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { ceph_monc_request_next_osdmap(&rbdc->client->monc); (void) ceph_monc_wait_osdmap(&rbdc->client->monc, - newest_epoch, timeout); + newest_epoch, + opts->mount_timeout); goto again; } else { /* the osdmap we have is new enough */ @@ -5160,7 +5187,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev) rbdc = __rbd_get_client(rbd_dev->rbd_client); ret = -ENOMEM; - parent = rbd_dev_create(rbdc, parent_spec); + parent = rbd_dev_create(rbdc, parent_spec, NULL); if (!parent) goto out_err; @@ -5406,9 +5433,6 @@ static ssize_t do_rbd_add(struct bus_type *bus, rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); if (rc < 0) goto err_out_module; - read_only = rbd_opts->read_only; - kfree(rbd_opts); - rbd_opts = NULL; /* done with this */ rbdc = rbd_get_client(ceph_opts); if (IS_ERR(rbdc)) { @@ -5434,11 +5458,12 @@ static ssize_t do_rbd_add(struct bus_type *bus, goto err_out_client; } - rbd_dev = rbd_dev_create(rbdc, spec); + rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts); if (!rbd_dev) goto err_out_client; rbdc = NULL; /* rbd_dev now owns this */ spec = NULL; /* rbd_dev now owns this */ + rbd_opts = NULL; /* rbd_dev now owns this */ rc = rbd_dev_image_probe(rbd_dev, true); if (rc < 0) @@ -5446,6 +5471,7 @@ static ssize_t do_rbd_add(struct bus_type *bus, /* If we are mapping a snapshot it must be marked read-only */ + read_only = rbd_dev->opts->read_only; if (rbd_dev->spec->snap_id != CEPH_NOSNAP) read_only = true; rbd_dev->mapping.read_only = read_only; @@ -5470,6 +5496,7 @@ err_out_client: rbd_put_client(rbdc); err_out_args: rbd_spec_put(spec); + kfree(rbd_opts); err_out_module: module_put(THIS_MODULE); diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 5d552857d..59c91d49b 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -620,7 +620,7 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx) spin_unlock_irq(&host->lock); DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx); - crq->rq->cmd_type = REQ_TYPE_SPECIAL; + crq->rq->cmd_type = REQ_TYPE_DRV_PRIV; crq->rq->special = crq; blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL); @@ -661,7 +661,7 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func) crq->msg_bucket = (u32) rc; DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx); - crq->rq->cmd_type = REQ_TYPE_SPECIAL; + crq->rq->cmd_type = REQ_TYPE_DRV_PRIV; crq->rq->special = crq; blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 5ea2f0bbb..d4d05f064 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -124,7 +124,7 @@ static inline void virtblk_request_done(struct request *req) req->resid_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual); req->sense_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len); req->errors = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors); - } else if (req->cmd_type == REQ_TYPE_SPECIAL) { + } else if (req->cmd_type == REQ_TYPE_DRV_PRIV) { req->errors = (error != 0); } @@ -188,7 +188,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, vbr->out_hdr.sector = 0; vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); break; - case REQ_TYPE_SPECIAL: + case REQ_TYPE_DRV_PRIV: vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID); vbr->out_hdr.sector = 0; vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); @@ -251,7 +251,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) return PTR_ERR(req); } - req->cmd_type = REQ_TYPE_SPECIAL; + req->cmd_type = REQ_TYPE_DRV_PRIV; err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false); blk_put_request(req); diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 713fc9ff1..954c0029f 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -84,6 +84,13 @@ MODULE_PARM_DESC(max_persistent_grants, "Maximum number of grants to map persistently"); /* + * Maximum order of pages to be used for the shared ring between front and + * backend, 4KB page granularity is used. + */ +unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_PAGE_ORDER; +module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO); +MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring"); +/* * The LRU mechanism to clean the lists of persistent grants needs to * be executed periodically. The time interval between consecutive executions * of the purge mechanism is set in ms. @@ -362,8 +369,8 @@ static void purge_persistent_gnt(struct xen_blkif *blkif) return; } - if (work_pending(&blkif->persistent_purge_work)) { - pr_alert_ratelimited("Scheduled work from previous purge is still pending, cannot purge list\n"); + if (work_busy(&blkif->persistent_purge_work)) { + pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n"); return; } @@ -729,7 +736,7 @@ static void xen_blkbk_unmap_and_respond(struct pending_req *req) struct grant_page **pages = req->segments; unsigned int invcount; - invcount = xen_blkbk_unmap_prepare(blkif, pages, req->nr_pages, + invcount = xen_blkbk_unmap_prepare(blkif, pages, req->nr_segs, req->unmap, req->unmap_pages); work->data = req; @@ -915,7 +922,7 @@ static int xen_blkbk_map_seg(struct pending_req *pending_req) int rc; rc = xen_blkbk_map(pending_req->blkif, pending_req->segments, - pending_req->nr_pages, + pending_req->nr_segs, (pending_req->operation != BLKIF_OP_READ)); return rc; @@ -931,7 +938,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, int indirect_grefs, rc, n, nseg, i; struct blkif_request_segment *segments = NULL; - nseg = pending_req->nr_pages; + nseg = pending_req->nr_segs; indirect_grefs = INDIRECT_PAGES(nseg); BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST); @@ -1251,7 +1258,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, pending_req->id = req->u.rw.id; pending_req->operation = req_operation; pending_req->status = BLKIF_RSP_OKAY; - pending_req->nr_pages = nseg; + pending_req->nr_segs = nseg; if (req->operation != BLKIF_OP_INDIRECT) { preq.dev = req->u.rw.handle; @@ -1372,7 +1379,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, fail_flush: xen_blkbk_unmap(blkif, pending_req->segments, - pending_req->nr_pages); + pending_req->nr_segs); fail_response: /* Haven't submitted any bio's yet. */ make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR); @@ -1438,6 +1445,12 @@ static int __init xen_blkif_init(void) if (!xen_domain()) return -ENODEV; + if (xen_blkif_max_ring_order > XENBUS_MAX_RING_PAGE_ORDER) { + pr_info("Invalid max_ring_order (%d), will use default max: %d.\n", + xen_blkif_max_ring_order, XENBUS_MAX_RING_PAGE_ORDER); + xen_blkif_max_ring_order = XENBUS_MAX_RING_PAGE_ORDER; + } + rc = xen_blkif_interface_init(); if (rc) goto failed_init; diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index f620b5d3f..45a044a53 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -44,6 +44,7 @@ #include <xen/interface/io/blkif.h> #include <xen/interface/io/protocols.h> +extern unsigned int xen_blkif_max_ring_order; /* * This is the maximum number of segments that would be allowed in indirect * requests. This value will also be passed to the frontend. @@ -248,7 +249,7 @@ struct backend_info; #define PERSISTENT_GNT_WAS_ACTIVE 1 /* Number of requests that we can fit in a ring */ -#define XEN_BLKIF_REQS 32 +#define XEN_BLKIF_REQS_PER_PAGE 32 struct persistent_gnt { struct page *page; @@ -320,6 +321,7 @@ struct xen_blkif { struct work_struct free_work; /* Thread shutdown wait queue. */ wait_queue_head_t shutdown_wq; + unsigned int nr_ring_pages; }; struct seg_buf { @@ -343,7 +345,7 @@ struct grant_page { struct pending_req { struct xen_blkif *blkif; u64 id; - int nr_pages; + int nr_segs; atomic_t pendcnt; unsigned short operation; int status; diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 6ab69ad61..deb3f0017 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -25,6 +25,7 @@ /* Enlarge the array size in order to fully show blkback name. */ #define BLKBACK_NAME_LEN (20) +#define RINGREF_NAME_LEN (20) struct backend_info { struct xenbus_device *dev; @@ -124,8 +125,6 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) static struct xen_blkif *xen_blkif_alloc(domid_t domid) { struct xen_blkif *blkif; - struct pending_req *req, *n; - int i, j; BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST); @@ -151,55 +150,15 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) INIT_LIST_HEAD(&blkif->pending_free); INIT_WORK(&blkif->free_work, xen_blkif_deferred_free); - - for (i = 0; i < XEN_BLKIF_REQS; i++) { - req = kzalloc(sizeof(*req), GFP_KERNEL); - if (!req) - goto fail; - list_add_tail(&req->free_list, - &blkif->pending_free); - for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { - req->segments[j] = kzalloc(sizeof(*req->segments[0]), - GFP_KERNEL); - if (!req->segments[j]) - goto fail; - } - for (j = 0; j < MAX_INDIRECT_PAGES; j++) { - req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]), - GFP_KERNEL); - if (!req->indirect_pages[j]) - goto fail; - } - } spin_lock_init(&blkif->pending_free_lock); init_waitqueue_head(&blkif->pending_free_wq); init_waitqueue_head(&blkif->shutdown_wq); return blkif; - -fail: - list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { - list_del(&req->free_list); - for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { - if (!req->segments[j]) - break; - kfree(req->segments[j]); - } - for (j = 0; j < MAX_INDIRECT_PAGES; j++) { - if (!req->indirect_pages[j]) - break; - kfree(req->indirect_pages[j]); - } - kfree(req); - } - - kmem_cache_free(xen_blkif_cachep, blkif); - - return ERR_PTR(-ENOMEM); } -static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref, - unsigned int evtchn) +static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, + unsigned int nr_grefs, unsigned int evtchn) { int err; @@ -207,7 +166,7 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref, if (blkif->irq) return 0; - err = xenbus_map_ring_valloc(blkif->be->dev, &gref, 1, + err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs, &blkif->blk_ring); if (err < 0) return err; @@ -217,21 +176,21 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref, { struct blkif_sring *sring; sring = (struct blkif_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE); + BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE * nr_grefs); break; } case BLKIF_PROTOCOL_X86_32: { struct blkif_x86_32_sring *sring_x86_32; sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE); + BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE * nr_grefs); break; } case BLKIF_PROTOCOL_X86_64: { struct blkif_x86_64_sring *sring_x86_64; sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE); + BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE * nr_grefs); break; } default: @@ -312,7 +271,7 @@ static void xen_blkif_free(struct xen_blkif *blkif) i++; } - WARN_ON(i != XEN_BLKIF_REQS); + WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages)); kmem_cache_free(xen_blkif_cachep, blkif); } @@ -597,6 +556,11 @@ static int xen_blkbk_probe(struct xenbus_device *dev, if (err) goto fail; + err = xenbus_printf(XBT_NIL, dev->nodename, "max-ring-page-order", "%u", + xen_blkif_max_ring_order); + if (err) + pr_warn("%s write out 'max-ring-page-order' failed\n", __func__); + err = xenbus_switch_state(dev, XenbusStateInitWait); if (err) goto fail; @@ -860,22 +824,66 @@ again: static int connect_ring(struct backend_info *be) { struct xenbus_device *dev = be->dev; - unsigned long ring_ref; - unsigned int evtchn; + unsigned int ring_ref[XENBUS_MAX_RING_PAGES]; + unsigned int evtchn, nr_grefs, ring_page_order; unsigned int pers_grants; char protocol[64] = ""; - int err; + struct pending_req *req, *n; + int err, i, j; pr_debug("%s %s\n", __func__, dev->otherend); - err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", - &ring_ref, "event-channel", "%u", &evtchn, NULL); - if (err) { - xenbus_dev_fatal(dev, err, - "reading %s/ring-ref and event-channel", + err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u", + &evtchn); + if (err != 1) { + err = -EINVAL; + xenbus_dev_fatal(dev, err, "reading %s/event-channel", dev->otherend); return err; } + pr_info("event-channel %u\n", evtchn); + + err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u", + &ring_page_order); + if (err != 1) { + err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref", + "%u", &ring_ref[0]); + if (err != 1) { + err = -EINVAL; + xenbus_dev_fatal(dev, err, "reading %s/ring-ref", + dev->otherend); + return err; + } + nr_grefs = 1; + pr_info("%s:using single page: ring-ref %d\n", dev->otherend, + ring_ref[0]); + } else { + unsigned int i; + + if (ring_page_order > xen_blkif_max_ring_order) { + err = -EINVAL; + xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d", + dev->otherend, ring_page_order, + xen_blkif_max_ring_order); + return err; + } + + nr_grefs = 1 << ring_page_order; + for (i = 0; i < nr_grefs; i++) { + char ring_ref_name[RINGREF_NAME_LEN]; + + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); + err = xenbus_scanf(XBT_NIL, dev->otherend, ring_ref_name, + "%u", &ring_ref[i]); + if (err != 1) { + err = -EINVAL; + xenbus_dev_fatal(dev, err, "reading %s/%s", + dev->otherend, ring_ref_name); + return err; + } + pr_info("ring-ref%u: %u\n", i, ring_ref[i]); + } + } be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", @@ -900,20 +908,55 @@ static int connect_ring(struct backend_info *be) be->blkif->vbd.feature_gnt_persistent = pers_grants; be->blkif->vbd.overflow_max_grants = 0; + be->blkif->nr_ring_pages = nr_grefs; - pr_info("ring-ref %ld, event-channel %d, protocol %d (%s) %s\n", - ring_ref, evtchn, be->blkif->blk_protocol, protocol, + pr_info("ring-pages:%d, event-channel %d, protocol %d (%s) %s\n", + nr_grefs, evtchn, be->blkif->blk_protocol, protocol, pers_grants ? "persistent grants" : ""); + for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) { + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + goto fail; + list_add_tail(&req->free_list, &be->blkif->pending_free); + for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { + req->segments[j] = kzalloc(sizeof(*req->segments[0]), GFP_KERNEL); + if (!req->segments[j]) + goto fail; + } + for (j = 0; j < MAX_INDIRECT_PAGES; j++) { + req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]), + GFP_KERNEL); + if (!req->indirect_pages[j]) + goto fail; + } + } + /* Map the shared frame, irq etc. */ - err = xen_blkif_map(be->blkif, ring_ref, evtchn); + err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn); if (err) { - xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u", - ring_ref, evtchn); + xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn); return err; } return 0; + +fail: + list_for_each_entry_safe(req, n, &be->blkif->pending_free, free_list) { + list_del(&req->free_list); + for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { + if (!req->segments[j]) + break; + kfree(req->segments[j]); + } + for (j = 0; j < MAX_INDIRECT_PAGES; j++) { + if (!req->indirect_pages[j]) + break; + kfree(req->indirect_pages[j]); + } + kfree(req); + } + return -ENOMEM; } static const struct xenbus_device_id xen_blkbk_ids[] = { diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 2c61cf8c6..7a8a73f1f 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -98,7 +98,21 @@ static unsigned int xen_blkif_max_segments = 32; module_param_named(max, xen_blkif_max_segments, int, S_IRUGO); MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)"); -#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) +/* + * Maximum order of pages to be used for the shared ring between front and + * backend, 4KB page granularity is used. + */ +static unsigned int xen_blkif_max_ring_order; +module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO); +MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring"); + +#define BLK_RING_SIZE(info) __CONST_RING_SIZE(blkif, PAGE_SIZE * (info)->nr_ring_pages) +#define BLK_MAX_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * XENBUS_MAX_RING_PAGES) +/* + * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 + * characters are enough. Define to 20 to keep consist with backend. + */ +#define RINGREF_NAME_LEN (20) /* * We have one of these per vbd, whether ide, scsi or 'other'. They @@ -114,13 +128,14 @@ struct blkfront_info int vdevice; blkif_vdev_t handle; enum blkif_state connected; - int ring_ref; + int ring_ref[XENBUS_MAX_RING_PAGES]; + unsigned int nr_ring_pages; struct blkif_front_ring ring; unsigned int evtchn, irq; struct request_queue *rq; struct work_struct work; struct gnttab_free_callback callback; - struct blk_shadow shadow[BLK_RING_SIZE]; + struct blk_shadow shadow[BLK_MAX_RING_SIZE]; struct list_head grants; struct list_head indirect_pages; unsigned int persistent_gnts_c; @@ -139,8 +154,6 @@ static unsigned int nr_minors; static unsigned long *minors; static DEFINE_SPINLOCK(minor_lock); -#define MAXIMUM_OUTSTANDING_BLOCK_REQS \ - (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE) #define GRANT_INVALID_REF 0 #define PARTS_PER_DISK 16 @@ -166,11 +179,12 @@ static DEFINE_SPINLOCK(minor_lock); ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) static int blkfront_setup_indirect(struct blkfront_info *info); +static int blkfront_gather_backend_features(struct blkfront_info *info); static int get_id_from_freelist(struct blkfront_info *info) { unsigned long free = info->shadow_free; - BUG_ON(free >= BLK_RING_SIZE); + BUG_ON(free >= BLK_RING_SIZE(info)); info->shadow_free = info->shadow[free].req.u.rw.id; info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */ return free; @@ -983,7 +997,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) } } - for (i = 0; i < BLK_RING_SIZE; i++) { + for (i = 0; i < BLK_RING_SIZE(info); i++) { /* * Clear persistent grants present in requests already * on the shared ring @@ -1033,12 +1047,15 @@ free_shadow: flush_work(&info->work); /* Free resources associated with old device channel. */ - if (info->ring_ref != GRANT_INVALID_REF) { - gnttab_end_foreign_access(info->ring_ref, 0, - (unsigned long)info->ring.sring); - info->ring_ref = GRANT_INVALID_REF; - info->ring.sring = NULL; + for (i = 0; i < info->nr_ring_pages; i++) { + if (info->ring_ref[i] != GRANT_INVALID_REF) { + gnttab_end_foreign_access(info->ring_ref[i], 0, 0); + info->ring_ref[i] = GRANT_INVALID_REF; + } } + free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE)); + info->ring.sring = NULL; + if (info->irq) unbind_from_irqhandler(info->irq, info); info->evtchn = info->irq = 0; @@ -1058,12 +1075,6 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; if (bret->operation == BLKIF_OP_READ && info->feature_persistent) { - /* - * Copy the data received from the backend into the bvec. - * Since bv_offset can be different than 0, and bv_len different - * than PAGE_SIZE, we have to keep track of the current offset, - * to be sure we are copying the data from the right shared page. - */ for_each_sg(s->sg, sg, nseg, i) { BUG_ON(sg->offset + sg->length > PAGE_SIZE); shared_data = kmap_atomic( @@ -1118,8 +1129,10 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, * Add the used indirect page back to the list of * available pages for indirect grefs. */ - indirect_page = pfn_to_page(s->indirect_grants[i]->pfn); - list_add(&indirect_page->lru, &info->indirect_pages); + if (!info->feature_persistent) { + indirect_page = pfn_to_page(s->indirect_grants[i]->pfn); + list_add(&indirect_page->lru, &info->indirect_pages); + } s->indirect_grants[i]->gref = GRANT_INVALID_REF; list_add_tail(&s->indirect_grants[i]->node, &info->grants); } @@ -1157,7 +1170,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) * never have given to it (we stamp it up to BLK_RING_SIZE - * look in get_id_from_freelist. */ - if (id >= BLK_RING_SIZE) { + if (id >= BLK_RING_SIZE(info)) { WARN(1, "%s: response to %s has incorrect id (%ld)\n", info->gd->disk_name, op_name(bret->operation), id); /* We can't safely get the 'struct request' as @@ -1245,26 +1258,30 @@ static int setup_blkring(struct xenbus_device *dev, struct blkfront_info *info) { struct blkif_sring *sring; - grant_ref_t gref; - int err; + int err, i; + unsigned long ring_size = info->nr_ring_pages * PAGE_SIZE; + grant_ref_t gref[XENBUS_MAX_RING_PAGES]; - info->ring_ref = GRANT_INVALID_REF; + for (i = 0; i < info->nr_ring_pages; i++) + info->ring_ref[i] = GRANT_INVALID_REF; - sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH); + sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH, + get_order(ring_size)); if (!sring) { xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); return -ENOMEM; } SHARED_RING_INIT(sring); - FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); + FRONT_RING_INIT(&info->ring, sring, ring_size); - err = xenbus_grant_ring(dev, info->ring.sring, 1, &gref); + err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref); if (err < 0) { - free_page((unsigned long)sring); + free_pages((unsigned long)sring, get_order(ring_size)); info->ring.sring = NULL; goto fail; } - info->ring_ref = gref; + for (i = 0; i < info->nr_ring_pages; i++) + info->ring_ref[i] = gref[i]; err = xenbus_alloc_evtchn(dev, &info->evtchn); if (err) @@ -1292,7 +1309,18 @@ static int talk_to_blkback(struct xenbus_device *dev, { const char *message = NULL; struct xenbus_transaction xbt; - int err; + int err, i; + unsigned int max_page_order = 0; + unsigned int ring_page_order = 0; + + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "max-ring-page-order", "%u", &max_page_order); + if (err != 1) + info->nr_ring_pages = 1; + else { + ring_page_order = min(xen_blkif_max_ring_order, max_page_order); + info->nr_ring_pages = 1 << ring_page_order; + } /* Create shared ring, alloc event channel. */ err = setup_blkring(dev, info); @@ -1306,11 +1334,32 @@ again: goto destroy_blkring; } - err = xenbus_printf(xbt, dev->nodename, - "ring-ref", "%u", info->ring_ref); - if (err) { - message = "writing ring-ref"; - goto abort_transaction; + if (info->nr_ring_pages == 1) { + err = xenbus_printf(xbt, dev->nodename, + "ring-ref", "%u", info->ring_ref[0]); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + } else { + err = xenbus_printf(xbt, dev->nodename, + "ring-page-order", "%u", ring_page_order); + if (err) { + message = "writing ring-page-order"; + goto abort_transaction; + } + + for (i = 0; i < info->nr_ring_pages; i++) { + char ring_ref_name[RINGREF_NAME_LEN]; + + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); + err = xenbus_printf(xbt, dev->nodename, ring_ref_name, + "%u", info->ring_ref[i]); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + } } err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u", info->evtchn); @@ -1338,6 +1387,9 @@ again: goto destroy_blkring; } + for (i = 0; i < BLK_RING_SIZE(info); i++) + info->shadow[i].req.u.rw.id = i+1; + info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; xenbus_switch_state(dev, XenbusStateInitialised); return 0; @@ -1361,7 +1413,7 @@ again: static int blkfront_probe(struct xenbus_device *dev, const struct xenbus_device_id *id) { - int err, vdevice, i; + int err, vdevice; struct blkfront_info *info; /* FIXME: Use dynamic device id if this is not set. */ @@ -1422,21 +1474,10 @@ static int blkfront_probe(struct xenbus_device *dev, info->connected = BLKIF_STATE_DISCONNECTED; INIT_WORK(&info->work, blkif_restart_queue); - for (i = 0; i < BLK_RING_SIZE; i++) - info->shadow[i].req.u.rw.id = i+1; - info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; - /* Front end dir is a number, which is used as the id. */ info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); dev_set_drvdata(&dev->dev, info); - err = talk_to_blkback(dev, info); - if (err) { - kfree(info); - dev_set_drvdata(&dev->dev, NULL); - return err; - } - return 0; } @@ -1476,12 +1517,12 @@ static int blkif_recover(struct blkfront_info *info) /* Stage 2: Set up free list. */ memset(&info->shadow, 0, sizeof(info->shadow)); - for (i = 0; i < BLK_RING_SIZE; i++) + for (i = 0; i < BLK_RING_SIZE(info); i++) info->shadow[i].req.u.rw.id = i+1; info->shadow_free = info->ring.req_prod_pvt; - info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; + info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; - rc = blkfront_setup_indirect(info); + rc = blkfront_gather_backend_features(info); if (rc) { kfree(copy); return rc; @@ -1491,7 +1532,7 @@ static int blkif_recover(struct blkfront_info *info) blk_queue_max_segments(info->rq, segs); bio_list_init(&bio_list); INIT_LIST_HEAD(&requests); - for (i = 0; i < BLK_RING_SIZE; i++) { + for (i = 0; i < BLK_RING_SIZE(info); i++) { /* Not in use? */ if (!copy[i].request) continue; @@ -1682,22 +1723,15 @@ static void blkfront_setup_discard(struct blkfront_info *info) static int blkfront_setup_indirect(struct blkfront_info *info) { - unsigned int indirect_segments, segs; + unsigned int segs; int err, i; - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "feature-max-indirect-segments", "%u", &indirect_segments, - NULL); - if (err) { - info->max_indirect_segments = 0; + if (info->max_indirect_segments == 0) segs = BLKIF_MAX_SEGMENTS_PER_REQUEST; - } else { - info->max_indirect_segments = min(indirect_segments, - xen_blkif_max_segments); + else segs = info->max_indirect_segments; - } - err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE); + err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE(info)); if (err) goto out_of_memory; @@ -1707,7 +1741,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info) * grants, we need to allocate a set of pages that can be * used for mapping indirect grefs */ - int num = INDIRECT_GREFS(segs) * BLK_RING_SIZE; + int num = INDIRECT_GREFS(segs) * BLK_RING_SIZE(info); BUG_ON(!list_empty(&info->indirect_pages)); for (i = 0; i < num; i++) { @@ -1718,7 +1752,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info) } } - for (i = 0; i < BLK_RING_SIZE; i++) { + for (i = 0; i < BLK_RING_SIZE(info); i++) { info->shadow[i].grants_used = kzalloc( sizeof(info->shadow[i].grants_used[0]) * segs, GFP_NOIO); @@ -1740,7 +1774,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info) return 0; out_of_memory: - for (i = 0; i < BLK_RING_SIZE; i++) { + for (i = 0; i < BLK_RING_SIZE(info); i++) { kfree(info->shadow[i].grants_used); info->shadow[i].grants_used = NULL; kfree(info->shadow[i].sg); @@ -1759,6 +1793,68 @@ out_of_memory: } /* + * Gather all backend feature-* + */ +static int blkfront_gather_backend_features(struct blkfront_info *info) +{ + int err; + int barrier, flush, discard, persistent; + unsigned int indirect_segments; + + info->feature_flush = 0; + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-barrier", "%d", &barrier, + NULL); + + /* + * If there's no "feature-barrier" defined, then it means + * we're dealing with a very old backend which writes + * synchronously; nothing to do. + * + * If there are barriers, then we use flush. + */ + if (!err && barrier) + info->feature_flush = REQ_FLUSH | REQ_FUA; + /* + * And if there is "feature-flush-cache" use that above + * barriers. + */ + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-flush-cache", "%d", &flush, + NULL); + + if (!err && flush) + info->feature_flush = REQ_FLUSH; + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-discard", "%d", &discard, + NULL); + + if (!err && discard) + blkfront_setup_discard(info); + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-persistent", "%u", &persistent, + NULL); + if (err) + info->feature_persistent = 0; + else + info->feature_persistent = persistent; + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-max-indirect-segments", "%u", &indirect_segments, + NULL); + if (err) + info->max_indirect_segments = 0; + else + info->max_indirect_segments = min(indirect_segments, + xen_blkif_max_segments); + + return blkfront_setup_indirect(info); +} + +/* * Invoked when the backend is finally 'ready' (and has told produced * the details about the physical device - #sectors, size, etc). */ @@ -1769,7 +1865,6 @@ static void blkfront_connect(struct blkfront_info *info) unsigned int physical_sector_size; unsigned int binfo; int err; - int barrier, flush, discard, persistent; switch (info->connected) { case BLKIF_STATE_CONNECTED: @@ -1826,48 +1921,7 @@ static void blkfront_connect(struct blkfront_info *info) if (err != 1) physical_sector_size = sector_size; - info->feature_flush = 0; - - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "feature-barrier", "%d", &barrier, - NULL); - - /* - * If there's no "feature-barrier" defined, then it means - * we're dealing with a very old backend which writes - * synchronously; nothing to do. - * - * If there are barriers, then we use flush. - */ - if (!err && barrier) - info->feature_flush = REQ_FLUSH | REQ_FUA; - /* - * And if there is "feature-flush-cache" use that above - * barriers. - */ - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "feature-flush-cache", "%d", &flush, - NULL); - - if (!err && flush) - info->feature_flush = REQ_FLUSH; - - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "feature-discard", "%d", &discard, - NULL); - - if (!err && discard) - blkfront_setup_discard(info); - - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "feature-persistent", "%u", &persistent, - NULL); - if (err) - info->feature_persistent = 0; - else - info->feature_persistent = persistent; - - err = blkfront_setup_indirect(info); + err = blkfront_gather_backend_features(info); if (err) { xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", info->xbdev->otherend); @@ -1906,8 +1960,15 @@ static void blkback_changed(struct xenbus_device *dev, dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state); switch (backend_state) { - case XenbusStateInitialising: case XenbusStateInitWait: + if (dev->state != XenbusStateInitialising) + break; + if (talk_to_blkback(dev, info)) { + kfree(info); + dev_set_drvdata(&dev->dev, NULL); + break; + } + case XenbusStateInitialising: case XenbusStateInitialised: case XenbusStateReconfiguring: case XenbusStateReconfigured: @@ -2091,6 +2152,12 @@ static int __init xlblk_init(void) if (!xen_domain()) return -ENODEV; + if (xen_blkif_max_ring_order > XENBUS_MAX_RING_PAGE_ORDER) { + pr_info("Invalid max_ring_order (%d), will use default max: %d.\n", + xen_blkif_max_ring_order, XENBUS_MAX_RING_PAGE_ORDER); + xen_blkif_max_ring_order = 0; + } + if (!xen_has_pv_disk_devices()) return -ENODEV; diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index 6489c0fd0..386ba3d1a 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -23,12 +23,4 @@ config ZRAM_LZ4_COMPRESS default n help This option enables LZ4 compression algorithm support. Compression - algorithm can be changed using `comp_algorithm' device attribute. - -config ZRAM_DEBUG - bool "Compressed RAM block device debug support" - depends on ZRAM - default n - help - This option adds additional debugging code to the compressed - RAM block device driver. + algorithm can be changed using `comp_algorithm' device attribute.
\ No newline at end of file diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index f1ff39a3d..965d1afb0 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -274,7 +274,7 @@ ssize_t zcomp_available_show(const char *comp, char *buf) int i = 0; while (backends[i]) { - if (sysfs_streq(comp, backends[i]->name)) + if (!strcmp(comp, backends[i]->name)) sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, "[%s] ", backends[i]->name); else @@ -286,6 +286,11 @@ ssize_t zcomp_available_show(const char *comp, char *buf) return sz; } +bool zcomp_available_algorithm(const char *comp) +{ + return find_backend(comp) != NULL; +} + bool zcomp_set_max_streams(struct zcomp *comp, int num_strm) { return comp->set_max_streams(comp, num_strm); diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index c59d1fca7..46e2b9f8f 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -51,6 +51,7 @@ struct zcomp { }; ssize_t zcomp_available_show(const char *comp, char *buf); +bool zcomp_available_algorithm(const char *comp); struct zcomp *zcomp_create(const char *comp, int max_strm); void zcomp_destroy(struct zcomp *comp); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 6e134f475..763301c78 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -15,10 +15,6 @@ #define KMSG_COMPONENT "zram" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt -#ifdef CONFIG_ZRAM_DEBUG -#define DEBUG -#endif - #include <linux/module.h> #include <linux/kernel.h> #include <linux/bio.h> @@ -32,12 +28,16 @@ #include <linux/string.h> #include <linux/vmalloc.h> #include <linux/err.h> +#include <linux/idr.h> +#include <linux/sysfs.h> #include "zram_drv.h" -/* Globals */ +static DEFINE_IDR(zram_index_idr); +/* idr index must be protected */ +static DEFINE_MUTEX(zram_index_mutex); + static int zram_major; -static struct zram *zram_devices; static const char *default_compressor = "lzo"; /* Module params (documentation at end) */ @@ -53,7 +53,7 @@ static inline void deprecated_attr_warn(const char *name) } #define ZRAM_ATTR_RO(name) \ -static ssize_t name##_show(struct device *d, \ +static ssize_t name##_show(struct device *d, \ struct device_attribute *attr, char *b) \ { \ struct zram *zram = dev_to_zram(d); \ @@ -74,33 +74,117 @@ static inline struct zram *dev_to_zram(struct device *dev) return (struct zram *)dev_to_disk(dev)->private_data; } -static ssize_t compact_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) +/* flag operations require table entry bit_spin_lock() being held */ +static int zram_test_flag(struct zram_meta *meta, u32 index, + enum zram_pageflags flag) { - unsigned long nr_migrated; - struct zram *zram = dev_to_zram(dev); - struct zram_meta *meta; + return meta->table[index].value & BIT(flag); +} - down_read(&zram->init_lock); - if (!init_done(zram)) { - up_read(&zram->init_lock); - return -EINVAL; - } +static void zram_set_flag(struct zram_meta *meta, u32 index, + enum zram_pageflags flag) +{ + meta->table[index].value |= BIT(flag); +} - meta = zram->meta; - nr_migrated = zs_compact(meta->mem_pool); - atomic64_add(nr_migrated, &zram->stats.num_migrated); - up_read(&zram->init_lock); +static void zram_clear_flag(struct zram_meta *meta, u32 index, + enum zram_pageflags flag) +{ + meta->table[index].value &= ~BIT(flag); +} - return len; +static size_t zram_get_obj_size(struct zram_meta *meta, u32 index) +{ + return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); } -static ssize_t disksize_show(struct device *dev, - struct device_attribute *attr, char *buf) +static void zram_set_obj_size(struct zram_meta *meta, + u32 index, size_t size) { - struct zram *zram = dev_to_zram(dev); + unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT; - return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize); + meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; +} + +static inline int is_partial_io(struct bio_vec *bvec) +{ + return bvec->bv_len != PAGE_SIZE; +} + +/* + * Check if request is within bounds and aligned on zram logical blocks. + */ +static inline int valid_io_request(struct zram *zram, + sector_t start, unsigned int size) +{ + u64 end, bound; + + /* unaligned request */ + if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) + return 0; + if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) + return 0; + + end = start + (size >> SECTOR_SHIFT); + bound = zram->disksize >> SECTOR_SHIFT; + /* out of range range */ + if (unlikely(start >= bound || end > bound || start > end)) + return 0; + + /* I/O request is valid */ + return 1; +} + +static void update_position(u32 *index, int *offset, struct bio_vec *bvec) +{ + if (*offset + bvec->bv_len >= PAGE_SIZE) + (*index)++; + *offset = (*offset + bvec->bv_len) % PAGE_SIZE; +} + +static inline void update_used_max(struct zram *zram, + const unsigned long pages) +{ + unsigned long old_max, cur_max; + + old_max = atomic_long_read(&zram->stats.max_used_pages); + + do { + cur_max = old_max; + if (pages > cur_max) + old_max = atomic_long_cmpxchg( + &zram->stats.max_used_pages, cur_max, pages); + } while (old_max != cur_max); +} + +static int page_zero_filled(void *ptr) +{ + unsigned int pos; + unsigned long *page; + + page = (unsigned long *)ptr; + + for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) { + if (page[pos]) + return 0; + } + + return 1; +} + +static void handle_zero_page(struct bio_vec *bvec) +{ + struct page *page = bvec->bv_page; + void *user_mem; + + user_mem = kmap_atomic(page); + if (is_partial_io(bvec)) + memset(user_mem + bvec->bv_offset, 0, bvec->bv_len); + else + clear_page(user_mem); + kunmap_atomic(user_mem); + + flush_dcache_page(page); } static ssize_t initstate_show(struct device *dev, @@ -116,6 +200,14 @@ static ssize_t initstate_show(struct device *dev, return scnprintf(buf, PAGE_SIZE, "%u\n", val); } +static ssize_t disksize_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize); +} + static ssize_t orig_data_size_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -143,19 +235,6 @@ static ssize_t mem_used_total_show(struct device *dev, return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); } -static ssize_t max_comp_streams_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - int val; - struct zram *zram = dev_to_zram(dev); - - down_read(&zram->init_lock); - val = zram->max_comp_streams; - up_read(&zram->init_lock); - - return scnprintf(buf, PAGE_SIZE, "%d\n", val); -} - static ssize_t mem_limit_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -225,6 +304,19 @@ static ssize_t mem_used_max_store(struct device *dev, return len; } +static ssize_t max_comp_streams_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int val; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + val = zram->max_comp_streams; + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%d\n", val); +} + static ssize_t max_comp_streams_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -271,6 +363,8 @@ static ssize_t comp_algorithm_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct zram *zram = dev_to_zram(dev); + size_t sz; + down_write(&zram->init_lock); if (init_done(zram)) { up_write(&zram->init_lock); @@ -278,69 +372,108 @@ static ssize_t comp_algorithm_store(struct device *dev, return -EBUSY; } strlcpy(zram->compressor, buf, sizeof(zram->compressor)); + + /* ignore trailing newline */ + sz = strlen(zram->compressor); + if (sz > 0 && zram->compressor[sz - 1] == '\n') + zram->compressor[sz - 1] = 0x00; + + if (!zcomp_available_algorithm(zram->compressor)) + len = -EINVAL; + up_write(&zram->init_lock); return len; } -/* flag operations needs meta->tb_lock */ -static int zram_test_flag(struct zram_meta *meta, u32 index, - enum zram_pageflags flag) +static ssize_t compact_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) { - return meta->table[index].value & BIT(flag); -} + unsigned long nr_migrated; + struct zram *zram = dev_to_zram(dev); + struct zram_meta *meta; -static void zram_set_flag(struct zram_meta *meta, u32 index, - enum zram_pageflags flag) -{ - meta->table[index].value |= BIT(flag); -} + down_read(&zram->init_lock); + if (!init_done(zram)) { + up_read(&zram->init_lock); + return -EINVAL; + } -static void zram_clear_flag(struct zram_meta *meta, u32 index, - enum zram_pageflags flag) -{ - meta->table[index].value &= ~BIT(flag); + meta = zram->meta; + nr_migrated = zs_compact(meta->mem_pool); + atomic64_add(nr_migrated, &zram->stats.num_migrated); + up_read(&zram->init_lock); + + return len; } -static size_t zram_get_obj_size(struct zram_meta *meta, u32 index) +static ssize_t io_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) { - return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); + struct zram *zram = dev_to_zram(dev); + ssize_t ret; + + down_read(&zram->init_lock); + ret = scnprintf(buf, PAGE_SIZE, + "%8llu %8llu %8llu %8llu\n", + (u64)atomic64_read(&zram->stats.failed_reads), + (u64)atomic64_read(&zram->stats.failed_writes), + (u64)atomic64_read(&zram->stats.invalid_io), + (u64)atomic64_read(&zram->stats.notify_free)); + up_read(&zram->init_lock); + + return ret; } -static void zram_set_obj_size(struct zram_meta *meta, - u32 index, size_t size) +static ssize_t mm_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) { - unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT; + struct zram *zram = dev_to_zram(dev); + u64 orig_size, mem_used = 0; + long max_used; + ssize_t ret; - meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; + down_read(&zram->init_lock); + if (init_done(zram)) + mem_used = zs_get_total_pages(zram->meta->mem_pool); + + orig_size = atomic64_read(&zram->stats.pages_stored); + max_used = atomic_long_read(&zram->stats.max_used_pages); + + ret = scnprintf(buf, PAGE_SIZE, + "%8llu %8llu %8llu %8lu %8ld %8llu %8llu\n", + orig_size << PAGE_SHIFT, + (u64)atomic64_read(&zram->stats.compr_data_size), + mem_used << PAGE_SHIFT, + zram->limit_pages << PAGE_SHIFT, + max_used << PAGE_SHIFT, + (u64)atomic64_read(&zram->stats.zero_pages), + (u64)atomic64_read(&zram->stats.num_migrated)); + up_read(&zram->init_lock); + + return ret; } -static inline int is_partial_io(struct bio_vec *bvec) +static DEVICE_ATTR_RO(io_stat); +static DEVICE_ATTR_RO(mm_stat); +ZRAM_ATTR_RO(num_reads); +ZRAM_ATTR_RO(num_writes); +ZRAM_ATTR_RO(failed_reads); +ZRAM_ATTR_RO(failed_writes); +ZRAM_ATTR_RO(invalid_io); +ZRAM_ATTR_RO(notify_free); +ZRAM_ATTR_RO(zero_pages); +ZRAM_ATTR_RO(compr_data_size); + +static inline bool zram_meta_get(struct zram *zram) { - return bvec->bv_len != PAGE_SIZE; + if (atomic_inc_not_zero(&zram->refcount)) + return true; + return false; } -/* - * Check if request is within bounds and aligned on zram logical blocks. - */ -static inline int valid_io_request(struct zram *zram, - sector_t start, unsigned int size) +static inline void zram_meta_put(struct zram *zram) { - u64 end, bound; - - /* unaligned request */ - if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) - return 0; - if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) - return 0; - - end = start + (size >> SECTOR_SHIFT); - bound = zram->disksize >> SECTOR_SHIFT; - /* out of range range */ - if (unlikely(start >= bound || end > bound || start > end)) - return 0; - - /* I/O request is valid */ - return 1; + atomic_dec(&zram->refcount); } static void zram_meta_free(struct zram_meta *meta, u64 disksize) @@ -363,10 +496,9 @@ static void zram_meta_free(struct zram_meta *meta, u64 disksize) kfree(meta); } -static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize) +static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize) { size_t num_pages; - char pool_name[8]; struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL); if (!meta) @@ -379,7 +511,6 @@ static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize) goto out_error; } - snprintf(pool_name, sizeof(pool_name), "zram%d", device_id); meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM); if (!meta->mem_pool) { pr_err("Error creating memory pool\n"); @@ -394,56 +525,6 @@ out_error: return NULL; } -static inline bool zram_meta_get(struct zram *zram) -{ - if (atomic_inc_not_zero(&zram->refcount)) - return true; - return false; -} - -static inline void zram_meta_put(struct zram *zram) -{ - atomic_dec(&zram->refcount); -} - -static void update_position(u32 *index, int *offset, struct bio_vec *bvec) -{ - if (*offset + bvec->bv_len >= PAGE_SIZE) - (*index)++; - *offset = (*offset + bvec->bv_len) % PAGE_SIZE; -} - -static int page_zero_filled(void *ptr) -{ - unsigned int pos; - unsigned long *page; - - page = (unsigned long *)ptr; - - for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) { - if (page[pos]) - return 0; - } - - return 1; -} - -static void handle_zero_page(struct bio_vec *bvec) -{ - struct page *page = bvec->bv_page; - void *user_mem; - - user_mem = kmap_atomic(page); - if (is_partial_io(bvec)) - memset(user_mem + bvec->bv_offset, 0, bvec->bv_len); - else - clear_page(user_mem); - kunmap_atomic(user_mem); - - flush_dcache_page(page); -} - - /* * To protect concurrent access to the same index entry, * caller should hold this table index entry's bit_spinlock to @@ -561,21 +642,6 @@ out_cleanup: return ret; } -static inline void update_used_max(struct zram *zram, - const unsigned long pages) -{ - unsigned long old_max, cur_max; - - old_max = atomic_long_read(&zram->stats.max_used_pages); - - do { - cur_max = old_max; - if (pages > cur_max) - old_max = atomic_long_cmpxchg( - &zram->stats.max_used_pages, cur_max, pages); - } while (old_max != cur_max); -} - static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, int offset) { @@ -585,8 +651,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, struct page *page; unsigned char *user_mem, *cmem, *src, *uncmem = NULL; struct zram_meta *meta = zram->meta; - struct zcomp_strm *zstrm; - bool locked = false; + struct zcomp_strm *zstrm = NULL; unsigned long alloced_pages; page = bvec->bv_page; @@ -606,7 +671,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, } zstrm = zcomp_strm_find(zram->comp); - locked = true; user_mem = kmap_atomic(page); if (is_partial_io(bvec)) { @@ -678,7 +742,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, } zcomp_strm_release(zram->comp, zstrm); - locked = false; + zstrm = NULL; zs_unmap_object(meta->mem_pool, handle); /* @@ -696,42 +760,13 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, atomic64_add(clen, &zram->stats.compr_data_size); atomic64_inc(&zram->stats.pages_stored); out: - if (locked) + if (zstrm) zcomp_strm_release(zram->comp, zstrm); if (is_partial_io(bvec)) kfree(uncmem); return ret; } -static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset, int rw) -{ - unsigned long start_time = jiffies; - int ret; - - generic_start_io_acct(rw, bvec->bv_len >> SECTOR_SHIFT, - &zram->disk->part0); - - if (rw == READ) { - atomic64_inc(&zram->stats.num_reads); - ret = zram_bvec_read(zram, bvec, index, offset); - } else { - atomic64_inc(&zram->stats.num_writes); - ret = zram_bvec_write(zram, bvec, index, offset); - } - - generic_end_io_acct(rw, &zram->disk->part0, start_time); - - if (unlikely(ret)) { - if (rw == READ) - atomic64_inc(&zram->stats.failed_reads); - else - atomic64_inc(&zram->stats.failed_writes); - } - - return ret; -} - /* * zram_bio_discard - handler on discard request * @index: physical block index in PAGE_SIZE units @@ -771,151 +806,32 @@ static void zram_bio_discard(struct zram *zram, u32 index, } } -static void zram_reset_device(struct zram *zram) -{ - struct zram_meta *meta; - struct zcomp *comp; - u64 disksize; - - down_write(&zram->init_lock); - - zram->limit_pages = 0; - - if (!init_done(zram)) { - up_write(&zram->init_lock); - return; - } - - meta = zram->meta; - comp = zram->comp; - disksize = zram->disksize; - /* - * Refcount will go down to 0 eventually and r/w handler - * cannot handle further I/O so it will bail out by - * check zram_meta_get. - */ - zram_meta_put(zram); - /* - * We want to free zram_meta in process context to avoid - * deadlock between reclaim path and any other locks. - */ - wait_event(zram->io_done, atomic_read(&zram->refcount) == 0); - - /* Reset stats */ - memset(&zram->stats, 0, sizeof(zram->stats)); - zram->disksize = 0; - zram->max_comp_streams = 1; - - set_capacity(zram->disk, 0); - part_stat_set_all(&zram->disk->part0, 0); - - up_write(&zram->init_lock); - /* I/O operation under all of CPU are done so let's free */ - zram_meta_free(meta, disksize); - zcomp_destroy(comp); -} - -static ssize_t disksize_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) -{ - u64 disksize; - struct zcomp *comp; - struct zram_meta *meta; - struct zram *zram = dev_to_zram(dev); - int err; - - disksize = memparse(buf, NULL); - if (!disksize) - return -EINVAL; - - disksize = PAGE_ALIGN(disksize); - meta = zram_meta_alloc(zram->disk->first_minor, disksize); - if (!meta) - return -ENOMEM; - - comp = zcomp_create(zram->compressor, zram->max_comp_streams); - if (IS_ERR(comp)) { - pr_info("Cannot initialise %s compressing backend\n", - zram->compressor); - err = PTR_ERR(comp); - goto out_free_meta; - } - - down_write(&zram->init_lock); - if (init_done(zram)) { - pr_info("Cannot change disksize for initialized device\n"); - err = -EBUSY; - goto out_destroy_comp; - } - - init_waitqueue_head(&zram->io_done); - atomic_set(&zram->refcount, 1); - zram->meta = meta; - zram->comp = comp; - zram->disksize = disksize; - set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - up_write(&zram->init_lock); - - /* - * Revalidate disk out of the init_lock to avoid lockdep splat. - * It's okay because disk's capacity is protected by init_lock - * so that revalidate_disk always sees up-to-date capacity. - */ - revalidate_disk(zram->disk); - - return len; - -out_destroy_comp: - up_write(&zram->init_lock); - zcomp_destroy(comp); -out_free_meta: - zram_meta_free(meta, disksize); - return err; -} - -static ssize_t reset_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) +static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, + int offset, int rw) { + unsigned long start_time = jiffies; int ret; - unsigned short do_reset; - struct zram *zram; - struct block_device *bdev; - zram = dev_to_zram(dev); - bdev = bdget_disk(zram->disk, 0); - - if (!bdev) - return -ENOMEM; + generic_start_io_acct(rw, bvec->bv_len >> SECTOR_SHIFT, + &zram->disk->part0); - mutex_lock(&bdev->bd_mutex); - /* Do not reset an active device! */ - if (bdev->bd_openers) { - ret = -EBUSY; - goto out; + if (rw == READ) { + atomic64_inc(&zram->stats.num_reads); + ret = zram_bvec_read(zram, bvec, index, offset); + } else { + atomic64_inc(&zram->stats.num_writes); + ret = zram_bvec_write(zram, bvec, index, offset); } - ret = kstrtou16(buf, 10, &do_reset); - if (ret) - goto out; + generic_end_io_acct(rw, &zram->disk->part0, start_time); - if (!do_reset) { - ret = -EINVAL; - goto out; + if (unlikely(ret)) { + if (rw == READ) + atomic64_inc(&zram->stats.failed_reads); + else + atomic64_inc(&zram->stats.failed_writes); } - /* Make sure all pending I/O is finished */ - fsync_bdev(bdev); - zram_reset_device(zram); - - mutex_unlock(&bdev->bd_mutex); - revalidate_disk(zram->disk); - bdput(bdev); - - return len; - -out: - mutex_unlock(&bdev->bd_mutex); - bdput(bdev); return ret; } @@ -1055,80 +971,185 @@ out: return err; } -static const struct block_device_operations zram_devops = { - .swap_slot_free_notify = zram_slot_free_notify, - .rw_page = zram_rw_page, - .owner = THIS_MODULE -}; +static void zram_reset_device(struct zram *zram) +{ + struct zram_meta *meta; + struct zcomp *comp; + u64 disksize; -static DEVICE_ATTR_WO(compact); -static DEVICE_ATTR_RW(disksize); -static DEVICE_ATTR_RO(initstate); -static DEVICE_ATTR_WO(reset); -static DEVICE_ATTR_RO(orig_data_size); -static DEVICE_ATTR_RO(mem_used_total); -static DEVICE_ATTR_RW(mem_limit); -static DEVICE_ATTR_RW(mem_used_max); -static DEVICE_ATTR_RW(max_comp_streams); -static DEVICE_ATTR_RW(comp_algorithm); + down_write(&zram->init_lock); -static ssize_t io_stat_show(struct device *dev, - struct device_attribute *attr, char *buf) + zram->limit_pages = 0; + + if (!init_done(zram)) { + up_write(&zram->init_lock); + return; + } + + meta = zram->meta; + comp = zram->comp; + disksize = zram->disksize; + /* + * Refcount will go down to 0 eventually and r/w handler + * cannot handle further I/O so it will bail out by + * check zram_meta_get. + */ + zram_meta_put(zram); + /* + * We want to free zram_meta in process context to avoid + * deadlock between reclaim path and any other locks. + */ + wait_event(zram->io_done, atomic_read(&zram->refcount) == 0); + + /* Reset stats */ + memset(&zram->stats, 0, sizeof(zram->stats)); + zram->disksize = 0; + zram->max_comp_streams = 1; + + set_capacity(zram->disk, 0); + part_stat_set_all(&zram->disk->part0, 0); + + up_write(&zram->init_lock); + /* I/O operation under all of CPU are done so let's free */ + zram_meta_free(meta, disksize); + zcomp_destroy(comp); +} + +static ssize_t disksize_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) { + u64 disksize; + struct zcomp *comp; + struct zram_meta *meta; struct zram *zram = dev_to_zram(dev); - ssize_t ret; + int err; - down_read(&zram->init_lock); - ret = scnprintf(buf, PAGE_SIZE, - "%8llu %8llu %8llu %8llu\n", - (u64)atomic64_read(&zram->stats.failed_reads), - (u64)atomic64_read(&zram->stats.failed_writes), - (u64)atomic64_read(&zram->stats.invalid_io), - (u64)atomic64_read(&zram->stats.notify_free)); - up_read(&zram->init_lock); + disksize = memparse(buf, NULL); + if (!disksize) + return -EINVAL; - return ret; + disksize = PAGE_ALIGN(disksize); + meta = zram_meta_alloc(zram->disk->disk_name, disksize); + if (!meta) + return -ENOMEM; + + comp = zcomp_create(zram->compressor, zram->max_comp_streams); + if (IS_ERR(comp)) { + pr_info("Cannot initialise %s compressing backend\n", + zram->compressor); + err = PTR_ERR(comp); + goto out_free_meta; + } + + down_write(&zram->init_lock); + if (init_done(zram)) { + pr_info("Cannot change disksize for initialized device\n"); + err = -EBUSY; + goto out_destroy_comp; + } + + init_waitqueue_head(&zram->io_done); + atomic_set(&zram->refcount, 1); + zram->meta = meta; + zram->comp = comp; + zram->disksize = disksize; + set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); + up_write(&zram->init_lock); + + /* + * Revalidate disk out of the init_lock to avoid lockdep splat. + * It's okay because disk's capacity is protected by init_lock + * so that revalidate_disk always sees up-to-date capacity. + */ + revalidate_disk(zram->disk); + + return len; + +out_destroy_comp: + up_write(&zram->init_lock); + zcomp_destroy(comp); +out_free_meta: + zram_meta_free(meta, disksize); + return err; } -static ssize_t mm_stat_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t reset_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) { - struct zram *zram = dev_to_zram(dev); - u64 orig_size, mem_used = 0; - long max_used; - ssize_t ret; + int ret; + unsigned short do_reset; + struct zram *zram; + struct block_device *bdev; - down_read(&zram->init_lock); - if (init_done(zram)) - mem_used = zs_get_total_pages(zram->meta->mem_pool); + ret = kstrtou16(buf, 10, &do_reset); + if (ret) + return ret; - orig_size = atomic64_read(&zram->stats.pages_stored); - max_used = atomic_long_read(&zram->stats.max_used_pages); + if (!do_reset) + return -EINVAL; - ret = scnprintf(buf, PAGE_SIZE, - "%8llu %8llu %8llu %8lu %8ld %8llu %8llu\n", - orig_size << PAGE_SHIFT, - (u64)atomic64_read(&zram->stats.compr_data_size), - mem_used << PAGE_SHIFT, - zram->limit_pages << PAGE_SHIFT, - max_used << PAGE_SHIFT, - (u64)atomic64_read(&zram->stats.zero_pages), - (u64)atomic64_read(&zram->stats.num_migrated)); - up_read(&zram->init_lock); + zram = dev_to_zram(dev); + bdev = bdget_disk(zram->disk, 0); + if (!bdev) + return -ENOMEM; + + mutex_lock(&bdev->bd_mutex); + /* Do not reset an active device or claimed device */ + if (bdev->bd_openers || zram->claim) { + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return -EBUSY; + } + + /* From now on, anyone can't open /dev/zram[0-9] */ + zram->claim = true; + mutex_unlock(&bdev->bd_mutex); + + /* Make sure all the pending I/O are finished */ + fsync_bdev(bdev); + zram_reset_device(zram); + revalidate_disk(zram->disk); + bdput(bdev); + + mutex_lock(&bdev->bd_mutex); + zram->claim = false; + mutex_unlock(&bdev->bd_mutex); + + return len; +} + +static int zram_open(struct block_device *bdev, fmode_t mode) +{ + int ret = 0; + struct zram *zram; + + WARN_ON(!mutex_is_locked(&bdev->bd_mutex)); + + zram = bdev->bd_disk->private_data; + /* zram was claimed to reset so open request fails */ + if (zram->claim) + ret = -EBUSY; return ret; } -static DEVICE_ATTR_RO(io_stat); -static DEVICE_ATTR_RO(mm_stat); -ZRAM_ATTR_RO(num_reads); -ZRAM_ATTR_RO(num_writes); -ZRAM_ATTR_RO(failed_reads); -ZRAM_ATTR_RO(failed_writes); -ZRAM_ATTR_RO(invalid_io); -ZRAM_ATTR_RO(notify_free); -ZRAM_ATTR_RO(zero_pages); -ZRAM_ATTR_RO(compr_data_size); +static const struct block_device_operations zram_devops = { + .open = zram_open, + .swap_slot_free_notify = zram_slot_free_notify, + .rw_page = zram_rw_page, + .owner = THIS_MODULE +}; + +static DEVICE_ATTR_WO(compact); +static DEVICE_ATTR_RW(disksize); +static DEVICE_ATTR_RO(initstate); +static DEVICE_ATTR_WO(reset); +static DEVICE_ATTR_RO(orig_data_size); +static DEVICE_ATTR_RO(mem_used_total); +static DEVICE_ATTR_RW(mem_limit); +static DEVICE_ATTR_RW(mem_used_max); +static DEVICE_ATTR_RW(max_comp_streams); +static DEVICE_ATTR_RW(comp_algorithm); static struct attribute *zram_disk_attrs[] = { &dev_attr_disksize.attr, @@ -1158,10 +1179,24 @@ static struct attribute_group zram_disk_attr_group = { .attrs = zram_disk_attrs, }; -static int create_device(struct zram *zram, int device_id) +/* + * Allocate and initialize new zram device. the function returns + * '>= 0' device_id upon success, and negative value otherwise. + */ +static int zram_add(void) { + struct zram *zram; struct request_queue *queue; - int ret = -ENOMEM; + int ret, device_id; + + zram = kzalloc(sizeof(struct zram), GFP_KERNEL); + if (!zram) + return -ENOMEM; + + ret = idr_alloc(&zram_index_idr, zram, 0, 0, GFP_KERNEL); + if (ret < 0) + goto out_free_dev; + device_id = ret; init_rwsem(&zram->init_lock); @@ -1169,12 +1204,13 @@ static int create_device(struct zram *zram, int device_id) if (!queue) { pr_err("Error allocating disk queue for device %d\n", device_id); - goto out; + ret = -ENOMEM; + goto out_free_idr; } blk_queue_make_request(queue, zram_make_request); - /* gendisk structure */ + /* gendisk structure */ zram->disk = alloc_disk(1); if (!zram->disk) { pr_warn("Error allocating disk structure for device %d\n", @@ -1232,90 +1268,177 @@ static int create_device(struct zram *zram, int device_id) strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); zram->meta = NULL; zram->max_comp_streams = 1; - return 0; + + pr_info("Added device: %s\n", zram->disk->disk_name); + return device_id; out_free_disk: del_gendisk(zram->disk); put_disk(zram->disk); out_free_queue: blk_cleanup_queue(queue); -out: +out_free_idr: + idr_remove(&zram_index_idr, device_id); +out_free_dev: + kfree(zram); return ret; } -static void destroy_devices(unsigned int nr) +static int zram_remove(struct zram *zram) +{ + struct block_device *bdev; + + bdev = bdget_disk(zram->disk, 0); + if (!bdev) + return -ENOMEM; + + mutex_lock(&bdev->bd_mutex); + if (bdev->bd_openers || zram->claim) { + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return -EBUSY; + } + + zram->claim = true; + mutex_unlock(&bdev->bd_mutex); + + /* + * Remove sysfs first, so no one will perform a disksize + * store while we destroy the devices. This also helps during + * hot_remove -- zram_reset_device() is the last holder of + * ->init_lock, no later/concurrent disksize_store() or any + * other sysfs handlers are possible. + */ + sysfs_remove_group(&disk_to_dev(zram->disk)->kobj, + &zram_disk_attr_group); + + /* Make sure all the pending I/O are finished */ + fsync_bdev(bdev); + zram_reset_device(zram); + bdput(bdev); + + pr_info("Removed device: %s\n", zram->disk->disk_name); + + idr_remove(&zram_index_idr, zram->disk->first_minor); + blk_cleanup_queue(zram->disk->queue); + del_gendisk(zram->disk); + put_disk(zram->disk); + kfree(zram); + return 0; +} + +/* zram-control sysfs attributes */ +static ssize_t hot_add_show(struct class *class, + struct class_attribute *attr, + char *buf) +{ + int ret; + + mutex_lock(&zram_index_mutex); + ret = zram_add(); + mutex_unlock(&zram_index_mutex); + + if (ret < 0) + return ret; + return scnprintf(buf, PAGE_SIZE, "%d\n", ret); +} + +static ssize_t hot_remove_store(struct class *class, + struct class_attribute *attr, + const char *buf, + size_t count) { struct zram *zram; - unsigned int i; + int ret, dev_id; - for (i = 0; i < nr; i++) { - zram = &zram_devices[i]; - /* - * Remove sysfs first, so no one will perform a disksize - * store while we destroy the devices - */ - sysfs_remove_group(&disk_to_dev(zram->disk)->kobj, - &zram_disk_attr_group); + /* dev_id is gendisk->first_minor, which is `int' */ + ret = kstrtoint(buf, 10, &dev_id); + if (ret) + return ret; + if (dev_id < 0) + return -EINVAL; - zram_reset_device(zram); + mutex_lock(&zram_index_mutex); - blk_cleanup_queue(zram->disk->queue); - del_gendisk(zram->disk); - put_disk(zram->disk); - } + zram = idr_find(&zram_index_idr, dev_id); + if (zram) + ret = zram_remove(zram); + else + ret = -ENODEV; + + mutex_unlock(&zram_index_mutex); + return ret ? ret : count; +} + +static struct class_attribute zram_control_class_attrs[] = { + __ATTR_RO(hot_add), + __ATTR_WO(hot_remove), + __ATTR_NULL, +}; + +static struct class zram_control_class = { + .name = "zram-control", + .owner = THIS_MODULE, + .class_attrs = zram_control_class_attrs, +}; + +static int zram_remove_cb(int id, void *ptr, void *data) +{ + zram_remove(ptr); + return 0; +} - kfree(zram_devices); +static void destroy_devices(void) +{ + class_unregister(&zram_control_class); + idr_for_each(&zram_index_idr, &zram_remove_cb, NULL); + idr_destroy(&zram_index_idr); unregister_blkdev(zram_major, "zram"); - pr_info("Destroyed %u device(s)\n", nr); } static int __init zram_init(void) { - int ret, dev_id; + int ret; - if (num_devices > max_num_devices) { - pr_warn("Invalid value for num_devices: %u\n", - num_devices); - return -EINVAL; + ret = class_register(&zram_control_class); + if (ret) { + pr_warn("Unable to register zram-control class\n"); + return ret; } zram_major = register_blkdev(0, "zram"); if (zram_major <= 0) { pr_warn("Unable to get major number\n"); + class_unregister(&zram_control_class); return -EBUSY; } - /* Allocate the device array and initialize each one */ - zram_devices = kzalloc(num_devices * sizeof(struct zram), GFP_KERNEL); - if (!zram_devices) { - unregister_blkdev(zram_major, "zram"); - return -ENOMEM; - } - - for (dev_id = 0; dev_id < num_devices; dev_id++) { - ret = create_device(&zram_devices[dev_id], dev_id); - if (ret) + while (num_devices != 0) { + mutex_lock(&zram_index_mutex); + ret = zram_add(); + mutex_unlock(&zram_index_mutex); + if (ret < 0) goto out_error; + num_devices--; } - pr_info("Created %u device(s)\n", num_devices); return 0; out_error: - destroy_devices(dev_id); + destroy_devices(); return ret; } static void __exit zram_exit(void) { - destroy_devices(num_devices); + destroy_devices(); } module_init(zram_init); module_exit(zram_exit); module_param(num_devices, uint, 0); -MODULE_PARM_DESC(num_devices, "Number of zram devices"); +MODULE_PARM_DESC(num_devices, "Number of pre-created zram devices"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>"); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 570c598f4..6dbe2df50 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -20,12 +20,6 @@ #include "zcomp.h" -/* - * Some arbitrary value. This is just to catch - * invalid value for num_devices module parameter. - */ -static const unsigned max_num_devices = 32; - /*-- Configurable parameters */ /* @@ -121,5 +115,9 @@ struct zram { */ u64 disksize; /* bytes */ char compressor[10]; + /* + * zram is claimed so open request will be failed + */ + bool claim; /* Protected by bdev->bd_mutex */ }; #endif |