diff options
author | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2016-09-11 04:34:46 -0300 |
---|---|---|
committer | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2016-09-11 04:34:46 -0300 |
commit | 863981e96738983919de841ec669e157e6bdaeb0 (patch) | |
tree | d6d89a12e7eb8017837c057935a2271290907f76 /drivers/block | |
parent | 8dec7c70575785729a6a9e6719a955e9c545bcab (diff) |
Linux-libre 4.7.1-gnupck-4.7.1-gnu
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/aoe/aoecmd.c | 4 | ||||
-rw-r--r-- | drivers/block/brd.c | 2 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_main.c | 2 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_nl.c | 28 | ||||
-rw-r--r-- | drivers/block/loop.c | 2 | ||||
-rw-r--r-- | drivers/block/mtip32xx/mtip32xx.c | 12 | ||||
-rw-r--r-- | drivers/block/nbd.c | 6 | ||||
-rw-r--r-- | drivers/block/osdblk.c | 2 | ||||
-rw-r--r-- | drivers/block/ps3disk.c | 2 | ||||
-rw-r--r-- | drivers/block/rbd.c | 305 | ||||
-rw-r--r-- | drivers/block/skd_main.c | 61 | ||||
-rw-r--r-- | drivers/block/virtio_blk.c | 6 | ||||
-rw-r--r-- | drivers/block/xen-blkback/xenbus.c | 2 | ||||
-rw-r--r-- | drivers/block/xen-blkfront.c | 129 | ||||
-rw-r--r-- | drivers/block/zram/zcomp.c | 300 | ||||
-rw-r--r-- | drivers/block/zram/zcomp.h | 14 | ||||
-rw-r--r-- | drivers/block/zram/zram_drv.c | 104 | ||||
-rw-r--r-- | drivers/block/zram/zram_drv.h | 2 |
18 files changed, 353 insertions, 630 deletions
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 437b3a822..ab19adb07 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -861,7 +861,7 @@ rqbiocnt(struct request *r) * discussion. * * We cannot use get_page in the workaround, because it insists on a - * positive page count as a precondition. So we use _count directly. + * positive page count as a precondition. So we use _refcount directly. */ static void bio_pageinc(struct bio *bio) @@ -1750,7 +1750,7 @@ aoecmd_init(void) int ret; /* get_zeroed_page returns page with ref count 1 */ - p = (void *) get_zeroed_page(GFP_KERNEL | __GFP_REPEAT); + p = (void *) get_zeroed_page(GFP_KERNEL); if (!p) return -ENOMEM; empty_page = virt_to_page(p); diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 51a071e32..c04bd9bc3 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -381,7 +381,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector, #ifdef CONFIG_BLK_DEV_RAM_DAX static long brd_direct_access(struct block_device *bdev, sector_t sector, - void __pmem **kaddr, pfn_t *pfn) + void __pmem **kaddr, pfn_t *pfn, long size) { struct brd_device *brd = bdev->bd_disk->private_data; struct page *page; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index fa209773d..2ba1494b2 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2761,7 +2761,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig q->backing_dev_info.congested_data = device; blk_queue_make_request(q, drbd_make_request); - blk_queue_flush(q, REQ_FLUSH | REQ_FUA); + blk_queue_write_cache(q, true, true); /* Setting the max_hw_sectors to an odd value of 8kibyte here This triggers a max_bio_size message upon first attach or connect */ blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 1fd1dcceb..0bac9c824 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -3633,14 +3633,15 @@ static int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device, goto nla_put_failure; if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) || nla_put_u32(skb, T_current_state, device->state.i) || - nla_put_u64(skb, T_ed_uuid, device->ed_uuid) || - nla_put_u64(skb, T_capacity, drbd_get_capacity(device->this_bdev)) || - nla_put_u64(skb, T_send_cnt, device->send_cnt) || - nla_put_u64(skb, T_recv_cnt, device->recv_cnt) || - nla_put_u64(skb, T_read_cnt, device->read_cnt) || - nla_put_u64(skb, T_writ_cnt, device->writ_cnt) || - nla_put_u64(skb, T_al_writ_cnt, device->al_writ_cnt) || - nla_put_u64(skb, T_bm_writ_cnt, device->bm_writ_cnt) || + nla_put_u64_0pad(skb, T_ed_uuid, device->ed_uuid) || + nla_put_u64_0pad(skb, T_capacity, + drbd_get_capacity(device->this_bdev)) || + nla_put_u64_0pad(skb, T_send_cnt, device->send_cnt) || + nla_put_u64_0pad(skb, T_recv_cnt, device->recv_cnt) || + nla_put_u64_0pad(skb, T_read_cnt, device->read_cnt) || + nla_put_u64_0pad(skb, T_writ_cnt, device->writ_cnt) || + nla_put_u64_0pad(skb, T_al_writ_cnt, device->al_writ_cnt) || + nla_put_u64_0pad(skb, T_bm_writ_cnt, device->bm_writ_cnt) || nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&device->ap_bio_cnt)) || nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&device->ap_pending_cnt)) || nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&device->rs_pending_cnt))) @@ -3657,13 +3658,16 @@ static int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device, goto nla_put_failure; if (nla_put_u32(skb, T_disk_flags, device->ldev->md.flags) || - nla_put_u64(skb, T_bits_total, drbd_bm_bits(device)) || - nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(device))) + nla_put_u64_0pad(skb, T_bits_total, drbd_bm_bits(device)) || + nla_put_u64_0pad(skb, T_bits_oos, + drbd_bm_total_weight(device))) goto nla_put_failure; if (C_SYNC_SOURCE <= device->state.conn && C_PAUSED_SYNC_T >= device->state.conn) { - if (nla_put_u64(skb, T_bits_rs_total, device->rs_total) || - nla_put_u64(skb, T_bits_rs_failed, device->rs_failed)) + if (nla_put_u64_0pad(skb, T_bits_rs_total, + device->rs_total) || + nla_put_u64_0pad(skb, T_bits_rs_failed, + device->rs_failed)) goto nla_put_failure; } } diff --git a/drivers/block/loop.c b/drivers/block/loop.c index ba9e4a722..7339e65f6 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -961,7 +961,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) - blk_queue_flush(lo->lo_queue, REQ_FLUSH); + blk_queue_write_cache(lo->lo_queue, true, false); loop_update_dio(lo); set_capacity(lo->lo_disk, size); diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 25824c169..6053e4659 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3000,14 +3000,14 @@ restart_eh: "Completion workers still active!"); spin_lock(dd->queue->queue_lock); - blk_mq_all_tag_busy_iter(*dd->tags.tags, + blk_mq_tagset_busy_iter(&dd->tags, mtip_queue_cmd, dd); spin_unlock(dd->queue->queue_lock); set_bit(MTIP_PF_ISSUE_CMDS_BIT, &dd->port->flags); if (mtip_device_reset(dd)) - blk_mq_all_tag_busy_iter(*dd->tags.tags, + blk_mq_tagset_busy_iter(&dd->tags, mtip_abort_cmd, dd); clear_bit(MTIP_PF_TO_ACTIVE_BIT, &dd->port->flags); @@ -4023,12 +4023,6 @@ skip_create_disk: blk_queue_io_min(dd->queue, 4096); blk_queue_bounce_limit(dd->queue, dd->pdev->dma_mask); - /* - * write back cache is not supported in the device. FUA depends on - * write back cache support, hence setting flush support to zero. - */ - blk_queue_flush(dd->queue, 0); - /* Signal trim support */ if (dd->trim_supp == true) { set_bit(QUEUE_FLAG_DISCARD, &dd->queue->queue_flags); @@ -4174,7 +4168,7 @@ static int mtip_block_remove(struct driver_data *dd) blk_mq_freeze_queue_start(dd->queue); blk_mq_stop_hw_queues(dd->queue); - blk_mq_all_tag_busy_iter(dd->tags.tags[0], mtip_no_dev_cleanup, dd); + blk_mq_tagset_busy_iter(&dd->tags, mtip_no_dev_cleanup, dd); /* * Delete our gendisk structure. This also removes the device diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 08afbc7a2..6a48ed419 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -693,9 +693,9 @@ static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev) if (nbd->flags & NBD_FLAG_SEND_TRIM) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); if (nbd->flags & NBD_FLAG_SEND_FLUSH) - blk_queue_flush(nbd->disk->queue, REQ_FLUSH); + blk_queue_write_cache(nbd->disk->queue, true, false); else - blk_queue_flush(nbd->disk->queue, 0); + blk_queue_write_cache(nbd->disk->queue, false, false); } static int nbd_dev_dbg_init(struct nbd_device *nbd); @@ -941,7 +941,7 @@ static int nbd_dev_dbg_init(struct nbd_device *nbd) debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize); debugfs_create_u32("timeout", 0444, dir, &nbd->xmit_timeout); debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize); - debugfs_create_file("flags", 0444, dir, &nbd, &nbd_dbg_flags_ops); + debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops); return 0; } diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c index 1b709a4e3..c2854a2bf 100644 --- a/drivers/block/osdblk.c +++ b/drivers/block/osdblk.c @@ -437,7 +437,7 @@ static int osdblk_init_disk(struct osdblk_device *osdev) blk_queue_stack_limits(q, osd_request_queue(osdev->osd)); blk_queue_prep_rq(q, blk_queue_start_tag); - blk_queue_flush(q, REQ_FLUSH); + blk_queue_write_cache(q, true, false); disk->queue = q; diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index c120d70d3..4b7e40583 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -468,7 +468,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) blk_queue_dma_alignment(queue, dev->blk_size-1); blk_queue_logical_block_size(queue, dev->blk_size); - blk_queue_flush(queue, REQ_FLUSH); + blk_queue_write_cache(queue, true, false); blk_queue_max_segments(queue, -1); blk_queue_max_segment_size(queue, dev->bounce_size); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0ede6d7e2..81666a564 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -350,12 +350,12 @@ struct rbd_device { struct rbd_spec *spec; struct rbd_options *opts; - char *header_name; + struct ceph_object_id header_oid; + struct ceph_object_locator header_oloc; struct ceph_file_layout layout; - struct ceph_osd_event *watch_event; - struct rbd_obj_request *watch_request; + struct ceph_osd_linger_request *watch_handle; struct rbd_spec *parent_spec; u64 parent_overlap; @@ -1596,12 +1596,6 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) return __rbd_obj_request_wait(obj_request, 0); } -static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request, - unsigned long timeout) -{ - return __rbd_obj_request_wait(obj_request, timeout); -} - static void rbd_img_request_complete(struct rbd_img_request *img_request) { @@ -1751,12 +1745,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) complete_all(&obj_request->completion); } -static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) -{ - dout("%s: obj %p\n", __func__, obj_request); - obj_request_done_set(obj_request); -} - static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request = NULL; @@ -1828,13 +1816,12 @@ static void rbd_osd_call_callback(struct rbd_obj_request *obj_request) obj_request_done_set(obj_request); } -static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, - struct ceph_msg *msg) +static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) { struct rbd_obj_request *obj_request = osd_req->r_priv; u16 opcode; - dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); + dout("%s: osd_req %p\n", __func__, osd_req); rbd_assert(osd_req == obj_request->osd_req); if (obj_request_img_data_test(obj_request)) { rbd_assert(obj_request->img_request); @@ -1878,10 +1865,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, case CEPH_OSD_OP_CALL: rbd_osd_call_callback(obj_request); break; - case CEPH_OSD_OP_NOTIFY_ACK: - case CEPH_OSD_OP_WATCH: - rbd_osd_trivial_callback(obj_request); - break; default: rbd_warn(NULL, "%s: unsupported op %hu", obj_request->object_name, (unsigned short) opcode); @@ -1896,27 +1879,17 @@ static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request = obj_request->img_request; struct ceph_osd_request *osd_req = obj_request->osd_req; - u64 snap_id; - rbd_assert(osd_req != NULL); - - snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP; - ceph_osdc_build_request(osd_req, obj_request->offset, - NULL, snap_id, NULL); + if (img_request) + osd_req->r_snapid = img_request->snap_id; } static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) { - struct rbd_img_request *img_request = obj_request->img_request; struct ceph_osd_request *osd_req = obj_request->osd_req; - struct ceph_snap_context *snapc; - struct timespec mtime = CURRENT_TIME; - rbd_assert(osd_req != NULL); - - snapc = img_request ? img_request->snapc : NULL; - ceph_osdc_build_request(osd_req, obj_request->offset, - snapc, CEPH_NOSNAP, &mtime); + osd_req->r_mtime = CURRENT_TIME; + osd_req->r_data_offset = obj_request->offset; } /* @@ -1954,7 +1927,7 @@ static struct ceph_osd_request *rbd_osd_req_create( osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO); if (!osd_req) - return NULL; /* ENOMEM */ + goto fail; if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; @@ -1965,9 +1938,18 @@ static struct ceph_osd_request *rbd_osd_req_create( osd_req->r_priv = obj_request; osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); - ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); + if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", + obj_request->object_name)) + goto fail; + + if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) + goto fail; return osd_req; + +fail: + ceph_osdc_put_request(osd_req); + return NULL; } /* @@ -2003,16 +1985,25 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops, false, GFP_NOIO); if (!osd_req) - return NULL; /* ENOMEM */ + goto fail; osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; osd_req->r_callback = rbd_osd_req_callback; osd_req->r_priv = obj_request; osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); - ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); + if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", + obj_request->object_name)) + goto fail; + + if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) + goto fail; return osd_req; + +fail: + ceph_osdc_put_request(osd_req); + return NULL; } @@ -2973,17 +2964,20 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request) { struct rbd_obj_request *obj_request; struct rbd_obj_request *next_obj_request; + int ret = 0; dout("%s: img %p\n", __func__, img_request); - for_each_obj_request_safe(img_request, obj_request, next_obj_request) { - int ret; + rbd_img_request_get(img_request); + for_each_obj_request_safe(img_request, obj_request, next_obj_request) { ret = rbd_img_obj_request_submit(obj_request); if (ret) - return ret; + goto out_put_ireq; } - return 0; +out_put_ireq: + rbd_img_request_put(img_request); + return ret; } static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) @@ -3090,45 +3084,18 @@ out_err: obj_request_done_set(obj_request); } -static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id) -{ - struct rbd_obj_request *obj_request; - struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; - int ret; - - obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, - OBJ_REQUEST_NODATA); - if (!obj_request) - return -ENOMEM; - - ret = -ENOMEM; - obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, - obj_request); - if (!obj_request->osd_req) - goto out; - - osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, - notify_id, 0, 0); - rbd_osd_req_format_read(obj_request); +static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev); +static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev); - ret = rbd_obj_request_submit(osdc, obj_request); - if (ret) - goto out; - ret = rbd_obj_request_wait(obj_request); -out: - rbd_obj_request_put(obj_request); - - return ret; -} - -static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) +static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie, + u64 notifier_id, void *data, size_t data_len) { - struct rbd_device *rbd_dev = (struct rbd_device *)data; + struct rbd_device *rbd_dev = arg; + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; int ret; - dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, - rbd_dev->header_name, (unsigned long long)notify_id, - (unsigned int)opcode); + dout("%s rbd_dev %p cookie %llu notify_id %llu\n", __func__, rbd_dev, + cookie, notify_id); /* * Until adequate refresh error handling is in place, there is @@ -3140,63 +3107,31 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) if (ret) rbd_warn(rbd_dev, "refresh failed: %d", ret); - ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id); + ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid, + &rbd_dev->header_oloc, notify_id, cookie, + NULL, 0); if (ret) rbd_warn(rbd_dev, "notify_ack ret %d", ret); } -/* - * Send a (un)watch request and wait for the ack. Return a request - * with a ref held on success or error. - */ -static struct rbd_obj_request *rbd_obj_watch_request_helper( - struct rbd_device *rbd_dev, - bool watch) +static void rbd_watch_errcb(void *arg, u64 cookie, int err) { - struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; - struct ceph_options *opts = osdc->client->options; - struct rbd_obj_request *obj_request; + struct rbd_device *rbd_dev = arg; int ret; - obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, - OBJ_REQUEST_NODATA); - if (!obj_request) - return ERR_PTR(-ENOMEM); - - obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1, - obj_request); - if (!obj_request->osd_req) { - ret = -ENOMEM; - goto out; - } - - osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, - rbd_dev->watch_event->cookie, 0, watch); - rbd_osd_req_format_write(obj_request); + rbd_warn(rbd_dev, "encountered watch error: %d", err); - if (watch) - ceph_osdc_set_request_linger(osdc, obj_request->osd_req); - - ret = rbd_obj_request_submit(osdc, obj_request); - if (ret) - goto out; + __rbd_dev_header_unwatch_sync(rbd_dev); - ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout); - if (ret) - goto out; - - ret = obj_request->result; + ret = rbd_dev_header_watch_sync(rbd_dev); if (ret) { - if (watch) - rbd_obj_request_end(obj_request); - goto out; + rbd_warn(rbd_dev, "failed to reregister watch: %d", ret); + return; } - return obj_request; - -out: - rbd_obj_request_put(obj_request); - return ERR_PTR(ret); + ret = rbd_dev_refresh(rbd_dev); + if (ret) + rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret); } /* @@ -3205,35 +3140,33 @@ out: static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; - struct rbd_obj_request *obj_request; - int ret; + struct ceph_osd_linger_request *handle; - rbd_assert(!rbd_dev->watch_event); - rbd_assert(!rbd_dev->watch_request); + rbd_assert(!rbd_dev->watch_handle); - ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, - &rbd_dev->watch_event); - if (ret < 0) - return ret; + handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid, + &rbd_dev->header_oloc, rbd_watch_cb, + rbd_watch_errcb, rbd_dev); + if (IS_ERR(handle)) + return PTR_ERR(handle); - obj_request = rbd_obj_watch_request_helper(rbd_dev, true); - if (IS_ERR(obj_request)) { - ceph_osdc_cancel_event(rbd_dev->watch_event); - rbd_dev->watch_event = NULL; - return PTR_ERR(obj_request); - } + rbd_dev->watch_handle = handle; + return 0; +} - /* - * A watch request is set to linger, so the underlying osd - * request won't go away until we unregister it. We retain - * a pointer to the object request during that time (in - * rbd_dev->watch_request), so we'll keep a reference to it. - * We'll drop that reference after we've unregistered it in - * rbd_dev_header_unwatch_sync(). - */ - rbd_dev->watch_request = obj_request; +static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) +{ + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + int ret; - return 0; + if (!rbd_dev->watch_handle) + return; + + ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle); + if (ret) + rbd_warn(rbd_dev, "failed to unwatch: %d", ret); + + rbd_dev->watch_handle = NULL; } /* @@ -3241,24 +3174,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) */ static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) { - struct rbd_obj_request *obj_request; - - rbd_assert(rbd_dev->watch_event); - rbd_assert(rbd_dev->watch_request); - - rbd_obj_request_end(rbd_dev->watch_request); - rbd_obj_request_put(rbd_dev->watch_request); - rbd_dev->watch_request = NULL; - - obj_request = rbd_obj_watch_request_helper(rbd_dev, false); - if (!IS_ERR(obj_request)) - rbd_obj_request_put(obj_request); - else - rbd_warn(rbd_dev, "unable to tear down watch request (%ld)", - PTR_ERR(obj_request)); - - ceph_osdc_cancel_event(rbd_dev->watch_event); - rbd_dev->watch_event = NULL; + __rbd_dev_header_unwatch_sync(rbd_dev); dout("%s flushing notifies\n", __func__); ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); @@ -3591,7 +3507,7 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) if (!ondisk) return -ENOMEM; - ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name, 0, size, ondisk); if (ret < 0) goto out; @@ -4033,6 +3949,8 @@ static void rbd_dev_release(struct device *dev) struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); bool need_put = !!rbd_dev->opts; + ceph_oid_destroy(&rbd_dev->header_oid); + rbd_put_client(rbd_dev->rbd_client); rbd_spec_put(rbd_dev->spec); kfree(rbd_dev->opts); @@ -4063,6 +3981,9 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, INIT_LIST_HEAD(&rbd_dev->node); init_rwsem(&rbd_dev->header_rwsem); + ceph_oid_init(&rbd_dev->header_oid); + ceph_oloc_init(&rbd_dev->header_oloc); + rbd_dev->dev.bus = &rbd_bus_type; rbd_dev->dev.type = &rbd_device_type; rbd_dev->dev.parent = &rbd_root_dev; @@ -4111,7 +4032,7 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, __le64 size; } __attribute__ ((packed)) size_buf = { 0 }; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, "rbd", "get_size", &snapid, sizeof (snapid), &size_buf, sizeof (size_buf)); @@ -4151,7 +4072,7 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) if (!reply_buf) return -ENOMEM; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, "rbd", "get_object_prefix", NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); @@ -4186,7 +4107,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, u64 unsup; int ret; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, "rbd", "get_features", &snapid, sizeof (snapid), &features_buf, sizeof (features_buf)); @@ -4248,7 +4169,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) } snapid = cpu_to_le64(rbd_dev->spec->snap_id); - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, "rbd", "get_parent", &snapid, sizeof (snapid), reply_buf, size); @@ -4351,7 +4272,7 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) u64 stripe_count; int ret; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, "rbd", "get_stripe_unit_count", NULL, 0, (char *)&striping_info_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); @@ -4599,7 +4520,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) if (!reply_buf) return -ENOMEM; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, "rbd", "get_snapcontext", NULL, 0, reply_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); @@ -4664,7 +4585,7 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, return ERR_PTR(-ENOMEM); snapid = cpu_to_le64(snap_id); - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, "rbd", "get_snapshot_name", &snapid, sizeof (snapid), reply_buf, size); @@ -4975,13 +4896,13 @@ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) again: ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); if (ret == -ENOENT && tries++ < 1) { - ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap", - &newest_epoch); + ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap", + &newest_epoch); if (ret < 0) return ret; if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { - ceph_monc_request_next_osdmap(&rbdc->client->monc); + ceph_osdc_maybe_request_map(&rbdc->client->osdc); (void) ceph_monc_wait_osdmap(&rbdc->client->monc, newest_epoch, opts->mount_timeout); @@ -5260,35 +5181,26 @@ err_out_unlock: static int rbd_dev_header_name(struct rbd_device *rbd_dev) { struct rbd_spec *spec = rbd_dev->spec; - size_t size; + int ret; /* Record the header object name for this rbd image. */ rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + rbd_dev->header_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); if (rbd_dev->image_format == 1) - size = strlen(spec->image_name) + sizeof (RBD_SUFFIX); + ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", + spec->image_name, RBD_SUFFIX); else - size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id); - - rbd_dev->header_name = kmalloc(size, GFP_KERNEL); - if (!rbd_dev->header_name) - return -ENOMEM; + ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", + RBD_HEADER_PREFIX, spec->image_id); - if (rbd_dev->image_format == 1) - sprintf(rbd_dev->header_name, "%s%s", - spec->image_name, RBD_SUFFIX); - else - sprintf(rbd_dev->header_name, "%s%s", - RBD_HEADER_PREFIX, spec->image_id); - return 0; + return ret; } static void rbd_dev_image_release(struct rbd_device *rbd_dev) { rbd_dev_unprobe(rbd_dev); - kfree(rbd_dev->header_name); - rbd_dev->header_name = NULL; rbd_dev->image_format = 0; kfree(rbd_dev->spec->image_id); rbd_dev->spec->image_id = NULL; @@ -5327,7 +5239,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) pr_info("image %s/%s does not exist\n", rbd_dev->spec->pool_name, rbd_dev->spec->image_name); - goto out_header_name; + goto err_out_format; } } @@ -5373,7 +5285,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) goto err_out_probe; dout("discovered format %u image, header name is %s\n", - rbd_dev->image_format, rbd_dev->header_name); + rbd_dev->image_format, rbd_dev->header_oid.name); return 0; err_out_probe: @@ -5381,9 +5293,6 @@ err_out_probe: err_out_watch: if (!depth) rbd_dev_header_unwatch_sync(rbd_dev); -out_header_name: - kfree(rbd_dev->header_name); - rbd_dev->header_name = NULL; err_out_format: rbd_dev->image_format = 0; kfree(rbd_dev->spec->image_id); diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 586f9168f..910e06591 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -133,7 +133,6 @@ MODULE_VERSION(DRV_VERSION "-" DRV_BUILD_ID); #define SKD_TIMER_MINUTES(minutes) ((minutes) * (60)) #define INQ_STD_NBYTES 36 -#define SKD_DISCARD_CDB_LENGTH 24 enum skd_drvr_state { SKD_DRVR_STATE_LOAD, @@ -212,7 +211,6 @@ struct skd_request_context { struct request *req; u8 flush_cmd; - u8 discard_page; u32 timeout_stamp; u8 sg_data_dir; @@ -230,7 +228,6 @@ struct skd_request_context { }; #define SKD_DATA_DIR_HOST_TO_CARD 1 #define SKD_DATA_DIR_CARD_TO_HOST 2 -#define SKD_DATA_DIR_NONE 3 /* especially for DISCARD requests. */ struct skd_special_context { struct skd_request_context req; @@ -540,31 +537,6 @@ skd_prep_zerosize_flush_cdb(struct skd_scsi_request *scsi_req, scsi_req->cdb[9] = 0; } -static void -skd_prep_discard_cdb(struct skd_scsi_request *scsi_req, - struct skd_request_context *skreq, - struct page *page, - u32 lba, u32 count) -{ - char *buf; - unsigned long len; - struct request *req; - - buf = page_address(page); - len = SKD_DISCARD_CDB_LENGTH; - - scsi_req->cdb[0] = UNMAP; - scsi_req->cdb[8] = len; - - put_unaligned_be16(6 + 16, &buf[0]); - put_unaligned_be16(16, &buf[2]); - put_unaligned_be64(lba, &buf[8]); - put_unaligned_be32(count, &buf[16]); - - req = skreq->req; - blk_add_request_payload(req, page, len); -} - static void skd_request_fn_not_online(struct request_queue *q); static void skd_request_fn(struct request_queue *q) @@ -575,7 +547,6 @@ static void skd_request_fn(struct request_queue *q) struct skd_request_context *skreq; struct request *req = NULL; struct skd_scsi_request *scsi_req; - struct page *page; unsigned long io_flags; int error; u32 lba; @@ -669,7 +640,6 @@ static void skd_request_fn(struct request_queue *q) skreq->flush_cmd = 0; skreq->n_sg = 0; skreq->sg_byte_count = 0; - skreq->discard_page = 0; /* * OK to now dequeue request from q. @@ -735,18 +705,7 @@ static void skd_request_fn(struct request_queue *q) else skreq->sg_data_dir = SKD_DATA_DIR_HOST_TO_CARD; - if (io_flags & REQ_DISCARD) { - page = alloc_page(GFP_ATOMIC | __GFP_ZERO); - if (!page) { - pr_err("request_fn:Page allocation failed.\n"); - skd_end_request(skdev, skreq, -ENOMEM); - break; - } - skreq->discard_page = 1; - req->completion_data = page; - skd_prep_discard_cdb(scsi_req, skreq, page, lba, count); - - } else if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) { + if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) { skd_prep_zerosize_flush_cdb(scsi_req, skreq); SKD_ASSERT(skreq->flush_cmd == 1); @@ -851,16 +810,6 @@ skip_sg: static void skd_end_request(struct skd_device *skdev, struct skd_request_context *skreq, int error) { - struct request *req = skreq->req; - unsigned int io_flags = req->cmd_flags; - - if ((io_flags & REQ_DISCARD) && - (skreq->discard_page == 1)) { - pr_debug("%s:%s:%d, free the page!", - skdev->name, __func__, __LINE__); - __free_page(req->completion_data); - } - if (unlikely(error)) { struct request *req = skreq->req; char *cmd = (rq_data_dir(req) == READ) ? "read" : "write"; @@ -4412,19 +4361,13 @@ static int skd_cons_disk(struct skd_device *skdev) disk->queue = q; q->queuedata = skdev; - blk_queue_flush(q, REQ_FLUSH | REQ_FUA); + blk_queue_write_cache(q, true, true); blk_queue_max_segments(q, skdev->sgs_per_request); blk_queue_max_hw_sectors(q, SKD_N_MAX_SECTORS); /* set sysfs ptimal_io_size to 8K */ blk_queue_io_opt(q, 8192); - /* DISCARD Flag initialization. */ - q->limits.discard_granularity = 8192; - q->limits.discard_alignment = 0; - blk_queue_max_discard_sectors(q, UINT_MAX >> 9); - q->limits.discard_zeroes_data = 1; - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 28cff0d23..42758b527 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -493,11 +493,7 @@ static void virtblk_update_cache_mode(struct virtio_device *vdev) u8 writeback = virtblk_get_cache_mode(vdev); struct virtio_blk *vblk = vdev->priv; - if (writeback) - blk_queue_flush(vblk->disk->queue, REQ_FLUSH); - else - blk_queue_flush(vblk->disk->queue, 0); - + blk_queue_write_cache(vblk->disk->queue, writeback, false); revalidate_disk(vblk->disk); } diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 26aa080e2..3355f1cdd 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -477,7 +477,7 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, vbd->type |= VDISK_REMOVABLE; q = bdev_get_queue(bdev); - if (q && q->flush_flags) + if (q && test_bit(QUEUE_FLAG_WC, &q->queue_flags)) vbd->flush_support = true; if (q && blk_queue_secdiscard(q)) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 6405b6557..fcc5b4e0a 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -207,6 +207,9 @@ struct blkfront_info struct blk_mq_tag_set tag_set; struct blkfront_ring_info *rinfo; unsigned int nr_rings; + /* Save uncomplete reqs and bios for migration. */ + struct list_head requests; + struct bio_list bio_list; }; static unsigned int nr_minors; @@ -874,8 +877,12 @@ static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *qd) { unsigned long flags; - struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)hctx->driver_data; + int qid = hctx->queue_num; + struct blkfront_info *info = hctx->queue->queuedata; + struct blkfront_ring_info *rinfo = NULL; + BUG_ON(info->nr_rings <= qid); + rinfo = &info->rinfo[qid]; blk_mq_start_request(qd->rq); spin_lock_irqsave(&rinfo->ring_lock, flags); if (RING_FULL(&rinfo->ring)) @@ -901,20 +908,9 @@ out_busy: return BLK_MQ_RQ_QUEUE_BUSY; } -static int blk_mq_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, - unsigned int index) -{ - struct blkfront_info *info = (struct blkfront_info *)data; - - BUG_ON(info->nr_rings <= index); - hctx->driver_data = &info->rinfo[index]; - return 0; -} - static struct blk_mq_ops blkfront_mq_ops = { .queue_rq = blkif_queue_rq, .map_queue = blk_mq_map_queue, - .init_hctx = blk_mq_init_hctx, }; static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, @@ -950,6 +946,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, return PTR_ERR(rq); } + rq->queuedata = info; queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); if (info->feature_discard) { @@ -998,7 +995,8 @@ static const char *flush_info(unsigned int feature_flush) static void xlvbd_flush(struct blkfront_info *info) { - blk_queue_flush(info->rq, info->feature_flush); + blk_queue_write_cache(info->rq, info->feature_flush & REQ_FLUSH, + info->feature_flush & REQ_FUA); pr_info("blkfront: %s: %s %s %s %s %s\n", info->gd->disk_name, flush_info(info->feature_flush), "persistent grants:", info->feature_persistent ? @@ -2007,69 +2005,22 @@ static int blkif_recover(struct blkfront_info *info) { unsigned int i, r_index; struct request *req, *n; - struct blk_shadow *copy; int rc; struct bio *bio, *cloned_bio; - struct bio_list bio_list, merge_bio; unsigned int segs, offset; int pending, size; struct split_bio *split_bio; - struct list_head requests; blkfront_gather_backend_features(info); segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; blk_queue_max_segments(info->rq, segs); - bio_list_init(&bio_list); - INIT_LIST_HEAD(&requests); for (r_index = 0; r_index < info->nr_rings; r_index++) { - struct blkfront_ring_info *rinfo; - - rinfo = &info->rinfo[r_index]; - /* Stage 1: Make a safe copy of the shadow state. */ - copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow), - GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); - if (!copy) - return -ENOMEM; - - /* Stage 2: Set up free list. */ - memset(&rinfo->shadow, 0, sizeof(rinfo->shadow)); - for (i = 0; i < BLK_RING_SIZE(info); i++) - rinfo->shadow[i].req.u.rw.id = i+1; - rinfo->shadow_free = rinfo->ring.req_prod_pvt; - rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; + struct blkfront_ring_info *rinfo = &info->rinfo[r_index]; rc = blkfront_setup_indirect(rinfo); - if (rc) { - kfree(copy); + if (rc) return rc; - } - - for (i = 0; i < BLK_RING_SIZE(info); i++) { - /* Not in use? */ - if (!copy[i].request) - continue; - - /* - * Get the bios in the request so we can re-queue them. - */ - if (copy[i].request->cmd_flags & - (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { - /* - * Flush operations don't contain bios, so - * we need to requeue the whole request - */ - list_add(©[i].request->queuelist, &requests); - continue; - } - merge_bio.head = copy[i].request->bio; - merge_bio.tail = copy[i].request->biotail; - bio_list_merge(&bio_list, &merge_bio); - copy[i].request->bio = NULL; - blk_end_request_all(copy[i].request, 0); - } - - kfree(copy); } xenbus_switch_state(info->xbdev, XenbusStateConnected); @@ -2084,7 +2035,7 @@ static int blkif_recover(struct blkfront_info *info) kick_pending_request_queues(rinfo); } - list_for_each_entry_safe(req, n, &requests, queuelist) { + list_for_each_entry_safe(req, n, &info->requests, queuelist) { /* Requeue pending requests (flush or discard) */ list_del_init(&req->queuelist); BUG_ON(req->nr_phys_segments > segs); @@ -2092,7 +2043,7 @@ static int blkif_recover(struct blkfront_info *info) } blk_mq_kick_requeue_list(info->rq); - while ((bio = bio_list_pop(&bio_list)) != NULL) { + while ((bio = bio_list_pop(&info->bio_list)) != NULL) { /* Traverse the list of pending bios and re-queue them */ if (bio_segments(bio) > segs) { /* @@ -2138,9 +2089,42 @@ static int blkfront_resume(struct xenbus_device *dev) { struct blkfront_info *info = dev_get_drvdata(&dev->dev); int err = 0; + unsigned int i, j; dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename); + bio_list_init(&info->bio_list); + INIT_LIST_HEAD(&info->requests); + for (i = 0; i < info->nr_rings; i++) { + struct blkfront_ring_info *rinfo = &info->rinfo[i]; + struct bio_list merge_bio; + struct blk_shadow *shadow = rinfo->shadow; + + for (j = 0; j < BLK_RING_SIZE(info); j++) { + /* Not in use? */ + if (!shadow[j].request) + continue; + + /* + * Get the bios in the request so we can re-queue them. + */ + if (shadow[j].request->cmd_flags & + (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { + /* + * Flush operations don't contain bios, so + * we need to requeue the whole request + */ + list_add(&shadow[j].request->queuelist, &info->requests); + continue; + } + merge_bio.head = shadow[j].request->bio; + merge_bio.tail = shadow[j].request->biotail; + bio_list_merge(&info->bio_list, &merge_bio); + shadow[j].request->bio = NULL; + blk_mq_end_request(shadow[j].request, 0); + } + } + blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); err = negotiate_mq(info); @@ -2148,6 +2132,8 @@ static int blkfront_resume(struct xenbus_device *dev) return err; err = talk_to_blkback(dev, info); + if (!err) + blk_mq_update_nr_hw_queues(&info->tag_set, info->nr_rings); /* * We have to wait for the backend to switch to @@ -2484,10 +2470,23 @@ static void blkback_changed(struct xenbus_device *dev, break; case XenbusStateConnected: - if (dev->state != XenbusStateInitialised) { + /* + * talk_to_blkback sets state to XenbusStateInitialised + * and blkfront_connect sets it to XenbusStateConnected + * (if connection went OK). + * + * If the backend (or toolstack) decides to poke at backend + * state (and re-trigger the watch by setting the state repeatedly + * to XenbusStateConnected (4)) we need to deal with this. + * This is allowed as this is used to communicate to the guest + * that the size of disk has changed! + */ + if ((dev->state != XenbusStateInitialised) && + (dev->state != XenbusStateConnected)) { if (talk_to_blkback(dev, info)) break; } + blkfront_connect(info); break; diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 3ef42e563..b51a816d7 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/wait.h> #include <linux/sched.h> +#include <linux/cpu.h> #include "zcomp.h" #include "zcomp_lzo.h" @@ -20,29 +21,6 @@ #include "zcomp_lz4.h" #endif -/* - * single zcomp_strm backend - */ -struct zcomp_strm_single { - struct mutex strm_lock; - struct zcomp_strm *zstrm; -}; - -/* - * multi zcomp_strm backend - */ -struct zcomp_strm_multi { - /* protect strm list */ - spinlock_t strm_lock; - /* max possible number of zstrm streams */ - int max_strm; - /* number of available zstrm streams */ - int avail_strm; - /* list of available strms */ - struct list_head idle_strm; - wait_queue_head_t strm_wait; -}; - static struct zcomp_backend *backends[] = { &zcomp_lzo, #ifdef CONFIG_ZRAM_LZ4_COMPRESS @@ -93,188 +71,6 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags) return zstrm; } -/* - * get idle zcomp_strm or wait until other process release - * (zcomp_strm_release()) one for us - */ -static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp) -{ - struct zcomp_strm_multi *zs = comp->stream; - struct zcomp_strm *zstrm; - - while (1) { - spin_lock(&zs->strm_lock); - if (!list_empty(&zs->idle_strm)) { - zstrm = list_entry(zs->idle_strm.next, - struct zcomp_strm, list); - list_del(&zstrm->list); - spin_unlock(&zs->strm_lock); - return zstrm; - } - /* zstrm streams limit reached, wait for idle stream */ - if (zs->avail_strm >= zs->max_strm) { - spin_unlock(&zs->strm_lock); - wait_event(zs->strm_wait, !list_empty(&zs->idle_strm)); - continue; - } - /* allocate new zstrm stream */ - zs->avail_strm++; - spin_unlock(&zs->strm_lock); - /* - * This function can be called in swapout/fs write path - * so we can't use GFP_FS|IO. And it assumes we already - * have at least one stream in zram initialization so we - * don't do best effort to allocate more stream in here. - * A default stream will work well without further multiple - * streams. That's why we use NORETRY | NOWARN. - */ - zstrm = zcomp_strm_alloc(comp, GFP_NOIO | __GFP_NORETRY | - __GFP_NOWARN); - if (!zstrm) { - spin_lock(&zs->strm_lock); - zs->avail_strm--; - spin_unlock(&zs->strm_lock); - wait_event(zs->strm_wait, !list_empty(&zs->idle_strm)); - continue; - } - break; - } - return zstrm; -} - -/* add stream back to idle list and wake up waiter or free the stream */ -static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstrm) -{ - struct zcomp_strm_multi *zs = comp->stream; - - spin_lock(&zs->strm_lock); - if (zs->avail_strm <= zs->max_strm) { - list_add(&zstrm->list, &zs->idle_strm); - spin_unlock(&zs->strm_lock); - wake_up(&zs->strm_wait); - return; - } - - zs->avail_strm--; - spin_unlock(&zs->strm_lock); - zcomp_strm_free(comp, zstrm); -} - -/* change max_strm limit */ -static bool zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm) -{ - struct zcomp_strm_multi *zs = comp->stream; - struct zcomp_strm *zstrm; - - spin_lock(&zs->strm_lock); - zs->max_strm = num_strm; - /* - * if user has lowered the limit and there are idle streams, - * immediately free as much streams (and memory) as we can. - */ - while (zs->avail_strm > num_strm && !list_empty(&zs->idle_strm)) { - zstrm = list_entry(zs->idle_strm.next, - struct zcomp_strm, list); - list_del(&zstrm->list); - zcomp_strm_free(comp, zstrm); - zs->avail_strm--; - } - spin_unlock(&zs->strm_lock); - return true; -} - -static void zcomp_strm_multi_destroy(struct zcomp *comp) -{ - struct zcomp_strm_multi *zs = comp->stream; - struct zcomp_strm *zstrm; - - while (!list_empty(&zs->idle_strm)) { - zstrm = list_entry(zs->idle_strm.next, - struct zcomp_strm, list); - list_del(&zstrm->list); - zcomp_strm_free(comp, zstrm); - } - kfree(zs); -} - -static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm) -{ - struct zcomp_strm *zstrm; - struct zcomp_strm_multi *zs; - - comp->destroy = zcomp_strm_multi_destroy; - comp->strm_find = zcomp_strm_multi_find; - comp->strm_release = zcomp_strm_multi_release; - comp->set_max_streams = zcomp_strm_multi_set_max_streams; - zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL); - if (!zs) - return -ENOMEM; - - comp->stream = zs; - spin_lock_init(&zs->strm_lock); - INIT_LIST_HEAD(&zs->idle_strm); - init_waitqueue_head(&zs->strm_wait); - zs->max_strm = max_strm; - zs->avail_strm = 1; - - zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); - if (!zstrm) { - kfree(zs); - return -ENOMEM; - } - list_add(&zstrm->list, &zs->idle_strm); - return 0; -} - -static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp) -{ - struct zcomp_strm_single *zs = comp->stream; - mutex_lock(&zs->strm_lock); - return zs->zstrm; -} - -static void zcomp_strm_single_release(struct zcomp *comp, - struct zcomp_strm *zstrm) -{ - struct zcomp_strm_single *zs = comp->stream; - mutex_unlock(&zs->strm_lock); -} - -static bool zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm) -{ - /* zcomp_strm_single support only max_comp_streams == 1 */ - return false; -} - -static void zcomp_strm_single_destroy(struct zcomp *comp) -{ - struct zcomp_strm_single *zs = comp->stream; - zcomp_strm_free(comp, zs->zstrm); - kfree(zs); -} - -static int zcomp_strm_single_create(struct zcomp *comp) -{ - struct zcomp_strm_single *zs; - - comp->destroy = zcomp_strm_single_destroy; - comp->strm_find = zcomp_strm_single_find; - comp->strm_release = zcomp_strm_single_release; - comp->set_max_streams = zcomp_strm_single_set_max_streams; - zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL); - if (!zs) - return -ENOMEM; - - comp->stream = zs; - mutex_init(&zs->strm_lock); - zs->zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); - if (!zs->zstrm) { - kfree(zs); - return -ENOMEM; - } - return 0; -} - /* show available compressors */ ssize_t zcomp_available_show(const char *comp, char *buf) { @@ -299,19 +95,14 @@ bool zcomp_available_algorithm(const char *comp) return find_backend(comp) != NULL; } -bool zcomp_set_max_streams(struct zcomp *comp, int num_strm) -{ - return comp->set_max_streams(comp, num_strm); -} - struct zcomp_strm *zcomp_strm_find(struct zcomp *comp) { - return comp->strm_find(comp); + return *get_cpu_ptr(comp->stream); } void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm) { - comp->strm_release(comp, zstrm); + put_cpu_ptr(comp->stream); } int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, @@ -327,9 +118,83 @@ int zcomp_decompress(struct zcomp *comp, const unsigned char *src, return comp->backend->decompress(src, src_len, dst); } +static int __zcomp_cpu_notifier(struct zcomp *comp, + unsigned long action, unsigned long cpu) +{ + struct zcomp_strm *zstrm; + + switch (action) { + case CPU_UP_PREPARE: + if (WARN_ON(*per_cpu_ptr(comp->stream, cpu))) + break; + zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); + if (IS_ERR_OR_NULL(zstrm)) { + pr_err("Can't allocate a compression stream\n"); + return NOTIFY_BAD; + } + *per_cpu_ptr(comp->stream, cpu) = zstrm; + break; + case CPU_DEAD: + case CPU_UP_CANCELED: + zstrm = *per_cpu_ptr(comp->stream, cpu); + if (!IS_ERR_OR_NULL(zstrm)) + zcomp_strm_free(comp, zstrm); + *per_cpu_ptr(comp->stream, cpu) = NULL; + break; + default: + break; + } + return NOTIFY_OK; +} + +static int zcomp_cpu_notifier(struct notifier_block *nb, + unsigned long action, void *pcpu) +{ + unsigned long cpu = (unsigned long)pcpu; + struct zcomp *comp = container_of(nb, typeof(*comp), notifier); + + return __zcomp_cpu_notifier(comp, action, cpu); +} + +static int zcomp_init(struct zcomp *comp) +{ + unsigned long cpu; + int ret; + + comp->notifier.notifier_call = zcomp_cpu_notifier; + + comp->stream = alloc_percpu(struct zcomp_strm *); + if (!comp->stream) + return -ENOMEM; + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) { + ret = __zcomp_cpu_notifier(comp, CPU_UP_PREPARE, cpu); + if (ret == NOTIFY_BAD) + goto cleanup; + } + __register_cpu_notifier(&comp->notifier); + cpu_notifier_register_done(); + return 0; + +cleanup: + for_each_online_cpu(cpu) + __zcomp_cpu_notifier(comp, CPU_UP_CANCELED, cpu); + cpu_notifier_register_done(); + return -ENOMEM; +} + void zcomp_destroy(struct zcomp *comp) { - comp->destroy(comp); + unsigned long cpu; + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) + __zcomp_cpu_notifier(comp, CPU_UP_CANCELED, cpu); + __unregister_cpu_notifier(&comp->notifier); + cpu_notifier_register_done(); + + free_percpu(comp->stream); kfree(comp); } @@ -339,9 +204,9 @@ void zcomp_destroy(struct zcomp *comp) * backend pointer or ERR_PTR if things went bad. ERR_PTR(-EINVAL) * if requested algorithm is not supported, ERR_PTR(-ENOMEM) in * case of allocation error, or any other error potentially - * returned by functions zcomp_strm_{multi,single}_create. + * returned by zcomp_init(). */ -struct zcomp *zcomp_create(const char *compress, int max_strm) +struct zcomp *zcomp_create(const char *compress) { struct zcomp *comp; struct zcomp_backend *backend; @@ -356,10 +221,7 @@ struct zcomp *zcomp_create(const char *compress, int max_strm) return ERR_PTR(-ENOMEM); comp->backend = backend; - if (max_strm > 1) - error = zcomp_strm_multi_create(comp, max_strm); - else - error = zcomp_strm_single_create(comp); + error = zcomp_init(comp); if (error) { kfree(comp); return ERR_PTR(error); diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index b7d2a4bca..ffd88cb74 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -10,8 +10,6 @@ #ifndef _ZCOMP_H_ #define _ZCOMP_H_ -#include <linux/mutex.h> - struct zcomp_strm { /* compression/decompression buffer */ void *buffer; @@ -21,8 +19,6 @@ struct zcomp_strm { * working memory) */ void *private; - /* used in multi stream backend, protected by backend strm_lock */ - struct list_head list; }; /* static compression backend */ @@ -41,19 +37,15 @@ struct zcomp_backend { /* dynamic per-device compression frontend */ struct zcomp { - void *stream; + struct zcomp_strm * __percpu *stream; struct zcomp_backend *backend; - - struct zcomp_strm *(*strm_find)(struct zcomp *comp); - void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm); - bool (*set_max_streams)(struct zcomp *comp, int num_strm); - void (*destroy)(struct zcomp *comp); + struct notifier_block notifier; }; ssize_t zcomp_available_show(const char *comp, char *buf); bool zcomp_available_algorithm(const char *comp); -struct zcomp *zcomp_create(const char *comp, int max_strm); +struct zcomp *zcomp_create(const char *comp); void zcomp_destroy(struct zcomp *comp); struct zcomp_strm *zcomp_strm_find(struct zcomp *comp); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 370c2f760..8fcad8b76 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -304,46 +304,25 @@ static ssize_t mem_used_max_store(struct device *dev, return len; } +/* + * We switched to per-cpu streams and this attr is not needed anymore. + * However, we will keep it around for some time, because: + * a) we may revert per-cpu streams in the future + * b) it's visible to user space and we need to follow our 2 years + * retirement rule; but we already have a number of 'soon to be + * altered' attrs, so max_comp_streams need to wait for the next + * layoff cycle. + */ static ssize_t max_comp_streams_show(struct device *dev, struct device_attribute *attr, char *buf) { - int val; - struct zram *zram = dev_to_zram(dev); - - down_read(&zram->init_lock); - val = zram->max_comp_streams; - up_read(&zram->init_lock); - - return scnprintf(buf, PAGE_SIZE, "%d\n", val); + return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus()); } static ssize_t max_comp_streams_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { - int num; - struct zram *zram = dev_to_zram(dev); - int ret; - - ret = kstrtoint(buf, 0, &num); - if (ret < 0) - return ret; - if (num < 1) - return -EINVAL; - - down_write(&zram->init_lock); - if (init_done(zram)) { - if (!zcomp_set_max_streams(zram->comp, num)) { - pr_info("Cannot change max compression streams\n"); - ret = -EINVAL; - goto out; - } - } - - zram->max_comp_streams = num; - ret = len; -out: - up_write(&zram->init_lock); - return ret; + return len; } static ssize_t comp_algorithm_show(struct device *dev, @@ -456,8 +435,26 @@ static ssize_t mm_stat_show(struct device *dev, return ret; } +static ssize_t debug_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int version = 1; + struct zram *zram = dev_to_zram(dev); + ssize_t ret; + + down_read(&zram->init_lock); + ret = scnprintf(buf, PAGE_SIZE, + "version: %d\n%8llu\n", + version, + (u64)atomic64_read(&zram->stats.writestall)); + up_read(&zram->init_lock); + + return ret; +} + static DEVICE_ATTR_RO(io_stat); static DEVICE_ATTR_RO(mm_stat); +static DEVICE_ATTR_RO(debug_stat); ZRAM_ATTR_RO(num_reads); ZRAM_ATTR_RO(num_writes); ZRAM_ATTR_RO(failed_reads); @@ -514,7 +511,7 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize) goto out_error; } - meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM); + meta->mem_pool = zs_create_pool(pool_name); if (!meta->mem_pool) { pr_err("Error creating memory pool\n"); goto out_error; @@ -650,7 +647,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, { int ret = 0; size_t clen; - unsigned long handle; + unsigned long handle = 0; struct page *page; unsigned char *user_mem, *cmem, *src, *uncmem = NULL; struct zram_meta *meta = zram->meta; @@ -673,9 +670,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } - zstrm = zcomp_strm_find(zram->comp); +compress_again: user_mem = kmap_atomic(page); - if (is_partial_io(bvec)) { memcpy(uncmem + offset, user_mem + bvec->bv_offset, bvec->bv_len); @@ -699,6 +695,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } + zstrm = zcomp_strm_find(zram->comp); ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen); if (!is_partial_io(bvec)) { kunmap_atomic(user_mem); @@ -710,6 +707,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, pr_err("Compression failed! err=%d\n", ret); goto out; } + src = zstrm->buffer; if (unlikely(clen > max_zpage_size)) { clen = PAGE_SIZE; @@ -717,8 +715,35 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, src = uncmem; } - handle = zs_malloc(meta->mem_pool, clen); + /* + * handle allocation has 2 paths: + * a) fast path is executed with preemption disabled (for + * per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear, + * since we can't sleep; + * b) slow path enables preemption and attempts to allocate + * the page with __GFP_DIRECT_RECLAIM bit set. we have to + * put per-cpu compression stream and, thus, to re-do + * the compression once handle is allocated. + * + * if we have a 'non-null' handle here then we are coming + * from the slow path and handle has already been allocated. + */ + if (!handle) + handle = zs_malloc(meta->mem_pool, clen, + __GFP_KSWAPD_RECLAIM | + __GFP_NOWARN | + __GFP_HIGHMEM); if (!handle) { + zcomp_strm_release(zram->comp, zstrm); + zstrm = NULL; + + atomic64_inc(&zram->stats.writestall); + + handle = zs_malloc(meta->mem_pool, clen, + GFP_NOIO | __GFP_HIGHMEM); + if (handle) + goto compress_again; + pr_err("Error allocating memory for compressed page: %u, size=%zu\n", index, clen); ret = -ENOMEM; @@ -1009,7 +1034,6 @@ static void zram_reset_device(struct zram *zram) /* Reset stats */ memset(&zram->stats, 0, sizeof(zram->stats)); zram->disksize = 0; - zram->max_comp_streams = 1; set_capacity(zram->disk, 0); part_stat_set_all(&zram->disk->part0, 0); @@ -1038,7 +1062,7 @@ static ssize_t disksize_store(struct device *dev, if (!meta) return -ENOMEM; - comp = zcomp_create(zram->compressor, zram->max_comp_streams); + comp = zcomp_create(zram->compressor); if (IS_ERR(comp)) { pr_err("Cannot initialise %s compressing backend\n", zram->compressor); @@ -1177,6 +1201,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_comp_algorithm.attr, &dev_attr_io_stat.attr, &dev_attr_mm_stat.attr, + &dev_attr_debug_stat.attr, NULL, }; @@ -1273,7 +1298,6 @@ static int zram_add(void) } strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); zram->meta = NULL; - zram->max_comp_streams = 1; pr_info("Added device: %s\n", zram->disk->disk_name); return device_id; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 8e9233968..3f5bf66a2 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -85,6 +85,7 @@ struct zram_stats { atomic64_t zero_pages; /* no. of zero filled pages */ atomic64_t pages_stored; /* no. of pages currently stored */ atomic_long_t max_used_pages; /* no. of maximum pages stored */ + atomic64_t writestall; /* no. of write slow paths */ }; struct zram_meta { @@ -102,7 +103,6 @@ struct zram { * the number of pages zram can consume for storing compressed data */ unsigned long limit_pages; - int max_comp_streams; struct zram_stats stats; atomic_t refcount; /* refcount for zram_meta */ |