diff options
Diffstat (limited to 'drivers/block/drbd')
24 files changed, 27117 insertions, 0 deletions
diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig new file mode 100644 index 000000000..7845bd6ee --- /dev/null +++ b/drivers/block/drbd/Kconfig @@ -0,0 +1,73 @@ +# +# DRBD device driver configuration +# + +comment "DRBD disabled because PROC_FS or INET not selected" + depends on PROC_FS='n' || INET='n' + +config BLK_DEV_DRBD + tristate "DRBD Distributed Replicated Block Device support" + depends on PROC_FS && INET + select LRU_CACHE + select LIBCRC32C + default n + help + + NOTE: In order to authenticate connections you have to select + CRYPTO_HMAC and a hash function as well. + + DRBD is a shared-nothing, synchronously replicated block device. It + is designed to serve as a building block for high availability + clusters and in this context, is a "drop-in" replacement for shared + storage. Simplistically, you could see it as a network RAID 1. + + Each minor device has a role, which can be 'primary' or 'secondary'. + On the node with the primary device the application is supposed to + run and to access the device (/dev/drbdX). Every write is sent to + the local 'lower level block device' and, across the network, to the + node with the device in 'secondary' state. The secondary device + simply writes the data to its lower level block device. + + DRBD can also be used in dual-Primary mode (device writable on both + nodes), which means it can exhibit shared disk semantics in a + shared-nothing cluster. Needless to say, on top of dual-Primary + DRBD utilizing a cluster file system is necessary to maintain for + cache coherency. + + For automatic failover you need a cluster manager (e.g. heartbeat). + See also: http://www.drbd.org/, http://www.linux-ha.org + + If unsure, say N. + +config DRBD_FAULT_INJECTION + bool "DRBD fault injection" + depends on BLK_DEV_DRBD + help + + Say Y here if you want to simulate IO errors, in order to test DRBD's + behavior. + + The actual simulation of IO errors is done by writing 3 values to + /sys/module/drbd/parameters/ + + enable_faults: bitmask of... + 1 meta data write + 2 read + 4 resync data write + 8 read + 16 data write + 32 data read + 64 read ahead + 128 kmalloc of bitmap + 256 allocation of peer_requests + 512 insert data corruption on receiving side + + fault_devs: bitmask of minor numbers + fault_rate: frequency in percent + + Example: Simulate data write errors on /dev/drbd0 with a probability of 5%. + echo 16 > /sys/module/drbd/parameters/enable_faults + echo 1 > /sys/module/drbd/parameters/fault_devs + echo 5 > /sys/module/drbd/parameters/fault_rate + + If unsure, say N. diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile new file mode 100644 index 000000000..4464e353c --- /dev/null +++ b/drivers/block/drbd/Makefile @@ -0,0 +1,8 @@ +drbd-y := drbd_bitmap.o drbd_proc.o +drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o +drbd-y += drbd_main.o drbd_strings.o drbd_nl.o +drbd-y += drbd_interval.o drbd_state.o +drbd-y += drbd_nla.o +drbd-$(CONFIG_DEBUG_FS) += drbd_debugfs.o + +obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c new file mode 100644 index 000000000..1318e3217 --- /dev/null +++ b/drivers/block/drbd/drbd_actlog.c @@ -0,0 +1,1219 @@ +/* + drbd_actlog.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. + Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include <linux/slab.h> +#include <linux/crc32c.h> +#include <linux/drbd.h> +#include <linux/drbd_limits.h> +#include <linux/dynamic_debug.h> +#include "drbd_int.h" + + +enum al_transaction_types { + AL_TR_UPDATE = 0, + AL_TR_INITIALIZED = 0xffff +}; +/* all fields on disc in big endian */ +struct __packed al_transaction_on_disk { + /* don't we all like magic */ + __be32 magic; + + /* to identify the most recent transaction block + * in the on disk ring buffer */ + __be32 tr_number; + + /* checksum on the full 4k block, with this field set to 0. */ + __be32 crc32c; + + /* type of transaction, special transaction types like: + * purge-all, set-all-idle, set-all-active, ... to-be-defined + * see also enum al_transaction_types */ + __be16 transaction_type; + + /* we currently allow only a few thousand extents, + * so 16bit will be enough for the slot number. */ + + /* how many updates in this transaction */ + __be16 n_updates; + + /* maximum slot number, "al-extents" in drbd.conf speak. + * Having this in each transaction should make reconfiguration + * of that parameter easier. */ + __be16 context_size; + + /* slot number the context starts with */ + __be16 context_start_slot_nr; + + /* Some reserved bytes. Expected usage is a 64bit counter of + * sectors-written since device creation, and other data generation tag + * supporting usage */ + __be32 __reserved[4]; + + /* --- 36 byte used --- */ + + /* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes + * in one transaction, then use the remaining byte in the 4k block for + * context information. "Flexible" number of updates per transaction + * does not help, as we have to account for the case when all update + * slots are used anyways, so it would only complicate code without + * additional benefit. + */ + __be16 update_slot_nr[AL_UPDATES_PER_TRANSACTION]; + + /* but the extent number is 32bit, which at an extent size of 4 MiB + * allows to cover device sizes of up to 2**54 Byte (16 PiB) */ + __be32 update_extent_nr[AL_UPDATES_PER_TRANSACTION]; + + /* --- 420 bytes used (36 + 64*6) --- */ + + /* 4096 - 420 = 3676 = 919 * 4 */ + __be32 context[AL_CONTEXT_PER_TRANSACTION]; +}; + +void *drbd_md_get_buffer(struct drbd_device *device, const char *intent) +{ + int r; + + wait_event(device->misc_wait, + (r = atomic_cmpxchg(&device->md_io.in_use, 0, 1)) == 0 || + device->state.disk <= D_FAILED); + + if (r) + return NULL; + + device->md_io.current_use = intent; + device->md_io.start_jif = jiffies; + device->md_io.submit_jif = device->md_io.start_jif - 1; + return page_address(device->md_io.page); +} + +void drbd_md_put_buffer(struct drbd_device *device) +{ + if (atomic_dec_and_test(&device->md_io.in_use)) + wake_up(&device->misc_wait); +} + +void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_backing_dev *bdev, + unsigned int *done) +{ + long dt; + + rcu_read_lock(); + dt = rcu_dereference(bdev->disk_conf)->disk_timeout; + rcu_read_unlock(); + dt = dt * HZ / 10; + if (dt == 0) + dt = MAX_SCHEDULE_TIMEOUT; + + dt = wait_event_timeout(device->misc_wait, + *done || test_bit(FORCE_DETACH, &device->flags), dt); + if (dt == 0) { + drbd_err(device, "meta-data IO operation timed out\n"); + drbd_chk_io_error(device, 1, DRBD_FORCE_DETACH); + } +} + +static int _drbd_md_sync_page_io(struct drbd_device *device, + struct drbd_backing_dev *bdev, + sector_t sector, int rw) +{ + struct bio *bio; + /* we do all our meta data IO in aligned 4k blocks. */ + const int size = 4096; + int err; + + device->md_io.done = 0; + device->md_io.error = -ENODEV; + + if ((rw & WRITE) && !test_bit(MD_NO_FUA, &device->flags)) + rw |= REQ_FUA | REQ_FLUSH; + rw |= REQ_SYNC | REQ_NOIDLE; + + bio = bio_alloc_drbd(GFP_NOIO); + bio->bi_bdev = bdev->md_bdev; + bio->bi_iter.bi_sector = sector; + err = -EIO; + if (bio_add_page(bio, device->md_io.page, size, 0) != size) + goto out; + bio->bi_private = device; + bio->bi_end_io = drbd_md_endio; + bio->bi_rw = rw; + + if (!(rw & WRITE) && device->state.disk == D_DISKLESS && device->ldev == NULL) + /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */ + ; + else if (!get_ldev_if_state(device, D_ATTACHING)) { + /* Corresponding put_ldev in drbd_md_endio() */ + drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); + err = -ENODEV; + goto out; + } + + bio_get(bio); /* one bio_put() is in the completion handler */ + atomic_inc(&device->md_io.in_use); /* drbd_md_put_buffer() is in the completion handler */ + device->md_io.submit_jif = jiffies; + if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) + bio_endio(bio, -EIO); + else + submit_bio(rw, bio); + wait_until_done_or_force_detached(device, bdev, &device->md_io.done); + if (bio_flagged(bio, BIO_UPTODATE)) + err = device->md_io.error; + + out: + bio_put(bio); + return err; +} + +int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev, + sector_t sector, int rw) +{ + int err; + D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1); + + BUG_ON(!bdev->md_bdev); + + dynamic_drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n", + current->comm, current->pid, __func__, + (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", + (void*)_RET_IP_ ); + + if (sector < drbd_md_first_sector(bdev) || + sector + 7 > drbd_md_last_sector(bdev)) + drbd_alert(device, "%s [%d]:%s(,%llus,%s) out of range md access!\n", + current->comm, current->pid, __func__, + (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); + + err = _drbd_md_sync_page_io(device, bdev, sector, rw); + if (err) { + drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", + (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); + } + return err; +} + +static struct bm_extent *find_active_resync_extent(struct drbd_device *device, unsigned int enr) +{ + struct lc_element *tmp; + tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT); + if (unlikely(tmp != NULL)) { + struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); + if (test_bit(BME_NO_WRITES, &bm_ext->flags)) + return bm_ext; + } + return NULL; +} + +static struct lc_element *_al_get(struct drbd_device *device, unsigned int enr, bool nonblock) +{ + struct lc_element *al_ext; + struct bm_extent *bm_ext; + int wake; + + spin_lock_irq(&device->al_lock); + bm_ext = find_active_resync_extent(device, enr); + if (bm_ext) { + wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags); + spin_unlock_irq(&device->al_lock); + if (wake) + wake_up(&device->al_wait); + return NULL; + } + if (nonblock) + al_ext = lc_try_get(device->act_log, enr); + else + al_ext = lc_get(device->act_log, enr); + spin_unlock_irq(&device->al_lock); + return al_ext; +} + +bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i) +{ + /* for bios crossing activity log extent boundaries, + * we may need to activate two extents in one go */ + unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); + unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); + + D_ASSERT(device, (unsigned)(last - first) <= 1); + D_ASSERT(device, atomic_read(&device->local_cnt) > 0); + + /* FIXME figure out a fast path for bios crossing AL extent boundaries */ + if (first != last) + return false; + + return _al_get(device, first, true); +} + +bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i) +{ + /* for bios crossing activity log extent boundaries, + * we may need to activate two extents in one go */ + unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); + unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); + unsigned enr; + bool need_transaction = false; + + D_ASSERT(device, first <= last); + D_ASSERT(device, atomic_read(&device->local_cnt) > 0); + + for (enr = first; enr <= last; enr++) { + struct lc_element *al_ext; + wait_event(device->al_wait, + (al_ext = _al_get(device, enr, false)) != NULL); + if (al_ext->lc_number != enr) + need_transaction = true; + } + return need_transaction; +} + +static int al_write_transaction(struct drbd_device *device); + +void drbd_al_begin_io_commit(struct drbd_device *device) +{ + bool locked = false; + + /* Serialize multiple transactions. + * This uses test_and_set_bit, memory barrier is implicit. + */ + wait_event(device->al_wait, + device->act_log->pending_changes == 0 || + (locked = lc_try_lock_for_transaction(device->act_log))); + + if (locked) { + /* Double check: it may have been committed by someone else, + * while we have been waiting for the lock. */ + if (device->act_log->pending_changes) { + bool write_al_updates; + + rcu_read_lock(); + write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; + rcu_read_unlock(); + + if (write_al_updates) + al_write_transaction(device); + spin_lock_irq(&device->al_lock); + /* FIXME + if (err) + we need an "lc_cancel" here; + */ + lc_committed(device->act_log); + spin_unlock_irq(&device->al_lock); + } + lc_unlock(device->act_log); + wake_up(&device->al_wait); + } +} + +/* + * @delegate: delegate activity log I/O to the worker thread + */ +void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i) +{ + if (drbd_al_begin_io_prepare(device, i)) + drbd_al_begin_io_commit(device); +} + +int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i) +{ + struct lru_cache *al = device->act_log; + /* for bios crossing activity log extent boundaries, + * we may need to activate two extents in one go */ + unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); + unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); + unsigned nr_al_extents; + unsigned available_update_slots; + unsigned enr; + + D_ASSERT(device, first <= last); + + nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */ + available_update_slots = min(al->nr_elements - al->used, + al->max_pending_changes - al->pending_changes); + + /* We want all necessary updates for a given request within the same transaction + * We could first check how many updates are *actually* needed, + * and use that instead of the worst-case nr_al_extents */ + if (available_update_slots < nr_al_extents) { + /* Too many activity log extents are currently "hot". + * + * If we have accumulated pending changes already, + * we made progress. + * + * If we cannot get even a single pending change through, + * stop the fast path until we made some progress, + * or requests to "cold" extents could be starved. */ + if (!al->pending_changes) + __set_bit(__LC_STARVING, &device->act_log->flags); + return -ENOBUFS; + } + + /* Is resync active in this area? */ + for (enr = first; enr <= last; enr++) { + struct lc_element *tmp; + tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT); + if (unlikely(tmp != NULL)) { + struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); + if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { + if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags)) + return -EBUSY; + return -EWOULDBLOCK; + } + } + } + + /* Checkout the refcounts. + * Given that we checked for available elements and update slots above, + * this has to be successful. */ + for (enr = first; enr <= last; enr++) { + struct lc_element *al_ext; + al_ext = lc_get_cumulative(device->act_log, enr); + if (!al_ext) + drbd_info(device, "LOGIC BUG for enr=%u\n", enr); + } + return 0; +} + +void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i) +{ + /* for bios crossing activity log extent boundaries, + * we may need to activate two extents in one go */ + unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); + unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); + unsigned enr; + struct lc_element *extent; + unsigned long flags; + + D_ASSERT(device, first <= last); + spin_lock_irqsave(&device->al_lock, flags); + + for (enr = first; enr <= last; enr++) { + extent = lc_find(device->act_log, enr); + if (!extent) { + drbd_err(device, "al_complete_io() called on inactive extent %u\n", enr); + continue; + } + lc_put(device->act_log, extent); + } + spin_unlock_irqrestore(&device->al_lock, flags); + wake_up(&device->al_wait); +} + +#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) +/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT + * are still coupled, or assume too much about their relation. + * Code below will not work if this is violated. + * Will be cleaned up with some followup patch. + */ +# error FIXME +#endif + +static unsigned int al_extent_to_bm_page(unsigned int al_enr) +{ + return al_enr >> + /* bit to page */ + ((PAGE_SHIFT + 3) - + /* al extent number to bit */ + (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)); +} + +static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device) +{ + const unsigned int stripes = device->ldev->md.al_stripes; + const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k; + + /* transaction number, modulo on-disk ring buffer wrap around */ + unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k); + + /* ... to aligned 4k on disk block */ + t = ((t % stripes) * stripe_size_4kB) + t/stripes; + + /* ... to 512 byte sector in activity log */ + t *= 8; + + /* ... plus offset to the on disk position */ + return device->ldev->md.md_offset + device->ldev->md.al_offset + t; +} + +int al_write_transaction(struct drbd_device *device) +{ + struct al_transaction_on_disk *buffer; + struct lc_element *e; + sector_t sector; + int i, mx; + unsigned extent_nr; + unsigned crc = 0; + int err = 0; + + if (!get_ldev(device)) { + drbd_err(device, "disk is %s, cannot start al transaction\n", + drbd_disk_str(device->state.disk)); + return -EIO; + } + + /* The bitmap write may have failed, causing a state change. */ + if (device->state.disk < D_INCONSISTENT) { + drbd_err(device, + "disk is %s, cannot write al transaction\n", + drbd_disk_str(device->state.disk)); + put_ldev(device); + return -EIO; + } + + /* protects md_io_buffer, al_tr_cycle, ... */ + buffer = drbd_md_get_buffer(device, __func__); + if (!buffer) { + drbd_err(device, "disk failed while waiting for md_io buffer\n"); + put_ldev(device); + return -ENODEV; + } + + memset(buffer, 0, sizeof(*buffer)); + buffer->magic = cpu_to_be32(DRBD_AL_MAGIC); + buffer->tr_number = cpu_to_be32(device->al_tr_number); + + i = 0; + + /* Even though no one can start to change this list + * once we set the LC_LOCKED -- from drbd_al_begin_io(), + * lc_try_lock_for_transaction() --, someone may still + * be in the process of changing it. */ + spin_lock_irq(&device->al_lock); + list_for_each_entry(e, &device->act_log->to_be_changed, list) { + if (i == AL_UPDATES_PER_TRANSACTION) { + i++; + break; + } + buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index); + buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number); + if (e->lc_number != LC_FREE) + drbd_bm_mark_for_writeout(device, + al_extent_to_bm_page(e->lc_number)); + i++; + } + spin_unlock_irq(&device->al_lock); + BUG_ON(i > AL_UPDATES_PER_TRANSACTION); + + buffer->n_updates = cpu_to_be16(i); + for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) { + buffer->update_slot_nr[i] = cpu_to_be16(-1); + buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE); + } + + buffer->context_size = cpu_to_be16(device->act_log->nr_elements); + buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle); + + mx = min_t(int, AL_CONTEXT_PER_TRANSACTION, + device->act_log->nr_elements - device->al_tr_cycle); + for (i = 0; i < mx; i++) { + unsigned idx = device->al_tr_cycle + i; + extent_nr = lc_element_by_index(device->act_log, idx)->lc_number; + buffer->context[i] = cpu_to_be32(extent_nr); + } + for (; i < AL_CONTEXT_PER_TRANSACTION; i++) + buffer->context[i] = cpu_to_be32(LC_FREE); + + device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION; + if (device->al_tr_cycle >= device->act_log->nr_elements) + device->al_tr_cycle = 0; + + sector = al_tr_number_to_on_disk_sector(device); + + crc = crc32c(0, buffer, 4096); + buffer->crc32c = cpu_to_be32(crc); + + if (drbd_bm_write_hinted(device)) + err = -EIO; + else { + bool write_al_updates; + rcu_read_lock(); + write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; + rcu_read_unlock(); + if (write_al_updates) { + if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) { + err = -EIO; + drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); + } else { + device->al_tr_number++; + device->al_writ_cnt++; + } + } + } + + drbd_md_put_buffer(device); + put_ldev(device); + + return err; +} + +static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext) +{ + int rv; + + spin_lock_irq(&device->al_lock); + rv = (al_ext->refcnt == 0); + if (likely(rv)) + lc_del(device->act_log, al_ext); + spin_unlock_irq(&device->al_lock); + + return rv; +} + +/** + * drbd_al_shrink() - Removes all active extents form the activity log + * @device: DRBD device. + * + * Removes all active extents form the activity log, waiting until + * the reference count of each entry dropped to 0 first, of course. + * + * You need to lock device->act_log with lc_try_lock() / lc_unlock() + */ +void drbd_al_shrink(struct drbd_device *device) +{ + struct lc_element *al_ext; + int i; + + D_ASSERT(device, test_bit(__LC_LOCKED, &device->act_log->flags)); + + for (i = 0; i < device->act_log->nr_elements; i++) { + al_ext = lc_element_by_index(device->act_log, i); + if (al_ext->lc_number == LC_FREE) + continue; + wait_event(device->al_wait, _try_lc_del(device, al_ext)); + } + + wake_up(&device->al_wait); +} + +int drbd_initialize_al(struct drbd_device *device, void *buffer) +{ + struct al_transaction_on_disk *al = buffer; + struct drbd_md *md = &device->ldev->md; + sector_t al_base = md->md_offset + md->al_offset; + int al_size_4k = md->al_stripes * md->al_stripe_size_4k; + int i; + + memset(al, 0, 4096); + al->magic = cpu_to_be32(DRBD_AL_MAGIC); + al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED); + al->crc32c = cpu_to_be32(crc32c(0, al, 4096)); + + for (i = 0; i < al_size_4k; i++) { + int err = drbd_md_sync_page_io(device, device->ldev, al_base + i * 8, WRITE); + if (err) + return err; + } + return 0; +} + +static const char *drbd_change_sync_fname[] = { + [RECORD_RS_FAILED] = "drbd_rs_failed_io", + [SET_IN_SYNC] = "drbd_set_in_sync", + [SET_OUT_OF_SYNC] = "drbd_set_out_of_sync" +}; + +/* ATTENTION. The AL's extents are 4MB each, while the extents in the + * resync LRU-cache are 16MB each. + * The caller of this function has to hold an get_ldev() reference. + * + * Adjusts the caching members ->rs_left (success) or ->rs_failed (!success), + * potentially pulling in (and recounting the corresponding bits) + * this resync extent into the resync extent lru cache. + * + * Returns whether all bits have been cleared for this resync extent, + * precisely: (rs_left <= rs_failed) + * + * TODO will be obsoleted once we have a caching lru of the on disk bitmap + */ +static bool update_rs_extent(struct drbd_device *device, + unsigned int enr, int count, + enum update_sync_bits_mode mode) +{ + struct lc_element *e; + + D_ASSERT(device, atomic_read(&device->local_cnt)); + + /* When setting out-of-sync bits, + * we don't need it cached (lc_find). + * But if it is present in the cache, + * we should update the cached bit count. + * Otherwise, that extent should be in the resync extent lru cache + * already -- or we want to pull it in if necessary -- (lc_get), + * then update and check rs_left and rs_failed. */ + if (mode == SET_OUT_OF_SYNC) + e = lc_find(device->resync, enr); + else + e = lc_get(device->resync, enr); + if (e) { + struct bm_extent *ext = lc_entry(e, struct bm_extent, lce); + if (ext->lce.lc_number == enr) { + if (mode == SET_IN_SYNC) + ext->rs_left -= count; + else if (mode == SET_OUT_OF_SYNC) + ext->rs_left += count; + else + ext->rs_failed += count; + if (ext->rs_left < ext->rs_failed) { + drbd_warn(device, "BAD! enr=%u rs_left=%d " + "rs_failed=%d count=%d cstate=%s\n", + ext->lce.lc_number, ext->rs_left, + ext->rs_failed, count, + drbd_conn_str(device->state.conn)); + + /* We don't expect to be able to clear more bits + * than have been set when we originally counted + * the set bits to cache that value in ext->rs_left. + * Whatever the reason (disconnect during resync, + * delayed local completion of an application write), + * try to fix it up by recounting here. */ + ext->rs_left = drbd_bm_e_weight(device, enr); + } + } else { + /* Normally this element should be in the cache, + * since drbd_rs_begin_io() pulled it already in. + * + * But maybe an application write finished, and we set + * something outside the resync lru_cache in sync. + */ + int rs_left = drbd_bm_e_weight(device, enr); + if (ext->flags != 0) { + drbd_warn(device, "changing resync lce: %d[%u;%02lx]" + " -> %d[%u;00]\n", + ext->lce.lc_number, ext->rs_left, + ext->flags, enr, rs_left); + ext->flags = 0; + } + if (ext->rs_failed) { + drbd_warn(device, "Kicking resync_lru element enr=%u " + "out with rs_failed=%d\n", + ext->lce.lc_number, ext->rs_failed); + } + ext->rs_left = rs_left; + ext->rs_failed = (mode == RECORD_RS_FAILED) ? count : 0; + /* we don't keep a persistent log of the resync lru, + * we can commit any change right away. */ + lc_committed(device->resync); + } + if (mode != SET_OUT_OF_SYNC) + lc_put(device->resync, &ext->lce); + /* no race, we are within the al_lock! */ + + if (ext->rs_left <= ext->rs_failed) { + ext->rs_failed = 0; + return true; + } + } else if (mode != SET_OUT_OF_SYNC) { + /* be quiet if lc_find() did not find it. */ + drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n", + device->resync_locked, + device->resync->nr_elements, + device->resync->flags); + } + return false; +} + +void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go) +{ + unsigned long now = jiffies; + unsigned long last = device->rs_mark_time[device->rs_last_mark]; + int next = (device->rs_last_mark + 1) % DRBD_SYNC_MARKS; + if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) { + if (device->rs_mark_left[device->rs_last_mark] != still_to_go && + device->state.conn != C_PAUSED_SYNC_T && + device->state.conn != C_PAUSED_SYNC_S) { + device->rs_mark_time[next] = now; + device->rs_mark_left[next] = still_to_go; + device->rs_last_mark = next; + } + } +} + +/* It is called lazy update, so don't do write-out too often. */ +static bool lazy_bitmap_update_due(struct drbd_device *device) +{ + return time_after(jiffies, device->rs_last_bcast + 2*HZ); +} + +static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done) +{ + if (rs_done) + set_bit(RS_DONE, &device->flags); + /* and also set RS_PROGRESS below */ + else if (!lazy_bitmap_update_due(device)) + return; + + drbd_device_post_work(device, RS_PROGRESS); +} + +static int update_sync_bits(struct drbd_device *device, + unsigned long sbnr, unsigned long ebnr, + enum update_sync_bits_mode mode) +{ + /* + * We keep a count of set bits per resync-extent in the ->rs_left + * caching member, so we need to loop and work within the resync extent + * alignment. Typically this loop will execute exactly once. + */ + unsigned long flags; + unsigned long count = 0; + unsigned int cleared = 0; + while (sbnr <= ebnr) { + /* set temporary boundary bit number to last bit number within + * the resync extent of the current start bit number, + * but cap at provided end bit number */ + unsigned long tbnr = min(ebnr, sbnr | BM_BLOCKS_PER_BM_EXT_MASK); + unsigned long c; + + if (mode == RECORD_RS_FAILED) + /* Only called from drbd_rs_failed_io(), bits + * supposedly still set. Recount, maybe some + * of the bits have been successfully cleared + * by application IO meanwhile. + */ + c = drbd_bm_count_bits(device, sbnr, tbnr); + else if (mode == SET_IN_SYNC) + c = drbd_bm_clear_bits(device, sbnr, tbnr); + else /* if (mode == SET_OUT_OF_SYNC) */ + c = drbd_bm_set_bits(device, sbnr, tbnr); + + if (c) { + spin_lock_irqsave(&device->al_lock, flags); + cleared += update_rs_extent(device, BM_BIT_TO_EXT(sbnr), c, mode); + spin_unlock_irqrestore(&device->al_lock, flags); + count += c; + } + sbnr = tbnr + 1; + } + if (count) { + if (mode == SET_IN_SYNC) { + unsigned long still_to_go = drbd_bm_total_weight(device); + bool rs_is_done = (still_to_go <= device->rs_failed); + drbd_advance_rs_marks(device, still_to_go); + if (cleared || rs_is_done) + maybe_schedule_on_disk_bitmap_update(device, rs_is_done); + } else if (mode == RECORD_RS_FAILED) + device->rs_failed += count; + wake_up(&device->al_wait); + } + return count; +} + +/* clear the bit corresponding to the piece of storage in question: + * size byte of data starting from sector. Only clear a bits of the affected + * one ore more _aligned_ BM_BLOCK_SIZE blocks. + * + * called by worker on C_SYNC_TARGET and receiver on SyncSource. + * + */ +int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, + enum update_sync_bits_mode mode) +{ + /* Is called from worker and receiver context _only_ */ + unsigned long sbnr, ebnr, lbnr; + unsigned long count = 0; + sector_t esector, nr_sectors; + + /* This would be an empty REQ_FLUSH, be silent. */ + if ((mode == SET_OUT_OF_SYNC) && size == 0) + return 0; + + if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { + drbd_err(device, "%s: sector=%llus size=%d nonsense!\n", + drbd_change_sync_fname[mode], + (unsigned long long)sector, size); + return 0; + } + + if (!get_ldev(device)) + return 0; /* no disk, no metadata, no bitmap to manipulate bits in */ + + nr_sectors = drbd_get_capacity(device->this_bdev); + esector = sector + (size >> 9) - 1; + + if (!expect(sector < nr_sectors)) + goto out; + if (!expect(esector < nr_sectors)) + esector = nr_sectors - 1; + + lbnr = BM_SECT_TO_BIT(nr_sectors-1); + + if (mode == SET_IN_SYNC) { + /* Round up start sector, round down end sector. We make sure + * we only clear full, aligned, BM_BLOCK_SIZE blocks. */ + if (unlikely(esector < BM_SECT_PER_BIT-1)) + goto out; + if (unlikely(esector == (nr_sectors-1))) + ebnr = lbnr; + else + ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); + sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); + } else { + /* We set it out of sync, or record resync failure. + * Should not round anything here. */ + sbnr = BM_SECT_TO_BIT(sector); + ebnr = BM_SECT_TO_BIT(esector); + } + + count = update_sync_bits(device, sbnr, ebnr, mode); +out: + put_ldev(device); + return count; +} + +static +struct bm_extent *_bme_get(struct drbd_device *device, unsigned int enr) +{ + struct lc_element *e; + struct bm_extent *bm_ext; + int wakeup = 0; + unsigned long rs_flags; + + spin_lock_irq(&device->al_lock); + if (device->resync_locked > device->resync->nr_elements/2) { + spin_unlock_irq(&device->al_lock); + return NULL; + } + e = lc_get(device->resync, enr); + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + if (bm_ext) { + if (bm_ext->lce.lc_number != enr) { + bm_ext->rs_left = drbd_bm_e_weight(device, enr); + bm_ext->rs_failed = 0; + lc_committed(device->resync); + wakeup = 1; + } + if (bm_ext->lce.refcnt == 1) + device->resync_locked++; + set_bit(BME_NO_WRITES, &bm_ext->flags); + } + rs_flags = device->resync->flags; + spin_unlock_irq(&device->al_lock); + if (wakeup) + wake_up(&device->al_wait); + + if (!bm_ext) { + if (rs_flags & LC_STARVING) + drbd_warn(device, "Have to wait for element" + " (resync LRU too small?)\n"); + BUG_ON(rs_flags & LC_LOCKED); + } + + return bm_ext; +} + +static int _is_in_al(struct drbd_device *device, unsigned int enr) +{ + int rv; + + spin_lock_irq(&device->al_lock); + rv = lc_is_used(device->act_log, enr); + spin_unlock_irq(&device->al_lock); + + return rv; +} + +/** + * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED + * @device: DRBD device. + * @sector: The sector number. + * + * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted. + */ +int drbd_rs_begin_io(struct drbd_device *device, sector_t sector) +{ + unsigned int enr = BM_SECT_TO_EXT(sector); + struct bm_extent *bm_ext; + int i, sig; + bool sa; + +retry: + sig = wait_event_interruptible(device->al_wait, + (bm_ext = _bme_get(device, enr))); + if (sig) + return -EINTR; + + if (test_bit(BME_LOCKED, &bm_ext->flags)) + return 0; + + /* step aside only while we are above c-min-rate; unless disabled. */ + sa = drbd_rs_c_min_rate_throttle(device); + + for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { + sig = wait_event_interruptible(device->al_wait, + !_is_in_al(device, enr * AL_EXT_PER_BM_SECT + i) || + (sa && test_bit(BME_PRIORITY, &bm_ext->flags))); + + if (sig || (sa && test_bit(BME_PRIORITY, &bm_ext->flags))) { + spin_lock_irq(&device->al_lock); + if (lc_put(device->resync, &bm_ext->lce) == 0) { + bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */ + device->resync_locked--; + wake_up(&device->al_wait); + } + spin_unlock_irq(&device->al_lock); + if (sig) + return -EINTR; + if (schedule_timeout_interruptible(HZ/10)) + return -EINTR; + goto retry; + } + } + set_bit(BME_LOCKED, &bm_ext->flags); + return 0; +} + +/** + * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep + * @device: DRBD device. + * @sector: The sector number. + * + * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then + * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN + * if there is still application IO going on in this area. + */ +int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector) +{ + unsigned int enr = BM_SECT_TO_EXT(sector); + const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT; + struct lc_element *e; + struct bm_extent *bm_ext; + int i; + bool throttle = drbd_rs_should_slow_down(device, sector, true); + + /* If we need to throttle, a half-locked (only marked BME_NO_WRITES, + * not yet BME_LOCKED) extent needs to be kicked out explicitly if we + * need to throttle. There is at most one such half-locked extent, + * which is remembered in resync_wenr. */ + + if (throttle && device->resync_wenr != enr) + return -EAGAIN; + + spin_lock_irq(&device->al_lock); + if (device->resync_wenr != LC_FREE && device->resync_wenr != enr) { + /* in case you have very heavy scattered io, it may + * stall the syncer undefined if we give up the ref count + * when we try again and requeue. + * + * if we don't give up the refcount, but the next time + * we are scheduled this extent has been "synced" by new + * application writes, we'd miss the lc_put on the + * extent we keep the refcount on. + * so we remembered which extent we had to try again, and + * if the next requested one is something else, we do + * the lc_put here... + * we also have to wake_up + */ + e = lc_find(device->resync, device->resync_wenr); + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + if (bm_ext) { + D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); + D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); + clear_bit(BME_NO_WRITES, &bm_ext->flags); + device->resync_wenr = LC_FREE; + if (lc_put(device->resync, &bm_ext->lce) == 0) { + bm_ext->flags = 0; + device->resync_locked--; + } + wake_up(&device->al_wait); + } else { + drbd_alert(device, "LOGIC BUG\n"); + } + } + /* TRY. */ + e = lc_try_get(device->resync, enr); + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + if (bm_ext) { + if (test_bit(BME_LOCKED, &bm_ext->flags)) + goto proceed; + if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) { + device->resync_locked++; + } else { + /* we did set the BME_NO_WRITES, + * but then could not set BME_LOCKED, + * so we tried again. + * drop the extra reference. */ + bm_ext->lce.refcnt--; + D_ASSERT(device, bm_ext->lce.refcnt > 0); + } + goto check_al; + } else { + /* do we rather want to try later? */ + if (device->resync_locked > device->resync->nr_elements-3) + goto try_again; + /* Do or do not. There is no try. -- Yoda */ + e = lc_get(device->resync, enr); + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + if (!bm_ext) { + const unsigned long rs_flags = device->resync->flags; + if (rs_flags & LC_STARVING) + drbd_warn(device, "Have to wait for element" + " (resync LRU too small?)\n"); + BUG_ON(rs_flags & LC_LOCKED); + goto try_again; + } + if (bm_ext->lce.lc_number != enr) { + bm_ext->rs_left = drbd_bm_e_weight(device, enr); + bm_ext->rs_failed = 0; + lc_committed(device->resync); + wake_up(&device->al_wait); + D_ASSERT(device, test_bit(BME_LOCKED, &bm_ext->flags) == 0); + } + set_bit(BME_NO_WRITES, &bm_ext->flags); + D_ASSERT(device, bm_ext->lce.refcnt == 1); + device->resync_locked++; + goto check_al; + } +check_al: + for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { + if (lc_is_used(device->act_log, al_enr+i)) + goto try_again; + } + set_bit(BME_LOCKED, &bm_ext->flags); +proceed: + device->resync_wenr = LC_FREE; + spin_unlock_irq(&device->al_lock); + return 0; + +try_again: + if (bm_ext) { + if (throttle) { + D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); + D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); + clear_bit(BME_NO_WRITES, &bm_ext->flags); + device->resync_wenr = LC_FREE; + if (lc_put(device->resync, &bm_ext->lce) == 0) { + bm_ext->flags = 0; + device->resync_locked--; + } + wake_up(&device->al_wait); + } else + device->resync_wenr = enr; + } + spin_unlock_irq(&device->al_lock); + return -EAGAIN; +} + +void drbd_rs_complete_io(struct drbd_device *device, sector_t sector) +{ + unsigned int enr = BM_SECT_TO_EXT(sector); + struct lc_element *e; + struct bm_extent *bm_ext; + unsigned long flags; + + spin_lock_irqsave(&device->al_lock, flags); + e = lc_find(device->resync, enr); + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + if (!bm_ext) { + spin_unlock_irqrestore(&device->al_lock, flags); + if (__ratelimit(&drbd_ratelimit_state)) + drbd_err(device, "drbd_rs_complete_io() called, but extent not found\n"); + return; + } + + if (bm_ext->lce.refcnt == 0) { + spin_unlock_irqrestore(&device->al_lock, flags); + drbd_err(device, "drbd_rs_complete_io(,%llu [=%u]) called, " + "but refcnt is 0!?\n", + (unsigned long long)sector, enr); + return; + } + + if (lc_put(device->resync, &bm_ext->lce) == 0) { + bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */ + device->resync_locked--; + wake_up(&device->al_wait); + } + + spin_unlock_irqrestore(&device->al_lock, flags); +} + +/** + * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED) + * @device: DRBD device. + */ +void drbd_rs_cancel_all(struct drbd_device *device) +{ + spin_lock_irq(&device->al_lock); + + if (get_ldev_if_state(device, D_FAILED)) { /* Makes sure ->resync is there. */ + lc_reset(device->resync); + put_ldev(device); + } + device->resync_locked = 0; + device->resync_wenr = LC_FREE; + spin_unlock_irq(&device->al_lock); + wake_up(&device->al_wait); +} + +/** + * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU + * @device: DRBD device. + * + * Returns 0 upon success, -EAGAIN if at least one reference count was + * not zero. + */ +int drbd_rs_del_all(struct drbd_device *device) +{ + struct lc_element *e; + struct bm_extent *bm_ext; + int i; + + spin_lock_irq(&device->al_lock); + + if (get_ldev_if_state(device, D_FAILED)) { + /* ok, ->resync is there. */ + for (i = 0; i < device->resync->nr_elements; i++) { + e = lc_element_by_index(device->resync, i); + bm_ext = lc_entry(e, struct bm_extent, lce); + if (bm_ext->lce.lc_number == LC_FREE) + continue; + if (bm_ext->lce.lc_number == device->resync_wenr) { + drbd_info(device, "dropping %u in drbd_rs_del_all, apparently" + " got 'synced' by application io\n", + device->resync_wenr); + D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); + D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); + clear_bit(BME_NO_WRITES, &bm_ext->flags); + device->resync_wenr = LC_FREE; + lc_put(device->resync, &bm_ext->lce); + } + if (bm_ext->lce.refcnt != 0) { + drbd_info(device, "Retrying drbd_rs_del_all() later. " + "refcnt=%d\n", bm_ext->lce.refcnt); + put_ldev(device); + spin_unlock_irq(&device->al_lock); + return -EAGAIN; + } + D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); + D_ASSERT(device, !test_bit(BME_NO_WRITES, &bm_ext->flags)); + lc_del(device->resync, &bm_ext->lce); + } + D_ASSERT(device, device->resync->used == 0); + put_ldev(device); + } + spin_unlock_irq(&device->al_lock); + wake_up(&device->al_wait); + + return 0; +} diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c new file mode 100644 index 000000000..434c77dcc --- /dev/null +++ b/drivers/block/drbd/drbd_bitmap.c @@ -0,0 +1,1648 @@ +/* + drbd_bitmap.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2004-2008, LINBIT Information Technologies GmbH. + Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/bitops.h> +#include <linux/vmalloc.h> +#include <linux/string.h> +#include <linux/drbd.h> +#include <linux/slab.h> +#include <asm/kmap_types.h> + +#include "drbd_int.h" + + +/* OPAQUE outside this file! + * interface defined in drbd_int.h + + * convention: + * function name drbd_bm_... => used elsewhere, "public". + * function name bm_... => internal to implementation, "private". + */ + + +/* + * LIMITATIONS: + * We want to support >= peta byte of backend storage, while for now still using + * a granularity of one bit per 4KiB of storage. + * 1 << 50 bytes backend storage (1 PiB) + * 1 << (50 - 12) bits needed + * 38 --> we need u64 to index and count bits + * 1 << (38 - 3) bitmap bytes needed + * 35 --> we still need u64 to index and count bytes + * (that's 32 GiB of bitmap for 1 PiB storage) + * 1 << (35 - 2) 32bit longs needed + * 33 --> we'd even need u64 to index and count 32bit long words. + * 1 << (35 - 3) 64bit longs needed + * 32 --> we could get away with a 32bit unsigned int to index and count + * 64bit long words, but I rather stay with unsigned long for now. + * We probably should neither count nor point to bytes or long words + * directly, but either by bitnumber, or by page index and offset. + * 1 << (35 - 12) + * 22 --> we need that much 4KiB pages of bitmap. + * 1 << (22 + 3) --> on a 64bit arch, + * we need 32 MiB to store the array of page pointers. + * + * Because I'm lazy, and because the resulting patch was too large, too ugly + * and still incomplete, on 32bit we still "only" support 16 TiB (minus some), + * (1 << 32) bits * 4k storage. + * + + * bitmap storage and IO: + * Bitmap is stored little endian on disk, and is kept little endian in + * core memory. Currently we still hold the full bitmap in core as long + * as we are "attached" to a local disk, which at 32 GiB for 1PiB storage + * seems excessive. + * + * We plan to reduce the amount of in-core bitmap pages by paging them in + * and out against their on-disk location as necessary, but need to make + * sure we don't cause too much meta data IO, and must not deadlock in + * tight memory situations. This needs some more work. + */ + +/* + * NOTE + * Access to the *bm_pages is protected by bm_lock. + * It is safe to read the other members within the lock. + * + * drbd_bm_set_bits is called from bio_endio callbacks, + * We may be called with irq already disabled, + * so we need spin_lock_irqsave(). + * And we need the kmap_atomic. + */ +struct drbd_bitmap { + struct page **bm_pages; + spinlock_t bm_lock; + + /* see LIMITATIONS: above */ + + unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */ + unsigned long bm_bits; + size_t bm_words; + size_t bm_number_of_pages; + sector_t bm_dev_capacity; + struct mutex bm_change; /* serializes resize operations */ + + wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */ + + enum bm_flag bm_flags; + + /* debugging aid, in case we are still racy somewhere */ + char *bm_why; + struct task_struct *bm_task; +}; + +#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__) +static void __bm_print_lock_info(struct drbd_device *device, const char *func) +{ + struct drbd_bitmap *b = device->bitmap; + if (!__ratelimit(&drbd_ratelimit_state)) + return; + drbd_err(device, "FIXME %s[%d] in %s, bitmap locked for '%s' by %s[%d]\n", + current->comm, task_pid_nr(current), + func, b->bm_why ?: "?", + b->bm_task->comm, task_pid_nr(b->bm_task)); +} + +void drbd_bm_lock(struct drbd_device *device, char *why, enum bm_flag flags) +{ + struct drbd_bitmap *b = device->bitmap; + int trylock_failed; + + if (!b) { + drbd_err(device, "FIXME no bitmap in drbd_bm_lock!?\n"); + return; + } + + trylock_failed = !mutex_trylock(&b->bm_change); + + if (trylock_failed) { + drbd_warn(device, "%s[%d] going to '%s' but bitmap already locked for '%s' by %s[%d]\n", + current->comm, task_pid_nr(current), + why, b->bm_why ?: "?", + b->bm_task->comm, task_pid_nr(b->bm_task)); + mutex_lock(&b->bm_change); + } + if (BM_LOCKED_MASK & b->bm_flags) + drbd_err(device, "FIXME bitmap already locked in bm_lock\n"); + b->bm_flags |= flags & BM_LOCKED_MASK; + + b->bm_why = why; + b->bm_task = current; +} + +void drbd_bm_unlock(struct drbd_device *device) +{ + struct drbd_bitmap *b = device->bitmap; + if (!b) { + drbd_err(device, "FIXME no bitmap in drbd_bm_unlock!?\n"); + return; + } + + if (!(BM_LOCKED_MASK & device->bitmap->bm_flags)) + drbd_err(device, "FIXME bitmap not locked in bm_unlock\n"); + + b->bm_flags &= ~BM_LOCKED_MASK; + b->bm_why = NULL; + b->bm_task = NULL; + mutex_unlock(&b->bm_change); +} + +/* we store some "meta" info about our pages in page->private */ +/* at a granularity of 4k storage per bitmap bit: + * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks + * 1<<38 bits, + * 1<<23 4k bitmap pages. + * Use 24 bits as page index, covers 2 peta byte storage + * at a granularity of 4k per bit. + * Used to report the failed page idx on io error from the endio handlers. + */ +#define BM_PAGE_IDX_MASK ((1UL<<24)-1) +/* this page is currently read in, or written back */ +#define BM_PAGE_IO_LOCK 31 +/* if there has been an IO error for this page */ +#define BM_PAGE_IO_ERROR 30 +/* this is to be able to intelligently skip disk IO, + * set if bits have been set since last IO. */ +#define BM_PAGE_NEED_WRITEOUT 29 +/* to mark for lazy writeout once syncer cleared all clearable bits, + * we if bits have been cleared since last IO. */ +#define BM_PAGE_LAZY_WRITEOUT 28 +/* pages marked with this "HINT" will be considered for writeout + * on activity log transactions */ +#define BM_PAGE_HINT_WRITEOUT 27 + +/* store_page_idx uses non-atomic assignment. It is only used directly after + * allocating the page. All other bm_set_page_* and bm_clear_page_* need to + * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap + * changes) may happen from various contexts, and wait_on_bit/wake_up_bit + * requires it all to be atomic as well. */ +static void bm_store_page_idx(struct page *page, unsigned long idx) +{ + BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK)); + set_page_private(page, idx); +} + +static unsigned long bm_page_to_idx(struct page *page) +{ + return page_private(page) & BM_PAGE_IDX_MASK; +} + +/* As is very unlikely that the same page is under IO from more than one + * context, we can get away with a bit per page and one wait queue per bitmap. + */ +static void bm_page_lock_io(struct drbd_device *device, int page_nr) +{ + struct drbd_bitmap *b = device->bitmap; + void *addr = &page_private(b->bm_pages[page_nr]); + wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr)); +} + +static void bm_page_unlock_io(struct drbd_device *device, int page_nr) +{ + struct drbd_bitmap *b = device->bitmap; + void *addr = &page_private(b->bm_pages[page_nr]); + clear_bit_unlock(BM_PAGE_IO_LOCK, addr); + wake_up(&device->bitmap->bm_io_wait); +} + +/* set _before_ submit_io, so it may be reset due to being changed + * while this page is in flight... will get submitted later again */ +static void bm_set_page_unchanged(struct page *page) +{ + /* use cmpxchg? */ + clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page)); + clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page)); +} + +static void bm_set_page_need_writeout(struct page *page) +{ + set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page)); +} + +/** + * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout + * @device: DRBD device. + * @page_nr: the bitmap page to mark with the "hint" flag + * + * From within an activity log transaction, we mark a few pages with these + * hints, then call drbd_bm_write_hinted(), which will only write out changed + * pages which are flagged with this mark. + */ +void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr) +{ + struct page *page; + if (page_nr >= device->bitmap->bm_number_of_pages) { + drbd_warn(device, "BAD: page_nr: %u, number_of_pages: %u\n", + page_nr, (int)device->bitmap->bm_number_of_pages); + return; + } + page = device->bitmap->bm_pages[page_nr]; + set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page)); +} + +static int bm_test_page_unchanged(struct page *page) +{ + volatile const unsigned long *addr = &page_private(page); + return (*addr & ((1UL<<BM_PAGE_NEED_WRITEOUT)|(1UL<<BM_PAGE_LAZY_WRITEOUT))) == 0; +} + +static void bm_set_page_io_err(struct page *page) +{ + set_bit(BM_PAGE_IO_ERROR, &page_private(page)); +} + +static void bm_clear_page_io_err(struct page *page) +{ + clear_bit(BM_PAGE_IO_ERROR, &page_private(page)); +} + +static void bm_set_page_lazy_writeout(struct page *page) +{ + set_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page)); +} + +static int bm_test_page_lazy_writeout(struct page *page) +{ + return test_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page)); +} + +/* on a 32bit box, this would allow for exactly (2<<38) bits. */ +static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr) +{ + /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */ + unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3); + BUG_ON(page_nr >= b->bm_number_of_pages); + return page_nr; +} + +static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr) +{ + /* page_nr = (bitnr/8) >> PAGE_SHIFT; */ + unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3); + BUG_ON(page_nr >= b->bm_number_of_pages); + return page_nr; +} + +static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx) +{ + struct page *page = b->bm_pages[idx]; + return (unsigned long *) kmap_atomic(page); +} + +static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx) +{ + return __bm_map_pidx(b, idx); +} + +static void __bm_unmap(unsigned long *p_addr) +{ + kunmap_atomic(p_addr); +}; + +static void bm_unmap(unsigned long *p_addr) +{ + return __bm_unmap(p_addr); +} + +/* long word offset of _bitmap_ sector */ +#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) +/* word offset from start of bitmap to word number _in_page_ + * modulo longs per page +#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long)) + hm, well, Philipp thinks gcc might not optimize the % into & (... - 1) + so do it explicitly: + */ +#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1)) + +/* Long words per page */ +#define LWPP (PAGE_SIZE/sizeof(long)) + +/* + * actually most functions herein should take a struct drbd_bitmap*, not a + * struct drbd_device*, but for the debug macros I like to have the device around + * to be able to report device specific. + */ + + +static void bm_free_pages(struct page **pages, unsigned long number) +{ + unsigned long i; + if (!pages) + return; + + for (i = 0; i < number; i++) { + if (!pages[i]) { + pr_alert("bm_free_pages tried to free a NULL pointer; i=%lu n=%lu\n", + i, number); + continue; + } + __free_page(pages[i]); + pages[i] = NULL; + } +} + +static void bm_vk_free(void *ptr, int v) +{ + if (v) + vfree(ptr); + else + kfree(ptr); +} + +/* + * "have" and "want" are NUMBER OF PAGES. + */ +static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) +{ + struct page **old_pages = b->bm_pages; + struct page **new_pages, *page; + unsigned int i, bytes, vmalloced = 0; + unsigned long have = b->bm_number_of_pages; + + BUG_ON(have == 0 && old_pages != NULL); + BUG_ON(have != 0 && old_pages == NULL); + + if (have == want) + return old_pages; + + /* Trying kmalloc first, falling back to vmalloc. + * GFP_NOIO, as this is called while drbd IO is "suspended", + * and during resize or attach on diskless Primary, + * we must not block on IO to ourselves. + * Context is receiver thread or dmsetup. */ + bytes = sizeof(struct page *)*want; + new_pages = kzalloc(bytes, GFP_NOIO | __GFP_NOWARN); + if (!new_pages) { + new_pages = __vmalloc(bytes, + GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL); + if (!new_pages) + return NULL; + vmalloced = 1; + } + + if (want >= have) { + for (i = 0; i < have; i++) + new_pages[i] = old_pages[i]; + for (; i < want; i++) { + page = alloc_page(GFP_NOIO | __GFP_HIGHMEM); + if (!page) { + bm_free_pages(new_pages + have, i - have); + bm_vk_free(new_pages, vmalloced); + return NULL; + } + /* we want to know which page it is + * from the endio handlers */ + bm_store_page_idx(page, i); + new_pages[i] = page; + } + } else { + for (i = 0; i < want; i++) + new_pages[i] = old_pages[i]; + /* NOT HERE, we are outside the spinlock! + bm_free_pages(old_pages + want, have - want); + */ + } + + if (vmalloced) + b->bm_flags |= BM_P_VMALLOCED; + else + b->bm_flags &= ~BM_P_VMALLOCED; + + return new_pages; +} + +/* + * called on driver init only. TODO call when a device is created. + * allocates the drbd_bitmap, and stores it in device->bitmap. + */ +int drbd_bm_init(struct drbd_device *device) +{ + struct drbd_bitmap *b = device->bitmap; + WARN_ON(b != NULL); + b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL); + if (!b) + return -ENOMEM; + spin_lock_init(&b->bm_lock); + mutex_init(&b->bm_change); + init_waitqueue_head(&b->bm_io_wait); + + device->bitmap = b; + + return 0; +} + +sector_t drbd_bm_capacity(struct drbd_device *device) +{ + if (!expect(device->bitmap)) + return 0; + return device->bitmap->bm_dev_capacity; +} + +/* called on driver unload. TODO: call when a device is destroyed. + */ +void drbd_bm_cleanup(struct drbd_device *device) +{ + if (!expect(device->bitmap)) + return; + bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages); + bm_vk_free(device->bitmap->bm_pages, (BM_P_VMALLOCED & device->bitmap->bm_flags)); + kfree(device->bitmap); + device->bitmap = NULL; +} + +/* + * since (b->bm_bits % BITS_PER_LONG) != 0, + * this masks out the remaining bits. + * Returns the number of bits cleared. + */ +#define BITS_PER_PAGE (1UL << (PAGE_SHIFT + 3)) +#define BITS_PER_PAGE_MASK (BITS_PER_PAGE - 1) +#define BITS_PER_LONG_MASK (BITS_PER_LONG - 1) +static int bm_clear_surplus(struct drbd_bitmap *b) +{ + unsigned long mask; + unsigned long *p_addr, *bm; + int tmp; + int cleared = 0; + + /* number of bits modulo bits per page */ + tmp = (b->bm_bits & BITS_PER_PAGE_MASK); + /* mask the used bits of the word containing the last bit */ + mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1; + /* bitmap is always stored little endian, + * on disk and in core memory alike */ + mask = cpu_to_lel(mask); + + p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1); + bm = p_addr + (tmp/BITS_PER_LONG); + if (mask) { + /* If mask != 0, we are not exactly aligned, so bm now points + * to the long containing the last bit. + * If mask == 0, bm already points to the word immediately + * after the last (long word aligned) bit. */ + cleared = hweight_long(*bm & ~mask); + *bm &= mask; + bm++; + } + + if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) { + /* on a 32bit arch, we may need to zero out + * a padding long to align with a 64bit remote */ + cleared += hweight_long(*bm); + *bm = 0; + } + bm_unmap(p_addr); + return cleared; +} + +static void bm_set_surplus(struct drbd_bitmap *b) +{ + unsigned long mask; + unsigned long *p_addr, *bm; + int tmp; + + /* number of bits modulo bits per page */ + tmp = (b->bm_bits & BITS_PER_PAGE_MASK); + /* mask the used bits of the word containing the last bit */ + mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1; + /* bitmap is always stored little endian, + * on disk and in core memory alike */ + mask = cpu_to_lel(mask); + + p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1); + bm = p_addr + (tmp/BITS_PER_LONG); + if (mask) { + /* If mask != 0, we are not exactly aligned, so bm now points + * to the long containing the last bit. + * If mask == 0, bm already points to the word immediately + * after the last (long word aligned) bit. */ + *bm |= ~mask; + bm++; + } + + if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) { + /* on a 32bit arch, we may need to zero out + * a padding long to align with a 64bit remote */ + *bm = ~0UL; + } + bm_unmap(p_addr); +} + +/* you better not modify the bitmap while this is running, + * or its results will be stale */ +static unsigned long bm_count_bits(struct drbd_bitmap *b) +{ + unsigned long *p_addr; + unsigned long bits = 0; + unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1; + int idx, i, last_word; + + /* all but last page */ + for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) { + p_addr = __bm_map_pidx(b, idx); + for (i = 0; i < LWPP; i++) + bits += hweight_long(p_addr[i]); + __bm_unmap(p_addr); + cond_resched(); + } + /* last (or only) page */ + last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL; + p_addr = __bm_map_pidx(b, idx); + for (i = 0; i < last_word; i++) + bits += hweight_long(p_addr[i]); + p_addr[last_word] &= cpu_to_lel(mask); + bits += hweight_long(p_addr[last_word]); + /* 32bit arch, may have an unused padding long */ + if (BITS_PER_LONG == 32 && (last_word & 1) == 0) + p_addr[last_word+1] = 0; + __bm_unmap(p_addr); + return bits; +} + +/* offset and len in long words.*/ +static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) +{ + unsigned long *p_addr, *bm; + unsigned int idx; + size_t do_now, end; + + end = offset + len; + + if (end > b->bm_words) { + pr_alert("bm_memset end > bm_words\n"); + return; + } + + while (offset < end) { + do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset; + idx = bm_word_to_page_idx(b, offset); + p_addr = bm_map_pidx(b, idx); + bm = p_addr + MLPP(offset); + if (bm+do_now > p_addr + LWPP) { + pr_alert("BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n", + p_addr, bm, (int)do_now); + } else + memset(bm, c, do_now * sizeof(long)); + bm_unmap(p_addr); + bm_set_page_need_writeout(b->bm_pages[idx]); + offset += do_now; + } +} + +/* For the layout, see comment above drbd_md_set_sector_offsets(). */ +static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev) +{ + u64 bitmap_sectors; + if (ldev->md.al_offset == 8) + bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset; + else + bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset; + return bitmap_sectors << (9 + 3); +} + +/* + * make sure the bitmap has enough room for the attached storage, + * if necessary, resize. + * called whenever we may have changed the device size. + * returns -ENOMEM if we could not allocate enough memory, 0 on success. + * In case this is actually a resize, we copy the old bitmap into the new one. + * Otherwise, the bitmap is initialized to all bits set. + */ +int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bits) +{ + struct drbd_bitmap *b = device->bitmap; + unsigned long bits, words, owords, obits; + unsigned long want, have, onpages; /* number of pages */ + struct page **npages, **opages = NULL; + int err = 0, growing; + int opages_vmalloced; + + if (!expect(b)) + return -ENOMEM; + + drbd_bm_lock(device, "resize", BM_LOCKED_MASK); + + drbd_info(device, "drbd_bm_resize called with capacity == %llu\n", + (unsigned long long)capacity); + + if (capacity == b->bm_dev_capacity) + goto out; + + opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags); + + if (capacity == 0) { + spin_lock_irq(&b->bm_lock); + opages = b->bm_pages; + onpages = b->bm_number_of_pages; + owords = b->bm_words; + b->bm_pages = NULL; + b->bm_number_of_pages = + b->bm_set = + b->bm_bits = + b->bm_words = + b->bm_dev_capacity = 0; + spin_unlock_irq(&b->bm_lock); + bm_free_pages(opages, onpages); + bm_vk_free(opages, opages_vmalloced); + goto out; + } + bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT)); + + /* if we would use + words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL; + a 32bit host could present the wrong number of words + to a 64bit host. + */ + words = ALIGN(bits, 64) >> LN2_BPL; + + if (get_ldev(device)) { + u64 bits_on_disk = drbd_md_on_disk_bits(device->ldev); + put_ldev(device); + if (bits > bits_on_disk) { + drbd_info(device, "bits = %lu\n", bits); + drbd_info(device, "bits_on_disk = %llu\n", bits_on_disk); + err = -ENOSPC; + goto out; + } + } + + want = ALIGN(words*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT; + have = b->bm_number_of_pages; + if (want == have) { + D_ASSERT(device, b->bm_pages != NULL); + npages = b->bm_pages; + } else { + if (drbd_insert_fault(device, DRBD_FAULT_BM_ALLOC)) + npages = NULL; + else + npages = bm_realloc_pages(b, want); + } + + if (!npages) { + err = -ENOMEM; + goto out; + } + + spin_lock_irq(&b->bm_lock); + opages = b->bm_pages; + owords = b->bm_words; + obits = b->bm_bits; + + growing = bits > obits; + if (opages && growing && set_new_bits) + bm_set_surplus(b); + + b->bm_pages = npages; + b->bm_number_of_pages = want; + b->bm_bits = bits; + b->bm_words = words; + b->bm_dev_capacity = capacity; + + if (growing) { + if (set_new_bits) { + bm_memset(b, owords, 0xff, words-owords); + b->bm_set += bits - obits; + } else + bm_memset(b, owords, 0x00, words-owords); + + } + + if (want < have) { + /* implicit: (opages != NULL) && (opages != npages) */ + bm_free_pages(opages + want, have - want); + } + + (void)bm_clear_surplus(b); + + spin_unlock_irq(&b->bm_lock); + if (opages != npages) + bm_vk_free(opages, opages_vmalloced); + if (!growing) + b->bm_set = bm_count_bits(b); + drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want); + + out: + drbd_bm_unlock(device); + return err; +} + +/* inherently racy: + * if not protected by other means, return value may be out of date when + * leaving this function... + * we still need to lock it, since it is important that this returns + * bm_set == 0 precisely. + * + * maybe bm_set should be atomic_t ? + */ +unsigned long _drbd_bm_total_weight(struct drbd_device *device) +{ + struct drbd_bitmap *b = device->bitmap; + unsigned long s; + unsigned long flags; + + if (!expect(b)) + return 0; + if (!expect(b->bm_pages)) + return 0; + + spin_lock_irqsave(&b->bm_lock, flags); + s = b->bm_set; + spin_unlock_irqrestore(&b->bm_lock, flags); + + return s; +} + +unsigned long drbd_bm_total_weight(struct drbd_device *device) +{ + unsigned long s; + /* if I don't have a disk, I don't know about out-of-sync status */ + if (!get_ldev_if_state(device, D_NEGOTIATING)) + return 0; + s = _drbd_bm_total_weight(device); + put_ldev(device); + return s; +} + +size_t drbd_bm_words(struct drbd_device *device) +{ + struct drbd_bitmap *b = device->bitmap; + if (!expect(b)) + return 0; + if (!expect(b->bm_pages)) + return 0; + + return b->bm_words; +} + +unsigned long drbd_bm_bits(struct drbd_device *device) +{ + struct drbd_bitmap *b = device->bitmap; + if (!expect(b)) + return 0; + + return b->bm_bits; +} + +/* merge number words from buffer into the bitmap starting at offset. + * buffer[i] is expected to be little endian unsigned long. + * bitmap must be locked by drbd_bm_lock. + * currently only used from receive_bitmap. + */ +void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, size_t number, + unsigned long *buffer) +{ + struct drbd_bitmap *b = device->bitmap; + unsigned long *p_addr, *bm; + unsigned long word, bits; + unsigned int idx; + size_t end, do_now; + + end = offset + number; + + if (!expect(b)) + return; + if (!expect(b->bm_pages)) + return; + if (number == 0) + return; + WARN_ON(offset >= b->bm_words); + WARN_ON(end > b->bm_words); + + spin_lock_irq(&b->bm_lock); + while (offset < end) { + do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; + idx = bm_word_to_page_idx(b, offset); + p_addr = bm_map_pidx(b, idx); + bm = p_addr + MLPP(offset); + offset += do_now; + while (do_now--) { + bits = hweight_long(*bm); + word = *bm | *buffer++; + *bm++ = word; + b->bm_set += hweight_long(word) - bits; + } + bm_unmap(p_addr); + bm_set_page_need_writeout(b->bm_pages[idx]); + } + /* with 32bit <-> 64bit cross-platform connect + * this is only correct for current usage, + * where we _know_ that we are 64 bit aligned, + * and know that this function is used in this way, too... + */ + if (end == b->bm_words) + b->bm_set -= bm_clear_surplus(b); + spin_unlock_irq(&b->bm_lock); +} + +/* copy number words from the bitmap starting at offset into the buffer. + * buffer[i] will be little endian unsigned long. + */ +void drbd_bm_get_lel(struct drbd_device *device, size_t offset, size_t number, + unsigned long *buffer) +{ + struct drbd_bitmap *b = device->bitmap; + unsigned long *p_addr, *bm; + size_t end, do_now; + + end = offset + number; + + if (!expect(b)) + return; + if (!expect(b->bm_pages)) + return; + + spin_lock_irq(&b->bm_lock); + if ((offset >= b->bm_words) || + (end > b->bm_words) || + (number <= 0)) + drbd_err(device, "offset=%lu number=%lu bm_words=%lu\n", + (unsigned long) offset, + (unsigned long) number, + (unsigned long) b->bm_words); + else { + while (offset < end) { + do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; + p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset)); + bm = p_addr + MLPP(offset); + offset += do_now; + while (do_now--) + *buffer++ = *bm++; + bm_unmap(p_addr); + } + } + spin_unlock_irq(&b->bm_lock); +} + +/* set all bits in the bitmap */ +void drbd_bm_set_all(struct drbd_device *device) +{ + struct drbd_bitmap *b = device->bitmap; + if (!expect(b)) + return; + if (!expect(b->bm_pages)) + return; + + spin_lock_irq(&b->bm_lock); + bm_memset(b, 0, 0xff, b->bm_words); + (void)bm_clear_surplus(b); + b->bm_set = b->bm_bits; + spin_unlock_irq(&b->bm_lock); +} + +/* clear all bits in the bitmap */ +void drbd_bm_clear_all(struct drbd_device *device) +{ + struct drbd_bitmap *b = device->bitmap; + if (!expect(b)) + return; + if (!expect(b->bm_pages)) + return; + + spin_lock_irq(&b->bm_lock); + bm_memset(b, 0, 0, b->bm_words); + b->bm_set = 0; + spin_unlock_irq(&b->bm_lock); +} + +static void drbd_bm_aio_ctx_destroy(struct kref *kref) +{ + struct drbd_bm_aio_ctx *ctx = container_of(kref, struct drbd_bm_aio_ctx, kref); + unsigned long flags; + + spin_lock_irqsave(&ctx->device->resource->req_lock, flags); + list_del(&ctx->list); + spin_unlock_irqrestore(&ctx->device->resource->req_lock, flags); + put_ldev(ctx->device); + kfree(ctx); +} + +/* bv_page may be a copy, or may be the original */ +static void drbd_bm_endio(struct bio *bio, int error) +{ + struct drbd_bm_aio_ctx *ctx = bio->bi_private; + struct drbd_device *device = ctx->device; + struct drbd_bitmap *b = device->bitmap; + unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page); + int uptodate = bio_flagged(bio, BIO_UPTODATE); + + + /* strange behavior of some lower level drivers... + * fail the request by clearing the uptodate flag, + * but do not return any error?! + * do we want to WARN() on this? */ + if (!error && !uptodate) + error = -EIO; + + if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 && + !bm_test_page_unchanged(b->bm_pages[idx])) + drbd_warn(device, "bitmap page idx %u changed during IO!\n", idx); + + if (error) { + /* ctx error will hold the completed-last non-zero error code, + * in case error codes differ. */ + ctx->error = error; + bm_set_page_io_err(b->bm_pages[idx]); + /* Not identical to on disk version of it. + * Is BM_PAGE_IO_ERROR enough? */ + if (__ratelimit(&drbd_ratelimit_state)) + drbd_err(device, "IO ERROR %d on bitmap page idx %u\n", + error, idx); + } else { + bm_clear_page_io_err(b->bm_pages[idx]); + dynamic_drbd_dbg(device, "bitmap page idx %u completed\n", idx); + } + + bm_page_unlock_io(device, idx); + + if (ctx->flags & BM_AIO_COPY_PAGES) + mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool); + + bio_put(bio); + + if (atomic_dec_and_test(&ctx->in_flight)) { + ctx->done = 1; + wake_up(&device->misc_wait); + kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy); + } +} + +static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local) +{ + struct bio *bio = bio_alloc_drbd(GFP_NOIO); + struct drbd_device *device = ctx->device; + struct drbd_bitmap *b = device->bitmap; + struct page *page; + unsigned int len; + unsigned int rw = (ctx->flags & BM_AIO_READ) ? READ : WRITE; + + sector_t on_disk_sector = + device->ldev->md.md_offset + device->ldev->md.bm_offset; + on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9); + + /* this might happen with very small + * flexible external meta data device, + * or with PAGE_SIZE > 4k */ + len = min_t(unsigned int, PAGE_SIZE, + (drbd_md_last_sector(device->ldev) - on_disk_sector + 1)<<9); + + /* serialize IO on this page */ + bm_page_lock_io(device, page_nr); + /* before memcpy and submit, + * so it can be redirtied any time */ + bm_set_page_unchanged(b->bm_pages[page_nr]); + + if (ctx->flags & BM_AIO_COPY_PAGES) { + page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT); + copy_highpage(page, b->bm_pages[page_nr]); + bm_store_page_idx(page, page_nr); + } else + page = b->bm_pages[page_nr]; + bio->bi_bdev = device->ldev->md_bdev; + bio->bi_iter.bi_sector = on_disk_sector; + /* bio_add_page of a single page to an empty bio will always succeed, + * according to api. Do we want to assert that? */ + bio_add_page(bio, page, len, 0); + bio->bi_private = ctx; + bio->bi_end_io = drbd_bm_endio; + + if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) { + bio->bi_rw |= rw; + bio_endio(bio, -EIO); + } else { + submit_bio(rw, bio); + /* this should not count as user activity and cause the + * resync to throttle -- see drbd_rs_should_slow_down(). */ + atomic_add(len >> 9, &device->rs_sect_ev); + } +} + +/* + * bm_rw: read/write the whole bitmap from/to its on disk location. + */ +static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned lazy_writeout_upper_idx) __must_hold(local) +{ + struct drbd_bm_aio_ctx *ctx; + struct drbd_bitmap *b = device->bitmap; + int num_pages, i, count = 0; + unsigned long now; + char ppb[10]; + int err = 0; + + /* + * We are protected against bitmap disappearing/resizing by holding an + * ldev reference (caller must have called get_ldev()). + * For read/write, we are protected against changes to the bitmap by + * the bitmap lock (see drbd_bitmap_io). + * For lazy writeout, we don't care for ongoing changes to the bitmap, + * as we submit copies of pages anyways. + */ + + ctx = kmalloc(sizeof(struct drbd_bm_aio_ctx), GFP_NOIO); + if (!ctx) + return -ENOMEM; + + *ctx = (struct drbd_bm_aio_ctx) { + .device = device, + .start_jif = jiffies, + .in_flight = ATOMIC_INIT(1), + .done = 0, + .flags = flags, + .error = 0, + .kref = { ATOMIC_INIT(2) }, + }; + + if (!get_ldev_if_state(device, D_ATTACHING)) { /* put is in drbd_bm_aio_ctx_destroy() */ + drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n"); + kfree(ctx); + return -ENODEV; + } + /* Here D_ATTACHING is sufficient since drbd_bm_read() is called only from + drbd_adm_attach(), after device->ldev was assigned. */ + + if (0 == (ctx->flags & ~BM_AIO_READ)) + WARN_ON(!(BM_LOCKED_MASK & b->bm_flags)); + + spin_lock_irq(&device->resource->req_lock); + list_add_tail(&ctx->list, &device->pending_bitmap_io); + spin_unlock_irq(&device->resource->req_lock); + + num_pages = b->bm_number_of_pages; + + now = jiffies; + + /* let the layers below us try to merge these bios... */ + for (i = 0; i < num_pages; i++) { + /* ignore completely unchanged pages */ + if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) + break; + if (!(flags & BM_AIO_READ)) { + if ((flags & BM_AIO_WRITE_HINTED) && + !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT, + &page_private(b->bm_pages[i]))) + continue; + + if (!(flags & BM_AIO_WRITE_ALL_PAGES) && + bm_test_page_unchanged(b->bm_pages[i])) { + dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i); + continue; + } + /* during lazy writeout, + * ignore those pages not marked for lazy writeout. */ + if (lazy_writeout_upper_idx && + !bm_test_page_lazy_writeout(b->bm_pages[i])) { + dynamic_drbd_dbg(device, "skipped bm lazy write for idx %u\n", i); + continue; + } + } + atomic_inc(&ctx->in_flight); + bm_page_io_async(ctx, i); + ++count; + cond_resched(); + } + + /* + * We initialize ctx->in_flight to one to make sure drbd_bm_endio + * will not set ctx->done early, and decrement / test it here. If there + * are still some bios in flight, we need to wait for them here. + * If all IO is done already (or nothing had been submitted), there is + * no need to wait. Still, we need to put the kref associated with the + * "in_flight reached zero, all done" event. + */ + if (!atomic_dec_and_test(&ctx->in_flight)) + wait_until_done_or_force_detached(device, device->ldev, &ctx->done); + else + kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy); + + /* summary for global bitmap IO */ + if (flags == 0) + drbd_info(device, "bitmap %s of %u pages took %lu jiffies\n", + (flags & BM_AIO_READ) ? "READ" : "WRITE", + count, jiffies - now); + + if (ctx->error) { + drbd_alert(device, "we had at least one MD IO ERROR during bitmap IO\n"); + drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); + err = -EIO; /* ctx->error ? */ + } + + if (atomic_read(&ctx->in_flight)) + err = -EIO; /* Disk timeout/force-detach during IO... */ + + now = jiffies; + if (flags & BM_AIO_READ) { + b->bm_set = bm_count_bits(b); + drbd_info(device, "recounting of set bits took additional %lu jiffies\n", + jiffies - now); + } + now = b->bm_set; + + if ((flags & ~BM_AIO_READ) == 0) + drbd_info(device, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", + ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); + + kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy); + return err; +} + +/** + * drbd_bm_read() - Read the whole bitmap from its on disk location. + * @device: DRBD device. + */ +int drbd_bm_read(struct drbd_device *device) __must_hold(local) +{ + return bm_rw(device, BM_AIO_READ, 0); +} + +/** + * drbd_bm_write() - Write the whole bitmap to its on disk location. + * @device: DRBD device. + * + * Will only write pages that have changed since last IO. + */ +int drbd_bm_write(struct drbd_device *device) __must_hold(local) +{ + return bm_rw(device, 0, 0); +} + +/** + * drbd_bm_write_all() - Write the whole bitmap to its on disk location. + * @device: DRBD device. + * + * Will write all pages. + */ +int drbd_bm_write_all(struct drbd_device *device) __must_hold(local) +{ + return bm_rw(device, BM_AIO_WRITE_ALL_PAGES, 0); +} + +/** + * drbd_bm_write_lazy() - Write bitmap pages 0 to @upper_idx-1, if they have changed. + * @device: DRBD device. + * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages + */ +int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local) +{ + return bm_rw(device, BM_AIO_COPY_PAGES, upper_idx); +} + +/** + * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location. + * @device: DRBD device. + * + * Will only write pages that have changed since last IO. + * In contrast to drbd_bm_write(), this will copy the bitmap pages + * to temporary writeout pages. It is intended to trigger a full write-out + * while still allowing the bitmap to change, for example if a resync or online + * verify is aborted due to a failed peer disk, while local IO continues, or + * pending resync acks are still being processed. + */ +int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local) +{ + return bm_rw(device, BM_AIO_COPY_PAGES, 0); +} + +/** + * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed. + * @device: DRBD device. + */ +int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local) +{ + return bm_rw(device, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0); +} + +/* NOTE + * find_first_bit returns int, we return unsigned long. + * For this to work on 32bit arch with bitnumbers > (1<<32), + * we'd need to return u64, and get a whole lot of other places + * fixed where we still use unsigned long. + * + * this returns a bit number, NOT a sector! + */ +static unsigned long __bm_find_next(struct drbd_device *device, unsigned long bm_fo, + const int find_zero_bit) +{ + struct drbd_bitmap *b = device->bitmap; + unsigned long *p_addr; + unsigned long bit_offset; + unsigned i; + + + if (bm_fo > b->bm_bits) { + drbd_err(device, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits); + bm_fo = DRBD_END_OF_BITMAP; + } else { + while (bm_fo < b->bm_bits) { + /* bit offset of the first bit in the page */ + bit_offset = bm_fo & ~BITS_PER_PAGE_MASK; + p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo)); + + if (find_zero_bit) + i = find_next_zero_bit_le(p_addr, + PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK); + else + i = find_next_bit_le(p_addr, + PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK); + + __bm_unmap(p_addr); + if (i < PAGE_SIZE*8) { + bm_fo = bit_offset + i; + if (bm_fo >= b->bm_bits) + break; + goto found; + } + bm_fo = bit_offset + PAGE_SIZE*8; + } + bm_fo = DRBD_END_OF_BITMAP; + } + found: + return bm_fo; +} + +static unsigned long bm_find_next(struct drbd_device *device, + unsigned long bm_fo, const int find_zero_bit) +{ + struct drbd_bitmap *b = device->bitmap; + unsigned long i = DRBD_END_OF_BITMAP; + + if (!expect(b)) + return i; + if (!expect(b->bm_pages)) + return i; + + spin_lock_irq(&b->bm_lock); + if (BM_DONT_TEST & b->bm_flags) + bm_print_lock_info(device); + + i = __bm_find_next(device, bm_fo, find_zero_bit); + + spin_unlock_irq(&b->bm_lock); + return i; +} + +unsigned long drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo) +{ + return bm_find_next(device, bm_fo, 0); +} + +#if 0 +/* not yet needed for anything. */ +unsigned long drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo) +{ + return bm_find_next(device, bm_fo, 1); +} +#endif + +/* does not spin_lock_irqsave. + * you must take drbd_bm_lock() first */ +unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo) +{ + /* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */ + return __bm_find_next(device, bm_fo, 0); +} + +unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo) +{ + /* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */ + return __bm_find_next(device, bm_fo, 1); +} + +/* returns number of bits actually changed. + * for val != 0, we change 0 -> 1, return code positive + * for val == 0, we change 1 -> 0, return code negative + * wants bitnr, not sector. + * expected to be called for only a few bits (e - s about BITS_PER_LONG). + * Must hold bitmap lock already. */ +static int __bm_change_bits_to(struct drbd_device *device, const unsigned long s, + unsigned long e, int val) +{ + struct drbd_bitmap *b = device->bitmap; + unsigned long *p_addr = NULL; + unsigned long bitnr; + unsigned int last_page_nr = -1U; + int c = 0; + int changed_total = 0; + + if (e >= b->bm_bits) { + drbd_err(device, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n", + s, e, b->bm_bits); + e = b->bm_bits ? b->bm_bits -1 : 0; + } + for (bitnr = s; bitnr <= e; bitnr++) { + unsigned int page_nr = bm_bit_to_page_idx(b, bitnr); + if (page_nr != last_page_nr) { + if (p_addr) + __bm_unmap(p_addr); + if (c < 0) + bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]); + else if (c > 0) + bm_set_page_need_writeout(b->bm_pages[last_page_nr]); + changed_total += c; + c = 0; + p_addr = __bm_map_pidx(b, page_nr); + last_page_nr = page_nr; + } + if (val) + c += (0 == __test_and_set_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr)); + else + c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr)); + } + if (p_addr) + __bm_unmap(p_addr); + if (c < 0) + bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]); + else if (c > 0) + bm_set_page_need_writeout(b->bm_pages[last_page_nr]); + changed_total += c; + b->bm_set += changed_total; + return changed_total; +} + +/* returns number of bits actually changed. + * for val != 0, we change 0 -> 1, return code positive + * for val == 0, we change 1 -> 0, return code negative + * wants bitnr, not sector */ +static int bm_change_bits_to(struct drbd_device *device, const unsigned long s, + const unsigned long e, int val) +{ + unsigned long flags; + struct drbd_bitmap *b = device->bitmap; + int c = 0; + + if (!expect(b)) + return 1; + if (!expect(b->bm_pages)) + return 0; + + spin_lock_irqsave(&b->bm_lock, flags); + if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags) + bm_print_lock_info(device); + + c = __bm_change_bits_to(device, s, e, val); + + spin_unlock_irqrestore(&b->bm_lock, flags); + return c; +} + +/* returns number of bits changed 0 -> 1 */ +int drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e) +{ + return bm_change_bits_to(device, s, e, 1); +} + +/* returns number of bits changed 1 -> 0 */ +int drbd_bm_clear_bits(struct drbd_device *device, const unsigned long s, const unsigned long e) +{ + return -bm_change_bits_to(device, s, e, 0); +} + +/* sets all bits in full words, + * from first_word up to, but not including, last_word */ +static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b, + int page_nr, int first_word, int last_word) +{ + int i; + int bits; + int changed = 0; + unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]); + for (i = first_word; i < last_word; i++) { + bits = hweight_long(paddr[i]); + paddr[i] = ~0UL; + changed += BITS_PER_LONG - bits; + } + kunmap_atomic(paddr); + if (changed) { + /* We only need lazy writeout, the information is still in the + * remote bitmap as well, and is reconstructed during the next + * bitmap exchange, if lost locally due to a crash. */ + bm_set_page_lazy_writeout(b->bm_pages[page_nr]); + b->bm_set += changed; + } +} + +/* Same thing as drbd_bm_set_bits, + * but more efficient for a large bit range. + * You must first drbd_bm_lock(). + * Can be called to set the whole bitmap in one go. + * Sets bits from s to e _inclusive_. */ +void _drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e) +{ + /* First set_bit from the first bit (s) + * up to the next long boundary (sl), + * then assign full words up to the last long boundary (el), + * then set_bit up to and including the last bit (e). + * + * Do not use memset, because we must account for changes, + * so we need to loop over the words with hweight() anyways. + */ + struct drbd_bitmap *b = device->bitmap; + unsigned long sl = ALIGN(s,BITS_PER_LONG); + unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1); + int first_page; + int last_page; + int page_nr; + int first_word; + int last_word; + + if (e - s <= 3*BITS_PER_LONG) { + /* don't bother; el and sl may even be wrong. */ + spin_lock_irq(&b->bm_lock); + __bm_change_bits_to(device, s, e, 1); + spin_unlock_irq(&b->bm_lock); + return; + } + + /* difference is large enough that we can trust sl and el */ + + spin_lock_irq(&b->bm_lock); + + /* bits filling the current long */ + if (sl) + __bm_change_bits_to(device, s, sl-1, 1); + + first_page = sl >> (3 + PAGE_SHIFT); + last_page = el >> (3 + PAGE_SHIFT); + + /* MLPP: modulo longs per page */ + /* LWPP: long words per page */ + first_word = MLPP(sl >> LN2_BPL); + last_word = LWPP; + + /* first and full pages, unless first page == last page */ + for (page_nr = first_page; page_nr < last_page; page_nr++) { + bm_set_full_words_within_one_page(device->bitmap, page_nr, first_word, last_word); + spin_unlock_irq(&b->bm_lock); + cond_resched(); + first_word = 0; + spin_lock_irq(&b->bm_lock); + } + /* last page (respectively only page, for first page == last page) */ + last_word = MLPP(el >> LN2_BPL); + + /* consider bitmap->bm_bits = 32768, bitmap->bm_number_of_pages = 1. (or multiples). + * ==> e = 32767, el = 32768, last_page = 2, + * and now last_word = 0. + * We do not want to touch last_page in this case, + * as we did not allocate it, it is not present in bitmap->bm_pages. + */ + if (last_word) + bm_set_full_words_within_one_page(device->bitmap, last_page, first_word, last_word); + + /* possibly trailing bits. + * example: (e & 63) == 63, el will be e+1. + * if that even was the very last bit, + * it would trigger an assert in __bm_change_bits_to() + */ + if (el <= e) + __bm_change_bits_to(device, el, e, 1); + spin_unlock_irq(&b->bm_lock); +} + +/* returns bit state + * wants bitnr, NOT sector. + * inherently racy... area needs to be locked by means of {al,rs}_lru + * 1 ... bit set + * 0 ... bit not set + * -1 ... first out of bounds access, stop testing for bits! + */ +int drbd_bm_test_bit(struct drbd_device *device, const unsigned long bitnr) +{ + unsigned long flags; + struct drbd_bitmap *b = device->bitmap; + unsigned long *p_addr; + int i; + + if (!expect(b)) + return 0; + if (!expect(b->bm_pages)) + return 0; + + spin_lock_irqsave(&b->bm_lock, flags); + if (BM_DONT_TEST & b->bm_flags) + bm_print_lock_info(device); + if (bitnr < b->bm_bits) { + p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr)); + i = test_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr) ? 1 : 0; + bm_unmap(p_addr); + } else if (bitnr == b->bm_bits) { + i = -1; + } else { /* (bitnr > b->bm_bits) */ + drbd_err(device, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits); + i = 0; + } + + spin_unlock_irqrestore(&b->bm_lock, flags); + return i; +} + +/* returns number of bits set in the range [s, e] */ +int drbd_bm_count_bits(struct drbd_device *device, const unsigned long s, const unsigned long e) +{ + unsigned long flags; + struct drbd_bitmap *b = device->bitmap; + unsigned long *p_addr = NULL; + unsigned long bitnr; + unsigned int page_nr = -1U; + int c = 0; + + /* If this is called without a bitmap, that is a bug. But just to be + * robust in case we screwed up elsewhere, in that case pretend there + * was one dirty bit in the requested area, so we won't try to do a + * local read there (no bitmap probably implies no disk) */ + if (!expect(b)) + return 1; + if (!expect(b->bm_pages)) + return 1; + + spin_lock_irqsave(&b->bm_lock, flags); + if (BM_DONT_TEST & b->bm_flags) + bm_print_lock_info(device); + for (bitnr = s; bitnr <= e; bitnr++) { + unsigned int idx = bm_bit_to_page_idx(b, bitnr); + if (page_nr != idx) { + page_nr = idx; + if (p_addr) + bm_unmap(p_addr); + p_addr = bm_map_pidx(b, idx); + } + if (expect(bitnr < b->bm_bits)) + c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr)); + else + drbd_err(device, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); + } + if (p_addr) + bm_unmap(p_addr); + spin_unlock_irqrestore(&b->bm_lock, flags); + return c; +} + + +/* inherently racy... + * return value may be already out-of-date when this function returns. + * but the general usage is that this is only use during a cstate when bits are + * only cleared, not set, and typically only care for the case when the return + * value is zero, or we already "locked" this "bitmap extent" by other means. + * + * enr is bm-extent number, since we chose to name one sector (512 bytes) + * worth of the bitmap a "bitmap extent". + * + * TODO + * I think since we use it like a reference count, we should use the real + * reference count of some bitmap extent element from some lru instead... + * + */ +int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr) +{ + struct drbd_bitmap *b = device->bitmap; + int count, s, e; + unsigned long flags; + unsigned long *p_addr, *bm; + + if (!expect(b)) + return 0; + if (!expect(b->bm_pages)) + return 0; + + spin_lock_irqsave(&b->bm_lock, flags); + if (BM_DONT_TEST & b->bm_flags) + bm_print_lock_info(device); + + s = S2W(enr); + e = min((size_t)S2W(enr+1), b->bm_words); + count = 0; + if (s < b->bm_words) { + int n = e-s; + p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s)); + bm = p_addr + MLPP(s); + while (n--) + count += hweight_long(*bm++); + bm_unmap(p_addr); + } else { + drbd_err(device, "start offset (%d) too large in drbd_bm_e_weight\n", s); + } + spin_unlock_irqrestore(&b->bm_lock, flags); + return count; +} diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c new file mode 100644 index 000000000..a6ee3d750 --- /dev/null +++ b/drivers/block/drbd/drbd_debugfs.c @@ -0,0 +1,958 @@ +#define pr_fmt(fmt) "drbd debugfs: " fmt +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/stat.h> +#include <linux/jiffies.h> +#include <linux/list.h> + +#include "drbd_int.h" +#include "drbd_req.h" +#include "drbd_debugfs.h" + + +/********************************************************************** + * Whenever you change the file format, remember to bump the version. * + **********************************************************************/ + +static struct dentry *drbd_debugfs_root; +static struct dentry *drbd_debugfs_version; +static struct dentry *drbd_debugfs_resources; +static struct dentry *drbd_debugfs_minors; + +static void seq_print_age_or_dash(struct seq_file *m, bool valid, unsigned long dt) +{ + if (valid) + seq_printf(m, "\t%d", jiffies_to_msecs(dt)); + else + seq_printf(m, "\t-"); +} + +static void __seq_print_rq_state_bit(struct seq_file *m, + bool is_set, char *sep, const char *set_name, const char *unset_name) +{ + if (is_set && set_name) { + seq_putc(m, *sep); + seq_puts(m, set_name); + *sep = '|'; + } else if (!is_set && unset_name) { + seq_putc(m, *sep); + seq_puts(m, unset_name); + *sep = '|'; + } +} + +static void seq_print_rq_state_bit(struct seq_file *m, + bool is_set, char *sep, const char *set_name) +{ + __seq_print_rq_state_bit(m, is_set, sep, set_name, NULL); +} + +/* pretty print enum drbd_req_state_bits req->rq_state */ +static void seq_print_request_state(struct seq_file *m, struct drbd_request *req) +{ + unsigned int s = req->rq_state; + char sep = ' '; + seq_printf(m, "\t0x%08x", s); + seq_printf(m, "\tmaster: %s", req->master_bio ? "pending" : "completed"); + + /* RQ_WRITE ignored, already reported */ + seq_puts(m, "\tlocal:"); + seq_print_rq_state_bit(m, s & RQ_IN_ACT_LOG, &sep, "in-AL"); + seq_print_rq_state_bit(m, s & RQ_POSTPONED, &sep, "postponed"); + seq_print_rq_state_bit(m, s & RQ_COMPLETION_SUSP, &sep, "suspended"); + sep = ' '; + seq_print_rq_state_bit(m, s & RQ_LOCAL_PENDING, &sep, "pending"); + seq_print_rq_state_bit(m, s & RQ_LOCAL_COMPLETED, &sep, "completed"); + seq_print_rq_state_bit(m, s & RQ_LOCAL_ABORTED, &sep, "aborted"); + seq_print_rq_state_bit(m, s & RQ_LOCAL_OK, &sep, "ok"); + if (sep == ' ') + seq_puts(m, " -"); + + /* for_each_connection ... */ + seq_printf(m, "\tnet:"); + sep = ' '; + seq_print_rq_state_bit(m, s & RQ_NET_PENDING, &sep, "pending"); + seq_print_rq_state_bit(m, s & RQ_NET_QUEUED, &sep, "queued"); + seq_print_rq_state_bit(m, s & RQ_NET_SENT, &sep, "sent"); + seq_print_rq_state_bit(m, s & RQ_NET_DONE, &sep, "done"); + seq_print_rq_state_bit(m, s & RQ_NET_SIS, &sep, "sis"); + seq_print_rq_state_bit(m, s & RQ_NET_OK, &sep, "ok"); + if (sep == ' ') + seq_puts(m, " -"); + + seq_printf(m, " :"); + sep = ' '; + seq_print_rq_state_bit(m, s & RQ_EXP_RECEIVE_ACK, &sep, "B"); + seq_print_rq_state_bit(m, s & RQ_EXP_WRITE_ACK, &sep, "C"); + seq_print_rq_state_bit(m, s & RQ_EXP_BARR_ACK, &sep, "barr"); + if (sep == ' ') + seq_puts(m, " -"); + seq_printf(m, "\n"); +} + +static void seq_print_one_request(struct seq_file *m, struct drbd_request *req, unsigned long now) +{ + /* change anything here, fixup header below! */ + unsigned int s = req->rq_state; + +#define RQ_HDR_1 "epoch\tsector\tsize\trw" + seq_printf(m, "0x%x\t%llu\t%u\t%s", + req->epoch, + (unsigned long long)req->i.sector, req->i.size >> 9, + (s & RQ_WRITE) ? "W" : "R"); + +#define RQ_HDR_2 "\tstart\tin AL\tsubmit" + seq_printf(m, "\t%d", jiffies_to_msecs(now - req->start_jif)); + seq_print_age_or_dash(m, s & RQ_IN_ACT_LOG, now - req->in_actlog_jif); + seq_print_age_or_dash(m, s & RQ_LOCAL_PENDING, now - req->pre_submit_jif); + +#define RQ_HDR_3 "\tsent\tacked\tdone" + seq_print_age_or_dash(m, s & RQ_NET_SENT, now - req->pre_send_jif); + seq_print_age_or_dash(m, (s & RQ_NET_SENT) && !(s & RQ_NET_PENDING), now - req->acked_jif); + seq_print_age_or_dash(m, s & RQ_NET_DONE, now - req->net_done_jif); + +#define RQ_HDR_4 "\tstate\n" + seq_print_request_state(m, req); +} +#define RQ_HDR RQ_HDR_1 RQ_HDR_2 RQ_HDR_3 RQ_HDR_4 + +static void seq_print_minor_vnr_req(struct seq_file *m, struct drbd_request *req, unsigned long now) +{ + seq_printf(m, "%u\t%u\t", req->device->minor, req->device->vnr); + seq_print_one_request(m, req, now); +} + +static void seq_print_resource_pending_meta_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now) +{ + struct drbd_device *device; + unsigned int i; + + seq_puts(m, "minor\tvnr\tstart\tsubmit\tintent\n"); + rcu_read_lock(); + idr_for_each_entry(&resource->devices, device, i) { + struct drbd_md_io tmp; + /* In theory this is racy, + * in the sense that there could have been a + * drbd_md_put_buffer(); drbd_md_get_buffer(); + * between accessing these members here. */ + tmp = device->md_io; + if (atomic_read(&tmp.in_use)) { + seq_printf(m, "%u\t%u\t%d\t", + device->minor, device->vnr, + jiffies_to_msecs(now - tmp.start_jif)); + if (time_before(tmp.submit_jif, tmp.start_jif)) + seq_puts(m, "-\t"); + else + seq_printf(m, "%d\t", jiffies_to_msecs(now - tmp.submit_jif)); + seq_printf(m, "%s\n", tmp.current_use); + } + } + rcu_read_unlock(); +} + +static void seq_print_waiting_for_AL(struct seq_file *m, struct drbd_resource *resource, unsigned long now) +{ + struct drbd_device *device; + unsigned int i; + + seq_puts(m, "minor\tvnr\tage\t#waiting\n"); + rcu_read_lock(); + idr_for_each_entry(&resource->devices, device, i) { + unsigned long jif; + struct drbd_request *req; + int n = atomic_read(&device->ap_actlog_cnt); + if (n) { + spin_lock_irq(&device->resource->req_lock); + req = list_first_entry_or_null(&device->pending_master_completion[1], + struct drbd_request, req_pending_master_completion); + /* if the oldest request does not wait for the activity log + * it is not interesting for us here */ + if (req && !(req->rq_state & RQ_IN_ACT_LOG)) + jif = req->start_jif; + else + req = NULL; + spin_unlock_irq(&device->resource->req_lock); + } + if (n) { + seq_printf(m, "%u\t%u\t", device->minor, device->vnr); + if (req) + seq_printf(m, "%u\t", jiffies_to_msecs(now - jif)); + else + seq_puts(m, "-\t"); + seq_printf(m, "%u\n", n); + } + } + rcu_read_unlock(); +} + +static void seq_print_device_bitmap_io(struct seq_file *m, struct drbd_device *device, unsigned long now) +{ + struct drbd_bm_aio_ctx *ctx; + unsigned long start_jif; + unsigned int in_flight; + unsigned int flags; + spin_lock_irq(&device->resource->req_lock); + ctx = list_first_entry_or_null(&device->pending_bitmap_io, struct drbd_bm_aio_ctx, list); + if (ctx && ctx->done) + ctx = NULL; + if (ctx) { + start_jif = ctx->start_jif; + in_flight = atomic_read(&ctx->in_flight); + flags = ctx->flags; + } + spin_unlock_irq(&device->resource->req_lock); + if (ctx) { + seq_printf(m, "%u\t%u\t%c\t%u\t%u\n", + device->minor, device->vnr, + (flags & BM_AIO_READ) ? 'R' : 'W', + jiffies_to_msecs(now - start_jif), + in_flight); + } +} + +static void seq_print_resource_pending_bitmap_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now) +{ + struct drbd_device *device; + unsigned int i; + + seq_puts(m, "minor\tvnr\trw\tage\t#in-flight\n"); + rcu_read_lock(); + idr_for_each_entry(&resource->devices, device, i) { + seq_print_device_bitmap_io(m, device, now); + } + rcu_read_unlock(); +} + +/* pretty print enum peer_req->flags */ +static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_request *peer_req) +{ + unsigned long f = peer_req->flags; + char sep = ' '; + + __seq_print_rq_state_bit(m, f & EE_SUBMITTED, &sep, "submitted", "preparing"); + __seq_print_rq_state_bit(m, f & EE_APPLICATION, &sep, "application", "internal"); + seq_print_rq_state_bit(m, f & EE_CALL_AL_COMPLETE_IO, &sep, "in-AL"); + seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C"); + seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync"); + + if (f & EE_IS_TRIM) { + seq_putc(m, sep); + sep = '|'; + if (f & EE_IS_TRIM_USE_ZEROOUT) + seq_puts(m, "zero-out"); + else + seq_puts(m, "trim"); + } + seq_putc(m, '\n'); +} + +static void seq_print_peer_request(struct seq_file *m, + struct drbd_device *device, struct list_head *lh, + unsigned long now) +{ + bool reported_preparing = false; + struct drbd_peer_request *peer_req; + list_for_each_entry(peer_req, lh, w.list) { + if (reported_preparing && !(peer_req->flags & EE_SUBMITTED)) + continue; + + if (device) + seq_printf(m, "%u\t%u\t", device->minor, device->vnr); + + seq_printf(m, "%llu\t%u\t%c\t%u\t", + (unsigned long long)peer_req->i.sector, peer_req->i.size >> 9, + (peer_req->flags & EE_WRITE) ? 'W' : 'R', + jiffies_to_msecs(now - peer_req->submit_jif)); + seq_print_peer_request_flags(m, peer_req); + if (peer_req->flags & EE_SUBMITTED) + break; + else + reported_preparing = true; + } +} + +static void seq_print_device_peer_requests(struct seq_file *m, + struct drbd_device *device, unsigned long now) +{ + seq_puts(m, "minor\tvnr\tsector\tsize\trw\tage\tflags\n"); + spin_lock_irq(&device->resource->req_lock); + seq_print_peer_request(m, device, &device->active_ee, now); + seq_print_peer_request(m, device, &device->read_ee, now); + seq_print_peer_request(m, device, &device->sync_ee, now); + spin_unlock_irq(&device->resource->req_lock); + if (test_bit(FLUSH_PENDING, &device->flags)) { + seq_printf(m, "%u\t%u\t-\t-\tF\t%u\tflush\n", + device->minor, device->vnr, + jiffies_to_msecs(now - device->flush_jif)); + } +} + +static void seq_print_resource_pending_peer_requests(struct seq_file *m, + struct drbd_resource *resource, unsigned long now) +{ + struct drbd_device *device; + unsigned int i; + + rcu_read_lock(); + idr_for_each_entry(&resource->devices, device, i) { + seq_print_device_peer_requests(m, device, now); + } + rcu_read_unlock(); +} + +static void seq_print_resource_transfer_log_summary(struct seq_file *m, + struct drbd_resource *resource, + struct drbd_connection *connection, + unsigned long now) +{ + struct drbd_request *req; + unsigned int count = 0; + unsigned int show_state = 0; + + seq_puts(m, "n\tdevice\tvnr\t" RQ_HDR); + spin_lock_irq(&resource->req_lock); + list_for_each_entry(req, &connection->transfer_log, tl_requests) { + unsigned int tmp = 0; + unsigned int s; + ++count; + + /* don't disable irq "forever" */ + if (!(count & 0x1ff)) { + struct drbd_request *req_next; + kref_get(&req->kref); + spin_unlock_irq(&resource->req_lock); + cond_resched(); + spin_lock_irq(&resource->req_lock); + req_next = list_next_entry(req, tl_requests); + if (kref_put(&req->kref, drbd_req_destroy)) + req = req_next; + if (&req->tl_requests == &connection->transfer_log) + break; + } + + s = req->rq_state; + + /* This is meant to summarize timing issues, to be able to tell + * local disk problems from network problems. + * Skip requests, if we have shown an even older request with + * similar aspects already. */ + if (req->master_bio == NULL) + tmp |= 1; + if ((s & RQ_LOCAL_MASK) && (s & RQ_LOCAL_PENDING)) + tmp |= 2; + if (s & RQ_NET_MASK) { + if (!(s & RQ_NET_SENT)) + tmp |= 4; + if (s & RQ_NET_PENDING) + tmp |= 8; + if (!(s & RQ_NET_DONE)) + tmp |= 16; + } + if ((tmp & show_state) == tmp) + continue; + show_state |= tmp; + seq_printf(m, "%u\t", count); + seq_print_minor_vnr_req(m, req, now); + if (show_state == 0x1f) + break; + } + spin_unlock_irq(&resource->req_lock); +} + +/* TODO: transfer_log and friends should be moved to resource */ +static int in_flight_summary_show(struct seq_file *m, void *pos) +{ + struct drbd_resource *resource = m->private; + struct drbd_connection *connection; + unsigned long jif = jiffies; + + connection = first_connection(resource); + /* This does not happen, actually. + * But be robust and prepare for future code changes. */ + if (!connection || !kref_get_unless_zero(&connection->kref)) + return -ESTALE; + + /* BUMP me if you change the file format/content/presentation */ + seq_printf(m, "v: %u\n\n", 0); + + seq_puts(m, "oldest bitmap IO\n"); + seq_print_resource_pending_bitmap_io(m, resource, jif); + seq_putc(m, '\n'); + + seq_puts(m, "meta data IO\n"); + seq_print_resource_pending_meta_io(m, resource, jif); + seq_putc(m, '\n'); + + seq_puts(m, "socket buffer stats\n"); + /* for each connection ... once we have more than one */ + rcu_read_lock(); + if (connection->data.socket) { + /* open coded SIOCINQ, the "relevant" part */ + struct tcp_sock *tp = tcp_sk(connection->data.socket->sk); + int answ = tp->rcv_nxt - tp->copied_seq; + seq_printf(m, "unread receive buffer: %u Byte\n", answ); + /* open coded SIOCOUTQ, the "relevant" part */ + answ = tp->write_seq - tp->snd_una; + seq_printf(m, "unacked send buffer: %u Byte\n", answ); + } + rcu_read_unlock(); + seq_putc(m, '\n'); + + seq_puts(m, "oldest peer requests\n"); + seq_print_resource_pending_peer_requests(m, resource, jif); + seq_putc(m, '\n'); + + seq_puts(m, "application requests waiting for activity log\n"); + seq_print_waiting_for_AL(m, resource, jif); + seq_putc(m, '\n'); + + seq_puts(m, "oldest application requests\n"); + seq_print_resource_transfer_log_summary(m, resource, connection, jif); + seq_putc(m, '\n'); + + jif = jiffies - jif; + if (jif) + seq_printf(m, "generated in %d ms\n", jiffies_to_msecs(jif)); + kref_put(&connection->kref, drbd_destroy_connection); + return 0; +} + +/* simple_positive(file->f_path.dentry) respectively debugfs_positive(), + * but neither is "reachable" from here. + * So we have our own inline version of it above. :-( */ +static inline int debugfs_positive(struct dentry *dentry) +{ + return d_really_is_positive(dentry) && !d_unhashed(dentry); +} + +/* make sure at *open* time that the respective object won't go away. */ +static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, void *), + void *data, struct kref *kref, + void (*release)(struct kref *)) +{ + struct dentry *parent; + int ret = -ESTALE; + + /* Are we still linked, + * or has debugfs_remove() already been called? */ + parent = file->f_path.dentry->d_parent; + /* not sure if this can happen: */ + if (!parent || d_really_is_negative(parent)) + goto out; + /* serialize with d_delete() */ + mutex_lock(&d_inode(parent)->i_mutex); + /* Make sure the object is still alive */ + if (debugfs_positive(file->f_path.dentry) + && kref_get_unless_zero(kref)) + ret = 0; + mutex_unlock(&d_inode(parent)->i_mutex); + if (!ret) { + ret = single_open(file, show, data); + if (ret) + kref_put(kref, release); + } +out: + return ret; +} + +static int in_flight_summary_open(struct inode *inode, struct file *file) +{ + struct drbd_resource *resource = inode->i_private; + return drbd_single_open(file, in_flight_summary_show, resource, + &resource->kref, drbd_destroy_resource); +} + +static int in_flight_summary_release(struct inode *inode, struct file *file) +{ + struct drbd_resource *resource = inode->i_private; + kref_put(&resource->kref, drbd_destroy_resource); + return single_release(inode, file); +} + +static const struct file_operations in_flight_summary_fops = { + .owner = THIS_MODULE, + .open = in_flight_summary_open, + .read = seq_read, + .llseek = seq_lseek, + .release = in_flight_summary_release, +}; + +void drbd_debugfs_resource_add(struct drbd_resource *resource) +{ + struct dentry *dentry; + if (!drbd_debugfs_resources) + return; + + dentry = debugfs_create_dir(resource->name, drbd_debugfs_resources); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + resource->debugfs_res = dentry; + + dentry = debugfs_create_dir("volumes", resource->debugfs_res); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + resource->debugfs_res_volumes = dentry; + + dentry = debugfs_create_dir("connections", resource->debugfs_res); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + resource->debugfs_res_connections = dentry; + + dentry = debugfs_create_file("in_flight_summary", S_IRUSR|S_IRGRP, + resource->debugfs_res, resource, + &in_flight_summary_fops); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + resource->debugfs_res_in_flight_summary = dentry; + return; + +fail: + drbd_debugfs_resource_cleanup(resource); + drbd_err(resource, "failed to create debugfs dentry\n"); +} + +static void drbd_debugfs_remove(struct dentry **dp) +{ + debugfs_remove(*dp); + *dp = NULL; +} + +void drbd_debugfs_resource_cleanup(struct drbd_resource *resource) +{ + /* it is ok to call debugfs_remove(NULL) */ + drbd_debugfs_remove(&resource->debugfs_res_in_flight_summary); + drbd_debugfs_remove(&resource->debugfs_res_connections); + drbd_debugfs_remove(&resource->debugfs_res_volumes); + drbd_debugfs_remove(&resource->debugfs_res); +} + +static void seq_print_one_timing_detail(struct seq_file *m, + const struct drbd_thread_timing_details *tdp, + unsigned long now) +{ + struct drbd_thread_timing_details td; + /* No locking... + * use temporary assignment to get at consistent data. */ + do { + td = *tdp; + } while (td.cb_nr != tdp->cb_nr); + if (!td.cb_addr) + return; + seq_printf(m, "%u\t%d\t%s:%u\t%ps\n", + td.cb_nr, + jiffies_to_msecs(now - td.start_jif), + td.caller_fn, td.line, + td.cb_addr); +} + +static void seq_print_timing_details(struct seq_file *m, + const char *title, + unsigned int cb_nr, struct drbd_thread_timing_details *tdp, unsigned long now) +{ + unsigned int start_idx; + unsigned int i; + + seq_printf(m, "%s\n", title); + /* If not much is going on, this will result in natural ordering. + * If it is very busy, we will possibly skip events, or even see wrap + * arounds, which could only be avoided with locking. + */ + start_idx = cb_nr % DRBD_THREAD_DETAILS_HIST; + for (i = start_idx; i < DRBD_THREAD_DETAILS_HIST; i++) + seq_print_one_timing_detail(m, tdp+i, now); + for (i = 0; i < start_idx; i++) + seq_print_one_timing_detail(m, tdp+i, now); +} + +static int callback_history_show(struct seq_file *m, void *ignored) +{ + struct drbd_connection *connection = m->private; + unsigned long jif = jiffies; + + /* BUMP me if you change the file format/content/presentation */ + seq_printf(m, "v: %u\n\n", 0); + + seq_puts(m, "n\tage\tcallsite\tfn\n"); + seq_print_timing_details(m, "worker", connection->w_cb_nr, connection->w_timing_details, jif); + seq_print_timing_details(m, "receiver", connection->r_cb_nr, connection->r_timing_details, jif); + return 0; +} + +static int callback_history_open(struct inode *inode, struct file *file) +{ + struct drbd_connection *connection = inode->i_private; + return drbd_single_open(file, callback_history_show, connection, + &connection->kref, drbd_destroy_connection); +} + +static int callback_history_release(struct inode *inode, struct file *file) +{ + struct drbd_connection *connection = inode->i_private; + kref_put(&connection->kref, drbd_destroy_connection); + return single_release(inode, file); +} + +static const struct file_operations connection_callback_history_fops = { + .owner = THIS_MODULE, + .open = callback_history_open, + .read = seq_read, + .llseek = seq_lseek, + .release = callback_history_release, +}; + +static int connection_oldest_requests_show(struct seq_file *m, void *ignored) +{ + struct drbd_connection *connection = m->private; + unsigned long now = jiffies; + struct drbd_request *r1, *r2; + + /* BUMP me if you change the file format/content/presentation */ + seq_printf(m, "v: %u\n\n", 0); + + spin_lock_irq(&connection->resource->req_lock); + r1 = connection->req_next; + if (r1) + seq_print_minor_vnr_req(m, r1, now); + r2 = connection->req_ack_pending; + if (r2 && r2 != r1) { + r1 = r2; + seq_print_minor_vnr_req(m, r1, now); + } + r2 = connection->req_not_net_done; + if (r2 && r2 != r1) + seq_print_minor_vnr_req(m, r2, now); + spin_unlock_irq(&connection->resource->req_lock); + return 0; +} + +static int connection_oldest_requests_open(struct inode *inode, struct file *file) +{ + struct drbd_connection *connection = inode->i_private; + return drbd_single_open(file, connection_oldest_requests_show, connection, + &connection->kref, drbd_destroy_connection); +} + +static int connection_oldest_requests_release(struct inode *inode, struct file *file) +{ + struct drbd_connection *connection = inode->i_private; + kref_put(&connection->kref, drbd_destroy_connection); + return single_release(inode, file); +} + +static const struct file_operations connection_oldest_requests_fops = { + .owner = THIS_MODULE, + .open = connection_oldest_requests_open, + .read = seq_read, + .llseek = seq_lseek, + .release = connection_oldest_requests_release, +}; + +void drbd_debugfs_connection_add(struct drbd_connection *connection) +{ + struct dentry *conns_dir = connection->resource->debugfs_res_connections; + struct dentry *dentry; + if (!conns_dir) + return; + + /* Once we enable mutliple peers, + * these connections will have descriptive names. + * For now, it is just the one connection to the (only) "peer". */ + dentry = debugfs_create_dir("peer", conns_dir); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + connection->debugfs_conn = dentry; + + dentry = debugfs_create_file("callback_history", S_IRUSR|S_IRGRP, + connection->debugfs_conn, connection, + &connection_callback_history_fops); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + connection->debugfs_conn_callback_history = dentry; + + dentry = debugfs_create_file("oldest_requests", S_IRUSR|S_IRGRP, + connection->debugfs_conn, connection, + &connection_oldest_requests_fops); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + connection->debugfs_conn_oldest_requests = dentry; + return; + +fail: + drbd_debugfs_connection_cleanup(connection); + drbd_err(connection, "failed to create debugfs dentry\n"); +} + +void drbd_debugfs_connection_cleanup(struct drbd_connection *connection) +{ + drbd_debugfs_remove(&connection->debugfs_conn_callback_history); + drbd_debugfs_remove(&connection->debugfs_conn_oldest_requests); + drbd_debugfs_remove(&connection->debugfs_conn); +} + +static void resync_dump_detail(struct seq_file *m, struct lc_element *e) +{ + struct bm_extent *bme = lc_entry(e, struct bm_extent, lce); + + seq_printf(m, "%5d %s %s %s", bme->rs_left, + test_bit(BME_NO_WRITES, &bme->flags) ? "NO_WRITES" : "---------", + test_bit(BME_LOCKED, &bme->flags) ? "LOCKED" : "------", + test_bit(BME_PRIORITY, &bme->flags) ? "PRIORITY" : "--------" + ); +} + +static int device_resync_extents_show(struct seq_file *m, void *ignored) +{ + struct drbd_device *device = m->private; + + /* BUMP me if you change the file format/content/presentation */ + seq_printf(m, "v: %u\n\n", 0); + + if (get_ldev_if_state(device, D_FAILED)) { + lc_seq_printf_stats(m, device->resync); + lc_seq_dump_details(m, device->resync, "rs_left flags", resync_dump_detail); + put_ldev(device); + } + return 0; +} + +static int device_act_log_extents_show(struct seq_file *m, void *ignored) +{ + struct drbd_device *device = m->private; + + /* BUMP me if you change the file format/content/presentation */ + seq_printf(m, "v: %u\n\n", 0); + + if (get_ldev_if_state(device, D_FAILED)) { + lc_seq_printf_stats(m, device->act_log); + lc_seq_dump_details(m, device->act_log, "", NULL); + put_ldev(device); + } + return 0; +} + +static int device_oldest_requests_show(struct seq_file *m, void *ignored) +{ + struct drbd_device *device = m->private; + struct drbd_resource *resource = device->resource; + unsigned long now = jiffies; + struct drbd_request *r1, *r2; + int i; + + /* BUMP me if you change the file format/content/presentation */ + seq_printf(m, "v: %u\n\n", 0); + + seq_puts(m, RQ_HDR); + spin_lock_irq(&resource->req_lock); + /* WRITE, then READ */ + for (i = 1; i >= 0; --i) { + r1 = list_first_entry_or_null(&device->pending_master_completion[i], + struct drbd_request, req_pending_master_completion); + r2 = list_first_entry_or_null(&device->pending_completion[i], + struct drbd_request, req_pending_local); + if (r1) + seq_print_one_request(m, r1, now); + if (r2 && r2 != r1) + seq_print_one_request(m, r2, now); + } + spin_unlock_irq(&resource->req_lock); + return 0; +} + +static int device_data_gen_id_show(struct seq_file *m, void *ignored) +{ + struct drbd_device *device = m->private; + struct drbd_md *md; + enum drbd_uuid_index idx; + + if (!get_ldev_if_state(device, D_FAILED)) + return -ENODEV; + + md = &device->ldev->md; + spin_lock_irq(&md->uuid_lock); + for (idx = UI_CURRENT; idx <= UI_HISTORY_END; idx++) { + seq_printf(m, "0x%016llX\n", md->uuid[idx]); + } + spin_unlock_irq(&md->uuid_lock); + put_ldev(device); + return 0; +} + +#define drbd_debugfs_device_attr(name) \ +static int device_ ## name ## _open(struct inode *inode, struct file *file) \ +{ \ + struct drbd_device *device = inode->i_private; \ + return drbd_single_open(file, device_ ## name ## _show, device, \ + &device->kref, drbd_destroy_device); \ +} \ +static int device_ ## name ## _release(struct inode *inode, struct file *file) \ +{ \ + struct drbd_device *device = inode->i_private; \ + kref_put(&device->kref, drbd_destroy_device); \ + return single_release(inode, file); \ +} \ +static const struct file_operations device_ ## name ## _fops = { \ + .owner = THIS_MODULE, \ + .open = device_ ## name ## _open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = device_ ## name ## _release, \ +}; + +drbd_debugfs_device_attr(oldest_requests) +drbd_debugfs_device_attr(act_log_extents) +drbd_debugfs_device_attr(resync_extents) +drbd_debugfs_device_attr(data_gen_id) + +void drbd_debugfs_device_add(struct drbd_device *device) +{ + struct dentry *vols_dir = device->resource->debugfs_res_volumes; + char minor_buf[8]; /* MINORMASK, MINORBITS == 20; */ + char vnr_buf[8]; /* volume number vnr is even 16 bit only; */ + char *slink_name = NULL; + + struct dentry *dentry; + if (!vols_dir || !drbd_debugfs_minors) + return; + + snprintf(vnr_buf, sizeof(vnr_buf), "%u", device->vnr); + dentry = debugfs_create_dir(vnr_buf, vols_dir); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + device->debugfs_vol = dentry; + + snprintf(minor_buf, sizeof(minor_buf), "%u", device->minor); + slink_name = kasprintf(GFP_KERNEL, "../resources/%s/volumes/%u", + device->resource->name, device->vnr); + if (!slink_name) + goto fail; + dentry = debugfs_create_symlink(minor_buf, drbd_debugfs_minors, slink_name); + kfree(slink_name); + slink_name = NULL; + if (IS_ERR_OR_NULL(dentry)) + goto fail; + device->debugfs_minor = dentry; + +#define DCF(name) do { \ + dentry = debugfs_create_file(#name, S_IRUSR|S_IRGRP, \ + device->debugfs_vol, device, \ + &device_ ## name ## _fops); \ + if (IS_ERR_OR_NULL(dentry)) \ + goto fail; \ + device->debugfs_vol_ ## name = dentry; \ + } while (0) + + DCF(oldest_requests); + DCF(act_log_extents); + DCF(resync_extents); + DCF(data_gen_id); +#undef DCF + return; + +fail: + drbd_debugfs_device_cleanup(device); + drbd_err(device, "failed to create debugfs entries\n"); +} + +void drbd_debugfs_device_cleanup(struct drbd_device *device) +{ + drbd_debugfs_remove(&device->debugfs_minor); + drbd_debugfs_remove(&device->debugfs_vol_oldest_requests); + drbd_debugfs_remove(&device->debugfs_vol_act_log_extents); + drbd_debugfs_remove(&device->debugfs_vol_resync_extents); + drbd_debugfs_remove(&device->debugfs_vol_data_gen_id); + drbd_debugfs_remove(&device->debugfs_vol); +} + +void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device) +{ + struct dentry *conn_dir = peer_device->connection->debugfs_conn; + struct dentry *dentry; + char vnr_buf[8]; + + if (!conn_dir) + return; + + snprintf(vnr_buf, sizeof(vnr_buf), "%u", peer_device->device->vnr); + dentry = debugfs_create_dir(vnr_buf, conn_dir); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + peer_device->debugfs_peer_dev = dentry; + return; + +fail: + drbd_debugfs_peer_device_cleanup(peer_device); + drbd_err(peer_device, "failed to create debugfs entries\n"); +} + +void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device) +{ + drbd_debugfs_remove(&peer_device->debugfs_peer_dev); +} + +static int drbd_version_show(struct seq_file *m, void *ignored) +{ + seq_printf(m, "# %s\n", drbd_buildtag()); + seq_printf(m, "VERSION=%s\n", REL_VERSION); + seq_printf(m, "API_VERSION=%u\n", API_VERSION); + seq_printf(m, "PRO_VERSION_MIN=%u\n", PRO_VERSION_MIN); + seq_printf(m, "PRO_VERSION_MAX=%u\n", PRO_VERSION_MAX); + return 0; +} + +static int drbd_version_open(struct inode *inode, struct file *file) +{ + return single_open(file, drbd_version_show, NULL); +} + +static struct file_operations drbd_version_fops = { + .owner = THIS_MODULE, + .open = drbd_version_open, + .llseek = seq_lseek, + .read = seq_read, + .release = single_release, +}; + +/* not __exit, may be indirectly called + * from the module-load-failure path as well. */ +void drbd_debugfs_cleanup(void) +{ + drbd_debugfs_remove(&drbd_debugfs_resources); + drbd_debugfs_remove(&drbd_debugfs_minors); + drbd_debugfs_remove(&drbd_debugfs_version); + drbd_debugfs_remove(&drbd_debugfs_root); +} + +int __init drbd_debugfs_init(void) +{ + struct dentry *dentry; + + dentry = debugfs_create_dir("drbd", NULL); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + drbd_debugfs_root = dentry; + + dentry = debugfs_create_file("version", 0444, drbd_debugfs_root, NULL, &drbd_version_fops); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + drbd_debugfs_version = dentry; + + dentry = debugfs_create_dir("resources", drbd_debugfs_root); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + drbd_debugfs_resources = dentry; + + dentry = debugfs_create_dir("minors", drbd_debugfs_root); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + drbd_debugfs_minors = dentry; + return 0; + +fail: + drbd_debugfs_cleanup(); + if (dentry) + return PTR_ERR(dentry); + else + return -EINVAL; +} diff --git a/drivers/block/drbd/drbd_debugfs.h b/drivers/block/drbd/drbd_debugfs.h new file mode 100644 index 000000000..8bee21340 --- /dev/null +++ b/drivers/block/drbd/drbd_debugfs.h @@ -0,0 +1,39 @@ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/debugfs.h> + +#include "drbd_int.h" + +#ifdef CONFIG_DEBUG_FS +int __init drbd_debugfs_init(void); +void drbd_debugfs_cleanup(void); + +void drbd_debugfs_resource_add(struct drbd_resource *resource); +void drbd_debugfs_resource_cleanup(struct drbd_resource *resource); + +void drbd_debugfs_connection_add(struct drbd_connection *connection); +void drbd_debugfs_connection_cleanup(struct drbd_connection *connection); + +void drbd_debugfs_device_add(struct drbd_device *device); +void drbd_debugfs_device_cleanup(struct drbd_device *device); + +void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device); +void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device); +#else + +static inline int __init drbd_debugfs_init(void) { return -ENODEV; } +static inline void drbd_debugfs_cleanup(void) { } + +static inline void drbd_debugfs_resource_add(struct drbd_resource *resource) { } +static inline void drbd_debugfs_resource_cleanup(struct drbd_resource *resource) { } + +static inline void drbd_debugfs_connection_add(struct drbd_connection *connection) { } +static inline void drbd_debugfs_connection_cleanup(struct drbd_connection *connection) { } + +static inline void drbd_debugfs_device_add(struct drbd_device *device) { } +static inline void drbd_debugfs_device_cleanup(struct drbd_device *device) { } + +static inline void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device) { } +static inline void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device) { } + +#endif diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h new file mode 100644 index 000000000..b905e9888 --- /dev/null +++ b/drivers/block/drbd/drbd_int.h @@ -0,0 +1,2328 @@ +/* + drbd_int.h + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + +#ifndef _DRBD_INT_H +#define _DRBD_INT_H + +#include <linux/compiler.h> +#include <linux/types.h> +#include <linux/list.h> +#include <linux/sched.h> +#include <linux/bitops.h> +#include <linux/slab.h> +#include <linux/crypto.h> +#include <linux/ratelimit.h> +#include <linux/tcp.h> +#include <linux/mutex.h> +#include <linux/major.h> +#include <linux/blkdev.h> +#include <linux/genhd.h> +#include <linux/idr.h> +#include <net/tcp.h> +#include <linux/lru_cache.h> +#include <linux/prefetch.h> +#include <linux/drbd_genl_api.h> +#include <linux/drbd.h> +#include "drbd_strings.h" +#include "drbd_state.h" +#include "drbd_protocol.h" + +#ifdef __CHECKER__ +# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) +# define __protected_read_by(x) __attribute__((require_context(x,1,999,"read"))) +# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write"))) +# define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call"))) +#else +# define __protected_by(x) +# define __protected_read_by(x) +# define __protected_write_by(x) +# define __must_hold(x) +#endif + +/* module parameter, defined in drbd_main.c */ +extern unsigned int minor_count; +extern bool disable_sendpage; +extern bool allow_oos; +void tl_abort_disk_io(struct drbd_device *device); + +#ifdef CONFIG_DRBD_FAULT_INJECTION +extern int enable_faults; +extern int fault_rate; +extern int fault_devs; +#endif + +extern char usermode_helper[]; + + +/* I don't remember why XCPU ... + * This is used to wake the asender, + * and to interrupt sending the sending task + * on disconnect. + */ +#define DRBD_SIG SIGXCPU + +/* This is used to stop/restart our threads. + * Cannot use SIGTERM nor SIGKILL, since these + * are sent out by init on runlevel changes + * I choose SIGHUP for now. + */ +#define DRBD_SIGKILL SIGHUP + +#define ID_IN_SYNC (4711ULL) +#define ID_OUT_OF_SYNC (4712ULL) +#define ID_SYNCER (-1ULL) + +#define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL) + +struct drbd_device; +struct drbd_connection; + +#define __drbd_printk_device(level, device, fmt, args...) \ + dev_printk(level, disk_to_dev((device)->vdisk), fmt, ## args) +#define __drbd_printk_peer_device(level, peer_device, fmt, args...) \ + dev_printk(level, disk_to_dev((peer_device)->device->vdisk), fmt, ## args) +#define __drbd_printk_resource(level, resource, fmt, args...) \ + printk(level "drbd %s: " fmt, (resource)->name, ## args) +#define __drbd_printk_connection(level, connection, fmt, args...) \ + printk(level "drbd %s: " fmt, (connection)->resource->name, ## args) + +void drbd_printk_with_wrong_object_type(void); + +#define __drbd_printk_if_same_type(obj, type, func, level, fmt, args...) \ + (__builtin_types_compatible_p(typeof(obj), type) || \ + __builtin_types_compatible_p(typeof(obj), const type)), \ + func(level, (const type)(obj), fmt, ## args) + +#define drbd_printk(level, obj, fmt, args...) \ + __builtin_choose_expr( \ + __drbd_printk_if_same_type(obj, struct drbd_device *, \ + __drbd_printk_device, level, fmt, ## args), \ + __builtin_choose_expr( \ + __drbd_printk_if_same_type(obj, struct drbd_resource *, \ + __drbd_printk_resource, level, fmt, ## args), \ + __builtin_choose_expr( \ + __drbd_printk_if_same_type(obj, struct drbd_connection *, \ + __drbd_printk_connection, level, fmt, ## args), \ + __builtin_choose_expr( \ + __drbd_printk_if_same_type(obj, struct drbd_peer_device *, \ + __drbd_printk_peer_device, level, fmt, ## args), \ + drbd_printk_with_wrong_object_type())))) + +#define drbd_dbg(obj, fmt, args...) \ + drbd_printk(KERN_DEBUG, obj, fmt, ## args) +#define drbd_alert(obj, fmt, args...) \ + drbd_printk(KERN_ALERT, obj, fmt, ## args) +#define drbd_err(obj, fmt, args...) \ + drbd_printk(KERN_ERR, obj, fmt, ## args) +#define drbd_warn(obj, fmt, args...) \ + drbd_printk(KERN_WARNING, obj, fmt, ## args) +#define drbd_info(obj, fmt, args...) \ + drbd_printk(KERN_INFO, obj, fmt, ## args) +#define drbd_emerg(obj, fmt, args...) \ + drbd_printk(KERN_EMERG, obj, fmt, ## args) + +#define dynamic_drbd_dbg(device, fmt, args...) \ + dynamic_dev_dbg(disk_to_dev(device->vdisk), fmt, ## args) + +#define D_ASSERT(device, exp) do { \ + if (!(exp)) \ + drbd_err(device, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__); \ + } while (0) + +/** + * expect - Make an assertion + * + * Unlike the assert macro, this macro returns a boolean result. + */ +#define expect(exp) ({ \ + bool _bool = (exp); \ + if (!_bool) \ + drbd_err(device, "ASSERTION %s FAILED in %s\n", \ + #exp, __func__); \ + _bool; \ + }) + +/* Defines to control fault insertion */ +enum { + DRBD_FAULT_MD_WR = 0, /* meta data write */ + DRBD_FAULT_MD_RD = 1, /* read */ + DRBD_FAULT_RS_WR = 2, /* resync */ + DRBD_FAULT_RS_RD = 3, + DRBD_FAULT_DT_WR = 4, /* data */ + DRBD_FAULT_DT_RD = 5, + DRBD_FAULT_DT_RA = 6, /* data read ahead */ + DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */ + DRBD_FAULT_AL_EE = 8, /* alloc ee */ + DRBD_FAULT_RECEIVE = 9, /* Changes some bytes upon receiving a [rs]data block */ + + DRBD_FAULT_MAX, +}; + +extern unsigned int +_drbd_insert_fault(struct drbd_device *device, unsigned int type); + +static inline int +drbd_insert_fault(struct drbd_device *device, unsigned int type) { +#ifdef CONFIG_DRBD_FAULT_INJECTION + return fault_rate && + (enable_faults & (1<<type)) && + _drbd_insert_fault(device, type); +#else + return 0; +#endif +} + +/* integer division, round _UP_ to the next integer */ +#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0)) +/* usual integer division */ +#define div_floor(A, B) ((A)/(B)) + +extern struct ratelimit_state drbd_ratelimit_state; +extern struct idr drbd_devices; /* RCU, updates: genl_lock() */ +extern struct list_head drbd_resources; /* RCU, updates: genl_lock() */ + +extern const char *cmdname(enum drbd_packet cmd); + +/* for sending/receiving the bitmap, + * possibly in some encoding scheme */ +struct bm_xfer_ctx { + /* "const" + * stores total bits and long words + * of the bitmap, so we don't need to + * call the accessor functions over and again. */ + unsigned long bm_bits; + unsigned long bm_words; + /* during xfer, current position within the bitmap */ + unsigned long bit_offset; + unsigned long word_offset; + + /* statistics; index: (h->command == P_BITMAP) */ + unsigned packets[2]; + unsigned bytes[2]; +}; + +extern void INFO_bm_xfer_stats(struct drbd_device *device, + const char *direction, struct bm_xfer_ctx *c); + +static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c) +{ + /* word_offset counts "native long words" (32 or 64 bit), + * aligned at 64 bit. + * Encoded packet may end at an unaligned bit offset. + * In case a fallback clear text packet is transmitted in + * between, we adjust this offset back to the last 64bit + * aligned "native long word", which makes coding and decoding + * the plain text bitmap much more convenient. */ +#if BITS_PER_LONG == 64 + c->word_offset = c->bit_offset >> 6; +#elif BITS_PER_LONG == 32 + c->word_offset = c->bit_offset >> 5; + c->word_offset &= ~(1UL); +#else +# error "unsupported BITS_PER_LONG" +#endif +} + +extern unsigned int drbd_header_size(struct drbd_connection *connection); + +/**********************************************************************/ +enum drbd_thread_state { + NONE, + RUNNING, + EXITING, + RESTARTING +}; + +struct drbd_thread { + spinlock_t t_lock; + struct task_struct *task; + struct completion stop; + enum drbd_thread_state t_state; + int (*function) (struct drbd_thread *); + struct drbd_resource *resource; + struct drbd_connection *connection; + int reset_cpu_mask; + const char *name; +}; + +static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi) +{ + /* THINK testing the t_state seems to be uncritical in all cases + * (but thread_{start,stop}), so we can read it *without* the lock. + * --lge */ + + smp_rmb(); + return thi->t_state; +} + +struct drbd_work { + struct list_head list; + int (*cb)(struct drbd_work *, int cancel); +}; + +struct drbd_device_work { + struct drbd_work w; + struct drbd_device *device; +}; + +#include "drbd_interval.h" + +extern int drbd_wait_misc(struct drbd_device *, struct drbd_interval *); + +struct drbd_request { + struct drbd_work w; + struct drbd_device *device; + + /* if local IO is not allowed, will be NULL. + * if local IO _is_ allowed, holds the locally submitted bio clone, + * or, after local IO completion, the ERR_PTR(error). + * see drbd_request_endio(). */ + struct bio *private_bio; + + struct drbd_interval i; + + /* epoch: used to check on "completion" whether this req was in + * the current epoch, and we therefore have to close it, + * causing a p_barrier packet to be send, starting a new epoch. + * + * This corresponds to "barrier" in struct p_barrier[_ack], + * and to "barrier_nr" in struct drbd_epoch (and various + * comments/function parameters/local variable names). + */ + unsigned int epoch; + + struct list_head tl_requests; /* ring list in the transfer log */ + struct bio *master_bio; /* master bio pointer */ + + /* see struct drbd_device */ + struct list_head req_pending_master_completion; + struct list_head req_pending_local; + + /* for generic IO accounting */ + unsigned long start_jif; + + /* for DRBD internal statistics */ + + /* Minimal set of time stamps to determine if we wait for activity log + * transactions, local disk or peer. 32 bit "jiffies" are good enough, + * we don't expect a DRBD request to be stalled for several month. + */ + + /* before actual request processing */ + unsigned long in_actlog_jif; + + /* local disk */ + unsigned long pre_submit_jif; + + /* per connection */ + unsigned long pre_send_jif; + unsigned long acked_jif; + unsigned long net_done_jif; + + /* Possibly even more detail to track each phase: + * master_completion_jif + * how long did it take to complete the master bio + * (application visible latency) + * allocated_jif + * how long the master bio was blocked until we finally allocated + * a tracking struct + * in_actlog_jif + * how long did we wait for activity log transactions + * + * net_queued_jif + * when did we finally queue it for sending + * pre_send_jif + * when did we start sending it + * post_send_jif + * how long did we block in the network stack trying to send it + * acked_jif + * when did we receive (or fake, in protocol A) a remote ACK + * net_done_jif + * when did we receive final acknowledgement (P_BARRIER_ACK), + * or decide, e.g. on connection loss, that we do no longer expect + * anything from this peer for this request. + * + * pre_submit_jif + * post_sub_jif + * when did we start submiting to the lower level device, + * and how long did we block in that submit function + * local_completion_jif + * how long did it take the lower level device to complete this request + */ + + + /* once it hits 0, we may complete the master_bio */ + atomic_t completion_ref; + /* once it hits 0, we may destroy this drbd_request object */ + struct kref kref; + + unsigned rq_state; /* see comments above _req_mod() */ +}; + +struct drbd_epoch { + struct drbd_connection *connection; + struct list_head list; + unsigned int barrier_nr; + atomic_t epoch_size; /* increased on every request added. */ + atomic_t active; /* increased on every req. added, and dec on every finished. */ + unsigned long flags; +}; + +/* Prototype declaration of function defined in drbd_receiver.c */ +int drbdd_init(struct drbd_thread *); +int drbd_asender(struct drbd_thread *); + +/* drbd_epoch flag bits */ +enum { + DE_HAVE_BARRIER_NUMBER, +}; + +enum epoch_event { + EV_PUT, + EV_GOT_BARRIER_NR, + EV_BECAME_LAST, + EV_CLEANUP = 32, /* used as flag */ +}; + +struct digest_info { + int digest_size; + void *digest; +}; + +struct drbd_peer_request { + struct drbd_work w; + struct drbd_peer_device *peer_device; + struct drbd_epoch *epoch; /* for writes */ + struct page *pages; + atomic_t pending_bios; + struct drbd_interval i; + /* see comments on ee flag bits below */ + unsigned long flags; + unsigned long submit_jif; + union { + u64 block_id; + struct digest_info *digest; + }; +}; + +/* ee flag bits. + * While corresponding bios are in flight, the only modification will be + * set_bit WAS_ERROR, which has to be atomic. + * If no bios are in flight yet, or all have been completed, + * non-atomic modification to ee->flags is ok. + */ +enum { + __EE_CALL_AL_COMPLETE_IO, + __EE_MAY_SET_IN_SYNC, + + /* is this a TRIM aka REQ_DISCARD? */ + __EE_IS_TRIM, + /* our lower level cannot handle trim, + * and we want to fall back to zeroout instead */ + __EE_IS_TRIM_USE_ZEROOUT, + + /* In case a barrier failed, + * we need to resubmit without the barrier flag. */ + __EE_RESUBMITTED, + + /* we may have several bios per peer request. + * if any of those fail, we set this flag atomically + * from the endio callback */ + __EE_WAS_ERROR, + + /* This ee has a pointer to a digest instead of a block id */ + __EE_HAS_DIGEST, + + /* Conflicting local requests need to be restarted after this request */ + __EE_RESTART_REQUESTS, + + /* The peer wants a write ACK for this (wire proto C) */ + __EE_SEND_WRITE_ACK, + + /* Is set when net_conf had two_primaries set while creating this peer_req */ + __EE_IN_INTERVAL_TREE, + + /* for debugfs: */ + /* has this been submitted, or does it still wait for something else? */ + __EE_SUBMITTED, + + /* this is/was a write request */ + __EE_WRITE, + + /* this originates from application on peer + * (not some resync or verify or other DRBD internal request) */ + __EE_APPLICATION, +}; +#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) +#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) +#define EE_IS_TRIM (1<<__EE_IS_TRIM) +#define EE_IS_TRIM_USE_ZEROOUT (1<<__EE_IS_TRIM_USE_ZEROOUT) +#define EE_RESUBMITTED (1<<__EE_RESUBMITTED) +#define EE_WAS_ERROR (1<<__EE_WAS_ERROR) +#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST) +#define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS) +#define EE_SEND_WRITE_ACK (1<<__EE_SEND_WRITE_ACK) +#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE) +#define EE_SUBMITTED (1<<__EE_SUBMITTED) +#define EE_WRITE (1<<__EE_WRITE) +#define EE_APPLICATION (1<<__EE_APPLICATION) + +/* flag bits per device */ +enum { + UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ + MD_DIRTY, /* current uuids and flags not yet on disk */ + USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */ + CL_ST_CHG_SUCCESS, + CL_ST_CHG_FAIL, + CRASHED_PRIMARY, /* This node was a crashed primary. + * Gets cleared when the state.conn + * goes into C_CONNECTED state. */ + CONSIDER_RESYNC, + + MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */ + + SUSPEND_IO, /* suspend application io */ + BITMAP_IO, /* suspend application io; + once no more io in flight, start bitmap io */ + BITMAP_IO_QUEUED, /* Started bitmap IO */ + WAS_IO_ERROR, /* Local disk failed, returned IO error */ + WAS_READ_ERROR, /* Local disk READ failed (set additionally to the above) */ + FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */ + RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ + RESIZE_PENDING, /* Size change detected locally, waiting for the response from + * the peer, if it changed there as well. */ + NEW_CUR_UUID, /* Create new current UUID when thawing IO */ + AL_SUSPENDED, /* Activity logging is currently suspended. */ + AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */ + B_RS_H_DONE, /* Before resync handler done (already executed) */ + DISCARD_MY_DATA, /* discard_my_data flag per volume */ + READ_BALANCE_RR, + + FLUSH_PENDING, /* if set, device->flush_jif is when we submitted that flush + * from drbd_flush_after_epoch() */ + + /* cleared only after backing device related structures have been destroyed. */ + GOING_DISKLESS, /* Disk is being detached, because of io-error, or admin request. */ + + /* to be used in drbd_device_post_work() */ + GO_DISKLESS, /* tell worker to schedule cleanup before detach */ + DESTROY_DISK, /* tell worker to close backing devices and destroy related structures. */ + MD_SYNC, /* tell worker to call drbd_md_sync() */ + RS_START, /* tell worker to start resync/OV */ + RS_PROGRESS, /* tell worker that resync made significant progress */ + RS_DONE, /* tell worker that resync is done */ +}; + +struct drbd_bitmap; /* opaque for drbd_device */ + +/* definition of bits in bm_flags to be used in drbd_bm_lock + * and drbd_bitmap_io and friends. */ +enum bm_flag { + /* do we need to kfree, or vfree bm_pages? */ + BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */ + + /* currently locked for bulk operation */ + BM_LOCKED_MASK = 0xf, + + /* in detail, that is: */ + BM_DONT_CLEAR = 0x1, + BM_DONT_SET = 0x2, + BM_DONT_TEST = 0x4, + + /* so we can mark it locked for bulk operation, + * and still allow all non-bulk operations */ + BM_IS_LOCKED = 0x8, + + /* (test bit, count bit) allowed (common case) */ + BM_LOCKED_TEST_ALLOWED = BM_DONT_CLEAR | BM_DONT_SET | BM_IS_LOCKED, + + /* testing bits, as well as setting new bits allowed, but clearing bits + * would be unexpected. Used during bitmap receive. Setting new bits + * requires sending of "out-of-sync" information, though. */ + BM_LOCKED_SET_ALLOWED = BM_DONT_CLEAR | BM_IS_LOCKED, + + /* for drbd_bm_write_copy_pages, everything is allowed, + * only concurrent bulk operations are locked out. */ + BM_LOCKED_CHANGE_ALLOWED = BM_IS_LOCKED, +}; + +struct drbd_work_queue { + struct list_head q; + spinlock_t q_lock; /* to protect the list. */ + wait_queue_head_t q_wait; +}; + +struct drbd_socket { + struct mutex mutex; + struct socket *socket; + /* this way we get our + * send/receive buffers off the stack */ + void *sbuf; + void *rbuf; +}; + +struct drbd_md { + u64 md_offset; /* sector offset to 'super' block */ + + u64 la_size_sect; /* last agreed size, unit sectors */ + spinlock_t uuid_lock; + u64 uuid[UI_SIZE]; + u64 device_uuid; + u32 flags; + u32 md_size_sect; + + s32 al_offset; /* signed relative sector offset to activity log */ + s32 bm_offset; /* signed relative sector offset to bitmap */ + + /* cached value of bdev->disk_conf->meta_dev_idx (see below) */ + s32 meta_dev_idx; + + /* see al_tr_number_to_on_disk_sector() */ + u32 al_stripes; + u32 al_stripe_size_4k; + u32 al_size_4k; /* cached product of the above */ +}; + +struct drbd_backing_dev { + struct block_device *backing_bdev; + struct block_device *md_bdev; + struct drbd_md md; + struct disk_conf *disk_conf; /* RCU, for updates: resource->conf_update */ + sector_t known_size; /* last known size of that backing device */ +}; + +struct drbd_md_io { + struct page *page; + unsigned long start_jif; /* last call to drbd_md_get_buffer */ + unsigned long submit_jif; /* last _drbd_md_sync_page_io() submit */ + const char *current_use; + atomic_t in_use; + unsigned int done; + int error; +}; + +struct bm_io_work { + struct drbd_work w; + char *why; + enum bm_flag flags; + int (*io_fn)(struct drbd_device *device); + void (*done)(struct drbd_device *device, int rv); +}; + +enum write_ordering_e { + WO_none, + WO_drain_io, + WO_bdev_flush, +}; + +struct fifo_buffer { + unsigned int head_index; + unsigned int size; + int total; /* sum of all values */ + int values[0]; +}; +extern struct fifo_buffer *fifo_alloc(int fifo_size); + +/* flag bits per connection */ +enum { + NET_CONGESTED, /* The data socket is congested */ + RESOLVE_CONFLICTS, /* Set on one node, cleared on the peer! */ + SEND_PING, /* whether asender should send a ping asap */ + SIGNAL_ASENDER, /* whether asender wants to be interrupted */ + GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */ + CONN_WD_ST_CHG_REQ, /* A cluster wide state change on the connection is active */ + CONN_WD_ST_CHG_OKAY, + CONN_WD_ST_CHG_FAIL, + CONN_DRY_RUN, /* Expect disconnect after resync handshake. */ + CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */ + STATE_SENT, /* Do not change state/UUIDs while this is set */ + CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC) + * pending, from drbd worker context. + * If set, bdi_write_congested() returns true, + * so shrink_page_list() would not recurse into, + * and potentially deadlock on, this drbd worker. + */ + DISCONNECT_SENT, + + DEVICE_WORK_PENDING, /* tell worker that some device has pending work */ +}; + +struct drbd_resource { + char *name; +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_res; + struct dentry *debugfs_res_volumes; + struct dentry *debugfs_res_connections; + struct dentry *debugfs_res_in_flight_summary; +#endif + struct kref kref; + struct idr devices; /* volume number to device mapping */ + struct list_head connections; + struct list_head resources; + struct res_opts res_opts; + struct mutex conf_update; /* mutex for ready-copy-update of net_conf and disk_conf */ + struct mutex adm_mutex; /* mutex to serialize administrative requests */ + spinlock_t req_lock; + + unsigned susp:1; /* IO suspended by user */ + unsigned susp_nod:1; /* IO suspended because no data */ + unsigned susp_fen:1; /* IO suspended because fence peer handler runs */ + + enum write_ordering_e write_ordering; + + cpumask_var_t cpu_mask; +}; + +struct drbd_thread_timing_details +{ + unsigned long start_jif; + void *cb_addr; + const char *caller_fn; + unsigned int line; + unsigned int cb_nr; +}; + +struct drbd_connection { + struct list_head connections; + struct drbd_resource *resource; +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_conn; + struct dentry *debugfs_conn_callback_history; + struct dentry *debugfs_conn_oldest_requests; +#endif + struct kref kref; + struct idr peer_devices; /* volume number to peer device mapping */ + enum drbd_conns cstate; /* Only C_STANDALONE to C_WF_REPORT_PARAMS */ + struct mutex cstate_mutex; /* Protects graceful disconnects */ + unsigned int connect_cnt; /* Inc each time a connection is established */ + + unsigned long flags; + struct net_conf *net_conf; /* content protected by rcu */ + wait_queue_head_t ping_wait; /* Woken upon reception of a ping, and a state change */ + + struct sockaddr_storage my_addr; + int my_addr_len; + struct sockaddr_storage peer_addr; + int peer_addr_len; + + struct drbd_socket data; /* data/barrier/cstate/parameter packets */ + struct drbd_socket meta; /* ping/ack (metadata) packets */ + int agreed_pro_version; /* actually used protocol version */ + u32 agreed_features; + unsigned long last_received; /* in jiffies, either socket */ + unsigned int ko_count; + + struct list_head transfer_log; /* all requests not yet fully processed */ + + struct crypto_hash *cram_hmac_tfm; + struct crypto_hash *integrity_tfm; /* checksums we compute, updates protected by connection->data->mutex */ + struct crypto_hash *peer_integrity_tfm; /* checksums we verify, only accessed from receiver thread */ + struct crypto_hash *csums_tfm; + struct crypto_hash *verify_tfm; + void *int_dig_in; + void *int_dig_vv; + + /* receiver side */ + struct drbd_epoch *current_epoch; + spinlock_t epoch_lock; + unsigned int epochs; + atomic_t current_tle_nr; /* transfer log epoch number */ + unsigned current_tle_writes; /* writes seen within this tl epoch */ + + unsigned long last_reconnect_jif; + struct drbd_thread receiver; + struct drbd_thread worker; + struct drbd_thread asender; + + /* cached pointers, + * so we can look up the oldest pending requests more quickly. + * protected by resource->req_lock */ + struct drbd_request *req_next; /* DRBD 9: todo.req_next */ + struct drbd_request *req_ack_pending; + struct drbd_request *req_not_net_done; + + /* sender side */ + struct drbd_work_queue sender_work; + +#define DRBD_THREAD_DETAILS_HIST 16 + unsigned int w_cb_nr; /* keeps counting up */ + unsigned int r_cb_nr; /* keeps counting up */ + struct drbd_thread_timing_details w_timing_details[DRBD_THREAD_DETAILS_HIST]; + struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST]; + + struct { + /* whether this sender thread + * has processed a single write yet. */ + bool seen_any_write_yet; + + /* Which barrier number to send with the next P_BARRIER */ + int current_epoch_nr; + + /* how many write requests have been sent + * with req->epoch == current_epoch_nr. + * If none, no P_BARRIER will be sent. */ + unsigned current_epoch_writes; + } send; +}; + +void __update_timing_details( + struct drbd_thread_timing_details *tdp, + unsigned int *cb_nr, + void *cb, + const char *fn, const unsigned int line); + +#define update_worker_timing_details(c, cb) \ + __update_timing_details(c->w_timing_details, &c->w_cb_nr, cb, __func__ , __LINE__ ) +#define update_receiver_timing_details(c, cb) \ + __update_timing_details(c->r_timing_details, &c->r_cb_nr, cb, __func__ , __LINE__ ) + +struct submit_worker { + struct workqueue_struct *wq; + struct work_struct worker; + + /* protected by ..->resource->req_lock */ + struct list_head writes; +}; + +struct drbd_peer_device { + struct list_head peer_devices; + struct drbd_device *device; + struct drbd_connection *connection; +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_peer_dev; +#endif +}; + +struct drbd_device { + struct drbd_resource *resource; + struct list_head peer_devices; + struct list_head pending_bitmap_io; + + unsigned long flush_jif; +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_minor; + struct dentry *debugfs_vol; + struct dentry *debugfs_vol_oldest_requests; + struct dentry *debugfs_vol_act_log_extents; + struct dentry *debugfs_vol_resync_extents; + struct dentry *debugfs_vol_data_gen_id; +#endif + + unsigned int vnr; /* volume number within the connection */ + unsigned int minor; /* device minor number */ + + struct kref kref; + + /* things that are stored as / read from meta data on disk */ + unsigned long flags; + + /* configured by drbdsetup */ + struct drbd_backing_dev *ldev __protected_by(local); + + sector_t p_size; /* partner's disk size */ + struct request_queue *rq_queue; + struct block_device *this_bdev; + struct gendisk *vdisk; + + unsigned long last_reattach_jif; + struct drbd_work resync_work; + struct drbd_work unplug_work; + struct timer_list resync_timer; + struct timer_list md_sync_timer; + struct timer_list start_resync_timer; + struct timer_list request_timer; + + /* Used after attach while negotiating new disk state. */ + union drbd_state new_state_tmp; + + union drbd_dev_state state; + wait_queue_head_t misc_wait; + wait_queue_head_t state_wait; /* upon each state change. */ + unsigned int send_cnt; + unsigned int recv_cnt; + unsigned int read_cnt; + unsigned int writ_cnt; + unsigned int al_writ_cnt; + unsigned int bm_writ_cnt; + atomic_t ap_bio_cnt; /* Requests we need to complete */ + atomic_t ap_actlog_cnt; /* Requests waiting for activity log */ + atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */ + atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ + atomic_t unacked_cnt; /* Need to send replies for */ + atomic_t local_cnt; /* Waiting for local completion */ + + /* Interval tree of pending local requests */ + struct rb_root read_requests; + struct rb_root write_requests; + + /* for statistics and timeouts */ + /* [0] read, [1] write */ + struct list_head pending_master_completion[2]; + struct list_head pending_completion[2]; + + /* use checksums for *this* resync */ + bool use_csums; + /* blocks to resync in this run [unit BM_BLOCK_SIZE] */ + unsigned long rs_total; + /* number of resync blocks that failed in this run */ + unsigned long rs_failed; + /* Syncer's start time [unit jiffies] */ + unsigned long rs_start; + /* cumulated time in PausedSyncX state [unit jiffies] */ + unsigned long rs_paused; + /* skipped because csum was equal [unit BM_BLOCK_SIZE] */ + unsigned long rs_same_csum; +#define DRBD_SYNC_MARKS 8 +#define DRBD_SYNC_MARK_STEP (3*HZ) + /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */ + unsigned long rs_mark_left[DRBD_SYNC_MARKS]; + /* marks's time [unit jiffies] */ + unsigned long rs_mark_time[DRBD_SYNC_MARKS]; + /* current index into rs_mark_{left,time} */ + int rs_last_mark; + unsigned long rs_last_bcast; /* [unit jiffies] */ + + /* where does the admin want us to start? (sector) */ + sector_t ov_start_sector; + sector_t ov_stop_sector; + /* where are we now? (sector) */ + sector_t ov_position; + /* Start sector of out of sync range (to merge printk reporting). */ + sector_t ov_last_oos_start; + /* size of out-of-sync range in sectors. */ + sector_t ov_last_oos_size; + unsigned long ov_left; /* in bits */ + + struct drbd_bitmap *bitmap; + unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */ + + /* Used to track operations of resync... */ + struct lru_cache *resync; + /* Number of locked elements in resync LRU */ + unsigned int resync_locked; + /* resync extent number waiting for application requests */ + unsigned int resync_wenr; + + int open_cnt; + u64 *p_uuid; + + struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */ + struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */ + struct list_head done_ee; /* need to send P_WRITE_ACK */ + struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */ + struct list_head net_ee; /* zero-copy network send in progress */ + + int next_barrier_nr; + struct list_head resync_reads; + atomic_t pp_in_use; /* allocated from page pool */ + atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */ + wait_queue_head_t ee_wait; + struct drbd_md_io md_io; + spinlock_t al_lock; + wait_queue_head_t al_wait; + struct lru_cache *act_log; /* activity log */ + unsigned int al_tr_number; + int al_tr_cycle; + wait_queue_head_t seq_wait; + atomic_t packet_seq; + unsigned int peer_seq; + spinlock_t peer_seq_lock; + unsigned long comm_bm_set; /* communicated number of set bits. */ + struct bm_io_work bm_io_work; + u64 ed_uuid; /* UUID of the exposed data */ + struct mutex own_state_mutex; + struct mutex *state_mutex; /* either own_state_mutex or first_peer_device(device)->connection->cstate_mutex */ + char congestion_reason; /* Why we where congested... */ + atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */ + atomic_t rs_sect_ev; /* for submitted resync data rate, both */ + int rs_last_sect_ev; /* counter to compare with */ + int rs_last_events; /* counter of read or write "events" (unit sectors) + * on the lower level device when we last looked. */ + int c_sync_rate; /* current resync rate after syncer throttle magic */ + struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, connection->conn_update) */ + int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ + atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ + unsigned int peer_max_bio_size; + unsigned int local_max_bio_size; + + /* any requests that would block in drbd_make_request() + * are deferred to this single-threaded work queue */ + struct submit_worker submit; +}; + +struct drbd_bm_aio_ctx { + struct drbd_device *device; + struct list_head list; /* on device->pending_bitmap_io */; + unsigned long start_jif; + atomic_t in_flight; + unsigned int done; + unsigned flags; +#define BM_AIO_COPY_PAGES 1 +#define BM_AIO_WRITE_HINTED 2 +#define BM_AIO_WRITE_ALL_PAGES 4 +#define BM_AIO_READ 8 + int error; + struct kref kref; +}; + +struct drbd_config_context { + /* assigned from drbd_genlmsghdr */ + unsigned int minor; + /* assigned from request attributes, if present */ + unsigned int volume; +#define VOLUME_UNSPECIFIED (-1U) + /* pointer into the request skb, + * limited lifetime! */ + char *resource_name; + struct nlattr *my_addr; + struct nlattr *peer_addr; + + /* reply buffer */ + struct sk_buff *reply_skb; + /* pointer into reply buffer */ + struct drbd_genlmsghdr *reply_dh; + /* resolved from attributes, if possible */ + struct drbd_device *device; + struct drbd_resource *resource; + struct drbd_connection *connection; +}; + +static inline struct drbd_device *minor_to_device(unsigned int minor) +{ + return (struct drbd_device *)idr_find(&drbd_devices, minor); +} + +static inline struct drbd_peer_device *first_peer_device(struct drbd_device *device) +{ + return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices); +} + +#define for_each_resource(resource, _resources) \ + list_for_each_entry(resource, _resources, resources) + +#define for_each_resource_rcu(resource, _resources) \ + list_for_each_entry_rcu(resource, _resources, resources) + +#define for_each_resource_safe(resource, tmp, _resources) \ + list_for_each_entry_safe(resource, tmp, _resources, resources) + +#define for_each_connection(connection, resource) \ + list_for_each_entry(connection, &resource->connections, connections) + +#define for_each_connection_rcu(connection, resource) \ + list_for_each_entry_rcu(connection, &resource->connections, connections) + +#define for_each_connection_safe(connection, tmp, resource) \ + list_for_each_entry_safe(connection, tmp, &resource->connections, connections) + +#define for_each_peer_device(peer_device, device) \ + list_for_each_entry(peer_device, &device->peer_devices, peer_devices) + +#define for_each_peer_device_rcu(peer_device, device) \ + list_for_each_entry_rcu(peer_device, &device->peer_devices, peer_devices) + +#define for_each_peer_device_safe(peer_device, tmp, device) \ + list_for_each_entry_safe(peer_device, tmp, &device->peer_devices, peer_devices) + +static inline unsigned int device_to_minor(struct drbd_device *device) +{ + return device->minor; +} + +/* + * function declarations + *************************/ + +/* drbd_main.c */ + +enum dds_flags { + DDSF_FORCED = 1, + DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */ +}; + +extern void drbd_init_set_defaults(struct drbd_device *device); +extern int drbd_thread_start(struct drbd_thread *thi); +extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait); +#ifdef CONFIG_SMP +extern void drbd_thread_current_set_cpu(struct drbd_thread *thi); +#else +#define drbd_thread_current_set_cpu(A) ({}) +#endif +extern void tl_release(struct drbd_connection *, unsigned int barrier_nr, + unsigned int set_size); +extern void tl_clear(struct drbd_connection *); +extern void drbd_free_sock(struct drbd_connection *connection); +extern int drbd_send(struct drbd_connection *connection, struct socket *sock, + void *buf, size_t size, unsigned msg_flags); +extern int drbd_send_all(struct drbd_connection *, struct socket *, void *, size_t, + unsigned); + +extern int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cmd); +extern int drbd_send_protocol(struct drbd_connection *connection); +extern int drbd_send_uuids(struct drbd_peer_device *); +extern int drbd_send_uuids_skip_initial_sync(struct drbd_peer_device *); +extern void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *); +extern int drbd_send_sizes(struct drbd_peer_device *, int trigger_reply, enum dds_flags flags); +extern int drbd_send_state(struct drbd_peer_device *, union drbd_state s); +extern int drbd_send_current_state(struct drbd_peer_device *); +extern int drbd_send_sync_param(struct drbd_peer_device *); +extern void drbd_send_b_ack(struct drbd_connection *connection, u32 barrier_nr, + u32 set_size); +extern int drbd_send_ack(struct drbd_peer_device *, enum drbd_packet, + struct drbd_peer_request *); +extern void drbd_send_ack_rp(struct drbd_peer_device *, enum drbd_packet, + struct p_block_req *rp); +extern void drbd_send_ack_dp(struct drbd_peer_device *, enum drbd_packet, + struct p_data *dp, int data_size); +extern int drbd_send_ack_ex(struct drbd_peer_device *, enum drbd_packet, + sector_t sector, int blksize, u64 block_id); +extern int drbd_send_out_of_sync(struct drbd_peer_device *, struct drbd_request *); +extern int drbd_send_block(struct drbd_peer_device *, enum drbd_packet, + struct drbd_peer_request *); +extern int drbd_send_dblock(struct drbd_peer_device *, struct drbd_request *req); +extern int drbd_send_drequest(struct drbd_peer_device *, int cmd, + sector_t sector, int size, u64 block_id); +extern int drbd_send_drequest_csum(struct drbd_peer_device *, sector_t sector, + int size, void *digest, int digest_size, + enum drbd_packet cmd); +extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int size); + +extern int drbd_send_bitmap(struct drbd_device *device); +extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode); +extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode); +extern void drbd_free_ldev(struct drbd_backing_dev *ldev); +extern void drbd_device_cleanup(struct drbd_device *device); +void drbd_print_uuids(struct drbd_device *device, const char *text); + +extern void conn_md_sync(struct drbd_connection *connection); +extern void drbd_md_write(struct drbd_device *device, void *buffer); +extern void drbd_md_sync(struct drbd_device *device); +extern int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev); +extern void drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local); +extern void _drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local); +extern void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local); +extern void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local); +extern void drbd_uuid_move_history(struct drbd_device *device) __must_hold(local); +extern void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local); +extern void drbd_md_set_flag(struct drbd_device *device, int flags) __must_hold(local); +extern void drbd_md_clear_flag(struct drbd_device *device, int flags)__must_hold(local); +extern int drbd_md_test_flag(struct drbd_backing_dev *, int); +extern void drbd_md_mark_dirty(struct drbd_device *device); +extern void drbd_queue_bitmap_io(struct drbd_device *device, + int (*io_fn)(struct drbd_device *), + void (*done)(struct drbd_device *, int), + char *why, enum bm_flag flags); +extern int drbd_bitmap_io(struct drbd_device *device, + int (*io_fn)(struct drbd_device *), + char *why, enum bm_flag flags); +extern int drbd_bitmap_io_from_worker(struct drbd_device *device, + int (*io_fn)(struct drbd_device *), + char *why, enum bm_flag flags); +extern int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local); +extern int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local); + +/* Meta data layout + * + * We currently have two possible layouts. + * Offsets in (512 byte) sectors. + * external: + * |----------- md_size_sect ------------------| + * [ 4k superblock ][ activity log ][ Bitmap ] + * | al_offset == 8 | + * | bm_offset = al_offset + X | + * ==> bitmap sectors = md_size_sect - bm_offset + * + * Variants: + * old, indexed fixed size meta data: + * + * internal: + * |----------- md_size_sect ------------------| + * [data.....][ Bitmap ][ activity log ][ 4k superblock ][padding*] + * | al_offset < 0 | + * | bm_offset = al_offset - Y | + * ==> bitmap sectors = Y = al_offset - bm_offset + * + * [padding*] are zero or up to 7 unused 512 Byte sectors to the + * end of the device, so that the [4k superblock] will be 4k aligned. + * + * The activity log consists of 4k transaction blocks, + * which are written in a ring-buffer, or striped ring-buffer like fashion, + * which are writtensize used to be fixed 32kB, + * but is about to become configurable. + */ + +/* Our old fixed size meta data layout + * allows up to about 3.8TB, so if you want more, + * you need to use the "flexible" meta data format. */ +#define MD_128MB_SECT (128LLU << 11) /* 128 MB, unit sectors */ +#define MD_4kB_SECT 8 +#define MD_32kB_SECT 64 + +/* One activity log extent represents 4M of storage */ +#define AL_EXTENT_SHIFT 22 +#define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT) + +/* We could make these currently hardcoded constants configurable + * variables at create-md time (or even re-configurable at runtime?). + * Which will require some more changes to the DRBD "super block" + * and attach code. + * + * updates per transaction: + * This many changes to the active set can be logged with one transaction. + * This number is arbitrary. + * context per transaction: + * This many context extent numbers are logged with each transaction. + * This number is resulting from the transaction block size (4k), the layout + * of the transaction header, and the number of updates per transaction. + * See drbd_actlog.c:struct al_transaction_on_disk + * */ +#define AL_UPDATES_PER_TRANSACTION 64 // arbitrary +#define AL_CONTEXT_PER_TRANSACTION 919 // (4096 - 36 - 6*64)/4 + +#if BITS_PER_LONG == 32 +#define LN2_BPL 5 +#define cpu_to_lel(A) cpu_to_le32(A) +#define lel_to_cpu(A) le32_to_cpu(A) +#elif BITS_PER_LONG == 64 +#define LN2_BPL 6 +#define cpu_to_lel(A) cpu_to_le64(A) +#define lel_to_cpu(A) le64_to_cpu(A) +#else +#error "LN2 of BITS_PER_LONG unknown!" +#endif + +/* resync bitmap */ +/* 16MB sized 'bitmap extent' to track syncer usage */ +struct bm_extent { + int rs_left; /* number of bits set (out of sync) in this extent. */ + int rs_failed; /* number of failed resync requests in this extent. */ + unsigned long flags; + struct lc_element lce; +}; + +#define BME_NO_WRITES 0 /* bm_extent.flags: no more requests on this one! */ +#define BME_LOCKED 1 /* bm_extent.flags: syncer active on this one. */ +#define BME_PRIORITY 2 /* finish resync IO on this extent ASAP! App IO waiting! */ + +/* drbd_bitmap.c */ +/* + * We need to store one bit for a block. + * Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap. + * Bit 0 ==> local node thinks this block is binary identical on both nodes + * Bit 1 ==> local node thinks this block needs to be synced. + */ + +#define SLEEP_TIME (HZ/10) + +/* We do bitmap IO in units of 4k blocks. + * We also still have a hardcoded 4k per bit relation. */ +#define BM_BLOCK_SHIFT 12 /* 4k per bit */ +#define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT) +/* mostly arbitrarily set the represented size of one bitmap extent, + * aka resync extent, to 16 MiB (which is also 512 Byte worth of bitmap + * at 4k per bit resolution) */ +#define BM_EXT_SHIFT 24 /* 16 MiB per resync extent */ +#define BM_EXT_SIZE (1<<BM_EXT_SHIFT) + +#if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12) +#error "HAVE YOU FIXED drbdmeta AS WELL??" +#endif + +/* thus many _storage_ sectors are described by one bit */ +#define BM_SECT_TO_BIT(x) ((x)>>(BM_BLOCK_SHIFT-9)) +#define BM_BIT_TO_SECT(x) ((sector_t)(x)<<(BM_BLOCK_SHIFT-9)) +#define BM_SECT_PER_BIT BM_BIT_TO_SECT(1) + +/* bit to represented kilo byte conversion */ +#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10)) + +/* in which _bitmap_ extent (resp. sector) the bit for a certain + * _storage_ sector is located in */ +#define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9)) +#define BM_BIT_TO_EXT(x) ((x) >> (BM_EXT_SHIFT - BM_BLOCK_SHIFT)) + +/* first storage sector a bitmap extent corresponds to */ +#define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9)) +/* how much _storage_ sectors we have per bitmap extent */ +#define BM_SECT_PER_EXT BM_EXT_TO_SECT(1) +/* how many bits are covered by one bitmap extent (resync extent) */ +#define BM_BITS_PER_EXT (1UL << (BM_EXT_SHIFT - BM_BLOCK_SHIFT)) + +#define BM_BLOCKS_PER_BM_EXT_MASK (BM_BITS_PER_EXT - 1) + + +/* in one sector of the bitmap, we have this many activity_log extents. */ +#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT)) + +/* the extent in "PER_EXTENT" below is an activity log extent + * we need that many (long words/bytes) to store the bitmap + * of one AL_EXTENT_SIZE chunk of storage. + * we can store the bitmap for that many AL_EXTENTS within + * one sector of the _on_disk_ bitmap: + * bit 0 bit 37 bit 38 bit (512*8)-1 + * ...|........|........|.. // ..|........| + * sect. 0 `296 `304 ^(512*8*8)-1 + * +#define BM_WORDS_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG ) +#define BM_BYTES_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 ) // 128 +#define BM_EXT_PER_SECT ( 512 / BM_BYTES_PER_EXTENT ) // 4 + */ + +#define DRBD_MAX_SECTORS_32 (0xffffffffLU) +/* we have a certain meta data variant that has a fixed on-disk size of 128 + * MiB, of which 4k are our "superblock", and 32k are the fixed size activity + * log, leaving this many sectors for the bitmap. + */ + +#define DRBD_MAX_SECTORS_FIXED_BM \ + ((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9))) +#if !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32 +#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32 +#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32 +#else +#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_FIXED_BM +/* 16 TB in units of sectors */ +#if BITS_PER_LONG == 32 +/* adjust by one page worth of bitmap, + * so we won't wrap around in drbd_bm_find_next_bit. + * you should use 64bit OS for that much storage, anyways. */ +#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff) +#else +/* we allow up to 1 PiB now on 64bit architecture with "flexible" meta data */ +#define DRBD_MAX_SECTORS_FLEX (1UL << 51) +/* corresponds to (1UL << 38) bits right now. */ +#endif +#endif + +/* BIO_MAX_SIZE is 256 * PAGE_CACHE_SIZE, + * so for typical PAGE_CACHE_SIZE of 4k, that is (1<<20) Byte. + * Since we may live in a mixed-platform cluster, + * we limit us to a platform agnostic constant here for now. + * A followup commit may allow even bigger BIO sizes, + * once we thought that through. */ +#define DRBD_MAX_BIO_SIZE (1U << 20) +#if DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE +#error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE +#endif +#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */ + +#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */ +#define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */ + +/* For now, don't allow more than one activity log extent worth of data + * to be discarded in one go. We may need to rework drbd_al_begin_io() + * to allow for even larger discard ranges */ +#define DRBD_MAX_DISCARD_SIZE AL_EXTENT_SIZE +#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9) + +extern int drbd_bm_init(struct drbd_device *device); +extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits); +extern void drbd_bm_cleanup(struct drbd_device *device); +extern void drbd_bm_set_all(struct drbd_device *device); +extern void drbd_bm_clear_all(struct drbd_device *device); +/* set/clear/test only a few bits at a time */ +extern int drbd_bm_set_bits( + struct drbd_device *device, unsigned long s, unsigned long e); +extern int drbd_bm_clear_bits( + struct drbd_device *device, unsigned long s, unsigned long e); +extern int drbd_bm_count_bits( + struct drbd_device *device, const unsigned long s, const unsigned long e); +/* bm_set_bits variant for use while holding drbd_bm_lock, + * may process the whole bitmap in one go */ +extern void _drbd_bm_set_bits(struct drbd_device *device, + const unsigned long s, const unsigned long e); +extern int drbd_bm_test_bit(struct drbd_device *device, unsigned long bitnr); +extern int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr); +extern int drbd_bm_read(struct drbd_device *device) __must_hold(local); +extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr); +extern int drbd_bm_write(struct drbd_device *device) __must_hold(local); +extern int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local); +extern int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local); +extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local); +extern int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local); +extern size_t drbd_bm_words(struct drbd_device *device); +extern unsigned long drbd_bm_bits(struct drbd_device *device); +extern sector_t drbd_bm_capacity(struct drbd_device *device); + +#define DRBD_END_OF_BITMAP (~(unsigned long)0) +extern unsigned long drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo); +/* bm_find_next variants for use while you hold drbd_bm_lock() */ +extern unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo); +extern unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo); +extern unsigned long _drbd_bm_total_weight(struct drbd_device *device); +extern unsigned long drbd_bm_total_weight(struct drbd_device *device); +/* for receive_bitmap */ +extern void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, + size_t number, unsigned long *buffer); +/* for _drbd_send_bitmap */ +extern void drbd_bm_get_lel(struct drbd_device *device, size_t offset, + size_t number, unsigned long *buffer); + +extern void drbd_bm_lock(struct drbd_device *device, char *why, enum bm_flag flags); +extern void drbd_bm_unlock(struct drbd_device *device); +/* drbd_main.c */ + +extern struct kmem_cache *drbd_request_cache; +extern struct kmem_cache *drbd_ee_cache; /* peer requests */ +extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ +extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ +extern mempool_t *drbd_request_mempool; +extern mempool_t *drbd_ee_mempool; + +/* drbd's page pool, used to buffer data received from the peer, + * or data requested by the peer. + * + * This does not have an emergency reserve. + * + * When allocating from this pool, it first takes pages from the pool. + * Only if the pool is depleted will try to allocate from the system. + * + * The assumption is that pages taken from this pool will be processed, + * and given back, "quickly", and then can be recycled, so we can avoid + * frequent calls to alloc_page(), and still will be able to make progress even + * under memory pressure. + */ +extern struct page *drbd_pp_pool; +extern spinlock_t drbd_pp_lock; +extern int drbd_pp_vacant; +extern wait_queue_head_t drbd_pp_wait; + +/* We also need a standard (emergency-reserve backed) page pool + * for meta data IO (activity log, bitmap). + * We can keep it global, as long as it is used as "N pages at a time". + * 128 should be plenty, currently we probably can get away with as few as 1. + */ +#define DRBD_MIN_POOL_PAGES 128 +extern mempool_t *drbd_md_io_page_pool; + +/* We also need to make sure we get a bio + * when we need it for housekeeping purposes */ +extern struct bio_set *drbd_md_io_bio_set; +/* to allocate from that set */ +extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); + +extern rwlock_t global_state_lock; + +extern int conn_lowest_minor(struct drbd_connection *connection); +extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor); +extern void drbd_destroy_device(struct kref *kref); +extern void drbd_delete_device(struct drbd_device *device); + +extern struct drbd_resource *drbd_create_resource(const char *name); +extern void drbd_free_resource(struct drbd_resource *resource); + +extern int set_resource_options(struct drbd_resource *resource, struct res_opts *res_opts); +extern struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts); +extern void drbd_destroy_connection(struct kref *kref); +extern struct drbd_connection *conn_get_by_addrs(void *my_addr, int my_addr_len, + void *peer_addr, int peer_addr_len); +extern struct drbd_resource *drbd_find_resource(const char *name); +extern void drbd_destroy_resource(struct kref *kref); +extern void conn_free_crypto(struct drbd_connection *connection); + +extern int proc_details; + +/* drbd_req */ +extern void do_submit(struct work_struct *ws); +extern void __drbd_make_request(struct drbd_device *, struct bio *, unsigned long); +extern void drbd_make_request(struct request_queue *q, struct bio *bio); +extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req); +extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec); +extern int is_valid_ar_handle(struct drbd_request *, sector_t); + + +/* drbd_nl.c */ +extern void drbd_suspend_io(struct drbd_device *device); +extern void drbd_resume_io(struct drbd_device *device); +extern char *ppsize(char *buf, unsigned long long size); +extern sector_t drbd_new_dev_size(struct drbd_device *, struct drbd_backing_dev *, sector_t, int); +enum determine_dev_size { + DS_ERROR_SHRINK = -3, + DS_ERROR_SPACE_MD = -2, + DS_ERROR = -1, + DS_UNCHANGED = 0, + DS_SHRUNK = 1, + DS_GREW = 2, + DS_GREW_FROM_ZERO = 3, +}; +extern enum determine_dev_size +drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local); +extern void resync_after_online_grow(struct drbd_device *); +extern void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev); +extern enum drbd_state_rv drbd_set_role(struct drbd_device *device, + enum drbd_role new_role, + int force); +extern bool conn_try_outdate_peer(struct drbd_connection *connection); +extern void conn_try_outdate_peer_async(struct drbd_connection *connection); +extern int drbd_khelper(struct drbd_device *device, char *cmd); + +/* drbd_worker.c */ +/* bi_end_io handlers */ +extern void drbd_md_endio(struct bio *bio, int error); +extern void drbd_peer_request_endio(struct bio *bio, int error); +extern void drbd_request_endio(struct bio *bio, int error); +extern int drbd_worker(struct drbd_thread *thi); +enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor); +void drbd_resync_after_changed(struct drbd_device *device); +extern void drbd_start_resync(struct drbd_device *device, enum drbd_conns side); +extern void resume_next_sg(struct drbd_device *device); +extern void suspend_other_sg(struct drbd_device *device); +extern int drbd_resync_finished(struct drbd_device *device); +/* maybe rather drbd_main.c ? */ +extern void *drbd_md_get_buffer(struct drbd_device *device, const char *intent); +extern void drbd_md_put_buffer(struct drbd_device *device); +extern int drbd_md_sync_page_io(struct drbd_device *device, + struct drbd_backing_dev *bdev, sector_t sector, int rw); +extern void drbd_ov_out_of_sync_found(struct drbd_device *, sector_t, int); +extern void wait_until_done_or_force_detached(struct drbd_device *device, + struct drbd_backing_dev *bdev, unsigned int *done); +extern void drbd_rs_controller_reset(struct drbd_device *device); + +static inline void ov_out_of_sync_print(struct drbd_device *device) +{ + if (device->ov_last_oos_size) { + drbd_err(device, "Out of sync: start=%llu, size=%lu (sectors)\n", + (unsigned long long)device->ov_last_oos_start, + (unsigned long)device->ov_last_oos_size); + } + device->ov_last_oos_size = 0; +} + + +extern void drbd_csum_bio(struct crypto_hash *, struct bio *, void *); +extern void drbd_csum_ee(struct crypto_hash *, struct drbd_peer_request *, void *); +/* worker callbacks */ +extern int w_e_end_data_req(struct drbd_work *, int); +extern int w_e_end_rsdata_req(struct drbd_work *, int); +extern int w_e_end_csum_rs_req(struct drbd_work *, int); +extern int w_e_end_ov_reply(struct drbd_work *, int); +extern int w_e_end_ov_req(struct drbd_work *, int); +extern int w_ov_finished(struct drbd_work *, int); +extern int w_resync_timer(struct drbd_work *, int); +extern int w_send_write_hint(struct drbd_work *, int); +extern int w_send_dblock(struct drbd_work *, int); +extern int w_send_read_req(struct drbd_work *, int); +extern int w_e_reissue(struct drbd_work *, int); +extern int w_restart_disk_io(struct drbd_work *, int); +extern int w_send_out_of_sync(struct drbd_work *, int); +extern int w_start_resync(struct drbd_work *, int); + +extern void resync_timer_fn(unsigned long data); +extern void start_resync_timer_fn(unsigned long data); + +extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req); + +/* drbd_receiver.c */ +extern int drbd_receiver(struct drbd_thread *thi); +extern int drbd_asender(struct drbd_thread *thi); +extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); +extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, + bool throttle_if_app_is_waiting); +extern int drbd_submit_peer_request(struct drbd_device *, + struct drbd_peer_request *, const unsigned, + const int); +extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *); +extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64, + sector_t, unsigned int, + bool, + gfp_t) __must_hold(local); +extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *, + int); +#define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0) +#define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1) +extern struct page *drbd_alloc_pages(struct drbd_peer_device *, unsigned int, bool); +extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled); +extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed); +extern int drbd_connected(struct drbd_peer_device *); + +static inline void drbd_tcp_cork(struct socket *sock) +{ + int val = 1; + (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK, + (char*)&val, sizeof(val)); +} + +static inline void drbd_tcp_uncork(struct socket *sock) +{ + int val = 0; + (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK, + (char*)&val, sizeof(val)); +} + +static inline void drbd_tcp_nodelay(struct socket *sock) +{ + int val = 1; + (void) kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, + (char*)&val, sizeof(val)); +} + +static inline void drbd_tcp_quickack(struct socket *sock) +{ + int val = 2; + (void) kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK, + (char*)&val, sizeof(val)); +} + +/* sets the number of 512 byte sectors of our virtual device */ +static inline void drbd_set_my_capacity(struct drbd_device *device, + sector_t size) +{ + /* set_capacity(device->this_bdev->bd_disk, size); */ + set_capacity(device->vdisk, size); + device->this_bdev->bd_inode->i_size = (loff_t)size << 9; +} + +/* + * used to submit our private bio + */ +static inline void drbd_generic_make_request(struct drbd_device *device, + int fault_type, struct bio *bio) +{ + __release(local); + if (!bio->bi_bdev) { + drbd_err(device, "drbd_generic_make_request: bio->bi_bdev == NULL\n"); + bio_endio(bio, -ENODEV); + return; + } + + if (drbd_insert_fault(device, fault_type)) + bio_endio(bio, -EIO); + else + generic_make_request(bio); +} + +void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev, + enum write_ordering_e wo); + +/* drbd_proc.c */ +extern struct proc_dir_entry *drbd_proc; +extern const struct file_operations drbd_proc_fops; +extern const char *drbd_conn_str(enum drbd_conns s); +extern const char *drbd_role_str(enum drbd_role s); + +/* drbd_actlog.c */ +extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i); +extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i); +extern void drbd_al_begin_io_commit(struct drbd_device *device); +extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i); +extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i); +extern void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i); +extern void drbd_rs_complete_io(struct drbd_device *device, sector_t sector); +extern int drbd_rs_begin_io(struct drbd_device *device, sector_t sector); +extern int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector); +extern void drbd_rs_cancel_all(struct drbd_device *device); +extern int drbd_rs_del_all(struct drbd_device *device); +extern void drbd_rs_failed_io(struct drbd_device *device, + sector_t sector, int size); +extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go); + +enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC }; +extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, + enum update_sync_bits_mode mode); +#define drbd_set_in_sync(device, sector, size) \ + __drbd_change_sync(device, sector, size, SET_IN_SYNC) +#define drbd_set_out_of_sync(device, sector, size) \ + __drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC) +#define drbd_rs_failed_io(device, sector, size) \ + __drbd_change_sync(device, sector, size, RECORD_RS_FAILED) +extern void drbd_al_shrink(struct drbd_device *device); +extern int drbd_initialize_al(struct drbd_device *, void *); + +/* drbd_nl.c */ +/* state info broadcast */ +struct sib_info { + enum drbd_state_info_bcast_reason sib_reason; + union { + struct { + char *helper_name; + unsigned helper_exit_code; + }; + struct { + union drbd_state os; + union drbd_state ns; + }; + }; +}; +void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib); + +/* + * inline helper functions + *************************/ + +/* see also page_chain_add and friends in drbd_receiver.c */ +static inline struct page *page_chain_next(struct page *page) +{ + return (struct page *)page_private(page); +} +#define page_chain_for_each(page) \ + for (; page && ({ prefetch(page_chain_next(page)); 1; }); \ + page = page_chain_next(page)) +#define page_chain_for_each_safe(page, n) \ + for (; page && ({ n = page_chain_next(page); 1; }); page = n) + + +static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req) +{ + struct page *page = peer_req->pages; + page_chain_for_each(page) { + if (page_count(page) > 1) + return 1; + } + return 0; +} + +static inline enum drbd_state_rv +_drbd_set_state(struct drbd_device *device, union drbd_state ns, + enum chg_state_flags flags, struct completion *done) +{ + enum drbd_state_rv rv; + + read_lock(&global_state_lock); + rv = __drbd_set_state(device, ns, flags, done); + read_unlock(&global_state_lock); + + return rv; +} + +static inline union drbd_state drbd_read_state(struct drbd_device *device) +{ + struct drbd_resource *resource = device->resource; + union drbd_state rv; + + rv.i = device->state.i; + rv.susp = resource->susp; + rv.susp_nod = resource->susp_nod; + rv.susp_fen = resource->susp_fen; + + return rv; +} + +enum drbd_force_detach_flags { + DRBD_READ_ERROR, + DRBD_WRITE_ERROR, + DRBD_META_IO_ERROR, + DRBD_FORCE_DETACH, +}; + +#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__) +static inline void __drbd_chk_io_error_(struct drbd_device *device, + enum drbd_force_detach_flags df, + const char *where) +{ + enum drbd_io_error_p ep; + + rcu_read_lock(); + ep = rcu_dereference(device->ldev->disk_conf)->on_io_error; + rcu_read_unlock(); + switch (ep) { + case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */ + if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) { + if (__ratelimit(&drbd_ratelimit_state)) + drbd_err(device, "Local IO failed in %s.\n", where); + if (device->state.disk > D_INCONSISTENT) + _drbd_set_state(_NS(device, disk, D_INCONSISTENT), CS_HARD, NULL); + break; + } + /* NOTE fall through for DRBD_META_IO_ERROR or DRBD_FORCE_DETACH */ + case EP_DETACH: + case EP_CALL_HELPER: + /* Remember whether we saw a READ or WRITE error. + * + * Recovery of the affected area for WRITE failure is covered + * by the activity log. + * READ errors may fall outside that area though. Certain READ + * errors can be "healed" by writing good data to the affected + * blocks, which triggers block re-allocation in lower layers. + * + * If we can not write the bitmap after a READ error, + * we may need to trigger a full sync (see w_go_diskless()). + * + * Force-detach is not really an IO error, but rather a + * desperate measure to try to deal with a completely + * unresponsive lower level IO stack. + * Still it should be treated as a WRITE error. + * + * Meta IO error is always WRITE error: + * we read meta data only once during attach, + * which will fail in case of errors. + */ + set_bit(WAS_IO_ERROR, &device->flags); + if (df == DRBD_READ_ERROR) + set_bit(WAS_READ_ERROR, &device->flags); + if (df == DRBD_FORCE_DETACH) + set_bit(FORCE_DETACH, &device->flags); + if (device->state.disk > D_FAILED) { + _drbd_set_state(_NS(device, disk, D_FAILED), CS_HARD, NULL); + drbd_err(device, + "Local IO failed in %s. Detaching...\n", where); + } + break; + } +} + +/** + * drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers + * @device: DRBD device. + * @error: Error code passed to the IO completion callback + * @forcedetach: Force detach. I.e. the error happened while accessing the meta data + * + * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED) + */ +#define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__) +static inline void drbd_chk_io_error_(struct drbd_device *device, + int error, enum drbd_force_detach_flags forcedetach, const char *where) +{ + if (error) { + unsigned long flags; + spin_lock_irqsave(&device->resource->req_lock, flags); + __drbd_chk_io_error_(device, forcedetach, where); + spin_unlock_irqrestore(&device->resource->req_lock, flags); + } +} + + +/** + * drbd_md_first_sector() - Returns the first sector number of the meta data area + * @bdev: Meta data block device. + * + * BTW, for internal meta data, this happens to be the maximum capacity + * we could agree upon with our peer node. + */ +static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) +{ + switch (bdev->md.meta_dev_idx) { + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + return bdev->md.md_offset + bdev->md.bm_offset; + case DRBD_MD_INDEX_FLEX_EXT: + default: + return bdev->md.md_offset; + } +} + +/** + * drbd_md_last_sector() - Return the last sector number of the meta data area + * @bdev: Meta data block device. + */ +static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) +{ + switch (bdev->md.meta_dev_idx) { + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + return bdev->md.md_offset + MD_4kB_SECT -1; + case DRBD_MD_INDEX_FLEX_EXT: + default: + return bdev->md.md_offset + bdev->md.md_size_sect -1; + } +} + +/* Returns the number of 512 byte sectors of the device */ +static inline sector_t drbd_get_capacity(struct block_device *bdev) +{ + /* return bdev ? get_capacity(bdev->bd_disk) : 0; */ + return bdev ? i_size_read(bdev->bd_inode) >> 9 : 0; +} + +/** + * drbd_get_max_capacity() - Returns the capacity we announce to out peer + * @bdev: Meta data block device. + * + * returns the capacity we announce to out peer. we clip ourselves at the + * various MAX_SECTORS, because if we don't, current implementation will + * oops sooner or later + */ +static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) +{ + sector_t s; + + switch (bdev->md.meta_dev_idx) { + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + s = drbd_get_capacity(bdev->backing_bdev) + ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, + drbd_md_first_sector(bdev)) + : 0; + break; + case DRBD_MD_INDEX_FLEX_EXT: + s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX, + drbd_get_capacity(bdev->backing_bdev)); + /* clip at maximum size the meta device can support */ + s = min_t(sector_t, s, + BM_EXT_TO_SECT(bdev->md.md_size_sect + - bdev->md.bm_offset)); + break; + default: + s = min_t(sector_t, DRBD_MAX_SECTORS, + drbd_get_capacity(bdev->backing_bdev)); + } + return s; +} + +/** + * drbd_md_ss() - Return the sector number of our meta data super block + * @bdev: Meta data block device. + */ +static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev) +{ + const int meta_dev_idx = bdev->md.meta_dev_idx; + + if (meta_dev_idx == DRBD_MD_INDEX_FLEX_EXT) + return 0; + + /* Since drbd08, internal meta data is always "flexible". + * position: last 4k aligned block of 4k size */ + if (meta_dev_idx == DRBD_MD_INDEX_INTERNAL || + meta_dev_idx == DRBD_MD_INDEX_FLEX_INT) + return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8; + + /* external, some index; this is the old fixed size layout */ + return MD_128MB_SECT * bdev->md.meta_dev_idx; +} + +static inline void +drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) +{ + unsigned long flags; + spin_lock_irqsave(&q->q_lock, flags); + list_add_tail(&w->list, &q->q); + spin_unlock_irqrestore(&q->q_lock, flags); + wake_up(&q->q_wait); +} + +static inline void +drbd_queue_work_if_unqueued(struct drbd_work_queue *q, struct drbd_work *w) +{ + unsigned long flags; + spin_lock_irqsave(&q->q_lock, flags); + if (list_empty_careful(&w->list)) + list_add_tail(&w->list, &q->q); + spin_unlock_irqrestore(&q->q_lock, flags); + wake_up(&q->q_wait); +} + +static inline void +drbd_device_post_work(struct drbd_device *device, int work_bit) +{ + if (!test_and_set_bit(work_bit, &device->flags)) { + struct drbd_connection *connection = + first_peer_device(device)->connection; + struct drbd_work_queue *q = &connection->sender_work; + if (!test_and_set_bit(DEVICE_WORK_PENDING, &connection->flags)) + wake_up(&q->q_wait); + } +} + +extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue); + +static inline void wake_asender(struct drbd_connection *connection) +{ + if (test_bit(SIGNAL_ASENDER, &connection->flags)) + force_sig(DRBD_SIG, connection->asender.task); +} + +static inline void request_ping(struct drbd_connection *connection) +{ + set_bit(SEND_PING, &connection->flags); + wake_asender(connection); +} + +extern void *conn_prepare_command(struct drbd_connection *, struct drbd_socket *); +extern void *drbd_prepare_command(struct drbd_peer_device *, struct drbd_socket *); +extern int conn_send_command(struct drbd_connection *, struct drbd_socket *, + enum drbd_packet, unsigned int, void *, + unsigned int); +extern int drbd_send_command(struct drbd_peer_device *, struct drbd_socket *, + enum drbd_packet, unsigned int, void *, + unsigned int); + +extern int drbd_send_ping(struct drbd_connection *connection); +extern int drbd_send_ping_ack(struct drbd_connection *connection); +extern int drbd_send_state_req(struct drbd_peer_device *, union drbd_state, union drbd_state); +extern int conn_send_state_req(struct drbd_connection *, union drbd_state, union drbd_state); + +static inline void drbd_thread_stop(struct drbd_thread *thi) +{ + _drbd_thread_stop(thi, false, true); +} + +static inline void drbd_thread_stop_nowait(struct drbd_thread *thi) +{ + _drbd_thread_stop(thi, false, false); +} + +static inline void drbd_thread_restart_nowait(struct drbd_thread *thi) +{ + _drbd_thread_stop(thi, true, false); +} + +/* counts how many answer packets packets we expect from our peer, + * for either explicit application requests, + * or implicit barrier packets as necessary. + * increased: + * w_send_barrier + * _req_mod(req, QUEUE_FOR_NET_WRITE or QUEUE_FOR_NET_READ); + * it is much easier and equally valid to count what we queue for the + * worker, even before it actually was queued or send. + * (drbd_make_request_common; recovery path on read io-error) + * decreased: + * got_BarrierAck (respective tl_clear, tl_clear_barrier) + * _req_mod(req, DATA_RECEIVED) + * [from receive_DataReply] + * _req_mod(req, WRITE_ACKED_BY_PEER or RECV_ACKED_BY_PEER or NEG_ACKED) + * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)] + * for some reason it is NOT decreased in got_NegAck, + * but in the resulting cleanup code from report_params. + * we should try to remember the reason for that... + * _req_mod(req, SEND_FAILED or SEND_CANCELED) + * _req_mod(req, CONNECTION_LOST_WHILE_PENDING) + * [from tl_clear_barrier] + */ +static inline void inc_ap_pending(struct drbd_device *device) +{ + atomic_inc(&device->ap_pending_cnt); +} + +#define ERR_IF_CNT_IS_NEGATIVE(which, func, line) \ + if (atomic_read(&device->which) < 0) \ + drbd_err(device, "in %s:%d: " #which " = %d < 0 !\n", \ + func, line, \ + atomic_read(&device->which)) + +#define dec_ap_pending(device) _dec_ap_pending(device, __func__, __LINE__) +static inline void _dec_ap_pending(struct drbd_device *device, const char *func, int line) +{ + if (atomic_dec_and_test(&device->ap_pending_cnt)) + wake_up(&device->misc_wait); + ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt, func, line); +} + +/* counts how many resync-related answers we still expect from the peer + * increase decrease + * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY) + * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK with ID_SYNCER) + * (or P_NEG_ACK with ID_SYNCER) + */ +static inline void inc_rs_pending(struct drbd_device *device) +{ + atomic_inc(&device->rs_pending_cnt); +} + +#define dec_rs_pending(device) _dec_rs_pending(device, __func__, __LINE__) +static inline void _dec_rs_pending(struct drbd_device *device, const char *func, int line) +{ + atomic_dec(&device->rs_pending_cnt); + ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt, func, line); +} + +/* counts how many answers we still need to send to the peer. + * increased on + * receive_Data unless protocol A; + * we need to send a P_RECV_ACK (proto B) + * or P_WRITE_ACK (proto C) + * receive_RSDataReply (recv_resync_read) we need to send a P_WRITE_ACK + * receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA + * receive_Barrier_* we need to send a P_BARRIER_ACK + */ +static inline void inc_unacked(struct drbd_device *device) +{ + atomic_inc(&device->unacked_cnt); +} + +#define dec_unacked(device) _dec_unacked(device, __func__, __LINE__) +static inline void _dec_unacked(struct drbd_device *device, const char *func, int line) +{ + atomic_dec(&device->unacked_cnt); + ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); +} + +#define sub_unacked(device, n) _sub_unacked(device, n, __func__, __LINE__) +static inline void _sub_unacked(struct drbd_device *device, int n, const char *func, int line) +{ + atomic_sub(n, &device->unacked_cnt); + ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); +} + +static inline bool is_sync_state(enum drbd_conns connection_state) +{ + return + (connection_state == C_SYNC_SOURCE + || connection_state == C_SYNC_TARGET + || connection_state == C_PAUSED_SYNC_S + || connection_state == C_PAUSED_SYNC_T); +} + +/** + * get_ldev() - Increase the ref count on device->ldev. Returns 0 if there is no ldev + * @_device: DRBD device. + * @_min_state: Minimum device state required for success. + * + * You have to call put_ldev() when finished working with device->ldev. + */ +#define get_ldev_if_state(_device, _min_state) \ + (_get_ldev_if_state((_device), (_min_state)) ? \ + ({ __acquire(x); true; }) : false) +#define get_ldev(_device) get_ldev_if_state(_device, D_INCONSISTENT) + +static inline void put_ldev(struct drbd_device *device) +{ + enum drbd_disk_state disk_state = device->state.disk; + /* We must check the state *before* the atomic_dec becomes visible, + * or we have a theoretical race where someone hitting zero, + * while state still D_FAILED, will then see D_DISKLESS in the + * condition below and calling into destroy, where he must not, yet. */ + int i = atomic_dec_return(&device->local_cnt); + + /* This may be called from some endio handler, + * so we must not sleep here. */ + + __release(local); + D_ASSERT(device, i >= 0); + if (i == 0) { + if (disk_state == D_DISKLESS) + /* even internal references gone, safe to destroy */ + drbd_device_post_work(device, DESTROY_DISK); + if (disk_state == D_FAILED) + /* all application IO references gone. */ + if (!test_and_set_bit(GOING_DISKLESS, &device->flags)) + drbd_device_post_work(device, GO_DISKLESS); + wake_up(&device->misc_wait); + } +} + +#ifndef __CHECKER__ +static inline int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins) +{ + int io_allowed; + + /* never get a reference while D_DISKLESS */ + if (device->state.disk == D_DISKLESS) + return 0; + + atomic_inc(&device->local_cnt); + io_allowed = (device->state.disk >= mins); + if (!io_allowed) + put_ldev(device); + return io_allowed; +} +#else +extern int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins); +#endif + +/* this throttles on-the-fly application requests + * according to max_buffers settings; + * maybe re-implement using semaphores? */ +static inline int drbd_get_max_buffers(struct drbd_device *device) +{ + struct net_conf *nc; + int mxb; + + rcu_read_lock(); + nc = rcu_dereference(first_peer_device(device)->connection->net_conf); + mxb = nc ? nc->max_buffers : 1000000; /* arbitrary limit on open requests */ + rcu_read_unlock(); + + return mxb; +} + +static inline int drbd_state_is_stable(struct drbd_device *device) +{ + union drbd_dev_state s = device->state; + + /* DO NOT add a default clause, we want the compiler to warn us + * for any newly introduced state we may have forgotten to add here */ + + switch ((enum drbd_conns)s.conn) { + /* new io only accepted when there is no connection, ... */ + case C_STANDALONE: + case C_WF_CONNECTION: + /* ... or there is a well established connection. */ + case C_CONNECTED: + case C_SYNC_SOURCE: + case C_SYNC_TARGET: + case C_VERIFY_S: + case C_VERIFY_T: + case C_PAUSED_SYNC_S: + case C_PAUSED_SYNC_T: + case C_AHEAD: + case C_BEHIND: + /* transitional states, IO allowed */ + case C_DISCONNECTING: + case C_UNCONNECTED: + case C_TIMEOUT: + case C_BROKEN_PIPE: + case C_NETWORK_FAILURE: + case C_PROTOCOL_ERROR: + case C_TEAR_DOWN: + case C_WF_REPORT_PARAMS: + case C_STARTING_SYNC_S: + case C_STARTING_SYNC_T: + break; + + /* Allow IO in BM exchange states with new protocols */ + case C_WF_BITMAP_S: + if (first_peer_device(device)->connection->agreed_pro_version < 96) + return 0; + break; + + /* no new io accepted in these states */ + case C_WF_BITMAP_T: + case C_WF_SYNC_UUID: + case C_MASK: + /* not "stable" */ + return 0; + } + + switch ((enum drbd_disk_state)s.disk) { + case D_DISKLESS: + case D_INCONSISTENT: + case D_OUTDATED: + case D_CONSISTENT: + case D_UP_TO_DATE: + case D_FAILED: + /* disk state is stable as well. */ + break; + + /* no new io accepted during transitional states */ + case D_ATTACHING: + case D_NEGOTIATING: + case D_UNKNOWN: + case D_MASK: + /* not "stable" */ + return 0; + } + + return 1; +} + +static inline int drbd_suspended(struct drbd_device *device) +{ + struct drbd_resource *resource = device->resource; + + return resource->susp || resource->susp_fen || resource->susp_nod; +} + +static inline bool may_inc_ap_bio(struct drbd_device *device) +{ + int mxb = drbd_get_max_buffers(device); + + if (drbd_suspended(device)) + return false; + if (test_bit(SUSPEND_IO, &device->flags)) + return false; + + /* to avoid potential deadlock or bitmap corruption, + * in various places, we only allow new application io + * to start during "stable" states. */ + + /* no new io accepted when attaching or detaching the disk */ + if (!drbd_state_is_stable(device)) + return false; + + /* since some older kernels don't have atomic_add_unless, + * and we are within the spinlock anyways, we have this workaround. */ + if (atomic_read(&device->ap_bio_cnt) > mxb) + return false; + if (test_bit(BITMAP_IO, &device->flags)) + return false; + return true; +} + +static inline bool inc_ap_bio_cond(struct drbd_device *device) +{ + bool rv = false; + + spin_lock_irq(&device->resource->req_lock); + rv = may_inc_ap_bio(device); + if (rv) + atomic_inc(&device->ap_bio_cnt); + spin_unlock_irq(&device->resource->req_lock); + + return rv; +} + +static inline void inc_ap_bio(struct drbd_device *device) +{ + /* we wait here + * as long as the device is suspended + * until the bitmap is no longer on the fly during connection + * handshake as long as we would exceed the max_buffer limit. + * + * to avoid races with the reconnect code, + * we need to atomic_inc within the spinlock. */ + + wait_event(device->misc_wait, inc_ap_bio_cond(device)); +} + +static inline void dec_ap_bio(struct drbd_device *device) +{ + int mxb = drbd_get_max_buffers(device); + int ap_bio = atomic_dec_return(&device->ap_bio_cnt); + + D_ASSERT(device, ap_bio >= 0); + + if (ap_bio == 0 && test_bit(BITMAP_IO, &device->flags)) { + if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags)) + drbd_queue_work(&first_peer_device(device)-> + connection->sender_work, + &device->bm_io_work.w); + } + + /* this currently does wake_up for every dec_ap_bio! + * maybe rather introduce some type of hysteresis? + * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */ + if (ap_bio < mxb) + wake_up(&device->misc_wait); +} + +static inline bool verify_can_do_stop_sector(struct drbd_device *device) +{ + return first_peer_device(device)->connection->agreed_pro_version >= 97 && + first_peer_device(device)->connection->agreed_pro_version != 100; +} + +static inline int drbd_set_ed_uuid(struct drbd_device *device, u64 val) +{ + int changed = device->ed_uuid != val; + device->ed_uuid = val; + return changed; +} + +static inline int drbd_queue_order_type(struct drbd_device *device) +{ + /* sorry, we currently have no working implementation + * of distributed TCQ stuff */ +#ifndef QUEUE_ORDERED_NONE +#define QUEUE_ORDERED_NONE 0 +#endif + return QUEUE_ORDERED_NONE; +} + +static inline struct drbd_connection *first_connection(struct drbd_resource *resource) +{ + return list_first_entry_or_null(&resource->connections, + struct drbd_connection, connections); +} + +#endif diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c new file mode 100644 index 000000000..51b25ad85 --- /dev/null +++ b/drivers/block/drbd/drbd_interval.c @@ -0,0 +1,179 @@ +#include <asm/bug.h> +#include <linux/rbtree_augmented.h> +#include "drbd_interval.h" + +/** + * interval_end - return end of @node + */ +static inline +sector_t interval_end(struct rb_node *node) +{ + struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb); + return this->end; +} + +/** + * compute_subtree_last - compute end of @node + * + * The end of an interval is the highest (start + (size >> 9)) value of this + * node and of its children. Called for @node and its parents whenever the end + * may have changed. + */ +static inline sector_t +compute_subtree_last(struct drbd_interval *node) +{ + sector_t max = node->sector + (node->size >> 9); + + if (node->rb.rb_left) { + sector_t left = interval_end(node->rb.rb_left); + if (left > max) + max = left; + } + if (node->rb.rb_right) { + sector_t right = interval_end(node->rb.rb_right); + if (right > max) + max = right; + } + return max; +} + +RB_DECLARE_CALLBACKS(static, augment_callbacks, struct drbd_interval, rb, + sector_t, end, compute_subtree_last); + +/** + * drbd_insert_interval - insert a new interval into a tree + */ +bool +drbd_insert_interval(struct rb_root *root, struct drbd_interval *this) +{ + struct rb_node **new = &root->rb_node, *parent = NULL; + sector_t this_end = this->sector + (this->size >> 9); + + BUG_ON(!IS_ALIGNED(this->size, 512)); + + while (*new) { + struct drbd_interval *here = + rb_entry(*new, struct drbd_interval, rb); + + parent = *new; + if (here->end < this_end) + here->end = this_end; + if (this->sector < here->sector) + new = &(*new)->rb_left; + else if (this->sector > here->sector) + new = &(*new)->rb_right; + else if (this < here) + new = &(*new)->rb_left; + else if (this > here) + new = &(*new)->rb_right; + else + return false; + } + + this->end = this_end; + rb_link_node(&this->rb, parent, new); + rb_insert_augmented(&this->rb, root, &augment_callbacks); + return true; +} + +/** + * drbd_contains_interval - check if a tree contains a given interval + * @sector: start sector of @interval + * @interval: may not be a valid pointer + * + * Returns if the tree contains the node @interval with start sector @start. + * Does not dereference @interval until @interval is known to be a valid object + * in @tree. Returns %false if @interval is in the tree but with a different + * sector number. + */ +bool +drbd_contains_interval(struct rb_root *root, sector_t sector, + struct drbd_interval *interval) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct drbd_interval *here = + rb_entry(node, struct drbd_interval, rb); + + if (sector < here->sector) + node = node->rb_left; + else if (sector > here->sector) + node = node->rb_right; + else if (interval < here) + node = node->rb_left; + else if (interval > here) + node = node->rb_right; + else + return true; + } + return false; +} + +/** + * drbd_remove_interval - remove an interval from a tree + */ +void +drbd_remove_interval(struct rb_root *root, struct drbd_interval *this) +{ + rb_erase_augmented(&this->rb, root, &augment_callbacks); +} + +/** + * drbd_find_overlap - search for an interval overlapping with [sector, sector + size) + * @sector: start sector + * @size: size, aligned to 512 bytes + * + * Returns an interval overlapping with [sector, sector + size), or NULL if + * there is none. When there is more than one overlapping interval in the + * tree, the interval with the lowest start sector is returned, and all other + * overlapping intervals will be on the right side of the tree, reachable with + * rb_next(). + */ +struct drbd_interval * +drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size) +{ + struct rb_node *node = root->rb_node; + struct drbd_interval *overlap = NULL; + sector_t end = sector + (size >> 9); + + BUG_ON(!IS_ALIGNED(size, 512)); + + while (node) { + struct drbd_interval *here = + rb_entry(node, struct drbd_interval, rb); + + if (node->rb_left && + sector < interval_end(node->rb_left)) { + /* Overlap if any must be on left side */ + node = node->rb_left; + } else if (here->sector < end && + sector < here->sector + (here->size >> 9)) { + overlap = here; + break; + } else if (sector >= here->sector) { + /* Overlap if any must be on right side */ + node = node->rb_right; + } else + break; + } + return overlap; +} + +struct drbd_interval * +drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size) +{ + sector_t end = sector + (size >> 9); + struct rb_node *node; + + for (;;) { + node = rb_next(&i->rb); + if (!node) + return NULL; + i = rb_entry(node, struct drbd_interval, rb); + if (i->sector >= end) + return NULL; + if (sector < i->sector + (i->size >> 9)) + return i; + } +} diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h new file mode 100644 index 000000000..f210543f0 --- /dev/null +++ b/drivers/block/drbd/drbd_interval.h @@ -0,0 +1,42 @@ +#ifndef __DRBD_INTERVAL_H +#define __DRBD_INTERVAL_H + +#include <linux/types.h> +#include <linux/rbtree.h> + +struct drbd_interval { + struct rb_node rb; + sector_t sector; /* start sector of the interval */ + unsigned int size; /* size in bytes */ + sector_t end; /* highest interval end in subtree */ + int local:1 /* local or remote request? */; + int waiting:1; /* someone is waiting for this to complete */ + int completed:1; /* this has been completed already; + * ignore for conflict detection */ +}; + +static inline void drbd_clear_interval(struct drbd_interval *i) +{ + RB_CLEAR_NODE(&i->rb); +} + +static inline bool drbd_interval_empty(struct drbd_interval *i) +{ + return RB_EMPTY_NODE(&i->rb); +} + +extern bool drbd_insert_interval(struct rb_root *, struct drbd_interval *); +extern bool drbd_contains_interval(struct rb_root *, sector_t, + struct drbd_interval *); +extern void drbd_remove_interval(struct rb_root *, struct drbd_interval *); +extern struct drbd_interval *drbd_find_overlap(struct rb_root *, sector_t, + unsigned int); +extern struct drbd_interval *drbd_next_overlap(struct drbd_interval *, sector_t, + unsigned int); + +#define drbd_for_each_overlap(i, root, sector, size) \ + for (i = drbd_find_overlap(root, sector, size); \ + i; \ + i = drbd_next_overlap(i, sector, size)) + +#endif /* __DRBD_INTERVAL_H */ diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c new file mode 100644 index 000000000..81fde9ef7 --- /dev/null +++ b/drivers/block/drbd/drbd_main.c @@ -0,0 +1,3844 @@ +/* + drbd.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev + from Logicworks, Inc. for making SDP replication support possible. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/jiffies.h> +#include <linux/drbd.h> +#include <asm/uaccess.h> +#include <asm/types.h> +#include <net/sock.h> +#include <linux/ctype.h> +#include <linux/mutex.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/memcontrol.h> +#include <linux/mm_inline.h> +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/reboot.h> +#include <linux/notifier.h> +#include <linux/kthread.h> +#include <linux/workqueue.h> +#define __KERNEL_SYSCALLS__ +#include <linux/unistd.h> +#include <linux/vmalloc.h> + +#include <linux/drbd_limits.h> +#include "drbd_int.h" +#include "drbd_protocol.h" +#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */ +#include "drbd_vli.h" +#include "drbd_debugfs.h" + +static DEFINE_MUTEX(drbd_main_mutex); +static int drbd_open(struct block_device *bdev, fmode_t mode); +static void drbd_release(struct gendisk *gd, fmode_t mode); +static void md_sync_timer_fn(unsigned long data); +static int w_bitmap_io(struct drbd_work *w, int unused); + +MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " + "Lars Ellenberg <lars@linbit.com>"); +MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION); +MODULE_VERSION(REL_VERSION); +MODULE_LICENSE("GPL"); +MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices (" + __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")"); +MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR); + +#include <linux/moduleparam.h> +/* allow_open_on_secondary */ +MODULE_PARM_DESC(allow_oos, "DONT USE!"); +/* thanks to these macros, if compiled into the kernel (not-module), + * this becomes the boot parameter drbd.minor_count */ +module_param(minor_count, uint, 0444); +module_param(disable_sendpage, bool, 0644); +module_param(allow_oos, bool, 0); +module_param(proc_details, int, 0644); + +#ifdef CONFIG_DRBD_FAULT_INJECTION +int enable_faults; +int fault_rate; +static int fault_count; +int fault_devs; +/* bitmap of enabled faults */ +module_param(enable_faults, int, 0664); +/* fault rate % value - applies to all enabled faults */ +module_param(fault_rate, int, 0664); +/* count of faults inserted */ +module_param(fault_count, int, 0664); +/* bitmap of devices to insert faults on */ +module_param(fault_devs, int, 0644); +#endif + +/* module parameter, defined */ +unsigned int minor_count = DRBD_MINOR_COUNT_DEF; +bool disable_sendpage; +bool allow_oos; +int proc_details; /* Detail level in proc drbd*/ + +/* Module parameter for setting the user mode helper program + * to run. Default is /sbin/drbdadm */ +char usermode_helper[80] = "/sbin/drbdadm"; + +module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644); + +/* in 2.6.x, our device mapping and config info contains our virtual gendisks + * as member "struct gendisk *vdisk;" + */ +struct idr drbd_devices; +struct list_head drbd_resources; + +struct kmem_cache *drbd_request_cache; +struct kmem_cache *drbd_ee_cache; /* peer requests */ +struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ +struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ +mempool_t *drbd_request_mempool; +mempool_t *drbd_ee_mempool; +mempool_t *drbd_md_io_page_pool; +struct bio_set *drbd_md_io_bio_set; + +/* I do not use a standard mempool, because: + 1) I want to hand out the pre-allocated objects first. + 2) I want to be able to interrupt sleeping allocation with a signal. + Note: This is a single linked list, the next pointer is the private + member of struct page. + */ +struct page *drbd_pp_pool; +spinlock_t drbd_pp_lock; +int drbd_pp_vacant; +wait_queue_head_t drbd_pp_wait; + +DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5); + +static const struct block_device_operations drbd_ops = { + .owner = THIS_MODULE, + .open = drbd_open, + .release = drbd_release, +}; + +struct bio *bio_alloc_drbd(gfp_t gfp_mask) +{ + struct bio *bio; + + if (!drbd_md_io_bio_set) + return bio_alloc(gfp_mask, 1); + + bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set); + if (!bio) + return NULL; + return bio; +} + +#ifdef __CHECKER__ +/* When checking with sparse, and this is an inline function, sparse will + give tons of false positives. When this is a real functions sparse works. + */ +int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins) +{ + int io_allowed; + + atomic_inc(&device->local_cnt); + io_allowed = (device->state.disk >= mins); + if (!io_allowed) { + if (atomic_dec_and_test(&device->local_cnt)) + wake_up(&device->misc_wait); + } + return io_allowed; +} + +#endif + +/** + * tl_release() - mark as BARRIER_ACKED all requests in the corresponding transfer log epoch + * @connection: DRBD connection. + * @barrier_nr: Expected identifier of the DRBD write barrier packet. + * @set_size: Expected number of requests before that barrier. + * + * In case the passed barrier_nr or set_size does not match the oldest + * epoch of not yet barrier-acked requests, this function will cause a + * termination of the connection. + */ +void tl_release(struct drbd_connection *connection, unsigned int barrier_nr, + unsigned int set_size) +{ + struct drbd_request *r; + struct drbd_request *req = NULL; + int expect_epoch = 0; + int expect_size = 0; + + spin_lock_irq(&connection->resource->req_lock); + + /* find oldest not yet barrier-acked write request, + * count writes in its epoch. */ + list_for_each_entry(r, &connection->transfer_log, tl_requests) { + const unsigned s = r->rq_state; + if (!req) { + if (!(s & RQ_WRITE)) + continue; + if (!(s & RQ_NET_MASK)) + continue; + if (s & RQ_NET_DONE) + continue; + req = r; + expect_epoch = req->epoch; + expect_size ++; + } else { + if (r->epoch != expect_epoch) + break; + if (!(s & RQ_WRITE)) + continue; + /* if (s & RQ_DONE): not expected */ + /* if (!(s & RQ_NET_MASK)): not expected */ + expect_size++; + } + } + + /* first some paranoia code */ + if (req == NULL) { + drbd_err(connection, "BAD! BarrierAck #%u received, but no epoch in tl!?\n", + barrier_nr); + goto bail; + } + if (expect_epoch != barrier_nr) { + drbd_err(connection, "BAD! BarrierAck #%u received, expected #%u!\n", + barrier_nr, expect_epoch); + goto bail; + } + + if (expect_size != set_size) { + drbd_err(connection, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n", + barrier_nr, set_size, expect_size); + goto bail; + } + + /* Clean up list of requests processed during current epoch. */ + /* this extra list walk restart is paranoia, + * to catch requests being barrier-acked "unexpectedly". + * It usually should find the same req again, or some READ preceding it. */ + list_for_each_entry(req, &connection->transfer_log, tl_requests) + if (req->epoch == expect_epoch) + break; + list_for_each_entry_safe_from(req, r, &connection->transfer_log, tl_requests) { + if (req->epoch != expect_epoch) + break; + _req_mod(req, BARRIER_ACKED); + } + spin_unlock_irq(&connection->resource->req_lock); + + return; + +bail: + spin_unlock_irq(&connection->resource->req_lock); + conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD); +} + + +/** + * _tl_restart() - Walks the transfer log, and applies an action to all requests + * @connection: DRBD connection to operate on. + * @what: The action/event to perform with all request objects + * + * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO, + * RESTART_FROZEN_DISK_IO. + */ +/* must hold resource->req_lock */ +void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what) +{ + struct drbd_request *req, *r; + + list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests) + _req_mod(req, what); +} + +void tl_restart(struct drbd_connection *connection, enum drbd_req_event what) +{ + spin_lock_irq(&connection->resource->req_lock); + _tl_restart(connection, what); + spin_unlock_irq(&connection->resource->req_lock); +} + +/** + * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL + * @device: DRBD device. + * + * This is called after the connection to the peer was lost. The storage covered + * by the requests on the transfer gets marked as our of sync. Called from the + * receiver thread and the worker thread. + */ +void tl_clear(struct drbd_connection *connection) +{ + tl_restart(connection, CONNECTION_LOST_WHILE_PENDING); +} + +/** + * tl_abort_disk_io() - Abort disk I/O for all requests for a certain device in the TL + * @device: DRBD device. + */ +void tl_abort_disk_io(struct drbd_device *device) +{ + struct drbd_connection *connection = first_peer_device(device)->connection; + struct drbd_request *req, *r; + + spin_lock_irq(&connection->resource->req_lock); + list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests) { + if (!(req->rq_state & RQ_LOCAL_PENDING)) + continue; + if (req->device != device) + continue; + _req_mod(req, ABORT_DISK_IO); + } + spin_unlock_irq(&connection->resource->req_lock); +} + +static int drbd_thread_setup(void *arg) +{ + struct drbd_thread *thi = (struct drbd_thread *) arg; + struct drbd_resource *resource = thi->resource; + unsigned long flags; + int retval; + + snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s", + thi->name[0], + resource->name); + +restart: + retval = thi->function(thi); + + spin_lock_irqsave(&thi->t_lock, flags); + + /* if the receiver has been "EXITING", the last thing it did + * was set the conn state to "StandAlone", + * if now a re-connect request comes in, conn state goes C_UNCONNECTED, + * and receiver thread will be "started". + * drbd_thread_start needs to set "RESTARTING" in that case. + * t_state check and assignment needs to be within the same spinlock, + * so either thread_start sees EXITING, and can remap to RESTARTING, + * or thread_start see NONE, and can proceed as normal. + */ + + if (thi->t_state == RESTARTING) { + drbd_info(resource, "Restarting %s thread\n", thi->name); + thi->t_state = RUNNING; + spin_unlock_irqrestore(&thi->t_lock, flags); + goto restart; + } + + thi->task = NULL; + thi->t_state = NONE; + smp_mb(); + complete_all(&thi->stop); + spin_unlock_irqrestore(&thi->t_lock, flags); + + drbd_info(resource, "Terminating %s\n", current->comm); + + /* Release mod reference taken when thread was started */ + + if (thi->connection) + kref_put(&thi->connection->kref, drbd_destroy_connection); + kref_put(&resource->kref, drbd_destroy_resource); + module_put(THIS_MODULE); + return retval; +} + +static void drbd_thread_init(struct drbd_resource *resource, struct drbd_thread *thi, + int (*func) (struct drbd_thread *), const char *name) +{ + spin_lock_init(&thi->t_lock); + thi->task = NULL; + thi->t_state = NONE; + thi->function = func; + thi->resource = resource; + thi->connection = NULL; + thi->name = name; +} + +int drbd_thread_start(struct drbd_thread *thi) +{ + struct drbd_resource *resource = thi->resource; + struct task_struct *nt; + unsigned long flags; + + /* is used from state engine doing drbd_thread_stop_nowait, + * while holding the req lock irqsave */ + spin_lock_irqsave(&thi->t_lock, flags); + + switch (thi->t_state) { + case NONE: + drbd_info(resource, "Starting %s thread (from %s [%d])\n", + thi->name, current->comm, current->pid); + + /* Get ref on module for thread - this is released when thread exits */ + if (!try_module_get(THIS_MODULE)) { + drbd_err(resource, "Failed to get module reference in drbd_thread_start\n"); + spin_unlock_irqrestore(&thi->t_lock, flags); + return false; + } + + kref_get(&resource->kref); + if (thi->connection) + kref_get(&thi->connection->kref); + + init_completion(&thi->stop); + thi->reset_cpu_mask = 1; + thi->t_state = RUNNING; + spin_unlock_irqrestore(&thi->t_lock, flags); + flush_signals(current); /* otherw. may get -ERESTARTNOINTR */ + + nt = kthread_create(drbd_thread_setup, (void *) thi, + "drbd_%c_%s", thi->name[0], thi->resource->name); + + if (IS_ERR(nt)) { + drbd_err(resource, "Couldn't start thread\n"); + + if (thi->connection) + kref_put(&thi->connection->kref, drbd_destroy_connection); + kref_put(&resource->kref, drbd_destroy_resource); + module_put(THIS_MODULE); + return false; + } + spin_lock_irqsave(&thi->t_lock, flags); + thi->task = nt; + thi->t_state = RUNNING; + spin_unlock_irqrestore(&thi->t_lock, flags); + wake_up_process(nt); + break; + case EXITING: + thi->t_state = RESTARTING; + drbd_info(resource, "Restarting %s thread (from %s [%d])\n", + thi->name, current->comm, current->pid); + /* fall through */ + case RUNNING: + case RESTARTING: + default: + spin_unlock_irqrestore(&thi->t_lock, flags); + break; + } + + return true; +} + + +void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) +{ + unsigned long flags; + + enum drbd_thread_state ns = restart ? RESTARTING : EXITING; + + /* may be called from state engine, holding the req lock irqsave */ + spin_lock_irqsave(&thi->t_lock, flags); + + if (thi->t_state == NONE) { + spin_unlock_irqrestore(&thi->t_lock, flags); + if (restart) + drbd_thread_start(thi); + return; + } + + if (thi->t_state != ns) { + if (thi->task == NULL) { + spin_unlock_irqrestore(&thi->t_lock, flags); + return; + } + + thi->t_state = ns; + smp_mb(); + init_completion(&thi->stop); + if (thi->task != current) + force_sig(DRBD_SIGKILL, thi->task); + } + + spin_unlock_irqrestore(&thi->t_lock, flags); + + if (wait) + wait_for_completion(&thi->stop); +} + +int conn_lowest_minor(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + int vnr = 0, minor = -1; + + rcu_read_lock(); + peer_device = idr_get_next(&connection->peer_devices, &vnr); + if (peer_device) + minor = device_to_minor(peer_device->device); + rcu_read_unlock(); + + return minor; +} + +#ifdef CONFIG_SMP +/** + * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs + * + * Forces all threads of a resource onto the same CPU. This is beneficial for + * DRBD's performance. May be overwritten by user's configuration. + */ +static void drbd_calc_cpu_mask(cpumask_var_t *cpu_mask) +{ + unsigned int *resources_per_cpu, min_index = ~0; + + resources_per_cpu = kzalloc(nr_cpu_ids * sizeof(*resources_per_cpu), GFP_KERNEL); + if (resources_per_cpu) { + struct drbd_resource *resource; + unsigned int cpu, min = ~0; + + rcu_read_lock(); + for_each_resource_rcu(resource, &drbd_resources) { + for_each_cpu(cpu, resource->cpu_mask) + resources_per_cpu[cpu]++; + } + rcu_read_unlock(); + for_each_online_cpu(cpu) { + if (resources_per_cpu[cpu] < min) { + min = resources_per_cpu[cpu]; + min_index = cpu; + } + } + kfree(resources_per_cpu); + } + if (min_index == ~0) { + cpumask_setall(*cpu_mask); + return; + } + cpumask_set_cpu(min_index, *cpu_mask); +} + +/** + * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread + * @device: DRBD device. + * @thi: drbd_thread object + * + * call in the "main loop" of _all_ threads, no need for any mutex, current won't die + * prematurely. + */ +void drbd_thread_current_set_cpu(struct drbd_thread *thi) +{ + struct drbd_resource *resource = thi->resource; + struct task_struct *p = current; + + if (!thi->reset_cpu_mask) + return; + thi->reset_cpu_mask = 0; + set_cpus_allowed_ptr(p, resource->cpu_mask); +} +#else +#define drbd_calc_cpu_mask(A) ({}) +#endif + +/** + * drbd_header_size - size of a packet header + * + * The header size is a multiple of 8, so any payload following the header is + * word aligned on 64-bit architectures. (The bitmap send and receive code + * relies on this.) + */ +unsigned int drbd_header_size(struct drbd_connection *connection) +{ + if (connection->agreed_pro_version >= 100) { + BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header100), 8)); + return sizeof(struct p_header100); + } else { + BUILD_BUG_ON(sizeof(struct p_header80) != + sizeof(struct p_header95)); + BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header80), 8)); + return sizeof(struct p_header80); + } +} + +static unsigned int prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size) +{ + h->magic = cpu_to_be32(DRBD_MAGIC); + h->command = cpu_to_be16(cmd); + h->length = cpu_to_be16(size); + return sizeof(struct p_header80); +} + +static unsigned int prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size) +{ + h->magic = cpu_to_be16(DRBD_MAGIC_BIG); + h->command = cpu_to_be16(cmd); + h->length = cpu_to_be32(size); + return sizeof(struct p_header95); +} + +static unsigned int prepare_header100(struct p_header100 *h, enum drbd_packet cmd, + int size, int vnr) +{ + h->magic = cpu_to_be32(DRBD_MAGIC_100); + h->volume = cpu_to_be16(vnr); + h->command = cpu_to_be16(cmd); + h->length = cpu_to_be32(size); + h->pad = 0; + return sizeof(struct p_header100); +} + +static unsigned int prepare_header(struct drbd_connection *connection, int vnr, + void *buffer, enum drbd_packet cmd, int size) +{ + if (connection->agreed_pro_version >= 100) + return prepare_header100(buffer, cmd, size, vnr); + else if (connection->agreed_pro_version >= 95 && + size > DRBD_MAX_SIZE_H80_PACKET) + return prepare_header95(buffer, cmd, size); + else + return prepare_header80(buffer, cmd, size); +} + +static void *__conn_prepare_command(struct drbd_connection *connection, + struct drbd_socket *sock) +{ + if (!sock->socket) + return NULL; + return sock->sbuf + drbd_header_size(connection); +} + +void *conn_prepare_command(struct drbd_connection *connection, struct drbd_socket *sock) +{ + void *p; + + mutex_lock(&sock->mutex); + p = __conn_prepare_command(connection, sock); + if (!p) + mutex_unlock(&sock->mutex); + + return p; +} + +void *drbd_prepare_command(struct drbd_peer_device *peer_device, struct drbd_socket *sock) +{ + return conn_prepare_command(peer_device->connection, sock); +} + +static int __send_command(struct drbd_connection *connection, int vnr, + struct drbd_socket *sock, enum drbd_packet cmd, + unsigned int header_size, void *data, + unsigned int size) +{ + int msg_flags; + int err; + + /* + * Called with @data == NULL and the size of the data blocks in @size + * for commands that send data blocks. For those commands, omit the + * MSG_MORE flag: this will increase the likelihood that data blocks + * which are page aligned on the sender will end up page aligned on the + * receiver. + */ + msg_flags = data ? MSG_MORE : 0; + + header_size += prepare_header(connection, vnr, sock->sbuf, cmd, + header_size + size); + err = drbd_send_all(connection, sock->socket, sock->sbuf, header_size, + msg_flags); + if (data && !err) + err = drbd_send_all(connection, sock->socket, data, size, 0); + /* DRBD protocol "pings" are latency critical. + * This is supposed to trigger tcp_push_pending_frames() */ + if (!err && (cmd == P_PING || cmd == P_PING_ACK)) + drbd_tcp_nodelay(sock->socket); + + return err; +} + +static int __conn_send_command(struct drbd_connection *connection, struct drbd_socket *sock, + enum drbd_packet cmd, unsigned int header_size, + void *data, unsigned int size) +{ + return __send_command(connection, 0, sock, cmd, header_size, data, size); +} + +int conn_send_command(struct drbd_connection *connection, struct drbd_socket *sock, + enum drbd_packet cmd, unsigned int header_size, + void *data, unsigned int size) +{ + int err; + + err = __conn_send_command(connection, sock, cmd, header_size, data, size); + mutex_unlock(&sock->mutex); + return err; +} + +int drbd_send_command(struct drbd_peer_device *peer_device, struct drbd_socket *sock, + enum drbd_packet cmd, unsigned int header_size, + void *data, unsigned int size) +{ + int err; + + err = __send_command(peer_device->connection, peer_device->device->vnr, + sock, cmd, header_size, data, size); + mutex_unlock(&sock->mutex); + return err; +} + +int drbd_send_ping(struct drbd_connection *connection) +{ + struct drbd_socket *sock; + + sock = &connection->meta; + if (!conn_prepare_command(connection, sock)) + return -EIO; + return conn_send_command(connection, sock, P_PING, 0, NULL, 0); +} + +int drbd_send_ping_ack(struct drbd_connection *connection) +{ + struct drbd_socket *sock; + + sock = &connection->meta; + if (!conn_prepare_command(connection, sock)) + return -EIO; + return conn_send_command(connection, sock, P_PING_ACK, 0, NULL, 0); +} + +int drbd_send_sync_param(struct drbd_peer_device *peer_device) +{ + struct drbd_socket *sock; + struct p_rs_param_95 *p; + int size; + const int apv = peer_device->connection->agreed_pro_version; + enum drbd_packet cmd; + struct net_conf *nc; + struct disk_conf *dc; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + + rcu_read_lock(); + nc = rcu_dereference(peer_device->connection->net_conf); + + size = apv <= 87 ? sizeof(struct p_rs_param) + : apv == 88 ? sizeof(struct p_rs_param) + + strlen(nc->verify_alg) + 1 + : apv <= 94 ? sizeof(struct p_rs_param_89) + : /* apv >= 95 */ sizeof(struct p_rs_param_95); + + cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; + + /* initialize verify_alg and csums_alg */ + memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); + + if (get_ldev(peer_device->device)) { + dc = rcu_dereference(peer_device->device->ldev->disk_conf); + p->resync_rate = cpu_to_be32(dc->resync_rate); + p->c_plan_ahead = cpu_to_be32(dc->c_plan_ahead); + p->c_delay_target = cpu_to_be32(dc->c_delay_target); + p->c_fill_target = cpu_to_be32(dc->c_fill_target); + p->c_max_rate = cpu_to_be32(dc->c_max_rate); + put_ldev(peer_device->device); + } else { + p->resync_rate = cpu_to_be32(DRBD_RESYNC_RATE_DEF); + p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF); + p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF); + p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF); + p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF); + } + + if (apv >= 88) + strcpy(p->verify_alg, nc->verify_alg); + if (apv >= 89) + strcpy(p->csums_alg, nc->csums_alg); + rcu_read_unlock(); + + return drbd_send_command(peer_device, sock, cmd, size, NULL, 0); +} + +int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cmd) +{ + struct drbd_socket *sock; + struct p_protocol *p; + struct net_conf *nc; + int size, cf; + + sock = &connection->data; + p = __conn_prepare_command(connection, sock); + if (!p) + return -EIO; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + + if (nc->tentative && connection->agreed_pro_version < 92) { + rcu_read_unlock(); + mutex_unlock(&sock->mutex); + drbd_err(connection, "--dry-run is not supported by peer"); + return -EOPNOTSUPP; + } + + size = sizeof(*p); + if (connection->agreed_pro_version >= 87) + size += strlen(nc->integrity_alg) + 1; + + p->protocol = cpu_to_be32(nc->wire_protocol); + p->after_sb_0p = cpu_to_be32(nc->after_sb_0p); + p->after_sb_1p = cpu_to_be32(nc->after_sb_1p); + p->after_sb_2p = cpu_to_be32(nc->after_sb_2p); + p->two_primaries = cpu_to_be32(nc->two_primaries); + cf = 0; + if (nc->discard_my_data) + cf |= CF_DISCARD_MY_DATA; + if (nc->tentative) + cf |= CF_DRY_RUN; + p->conn_flags = cpu_to_be32(cf); + + if (connection->agreed_pro_version >= 87) + strcpy(p->integrity_alg, nc->integrity_alg); + rcu_read_unlock(); + + return __conn_send_command(connection, sock, cmd, size, NULL, 0); +} + +int drbd_send_protocol(struct drbd_connection *connection) +{ + int err; + + mutex_lock(&connection->data.mutex); + err = __drbd_send_protocol(connection, P_PROTOCOL); + mutex_unlock(&connection->data.mutex); + + return err; +} + +static int _drbd_send_uuids(struct drbd_peer_device *peer_device, u64 uuid_flags) +{ + struct drbd_device *device = peer_device->device; + struct drbd_socket *sock; + struct p_uuids *p; + int i; + + if (!get_ldev_if_state(device, D_NEGOTIATING)) + return 0; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) { + put_ldev(device); + return -EIO; + } + spin_lock_irq(&device->ldev->md.uuid_lock); + for (i = UI_CURRENT; i < UI_SIZE; i++) + p->uuid[i] = cpu_to_be64(device->ldev->md.uuid[i]); + spin_unlock_irq(&device->ldev->md.uuid_lock); + + device->comm_bm_set = drbd_bm_total_weight(device); + p->uuid[UI_SIZE] = cpu_to_be64(device->comm_bm_set); + rcu_read_lock(); + uuid_flags |= rcu_dereference(peer_device->connection->net_conf)->discard_my_data ? 1 : 0; + rcu_read_unlock(); + uuid_flags |= test_bit(CRASHED_PRIMARY, &device->flags) ? 2 : 0; + uuid_flags |= device->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0; + p->uuid[UI_FLAGS] = cpu_to_be64(uuid_flags); + + put_ldev(device); + return drbd_send_command(peer_device, sock, P_UUIDS, sizeof(*p), NULL, 0); +} + +int drbd_send_uuids(struct drbd_peer_device *peer_device) +{ + return _drbd_send_uuids(peer_device, 0); +} + +int drbd_send_uuids_skip_initial_sync(struct drbd_peer_device *peer_device) +{ + return _drbd_send_uuids(peer_device, 8); +} + +void drbd_print_uuids(struct drbd_device *device, const char *text) +{ + if (get_ldev_if_state(device, D_NEGOTIATING)) { + u64 *uuid = device->ldev->md.uuid; + drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX\n", + text, + (unsigned long long)uuid[UI_CURRENT], + (unsigned long long)uuid[UI_BITMAP], + (unsigned long long)uuid[UI_HISTORY_START], + (unsigned long long)uuid[UI_HISTORY_END]); + put_ldev(device); + } else { + drbd_info(device, "%s effective data uuid: %016llX\n", + text, + (unsigned long long)device->ed_uuid); + } +} + +void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device) +{ + struct drbd_device *device = peer_device->device; + struct drbd_socket *sock; + struct p_rs_uuid *p; + u64 uuid; + + D_ASSERT(device, device->state.disk == D_UP_TO_DATE); + + uuid = device->ldev->md.uuid[UI_BITMAP]; + if (uuid && uuid != UUID_JUST_CREATED) + uuid = uuid + UUID_NEW_BM_OFFSET; + else + get_random_bytes(&uuid, sizeof(u64)); + drbd_uuid_set(device, UI_BITMAP, uuid); + drbd_print_uuids(device, "updated sync UUID"); + drbd_md_sync(device); + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (p) { + p->uuid = cpu_to_be64(uuid); + drbd_send_command(peer_device, sock, P_SYNC_UUID, sizeof(*p), NULL, 0); + } +} + +int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags) +{ + struct drbd_device *device = peer_device->device; + struct drbd_socket *sock; + struct p_sizes *p; + sector_t d_size, u_size; + int q_order_type; + unsigned int max_bio_size; + + if (get_ldev_if_state(device, D_NEGOTIATING)) { + D_ASSERT(device, device->ldev->backing_bdev); + d_size = drbd_get_max_capacity(device->ldev); + rcu_read_lock(); + u_size = rcu_dereference(device->ldev->disk_conf)->disk_size; + rcu_read_unlock(); + q_order_type = drbd_queue_order_type(device); + max_bio_size = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9; + max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE); + put_ldev(device); + } else { + d_size = 0; + u_size = 0; + q_order_type = QUEUE_ORDERED_NONE; + max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ + } + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + + if (peer_device->connection->agreed_pro_version <= 94) + max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + else if (peer_device->connection->agreed_pro_version < 100) + max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE_P95); + + p->d_size = cpu_to_be64(d_size); + p->u_size = cpu_to_be64(u_size); + p->c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(device->this_bdev)); + p->max_bio_size = cpu_to_be32(max_bio_size); + p->queue_order_type = cpu_to_be16(q_order_type); + p->dds_flags = cpu_to_be16(flags); + return drbd_send_command(peer_device, sock, P_SIZES, sizeof(*p), NULL, 0); +} + +/** + * drbd_send_current_state() - Sends the drbd state to the peer + * @peer_device: DRBD peer device. + */ +int drbd_send_current_state(struct drbd_peer_device *peer_device) +{ + struct drbd_socket *sock; + struct p_state *p; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + p->state = cpu_to_be32(peer_device->device->state.i); /* Within the send mutex */ + return drbd_send_command(peer_device, sock, P_STATE, sizeof(*p), NULL, 0); +} + +/** + * drbd_send_state() - After a state change, sends the new state to the peer + * @peer_device: DRBD peer device. + * @state: the state to send, not necessarily the current state. + * + * Each state change queues an "after_state_ch" work, which will eventually + * send the resulting new state to the peer. If more state changes happen + * between queuing and processing of the after_state_ch work, we still + * want to send each intermediary state in the order it occurred. + */ +int drbd_send_state(struct drbd_peer_device *peer_device, union drbd_state state) +{ + struct drbd_socket *sock; + struct p_state *p; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + p->state = cpu_to_be32(state.i); /* Within the send mutex */ + return drbd_send_command(peer_device, sock, P_STATE, sizeof(*p), NULL, 0); +} + +int drbd_send_state_req(struct drbd_peer_device *peer_device, union drbd_state mask, union drbd_state val) +{ + struct drbd_socket *sock; + struct p_req_state *p; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + p->mask = cpu_to_be32(mask.i); + p->val = cpu_to_be32(val.i); + return drbd_send_command(peer_device, sock, P_STATE_CHG_REQ, sizeof(*p), NULL, 0); +} + +int conn_send_state_req(struct drbd_connection *connection, union drbd_state mask, union drbd_state val) +{ + enum drbd_packet cmd; + struct drbd_socket *sock; + struct p_req_state *p; + + cmd = connection->agreed_pro_version < 100 ? P_STATE_CHG_REQ : P_CONN_ST_CHG_REQ; + sock = &connection->data; + p = conn_prepare_command(connection, sock); + if (!p) + return -EIO; + p->mask = cpu_to_be32(mask.i); + p->val = cpu_to_be32(val.i); + return conn_send_command(connection, sock, cmd, sizeof(*p), NULL, 0); +} + +void drbd_send_sr_reply(struct drbd_peer_device *peer_device, enum drbd_state_rv retcode) +{ + struct drbd_socket *sock; + struct p_req_state_reply *p; + + sock = &peer_device->connection->meta; + p = drbd_prepare_command(peer_device, sock); + if (p) { + p->retcode = cpu_to_be32(retcode); + drbd_send_command(peer_device, sock, P_STATE_CHG_REPLY, sizeof(*p), NULL, 0); + } +} + +void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode) +{ + struct drbd_socket *sock; + struct p_req_state_reply *p; + enum drbd_packet cmd = connection->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY; + + sock = &connection->meta; + p = conn_prepare_command(connection, sock); + if (p) { + p->retcode = cpu_to_be32(retcode); + conn_send_command(connection, sock, cmd, sizeof(*p), NULL, 0); + } +} + +static void dcbp_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code) +{ + BUG_ON(code & ~0xf); + p->encoding = (p->encoding & ~0xf) | code; +} + +static void dcbp_set_start(struct p_compressed_bm *p, int set) +{ + p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0); +} + +static void dcbp_set_pad_bits(struct p_compressed_bm *p, int n) +{ + BUG_ON(n & ~0x7); + p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4); +} + +static int fill_bitmap_rle_bits(struct drbd_device *device, + struct p_compressed_bm *p, + unsigned int size, + struct bm_xfer_ctx *c) +{ + struct bitstream bs; + unsigned long plain_bits; + unsigned long tmp; + unsigned long rl; + unsigned len; + unsigned toggle; + int bits, use_rle; + + /* may we use this feature? */ + rcu_read_lock(); + use_rle = rcu_dereference(first_peer_device(device)->connection->net_conf)->use_rle; + rcu_read_unlock(); + if (!use_rle || first_peer_device(device)->connection->agreed_pro_version < 90) + return 0; + + if (c->bit_offset >= c->bm_bits) + return 0; /* nothing to do. */ + + /* use at most thus many bytes */ + bitstream_init(&bs, p->code, size, 0); + memset(p->code, 0, size); + /* plain bits covered in this code string */ + plain_bits = 0; + + /* p->encoding & 0x80 stores whether the first run length is set. + * bit offset is implicit. + * start with toggle == 2 to be able to tell the first iteration */ + toggle = 2; + + /* see how much plain bits we can stuff into one packet + * using RLE and VLI. */ + do { + tmp = (toggle == 0) ? _drbd_bm_find_next_zero(device, c->bit_offset) + : _drbd_bm_find_next(device, c->bit_offset); + if (tmp == -1UL) + tmp = c->bm_bits; + rl = tmp - c->bit_offset; + + if (toggle == 2) { /* first iteration */ + if (rl == 0) { + /* the first checked bit was set, + * store start value, */ + dcbp_set_start(p, 1); + /* but skip encoding of zero run length */ + toggle = !toggle; + continue; + } + dcbp_set_start(p, 0); + } + + /* paranoia: catch zero runlength. + * can only happen if bitmap is modified while we scan it. */ + if (rl == 0) { + drbd_err(device, "unexpected zero runlength while encoding bitmap " + "t:%u bo:%lu\n", toggle, c->bit_offset); + return -1; + } + + bits = vli_encode_bits(&bs, rl); + if (bits == -ENOBUFS) /* buffer full */ + break; + if (bits <= 0) { + drbd_err(device, "error while encoding bitmap: %d\n", bits); + return 0; + } + + toggle = !toggle; + plain_bits += rl; + c->bit_offset = tmp; + } while (c->bit_offset < c->bm_bits); + + len = bs.cur.b - p->code + !!bs.cur.bit; + + if (plain_bits < (len << 3)) { + /* incompressible with this method. + * we need to rewind both word and bit position. */ + c->bit_offset -= plain_bits; + bm_xfer_ctx_bit_to_word_offset(c); + c->bit_offset = c->word_offset * BITS_PER_LONG; + return 0; + } + + /* RLE + VLI was able to compress it just fine. + * update c->word_offset. */ + bm_xfer_ctx_bit_to_word_offset(c); + + /* store pad_bits */ + dcbp_set_pad_bits(p, (8 - bs.cur.bit) & 0x7); + + return len; +} + +/** + * send_bitmap_rle_or_plain + * + * Return 0 when done, 1 when another iteration is needed, and a negative error + * code upon failure. + */ +static int +send_bitmap_rle_or_plain(struct drbd_device *device, struct bm_xfer_ctx *c) +{ + struct drbd_socket *sock = &first_peer_device(device)->connection->data; + unsigned int header_size = drbd_header_size(first_peer_device(device)->connection); + struct p_compressed_bm *p = sock->sbuf + header_size; + int len, err; + + len = fill_bitmap_rle_bits(device, p, + DRBD_SOCKET_BUFFER_SIZE - header_size - sizeof(*p), c); + if (len < 0) + return -EIO; + + if (len) { + dcbp_set_code(p, RLE_VLI_Bits); + err = __send_command(first_peer_device(device)->connection, device->vnr, sock, + P_COMPRESSED_BITMAP, sizeof(*p) + len, + NULL, 0); + c->packets[0]++; + c->bytes[0] += header_size + sizeof(*p) + len; + + if (c->bit_offset >= c->bm_bits) + len = 0; /* DONE */ + } else { + /* was not compressible. + * send a buffer full of plain text bits instead. */ + unsigned int data_size; + unsigned long num_words; + unsigned long *p = sock->sbuf + header_size; + + data_size = DRBD_SOCKET_BUFFER_SIZE - header_size; + num_words = min_t(size_t, data_size / sizeof(*p), + c->bm_words - c->word_offset); + len = num_words * sizeof(*p); + if (len) + drbd_bm_get_lel(device, c->word_offset, num_words, p); + err = __send_command(first_peer_device(device)->connection, device->vnr, sock, P_BITMAP, len, NULL, 0); + c->word_offset += num_words; + c->bit_offset = c->word_offset * BITS_PER_LONG; + + c->packets[1]++; + c->bytes[1] += header_size + len; + + if (c->bit_offset > c->bm_bits) + c->bit_offset = c->bm_bits; + } + if (!err) { + if (len == 0) { + INFO_bm_xfer_stats(device, "send", c); + return 0; + } else + return 1; + } + return -EIO; +} + +/* See the comment at receive_bitmap() */ +static int _drbd_send_bitmap(struct drbd_device *device) +{ + struct bm_xfer_ctx c; + int err; + + if (!expect(device->bitmap)) + return false; + + if (get_ldev(device)) { + if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC)) { + drbd_info(device, "Writing the whole bitmap, MDF_FullSync was set.\n"); + drbd_bm_set_all(device); + if (drbd_bm_write(device)) { + /* write_bm did fail! Leave full sync flag set in Meta P_DATA + * but otherwise process as per normal - need to tell other + * side that a full resync is required! */ + drbd_err(device, "Failed to write bitmap to disk!\n"); + } else { + drbd_md_clear_flag(device, MDF_FULL_SYNC); + drbd_md_sync(device); + } + } + put_ldev(device); + } + + c = (struct bm_xfer_ctx) { + .bm_bits = drbd_bm_bits(device), + .bm_words = drbd_bm_words(device), + }; + + do { + err = send_bitmap_rle_or_plain(device, &c); + } while (err > 0); + + return err == 0; +} + +int drbd_send_bitmap(struct drbd_device *device) +{ + struct drbd_socket *sock = &first_peer_device(device)->connection->data; + int err = -1; + + mutex_lock(&sock->mutex); + if (sock->socket) + err = !_drbd_send_bitmap(device); + mutex_unlock(&sock->mutex); + return err; +} + +void drbd_send_b_ack(struct drbd_connection *connection, u32 barrier_nr, u32 set_size) +{ + struct drbd_socket *sock; + struct p_barrier_ack *p; + + if (connection->cstate < C_WF_REPORT_PARAMS) + return; + + sock = &connection->meta; + p = conn_prepare_command(connection, sock); + if (!p) + return; + p->barrier = barrier_nr; + p->set_size = cpu_to_be32(set_size); + conn_send_command(connection, sock, P_BARRIER_ACK, sizeof(*p), NULL, 0); +} + +/** + * _drbd_send_ack() - Sends an ack packet + * @device: DRBD device. + * @cmd: Packet command code. + * @sector: sector, needs to be in big endian byte order + * @blksize: size in byte, needs to be in big endian byte order + * @block_id: Id, big endian byte order + */ +static int _drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd, + u64 sector, u32 blksize, u64 block_id) +{ + struct drbd_socket *sock; + struct p_block_ack *p; + + if (peer_device->device->state.conn < C_CONNECTED) + return -EIO; + + sock = &peer_device->connection->meta; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + p->sector = sector; + p->block_id = block_id; + p->blksize = blksize; + p->seq_num = cpu_to_be32(atomic_inc_return(&peer_device->device->packet_seq)); + return drbd_send_command(peer_device, sock, cmd, sizeof(*p), NULL, 0); +} + +/* dp->sector and dp->block_id already/still in network byte order, + * data_size is payload size according to dp->head, + * and may need to be corrected for digest size. */ +void drbd_send_ack_dp(struct drbd_peer_device *peer_device, enum drbd_packet cmd, + struct p_data *dp, int data_size) +{ + if (peer_device->connection->peer_integrity_tfm) + data_size -= crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm); + _drbd_send_ack(peer_device, cmd, dp->sector, cpu_to_be32(data_size), + dp->block_id); +} + +void drbd_send_ack_rp(struct drbd_peer_device *peer_device, enum drbd_packet cmd, + struct p_block_req *rp) +{ + _drbd_send_ack(peer_device, cmd, rp->sector, rp->blksize, rp->block_id); +} + +/** + * drbd_send_ack() - Sends an ack packet + * @device: DRBD device + * @cmd: packet command code + * @peer_req: peer request + */ +int drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd, + struct drbd_peer_request *peer_req) +{ + return _drbd_send_ack(peer_device, cmd, + cpu_to_be64(peer_req->i.sector), + cpu_to_be32(peer_req->i.size), + peer_req->block_id); +} + +/* This function misuses the block_id field to signal if the blocks + * are is sync or not. */ +int drbd_send_ack_ex(struct drbd_peer_device *peer_device, enum drbd_packet cmd, + sector_t sector, int blksize, u64 block_id) +{ + return _drbd_send_ack(peer_device, cmd, + cpu_to_be64(sector), + cpu_to_be32(blksize), + cpu_to_be64(block_id)); +} + +int drbd_send_drequest(struct drbd_peer_device *peer_device, int cmd, + sector_t sector, int size, u64 block_id) +{ + struct drbd_socket *sock; + struct p_block_req *p; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + p->sector = cpu_to_be64(sector); + p->block_id = block_id; + p->blksize = cpu_to_be32(size); + return drbd_send_command(peer_device, sock, cmd, sizeof(*p), NULL, 0); +} + +int drbd_send_drequest_csum(struct drbd_peer_device *peer_device, sector_t sector, int size, + void *digest, int digest_size, enum drbd_packet cmd) +{ + struct drbd_socket *sock; + struct p_block_req *p; + + /* FIXME: Put the digest into the preallocated socket buffer. */ + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + p->sector = cpu_to_be64(sector); + p->block_id = ID_SYNCER /* unused */; + p->blksize = cpu_to_be32(size); + return drbd_send_command(peer_device, sock, cmd, sizeof(*p), digest, digest_size); +} + +int drbd_send_ov_request(struct drbd_peer_device *peer_device, sector_t sector, int size) +{ + struct drbd_socket *sock; + struct p_block_req *p; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + p->sector = cpu_to_be64(sector); + p->block_id = ID_SYNCER /* unused */; + p->blksize = cpu_to_be32(size); + return drbd_send_command(peer_device, sock, P_OV_REQUEST, sizeof(*p), NULL, 0); +} + +/* called on sndtimeo + * returns false if we should retry, + * true if we think connection is dead + */ +static int we_should_drop_the_connection(struct drbd_connection *connection, struct socket *sock) +{ + int drop_it; + /* long elapsed = (long)(jiffies - device->last_received); */ + + drop_it = connection->meta.socket == sock + || !connection->asender.task + || get_t_state(&connection->asender) != RUNNING + || connection->cstate < C_WF_REPORT_PARAMS; + + if (drop_it) + return true; + + drop_it = !--connection->ko_count; + if (!drop_it) { + drbd_err(connection, "[%s/%d] sock_sendmsg time expired, ko = %u\n", + current->comm, current->pid, connection->ko_count); + request_ping(connection); + } + + return drop_it; /* && (device->state == R_PRIMARY) */; +} + +static void drbd_update_congested(struct drbd_connection *connection) +{ + struct sock *sk = connection->data.socket->sk; + if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5) + set_bit(NET_CONGESTED, &connection->flags); +} + +/* The idea of sendpage seems to be to put some kind of reference + * to the page into the skb, and to hand it over to the NIC. In + * this process get_page() gets called. + * + * As soon as the page was really sent over the network put_page() + * gets called by some part of the network layer. [ NIC driver? ] + * + * [ get_page() / put_page() increment/decrement the count. If count + * reaches 0 the page will be freed. ] + * + * This works nicely with pages from FSs. + * But this means that in protocol A we might signal IO completion too early! + * + * In order not to corrupt data during a resync we must make sure + * that we do not reuse our own buffer pages (EEs) to early, therefore + * we have the net_ee list. + * + * XFS seems to have problems, still, it submits pages with page_count == 0! + * As a workaround, we disable sendpage on pages + * with page_count == 0 or PageSlab. + */ +static int _drbd_no_send_page(struct drbd_peer_device *peer_device, struct page *page, + int offset, size_t size, unsigned msg_flags) +{ + struct socket *socket; + void *addr; + int err; + + socket = peer_device->connection->data.socket; + addr = kmap(page) + offset; + err = drbd_send_all(peer_device->connection, socket, addr, size, msg_flags); + kunmap(page); + if (!err) + peer_device->device->send_cnt += size >> 9; + return err; +} + +static int _drbd_send_page(struct drbd_peer_device *peer_device, struct page *page, + int offset, size_t size, unsigned msg_flags) +{ + struct socket *socket = peer_device->connection->data.socket; + mm_segment_t oldfs = get_fs(); + int len = size; + int err = -EIO; + + /* e.g. XFS meta- & log-data is in slab pages, which have a + * page_count of 0 and/or have PageSlab() set. + * we cannot use send_page for those, as that does get_page(); + * put_page(); and would cause either a VM_BUG directly, or + * __page_cache_release a page that would actually still be referenced + * by someone, leading to some obscure delayed Oops somewhere else. */ + if (disable_sendpage || (page_count(page) < 1) || PageSlab(page)) + return _drbd_no_send_page(peer_device, page, offset, size, msg_flags); + + msg_flags |= MSG_NOSIGNAL; + drbd_update_congested(peer_device->connection); + set_fs(KERNEL_DS); + do { + int sent; + + sent = socket->ops->sendpage(socket, page, offset, len, msg_flags); + if (sent <= 0) { + if (sent == -EAGAIN) { + if (we_should_drop_the_connection(peer_device->connection, socket)) + break; + continue; + } + drbd_warn(peer_device->device, "%s: size=%d len=%d sent=%d\n", + __func__, (int)size, len, sent); + if (sent < 0) + err = sent; + break; + } + len -= sent; + offset += sent; + } while (len > 0 /* THINK && device->cstate >= C_CONNECTED*/); + set_fs(oldfs); + clear_bit(NET_CONGESTED, &peer_device->connection->flags); + + if (len == 0) { + err = 0; + peer_device->device->send_cnt += size >> 9; + } + return err; +} + +static int _drbd_send_bio(struct drbd_peer_device *peer_device, struct bio *bio) +{ + struct bio_vec bvec; + struct bvec_iter iter; + + /* hint all but last page with MSG_MORE */ + bio_for_each_segment(bvec, bio, iter) { + int err; + + err = _drbd_no_send_page(peer_device, bvec.bv_page, + bvec.bv_offset, bvec.bv_len, + bio_iter_last(bvec, iter) + ? 0 : MSG_MORE); + if (err) + return err; + } + return 0; +} + +static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *bio) +{ + struct bio_vec bvec; + struct bvec_iter iter; + + /* hint all but last page with MSG_MORE */ + bio_for_each_segment(bvec, bio, iter) { + int err; + + err = _drbd_send_page(peer_device, bvec.bv_page, + bvec.bv_offset, bvec.bv_len, + bio_iter_last(bvec, iter) ? 0 : MSG_MORE); + if (err) + return err; + } + return 0; +} + +static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device, + struct drbd_peer_request *peer_req) +{ + struct page *page = peer_req->pages; + unsigned len = peer_req->i.size; + int err; + + /* hint all but last page with MSG_MORE */ + page_chain_for_each(page) { + unsigned l = min_t(unsigned, len, PAGE_SIZE); + + err = _drbd_send_page(peer_device, page, 0, l, + page_chain_next(page) ? MSG_MORE : 0); + if (err) + return err; + len -= l; + } + return 0; +} + +static u32 bio_flags_to_wire(struct drbd_connection *connection, unsigned long bi_rw) +{ + if (connection->agreed_pro_version >= 95) + return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) | + (bi_rw & REQ_FUA ? DP_FUA : 0) | + (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) | + (bi_rw & REQ_DISCARD ? DP_DISCARD : 0); + else + return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0; +} + +/* Used to send write or TRIM aka REQ_DISCARD requests + * R_PRIMARY -> Peer (P_DATA, P_TRIM) + */ +int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *req) +{ + struct drbd_device *device = peer_device->device; + struct drbd_socket *sock; + struct p_data *p; + unsigned int dp_flags = 0; + int digest_size; + int err; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + digest_size = peer_device->connection->integrity_tfm ? + crypto_hash_digestsize(peer_device->connection->integrity_tfm) : 0; + + if (!p) + return -EIO; + p->sector = cpu_to_be64(req->i.sector); + p->block_id = (unsigned long)req; + p->seq_num = cpu_to_be32(atomic_inc_return(&device->packet_seq)); + dp_flags = bio_flags_to_wire(peer_device->connection, req->master_bio->bi_rw); + if (device->state.conn >= C_SYNC_SOURCE && + device->state.conn <= C_PAUSED_SYNC_T) + dp_flags |= DP_MAY_SET_IN_SYNC; + if (peer_device->connection->agreed_pro_version >= 100) { + if (req->rq_state & RQ_EXP_RECEIVE_ACK) + dp_flags |= DP_SEND_RECEIVE_ACK; + /* During resync, request an explicit write ack, + * even in protocol != C */ + if (req->rq_state & RQ_EXP_WRITE_ACK + || (dp_flags & DP_MAY_SET_IN_SYNC)) + dp_flags |= DP_SEND_WRITE_ACK; + } + p->dp_flags = cpu_to_be32(dp_flags); + + if (dp_flags & DP_DISCARD) { + struct p_trim *t = (struct p_trim*)p; + t->size = cpu_to_be32(req->i.size); + err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0); + goto out; + } + + /* our digest is still only over the payload. + * TRIM does not carry any payload. */ + if (digest_size) + drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1); + err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + digest_size, NULL, req->i.size); + if (!err) { + /* For protocol A, we have to memcpy the payload into + * socket buffers, as we may complete right away + * as soon as we handed it over to tcp, at which point the data + * pages may become invalid. + * + * For data-integrity enabled, we copy it as well, so we can be + * sure that even if the bio pages may still be modified, it + * won't change the data on the wire, thus if the digest checks + * out ok after sending on this side, but does not fit on the + * receiving side, we sure have detected corruption elsewhere. + */ + if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || digest_size) + err = _drbd_send_bio(peer_device, req->master_bio); + else + err = _drbd_send_zc_bio(peer_device, req->master_bio); + + /* double check digest, sometimes buffers have been modified in flight. */ + if (digest_size > 0 && digest_size <= 64) { + /* 64 byte, 512 bit, is the largest digest size + * currently supported in kernel crypto. */ + unsigned char digest[64]; + drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest); + if (memcmp(p + 1, digest, digest_size)) { + drbd_warn(device, + "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n", + (unsigned long long)req->i.sector, req->i.size); + } + } /* else if (digest_size > 64) { + ... Be noisy about digest too large ... + } */ + } +out: + mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */ + + return err; +} + +/* answer packet, used to send data back for read requests: + * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY) + * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY) + */ +int drbd_send_block(struct drbd_peer_device *peer_device, enum drbd_packet cmd, + struct drbd_peer_request *peer_req) +{ + struct drbd_device *device = peer_device->device; + struct drbd_socket *sock; + struct p_data *p; + int err; + int digest_size; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + + digest_size = peer_device->connection->integrity_tfm ? + crypto_hash_digestsize(peer_device->connection->integrity_tfm) : 0; + + if (!p) + return -EIO; + p->sector = cpu_to_be64(peer_req->i.sector); + p->block_id = peer_req->block_id; + p->seq_num = 0; /* unused */ + p->dp_flags = 0; + if (digest_size) + drbd_csum_ee(peer_device->connection->integrity_tfm, peer_req, p + 1); + err = __send_command(peer_device->connection, device->vnr, sock, cmd, sizeof(*p) + digest_size, NULL, peer_req->i.size); + if (!err) + err = _drbd_send_zc_ee(peer_device, peer_req); + mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */ + + return err; +} + +int drbd_send_out_of_sync(struct drbd_peer_device *peer_device, struct drbd_request *req) +{ + struct drbd_socket *sock; + struct p_block_desc *p; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + p->sector = cpu_to_be64(req->i.sector); + p->blksize = cpu_to_be32(req->i.size); + return drbd_send_command(peer_device, sock, P_OUT_OF_SYNC, sizeof(*p), NULL, 0); +} + +/* + drbd_send distinguishes two cases: + + Packets sent via the data socket "sock" + and packets sent via the meta data socket "msock" + + sock msock + -----------------+-------------------------+------------------------------ + timeout conf.timeout / 2 conf.timeout / 2 + timeout action send a ping via msock Abort communication + and close all sockets +*/ + +/* + * you must have down()ed the appropriate [m]sock_mutex elsewhere! + */ +int drbd_send(struct drbd_connection *connection, struct socket *sock, + void *buf, size_t size, unsigned msg_flags) +{ + struct kvec iov; + struct msghdr msg; + int rv, sent = 0; + + if (!sock) + return -EBADR; + + /* THINK if (signal_pending) return ... ? */ + + iov.iov_base = buf; + iov.iov_len = size; + + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = msg_flags | MSG_NOSIGNAL; + + if (sock == connection->data.socket) { + rcu_read_lock(); + connection->ko_count = rcu_dereference(connection->net_conf)->ko_count; + rcu_read_unlock(); + drbd_update_congested(connection); + } + do { + /* STRANGE + * tcp_sendmsg does _not_ use its size parameter at all ? + * + * -EAGAIN on timeout, -EINTR on signal. + */ +/* THINK + * do we need to block DRBD_SIG if sock == &meta.socket ?? + * otherwise wake_asender() might interrupt some send_*Ack ! + */ + rv = kernel_sendmsg(sock, &msg, &iov, 1, size); + if (rv == -EAGAIN) { + if (we_should_drop_the_connection(connection, sock)) + break; + else + continue; + } + if (rv == -EINTR) { + flush_signals(current); + rv = 0; + } + if (rv < 0) + break; + sent += rv; + iov.iov_base += rv; + iov.iov_len -= rv; + } while (sent < size); + + if (sock == connection->data.socket) + clear_bit(NET_CONGESTED, &connection->flags); + + if (rv <= 0) { + if (rv != -EAGAIN) { + drbd_err(connection, "%s_sendmsg returned %d\n", + sock == connection->meta.socket ? "msock" : "sock", + rv); + conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD); + } else + conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD); + } + + return sent; +} + +/** + * drbd_send_all - Send an entire buffer + * + * Returns 0 upon success and a negative error value otherwise. + */ +int drbd_send_all(struct drbd_connection *connection, struct socket *sock, void *buffer, + size_t size, unsigned msg_flags) +{ + int err; + + err = drbd_send(connection, sock, buffer, size, msg_flags); + if (err < 0) + return err; + if (err != size) + return -EIO; + return 0; +} + +static int drbd_open(struct block_device *bdev, fmode_t mode) +{ + struct drbd_device *device = bdev->bd_disk->private_data; + unsigned long flags; + int rv = 0; + + mutex_lock(&drbd_main_mutex); + spin_lock_irqsave(&device->resource->req_lock, flags); + /* to have a stable device->state.role + * and no race with updating open_cnt */ + + if (device->state.role != R_PRIMARY) { + if (mode & FMODE_WRITE) + rv = -EROFS; + else if (!allow_oos) + rv = -EMEDIUMTYPE; + } + + if (!rv) + device->open_cnt++; + spin_unlock_irqrestore(&device->resource->req_lock, flags); + mutex_unlock(&drbd_main_mutex); + + return rv; +} + +static void drbd_release(struct gendisk *gd, fmode_t mode) +{ + struct drbd_device *device = gd->private_data; + mutex_lock(&drbd_main_mutex); + device->open_cnt--; + mutex_unlock(&drbd_main_mutex); +} + +static void drbd_set_defaults(struct drbd_device *device) +{ + /* Beware! The actual layout differs + * between big endian and little endian */ + device->state = (union drbd_dev_state) { + { .role = R_SECONDARY, + .peer = R_UNKNOWN, + .conn = C_STANDALONE, + .disk = D_DISKLESS, + .pdsk = D_UNKNOWN, + } }; +} + +void drbd_init_set_defaults(struct drbd_device *device) +{ + /* the memset(,0,) did most of this. + * note: only assignments, no allocation in here */ + + drbd_set_defaults(device); + + atomic_set(&device->ap_bio_cnt, 0); + atomic_set(&device->ap_actlog_cnt, 0); + atomic_set(&device->ap_pending_cnt, 0); + atomic_set(&device->rs_pending_cnt, 0); + atomic_set(&device->unacked_cnt, 0); + atomic_set(&device->local_cnt, 0); + atomic_set(&device->pp_in_use_by_net, 0); + atomic_set(&device->rs_sect_in, 0); + atomic_set(&device->rs_sect_ev, 0); + atomic_set(&device->ap_in_flight, 0); + atomic_set(&device->md_io.in_use, 0); + + mutex_init(&device->own_state_mutex); + device->state_mutex = &device->own_state_mutex; + + spin_lock_init(&device->al_lock); + spin_lock_init(&device->peer_seq_lock); + + INIT_LIST_HEAD(&device->active_ee); + INIT_LIST_HEAD(&device->sync_ee); + INIT_LIST_HEAD(&device->done_ee); + INIT_LIST_HEAD(&device->read_ee); + INIT_LIST_HEAD(&device->net_ee); + INIT_LIST_HEAD(&device->resync_reads); + INIT_LIST_HEAD(&device->resync_work.list); + INIT_LIST_HEAD(&device->unplug_work.list); + INIT_LIST_HEAD(&device->bm_io_work.w.list); + INIT_LIST_HEAD(&device->pending_master_completion[0]); + INIT_LIST_HEAD(&device->pending_master_completion[1]); + INIT_LIST_HEAD(&device->pending_completion[0]); + INIT_LIST_HEAD(&device->pending_completion[1]); + + device->resync_work.cb = w_resync_timer; + device->unplug_work.cb = w_send_write_hint; + device->bm_io_work.w.cb = w_bitmap_io; + + init_timer(&device->resync_timer); + init_timer(&device->md_sync_timer); + init_timer(&device->start_resync_timer); + init_timer(&device->request_timer); + device->resync_timer.function = resync_timer_fn; + device->resync_timer.data = (unsigned long) device; + device->md_sync_timer.function = md_sync_timer_fn; + device->md_sync_timer.data = (unsigned long) device; + device->start_resync_timer.function = start_resync_timer_fn; + device->start_resync_timer.data = (unsigned long) device; + device->request_timer.function = request_timer_fn; + device->request_timer.data = (unsigned long) device; + + init_waitqueue_head(&device->misc_wait); + init_waitqueue_head(&device->state_wait); + init_waitqueue_head(&device->ee_wait); + init_waitqueue_head(&device->al_wait); + init_waitqueue_head(&device->seq_wait); + + device->resync_wenr = LC_FREE; + device->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE; + device->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE; +} + +void drbd_device_cleanup(struct drbd_device *device) +{ + int i; + if (first_peer_device(device)->connection->receiver.t_state != NONE) + drbd_err(device, "ASSERT FAILED: receiver t_state == %d expected 0.\n", + first_peer_device(device)->connection->receiver.t_state); + + device->al_writ_cnt = + device->bm_writ_cnt = + device->read_cnt = + device->recv_cnt = + device->send_cnt = + device->writ_cnt = + device->p_size = + device->rs_start = + device->rs_total = + device->rs_failed = 0; + device->rs_last_events = 0; + device->rs_last_sect_ev = 0; + for (i = 0; i < DRBD_SYNC_MARKS; i++) { + device->rs_mark_left[i] = 0; + device->rs_mark_time[i] = 0; + } + D_ASSERT(device, first_peer_device(device)->connection->net_conf == NULL); + + drbd_set_my_capacity(device, 0); + if (device->bitmap) { + /* maybe never allocated. */ + drbd_bm_resize(device, 0, 1); + drbd_bm_cleanup(device); + } + + drbd_free_ldev(device->ldev); + device->ldev = NULL; + + clear_bit(AL_SUSPENDED, &device->flags); + + D_ASSERT(device, list_empty(&device->active_ee)); + D_ASSERT(device, list_empty(&device->sync_ee)); + D_ASSERT(device, list_empty(&device->done_ee)); + D_ASSERT(device, list_empty(&device->read_ee)); + D_ASSERT(device, list_empty(&device->net_ee)); + D_ASSERT(device, list_empty(&device->resync_reads)); + D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q)); + D_ASSERT(device, list_empty(&device->resync_work.list)); + D_ASSERT(device, list_empty(&device->unplug_work.list)); + + drbd_set_defaults(device); +} + + +static void drbd_destroy_mempools(void) +{ + struct page *page; + + while (drbd_pp_pool) { + page = drbd_pp_pool; + drbd_pp_pool = (struct page *)page_private(page); + __free_page(page); + drbd_pp_vacant--; + } + + /* D_ASSERT(device, atomic_read(&drbd_pp_vacant)==0); */ + + if (drbd_md_io_bio_set) + bioset_free(drbd_md_io_bio_set); + if (drbd_md_io_page_pool) + mempool_destroy(drbd_md_io_page_pool); + if (drbd_ee_mempool) + mempool_destroy(drbd_ee_mempool); + if (drbd_request_mempool) + mempool_destroy(drbd_request_mempool); + if (drbd_ee_cache) + kmem_cache_destroy(drbd_ee_cache); + if (drbd_request_cache) + kmem_cache_destroy(drbd_request_cache); + if (drbd_bm_ext_cache) + kmem_cache_destroy(drbd_bm_ext_cache); + if (drbd_al_ext_cache) + kmem_cache_destroy(drbd_al_ext_cache); + + drbd_md_io_bio_set = NULL; + drbd_md_io_page_pool = NULL; + drbd_ee_mempool = NULL; + drbd_request_mempool = NULL; + drbd_ee_cache = NULL; + drbd_request_cache = NULL; + drbd_bm_ext_cache = NULL; + drbd_al_ext_cache = NULL; + + return; +} + +static int drbd_create_mempools(void) +{ + struct page *page; + const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count; + int i; + + /* prepare our caches and mempools */ + drbd_request_mempool = NULL; + drbd_ee_cache = NULL; + drbd_request_cache = NULL; + drbd_bm_ext_cache = NULL; + drbd_al_ext_cache = NULL; + drbd_pp_pool = NULL; + drbd_md_io_page_pool = NULL; + drbd_md_io_bio_set = NULL; + + /* caches */ + drbd_request_cache = kmem_cache_create( + "drbd_req", sizeof(struct drbd_request), 0, 0, NULL); + if (drbd_request_cache == NULL) + goto Enomem; + + drbd_ee_cache = kmem_cache_create( + "drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL); + if (drbd_ee_cache == NULL) + goto Enomem; + + drbd_bm_ext_cache = kmem_cache_create( + "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL); + if (drbd_bm_ext_cache == NULL) + goto Enomem; + + drbd_al_ext_cache = kmem_cache_create( + "drbd_al", sizeof(struct lc_element), 0, 0, NULL); + if (drbd_al_ext_cache == NULL) + goto Enomem; + + /* mempools */ + drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0); + if (drbd_md_io_bio_set == NULL) + goto Enomem; + + drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0); + if (drbd_md_io_page_pool == NULL) + goto Enomem; + + drbd_request_mempool = mempool_create_slab_pool(number, + drbd_request_cache); + if (drbd_request_mempool == NULL) + goto Enomem; + + drbd_ee_mempool = mempool_create_slab_pool(number, drbd_ee_cache); + if (drbd_ee_mempool == NULL) + goto Enomem; + + /* drbd's page pool */ + spin_lock_init(&drbd_pp_lock); + + for (i = 0; i < number; i++) { + page = alloc_page(GFP_HIGHUSER); + if (!page) + goto Enomem; + set_page_private(page, (unsigned long)drbd_pp_pool); + drbd_pp_pool = page; + } + drbd_pp_vacant = number; + + return 0; + +Enomem: + drbd_destroy_mempools(); /* in case we allocated some */ + return -ENOMEM; +} + +static void drbd_release_all_peer_reqs(struct drbd_device *device) +{ + int rr; + + rr = drbd_free_peer_reqs(device, &device->active_ee); + if (rr) + drbd_err(device, "%d EEs in active list found!\n", rr); + + rr = drbd_free_peer_reqs(device, &device->sync_ee); + if (rr) + drbd_err(device, "%d EEs in sync list found!\n", rr); + + rr = drbd_free_peer_reqs(device, &device->read_ee); + if (rr) + drbd_err(device, "%d EEs in read list found!\n", rr); + + rr = drbd_free_peer_reqs(device, &device->done_ee); + if (rr) + drbd_err(device, "%d EEs in done list found!\n", rr); + + rr = drbd_free_peer_reqs(device, &device->net_ee); + if (rr) + drbd_err(device, "%d EEs in net list found!\n", rr); +} + +/* caution. no locking. */ +void drbd_destroy_device(struct kref *kref) +{ + struct drbd_device *device = container_of(kref, struct drbd_device, kref); + struct drbd_resource *resource = device->resource; + struct drbd_peer_device *peer_device, *tmp_peer_device; + + del_timer_sync(&device->request_timer); + + /* paranoia asserts */ + D_ASSERT(device, device->open_cnt == 0); + /* end paranoia asserts */ + + /* cleanup stuff that may have been allocated during + * device (re-)configuration or state changes */ + + if (device->this_bdev) + bdput(device->this_bdev); + + drbd_free_ldev(device->ldev); + device->ldev = NULL; + + drbd_release_all_peer_reqs(device); + + lc_destroy(device->act_log); + lc_destroy(device->resync); + + kfree(device->p_uuid); + /* device->p_uuid = NULL; */ + + if (device->bitmap) /* should no longer be there. */ + drbd_bm_cleanup(device); + __free_page(device->md_io.page); + put_disk(device->vdisk); + blk_cleanup_queue(device->rq_queue); + kfree(device->rs_plan_s); + + /* not for_each_connection(connection, resource): + * those may have been cleaned up and disassociated already. + */ + for_each_peer_device_safe(peer_device, tmp_peer_device, device) { + kref_put(&peer_device->connection->kref, drbd_destroy_connection); + kfree(peer_device); + } + memset(device, 0xfd, sizeof(*device)); + kfree(device); + kref_put(&resource->kref, drbd_destroy_resource); +} + +/* One global retry thread, if we need to push back some bio and have it + * reinserted through our make request function. + */ +static struct retry_worker { + struct workqueue_struct *wq; + struct work_struct worker; + + spinlock_t lock; + struct list_head writes; +} retry; + +static void do_retry(struct work_struct *ws) +{ + struct retry_worker *retry = container_of(ws, struct retry_worker, worker); + LIST_HEAD(writes); + struct drbd_request *req, *tmp; + + spin_lock_irq(&retry->lock); + list_splice_init(&retry->writes, &writes); + spin_unlock_irq(&retry->lock); + + list_for_each_entry_safe(req, tmp, &writes, tl_requests) { + struct drbd_device *device = req->device; + struct bio *bio = req->master_bio; + unsigned long start_jif = req->start_jif; + bool expected; + + expected = + expect(atomic_read(&req->completion_ref) == 0) && + expect(req->rq_state & RQ_POSTPONED) && + expect((req->rq_state & RQ_LOCAL_PENDING) == 0 || + (req->rq_state & RQ_LOCAL_ABORTED) != 0); + + if (!expected) + drbd_err(device, "req=%p completion_ref=%d rq_state=%x\n", + req, atomic_read(&req->completion_ref), + req->rq_state); + + /* We still need to put one kref associated with the + * "completion_ref" going zero in the code path that queued it + * here. The request object may still be referenced by a + * frozen local req->private_bio, in case we force-detached. + */ + kref_put(&req->kref, drbd_req_destroy); + + /* A single suspended or otherwise blocking device may stall + * all others as well. Fortunately, this code path is to + * recover from a situation that "should not happen": + * concurrent writes in multi-primary setup. + * In a "normal" lifecycle, this workqueue is supposed to be + * destroyed without ever doing anything. + * If it turns out to be an issue anyways, we can do per + * resource (replication group) or per device (minor) retry + * workqueues instead. + */ + + /* We are not just doing generic_make_request(), + * as we want to keep the start_time information. */ + inc_ap_bio(device); + __drbd_make_request(device, bio, start_jif); + } +} + +/* called via drbd_req_put_completion_ref(), + * holds resource->req_lock */ +void drbd_restart_request(struct drbd_request *req) +{ + unsigned long flags; + spin_lock_irqsave(&retry.lock, flags); + list_move_tail(&req->tl_requests, &retry.writes); + spin_unlock_irqrestore(&retry.lock, flags); + + /* Drop the extra reference that would otherwise + * have been dropped by complete_master_bio. + * do_retry() needs to grab a new one. */ + dec_ap_bio(req->device); + + queue_work(retry.wq, &retry.worker); +} + +void drbd_destroy_resource(struct kref *kref) +{ + struct drbd_resource *resource = + container_of(kref, struct drbd_resource, kref); + + idr_destroy(&resource->devices); + free_cpumask_var(resource->cpu_mask); + kfree(resource->name); + memset(resource, 0xf2, sizeof(*resource)); + kfree(resource); +} + +void drbd_free_resource(struct drbd_resource *resource) +{ + struct drbd_connection *connection, *tmp; + + for_each_connection_safe(connection, tmp, resource) { + list_del(&connection->connections); + drbd_debugfs_connection_cleanup(connection); + kref_put(&connection->kref, drbd_destroy_connection); + } + drbd_debugfs_resource_cleanup(resource); + kref_put(&resource->kref, drbd_destroy_resource); +} + +static void drbd_cleanup(void) +{ + unsigned int i; + struct drbd_device *device; + struct drbd_resource *resource, *tmp; + + /* first remove proc, + * drbdsetup uses it's presence to detect + * whether DRBD is loaded. + * If we would get stuck in proc removal, + * but have netlink already deregistered, + * some drbdsetup commands may wait forever + * for an answer. + */ + if (drbd_proc) + remove_proc_entry("drbd", NULL); + + if (retry.wq) + destroy_workqueue(retry.wq); + + drbd_genl_unregister(); + drbd_debugfs_cleanup(); + + idr_for_each_entry(&drbd_devices, device, i) + drbd_delete_device(device); + + /* not _rcu since, no other updater anymore. Genl already unregistered */ + for_each_resource_safe(resource, tmp, &drbd_resources) { + list_del(&resource->resources); + drbd_free_resource(resource); + } + + drbd_destroy_mempools(); + unregister_blkdev(DRBD_MAJOR, "drbd"); + + idr_destroy(&drbd_devices); + + pr_info("module cleanup done.\n"); +} + +/** + * drbd_congested() - Callback for the flusher thread + * @congested_data: User data + * @bdi_bits: Bits the BDI flusher thread is currently interested in + * + * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested. + */ +static int drbd_congested(void *congested_data, int bdi_bits) +{ + struct drbd_device *device = congested_data; + struct request_queue *q; + char reason = '-'; + int r = 0; + + if (!may_inc_ap_bio(device)) { + /* DRBD has frozen IO */ + r = bdi_bits; + reason = 'd'; + goto out; + } + + if (test_bit(CALLBACK_PENDING, &first_peer_device(device)->connection->flags)) { + r |= (1 << BDI_async_congested); + /* Without good local data, we would need to read from remote, + * and that would need the worker thread as well, which is + * currently blocked waiting for that usermode helper to + * finish. + */ + if (!get_ldev_if_state(device, D_UP_TO_DATE)) + r |= (1 << BDI_sync_congested); + else + put_ldev(device); + r &= bdi_bits; + reason = 'c'; + goto out; + } + + if (get_ldev(device)) { + q = bdev_get_queue(device->ldev->backing_bdev); + r = bdi_congested(&q->backing_dev_info, bdi_bits); + put_ldev(device); + if (r) + reason = 'b'; + } + + if (bdi_bits & (1 << BDI_async_congested) && + test_bit(NET_CONGESTED, &first_peer_device(device)->connection->flags)) { + r |= (1 << BDI_async_congested); + reason = reason == 'b' ? 'a' : 'n'; + } + +out: + device->congestion_reason = reason; + return r; +} + +static void drbd_init_workqueue(struct drbd_work_queue* wq) +{ + spin_lock_init(&wq->q_lock); + INIT_LIST_HEAD(&wq->q); + init_waitqueue_head(&wq->q_wait); +} + +struct completion_work { + struct drbd_work w; + struct completion done; +}; + +static int w_complete(struct drbd_work *w, int cancel) +{ + struct completion_work *completion_work = + container_of(w, struct completion_work, w); + + complete(&completion_work->done); + return 0; +} + +void drbd_flush_workqueue(struct drbd_work_queue *work_queue) +{ + struct completion_work completion_work; + + completion_work.w.cb = w_complete; + init_completion(&completion_work.done); + drbd_queue_work(work_queue, &completion_work.w); + wait_for_completion(&completion_work.done); +} + +struct drbd_resource *drbd_find_resource(const char *name) +{ + struct drbd_resource *resource; + + if (!name || !name[0]) + return NULL; + + rcu_read_lock(); + for_each_resource_rcu(resource, &drbd_resources) { + if (!strcmp(resource->name, name)) { + kref_get(&resource->kref); + goto found; + } + } + resource = NULL; +found: + rcu_read_unlock(); + return resource; +} + +struct drbd_connection *conn_get_by_addrs(void *my_addr, int my_addr_len, + void *peer_addr, int peer_addr_len) +{ + struct drbd_resource *resource; + struct drbd_connection *connection; + + rcu_read_lock(); + for_each_resource_rcu(resource, &drbd_resources) { + for_each_connection_rcu(connection, resource) { + if (connection->my_addr_len == my_addr_len && + connection->peer_addr_len == peer_addr_len && + !memcmp(&connection->my_addr, my_addr, my_addr_len) && + !memcmp(&connection->peer_addr, peer_addr, peer_addr_len)) { + kref_get(&connection->kref); + goto found; + } + } + } + connection = NULL; +found: + rcu_read_unlock(); + return connection; +} + +static int drbd_alloc_socket(struct drbd_socket *socket) +{ + socket->rbuf = (void *) __get_free_page(GFP_KERNEL); + if (!socket->rbuf) + return -ENOMEM; + socket->sbuf = (void *) __get_free_page(GFP_KERNEL); + if (!socket->sbuf) + return -ENOMEM; + return 0; +} + +static void drbd_free_socket(struct drbd_socket *socket) +{ + free_page((unsigned long) socket->sbuf); + free_page((unsigned long) socket->rbuf); +} + +void conn_free_crypto(struct drbd_connection *connection) +{ + drbd_free_sock(connection); + + crypto_free_hash(connection->csums_tfm); + crypto_free_hash(connection->verify_tfm); + crypto_free_hash(connection->cram_hmac_tfm); + crypto_free_hash(connection->integrity_tfm); + crypto_free_hash(connection->peer_integrity_tfm); + kfree(connection->int_dig_in); + kfree(connection->int_dig_vv); + + connection->csums_tfm = NULL; + connection->verify_tfm = NULL; + connection->cram_hmac_tfm = NULL; + connection->integrity_tfm = NULL; + connection->peer_integrity_tfm = NULL; + connection->int_dig_in = NULL; + connection->int_dig_vv = NULL; +} + +int set_resource_options(struct drbd_resource *resource, struct res_opts *res_opts) +{ + struct drbd_connection *connection; + cpumask_var_t new_cpu_mask; + int err; + + if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + /* silently ignore cpu mask on UP kernel */ + if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) { + err = bitmap_parse(res_opts->cpu_mask, DRBD_CPU_MASK_SIZE, + cpumask_bits(new_cpu_mask), nr_cpu_ids); + if (err == -EOVERFLOW) { + /* So what. mask it out. */ + cpumask_var_t tmp_cpu_mask; + if (zalloc_cpumask_var(&tmp_cpu_mask, GFP_KERNEL)) { + cpumask_setall(tmp_cpu_mask); + cpumask_and(new_cpu_mask, new_cpu_mask, tmp_cpu_mask); + drbd_warn(resource, "Overflow in bitmap_parse(%.12s%s), truncating to %u bits\n", + res_opts->cpu_mask, + strlen(res_opts->cpu_mask) > 12 ? "..." : "", + nr_cpu_ids); + free_cpumask_var(tmp_cpu_mask); + err = 0; + } + } + if (err) { + drbd_warn(resource, "bitmap_parse() failed with %d\n", err); + /* retcode = ERR_CPU_MASK_PARSE; */ + goto fail; + } + } + resource->res_opts = *res_opts; + if (cpumask_empty(new_cpu_mask)) + drbd_calc_cpu_mask(&new_cpu_mask); + if (!cpumask_equal(resource->cpu_mask, new_cpu_mask)) { + cpumask_copy(resource->cpu_mask, new_cpu_mask); + for_each_connection_rcu(connection, resource) { + connection->receiver.reset_cpu_mask = 1; + connection->asender.reset_cpu_mask = 1; + connection->worker.reset_cpu_mask = 1; + } + } + err = 0; + +fail: + free_cpumask_var(new_cpu_mask); + return err; + +} + +struct drbd_resource *drbd_create_resource(const char *name) +{ + struct drbd_resource *resource; + + resource = kzalloc(sizeof(struct drbd_resource), GFP_KERNEL); + if (!resource) + goto fail; + resource->name = kstrdup(name, GFP_KERNEL); + if (!resource->name) + goto fail_free_resource; + if (!zalloc_cpumask_var(&resource->cpu_mask, GFP_KERNEL)) + goto fail_free_name; + kref_init(&resource->kref); + idr_init(&resource->devices); + INIT_LIST_HEAD(&resource->connections); + resource->write_ordering = WO_bdev_flush; + list_add_tail_rcu(&resource->resources, &drbd_resources); + mutex_init(&resource->conf_update); + mutex_init(&resource->adm_mutex); + spin_lock_init(&resource->req_lock); + drbd_debugfs_resource_add(resource); + return resource; + +fail_free_name: + kfree(resource->name); +fail_free_resource: + kfree(resource); +fail: + return NULL; +} + +/* caller must be under adm_mutex */ +struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts) +{ + struct drbd_resource *resource; + struct drbd_connection *connection; + + connection = kzalloc(sizeof(struct drbd_connection), GFP_KERNEL); + if (!connection) + return NULL; + + if (drbd_alloc_socket(&connection->data)) + goto fail; + if (drbd_alloc_socket(&connection->meta)) + goto fail; + + connection->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL); + if (!connection->current_epoch) + goto fail; + + INIT_LIST_HEAD(&connection->transfer_log); + + INIT_LIST_HEAD(&connection->current_epoch->list); + connection->epochs = 1; + spin_lock_init(&connection->epoch_lock); + + connection->send.seen_any_write_yet = false; + connection->send.current_epoch_nr = 0; + connection->send.current_epoch_writes = 0; + + resource = drbd_create_resource(name); + if (!resource) + goto fail; + + connection->cstate = C_STANDALONE; + mutex_init(&connection->cstate_mutex); + init_waitqueue_head(&connection->ping_wait); + idr_init(&connection->peer_devices); + + drbd_init_workqueue(&connection->sender_work); + mutex_init(&connection->data.mutex); + mutex_init(&connection->meta.mutex); + + drbd_thread_init(resource, &connection->receiver, drbd_receiver, "receiver"); + connection->receiver.connection = connection; + drbd_thread_init(resource, &connection->worker, drbd_worker, "worker"); + connection->worker.connection = connection; + drbd_thread_init(resource, &connection->asender, drbd_asender, "asender"); + connection->asender.connection = connection; + + kref_init(&connection->kref); + + connection->resource = resource; + + if (set_resource_options(resource, res_opts)) + goto fail_resource; + + kref_get(&resource->kref); + list_add_tail_rcu(&connection->connections, &resource->connections); + drbd_debugfs_connection_add(connection); + return connection; + +fail_resource: + list_del(&resource->resources); + drbd_free_resource(resource); +fail: + kfree(connection->current_epoch); + drbd_free_socket(&connection->meta); + drbd_free_socket(&connection->data); + kfree(connection); + return NULL; +} + +void drbd_destroy_connection(struct kref *kref) +{ + struct drbd_connection *connection = container_of(kref, struct drbd_connection, kref); + struct drbd_resource *resource = connection->resource; + + if (atomic_read(&connection->current_epoch->epoch_size) != 0) + drbd_err(connection, "epoch_size:%d\n", atomic_read(&connection->current_epoch->epoch_size)); + kfree(connection->current_epoch); + + idr_destroy(&connection->peer_devices); + + drbd_free_socket(&connection->meta); + drbd_free_socket(&connection->data); + kfree(connection->int_dig_in); + kfree(connection->int_dig_vv); + memset(connection, 0xfc, sizeof(*connection)); + kfree(connection); + kref_put(&resource->kref, drbd_destroy_resource); +} + +static int init_submitter(struct drbd_device *device) +{ + /* opencoded create_singlethread_workqueue(), + * to be able to say "drbd%d", ..., minor */ + device->submit.wq = alloc_workqueue("drbd%u_submit", + WQ_UNBOUND | WQ_MEM_RECLAIM, 1, device->minor); + if (!device->submit.wq) + return -ENOMEM; + + INIT_WORK(&device->submit.worker, do_submit); + INIT_LIST_HEAD(&device->submit.writes); + return 0; +} + +enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor) +{ + struct drbd_resource *resource = adm_ctx->resource; + struct drbd_connection *connection; + struct drbd_device *device; + struct drbd_peer_device *peer_device, *tmp_peer_device; + struct gendisk *disk; + struct request_queue *q; + int id; + int vnr = adm_ctx->volume; + enum drbd_ret_code err = ERR_NOMEM; + + device = minor_to_device(minor); + if (device) + return ERR_MINOR_OR_VOLUME_EXISTS; + + /* GFP_KERNEL, we are outside of all write-out paths */ + device = kzalloc(sizeof(struct drbd_device), GFP_KERNEL); + if (!device) + return ERR_NOMEM; + kref_init(&device->kref); + + kref_get(&resource->kref); + device->resource = resource; + device->minor = minor; + device->vnr = vnr; + + drbd_init_set_defaults(device); + + q = blk_alloc_queue(GFP_KERNEL); + if (!q) + goto out_no_q; + device->rq_queue = q; + q->queuedata = device; + + disk = alloc_disk(1); + if (!disk) + goto out_no_disk; + device->vdisk = disk; + + set_disk_ro(disk, true); + + disk->queue = q; + disk->major = DRBD_MAJOR; + disk->first_minor = minor; + disk->fops = &drbd_ops; + sprintf(disk->disk_name, "drbd%d", minor); + disk->private_data = device; + + device->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor)); + /* we have no partitions. we contain only ourselves. */ + device->this_bdev->bd_contains = device->this_bdev; + + q->backing_dev_info.congested_fn = drbd_congested; + q->backing_dev_info.congested_data = device; + + blk_queue_make_request(q, drbd_make_request); + blk_queue_flush(q, REQ_FLUSH | REQ_FUA); + /* Setting the max_hw_sectors to an odd value of 8kibyte here + This triggers a max_bio_size message upon first attach or connect */ + blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8); + blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); + blk_queue_merge_bvec(q, drbd_merge_bvec); + q->queue_lock = &resource->req_lock; + + device->md_io.page = alloc_page(GFP_KERNEL); + if (!device->md_io.page) + goto out_no_io_page; + + if (drbd_bm_init(device)) + goto out_no_bitmap; + device->read_requests = RB_ROOT; + device->write_requests = RB_ROOT; + + id = idr_alloc(&drbd_devices, device, minor, minor + 1, GFP_KERNEL); + if (id < 0) { + if (id == -ENOSPC) + err = ERR_MINOR_OR_VOLUME_EXISTS; + goto out_no_minor_idr; + } + kref_get(&device->kref); + + id = idr_alloc(&resource->devices, device, vnr, vnr + 1, GFP_KERNEL); + if (id < 0) { + if (id == -ENOSPC) + err = ERR_MINOR_OR_VOLUME_EXISTS; + goto out_idr_remove_minor; + } + kref_get(&device->kref); + + INIT_LIST_HEAD(&device->peer_devices); + INIT_LIST_HEAD(&device->pending_bitmap_io); + for_each_connection(connection, resource) { + peer_device = kzalloc(sizeof(struct drbd_peer_device), GFP_KERNEL); + if (!peer_device) + goto out_idr_remove_from_resource; + peer_device->connection = connection; + peer_device->device = device; + + list_add(&peer_device->peer_devices, &device->peer_devices); + kref_get(&device->kref); + + id = idr_alloc(&connection->peer_devices, peer_device, vnr, vnr + 1, GFP_KERNEL); + if (id < 0) { + if (id == -ENOSPC) + err = ERR_INVALID_REQUEST; + goto out_idr_remove_from_resource; + } + kref_get(&connection->kref); + } + + if (init_submitter(device)) { + err = ERR_NOMEM; + goto out_idr_remove_vol; + } + + add_disk(disk); + + /* inherit the connection state */ + device->state.conn = first_connection(resource)->cstate; + if (device->state.conn == C_WF_REPORT_PARAMS) { + for_each_peer_device(peer_device, device) + drbd_connected(peer_device); + } + /* move to create_peer_device() */ + for_each_peer_device(peer_device, device) + drbd_debugfs_peer_device_add(peer_device); + drbd_debugfs_device_add(device); + return NO_ERROR; + +out_idr_remove_vol: + idr_remove(&connection->peer_devices, vnr); +out_idr_remove_from_resource: + for_each_connection(connection, resource) { + peer_device = idr_find(&connection->peer_devices, vnr); + if (peer_device) { + idr_remove(&connection->peer_devices, vnr); + kref_put(&connection->kref, drbd_destroy_connection); + } + } + for_each_peer_device_safe(peer_device, tmp_peer_device, device) { + list_del(&peer_device->peer_devices); + kfree(peer_device); + } + idr_remove(&resource->devices, vnr); +out_idr_remove_minor: + idr_remove(&drbd_devices, minor); + synchronize_rcu(); +out_no_minor_idr: + drbd_bm_cleanup(device); +out_no_bitmap: + __free_page(device->md_io.page); +out_no_io_page: + put_disk(disk); +out_no_disk: + blk_cleanup_queue(q); +out_no_q: + kref_put(&resource->kref, drbd_destroy_resource); + kfree(device); + return err; +} + +void drbd_delete_device(struct drbd_device *device) +{ + struct drbd_resource *resource = device->resource; + struct drbd_connection *connection; + struct drbd_peer_device *peer_device; + int refs = 3; + + /* move to free_peer_device() */ + for_each_peer_device(peer_device, device) + drbd_debugfs_peer_device_cleanup(peer_device); + drbd_debugfs_device_cleanup(device); + for_each_connection(connection, resource) { + idr_remove(&connection->peer_devices, device->vnr); + refs++; + } + idr_remove(&resource->devices, device->vnr); + idr_remove(&drbd_devices, device_to_minor(device)); + del_gendisk(device->vdisk); + synchronize_rcu(); + kref_sub(&device->kref, refs, drbd_destroy_device); +} + +static int __init drbd_init(void) +{ + int err; + + if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) { + pr_err("invalid minor_count (%d)\n", minor_count); +#ifdef MODULE + return -EINVAL; +#else + minor_count = DRBD_MINOR_COUNT_DEF; +#endif + } + + err = register_blkdev(DRBD_MAJOR, "drbd"); + if (err) { + pr_err("unable to register block device major %d\n", + DRBD_MAJOR); + return err; + } + + /* + * allocate all necessary structs + */ + init_waitqueue_head(&drbd_pp_wait); + + drbd_proc = NULL; /* play safe for drbd_cleanup */ + idr_init(&drbd_devices); + + rwlock_init(&global_state_lock); + INIT_LIST_HEAD(&drbd_resources); + + err = drbd_genl_register(); + if (err) { + pr_err("unable to register generic netlink family\n"); + goto fail; + } + + err = drbd_create_mempools(); + if (err) + goto fail; + + err = -ENOMEM; + drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL); + if (!drbd_proc) { + pr_err("unable to register proc file\n"); + goto fail; + } + + retry.wq = create_singlethread_workqueue("drbd-reissue"); + if (!retry.wq) { + pr_err("unable to create retry workqueue\n"); + goto fail; + } + INIT_WORK(&retry.worker, do_retry); + spin_lock_init(&retry.lock); + INIT_LIST_HEAD(&retry.writes); + + if (drbd_debugfs_init()) + pr_notice("failed to initialize debugfs -- will not be available\n"); + + pr_info("initialized. " + "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n", + API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX); + pr_info("%s\n", drbd_buildtag()); + pr_info("registered as block device major %d\n", DRBD_MAJOR); + return 0; /* Success! */ + +fail: + drbd_cleanup(); + if (err == -ENOMEM) + pr_err("ran out of memory\n"); + else + pr_err("initialization failure\n"); + return err; +} + +void drbd_free_ldev(struct drbd_backing_dev *ldev) +{ + if (ldev == NULL) + return; + + blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + + kfree(ldev->disk_conf); + kfree(ldev); +} + +static void drbd_free_one_sock(struct drbd_socket *ds) +{ + struct socket *s; + mutex_lock(&ds->mutex); + s = ds->socket; + ds->socket = NULL; + mutex_unlock(&ds->mutex); + if (s) { + /* so debugfs does not need to mutex_lock() */ + synchronize_rcu(); + kernel_sock_shutdown(s, SHUT_RDWR); + sock_release(s); + } +} + +void drbd_free_sock(struct drbd_connection *connection) +{ + if (connection->data.socket) + drbd_free_one_sock(&connection->data); + if (connection->meta.socket) + drbd_free_one_sock(&connection->meta); +} + +/* meta data management */ + +void conn_md_sync(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + + kref_get(&device->kref); + rcu_read_unlock(); + drbd_md_sync(device); + kref_put(&device->kref, drbd_destroy_device); + rcu_read_lock(); + } + rcu_read_unlock(); +} + +/* aligned 4kByte */ +struct meta_data_on_disk { + u64 la_size_sect; /* last agreed size. */ + u64 uuid[UI_SIZE]; /* UUIDs. */ + u64 device_uuid; + u64 reserved_u64_1; + u32 flags; /* MDF */ + u32 magic; + u32 md_size_sect; + u32 al_offset; /* offset to this block */ + u32 al_nr_extents; /* important for restoring the AL (userspace) */ + /* `-- act_log->nr_elements <-- ldev->dc.al_extents */ + u32 bm_offset; /* offset to the bitmap, from here */ + u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ + u32 la_peer_max_bio_size; /* last peer max_bio_size */ + + /* see al_tr_number_to_on_disk_sector() */ + u32 al_stripes; + u32 al_stripe_size_4k; + + u8 reserved_u8[4096 - (7*8 + 10*4)]; +} __packed; + + + +void drbd_md_write(struct drbd_device *device, void *b) +{ + struct meta_data_on_disk *buffer = b; + sector_t sector; + int i; + + memset(buffer, 0, sizeof(*buffer)); + + buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(device->this_bdev)); + for (i = UI_CURRENT; i < UI_SIZE; i++) + buffer->uuid[i] = cpu_to_be64(device->ldev->md.uuid[i]); + buffer->flags = cpu_to_be32(device->ldev->md.flags); + buffer->magic = cpu_to_be32(DRBD_MD_MAGIC_84_UNCLEAN); + + buffer->md_size_sect = cpu_to_be32(device->ldev->md.md_size_sect); + buffer->al_offset = cpu_to_be32(device->ldev->md.al_offset); + buffer->al_nr_extents = cpu_to_be32(device->act_log->nr_elements); + buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE); + buffer->device_uuid = cpu_to_be64(device->ldev->md.device_uuid); + + buffer->bm_offset = cpu_to_be32(device->ldev->md.bm_offset); + buffer->la_peer_max_bio_size = cpu_to_be32(device->peer_max_bio_size); + + buffer->al_stripes = cpu_to_be32(device->ldev->md.al_stripes); + buffer->al_stripe_size_4k = cpu_to_be32(device->ldev->md.al_stripe_size_4k); + + D_ASSERT(device, drbd_md_ss(device->ldev) == device->ldev->md.md_offset); + sector = device->ldev->md.md_offset; + + if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) { + /* this was a try anyways ... */ + drbd_err(device, "meta data update failed!\n"); + drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); + } +} + +/** + * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set + * @device: DRBD device. + */ +void drbd_md_sync(struct drbd_device *device) +{ + struct meta_data_on_disk *buffer; + + /* Don't accidentally change the DRBD meta data layout. */ + BUILD_BUG_ON(UI_SIZE != 4); + BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096); + + del_timer(&device->md_sync_timer); + /* timer may be rearmed by drbd_md_mark_dirty() now. */ + if (!test_and_clear_bit(MD_DIRTY, &device->flags)) + return; + + /* We use here D_FAILED and not D_ATTACHING because we try to write + * metadata even if we detach due to a disk failure! */ + if (!get_ldev_if_state(device, D_FAILED)) + return; + + buffer = drbd_md_get_buffer(device, __func__); + if (!buffer) + goto out; + + drbd_md_write(device, buffer); + + /* Update device->ldev->md.la_size_sect, + * since we updated it on metadata. */ + device->ldev->md.la_size_sect = drbd_get_capacity(device->this_bdev); + + drbd_md_put_buffer(device); +out: + put_ldev(device); +} + +static int check_activity_log_stripe_size(struct drbd_device *device, + struct meta_data_on_disk *on_disk, + struct drbd_md *in_core) +{ + u32 al_stripes = be32_to_cpu(on_disk->al_stripes); + u32 al_stripe_size_4k = be32_to_cpu(on_disk->al_stripe_size_4k); + u64 al_size_4k; + + /* both not set: default to old fixed size activity log */ + if (al_stripes == 0 && al_stripe_size_4k == 0) { + al_stripes = 1; + al_stripe_size_4k = MD_32kB_SECT/8; + } + + /* some paranoia plausibility checks */ + + /* we need both values to be set */ + if (al_stripes == 0 || al_stripe_size_4k == 0) + goto err; + + al_size_4k = (u64)al_stripes * al_stripe_size_4k; + + /* Upper limit of activity log area, to avoid potential overflow + * problems in al_tr_number_to_on_disk_sector(). As right now, more + * than 72 * 4k blocks total only increases the amount of history, + * limiting this arbitrarily to 16 GB is not a real limitation ;-) */ + if (al_size_4k > (16 * 1024 * 1024/4)) + goto err; + + /* Lower limit: we need at least 8 transaction slots (32kB) + * to not break existing setups */ + if (al_size_4k < MD_32kB_SECT/8) + goto err; + + in_core->al_stripe_size_4k = al_stripe_size_4k; + in_core->al_stripes = al_stripes; + in_core->al_size_4k = al_size_4k; + + return 0; +err: + drbd_err(device, "invalid activity log striping: al_stripes=%u, al_stripe_size_4k=%u\n", + al_stripes, al_stripe_size_4k); + return -EINVAL; +} + +static int check_offsets_and_sizes(struct drbd_device *device, struct drbd_backing_dev *bdev) +{ + sector_t capacity = drbd_get_capacity(bdev->md_bdev); + struct drbd_md *in_core = &bdev->md; + s32 on_disk_al_sect; + s32 on_disk_bm_sect; + + /* The on-disk size of the activity log, calculated from offsets, and + * the size of the activity log calculated from the stripe settings, + * should match. + * Though we could relax this a bit: it is ok, if the striped activity log + * fits in the available on-disk activity log size. + * Right now, that would break how resize is implemented. + * TODO: make drbd_determine_dev_size() (and the drbdmeta tool) aware + * of possible unused padding space in the on disk layout. */ + if (in_core->al_offset < 0) { + if (in_core->bm_offset > in_core->al_offset) + goto err; + on_disk_al_sect = -in_core->al_offset; + on_disk_bm_sect = in_core->al_offset - in_core->bm_offset; + } else { + if (in_core->al_offset != MD_4kB_SECT) + goto err; + if (in_core->bm_offset < in_core->al_offset + in_core->al_size_4k * MD_4kB_SECT) + goto err; + + on_disk_al_sect = in_core->bm_offset - MD_4kB_SECT; + on_disk_bm_sect = in_core->md_size_sect - in_core->bm_offset; + } + + /* old fixed size meta data is exactly that: fixed. */ + if (in_core->meta_dev_idx >= 0) { + if (in_core->md_size_sect != MD_128MB_SECT + || in_core->al_offset != MD_4kB_SECT + || in_core->bm_offset != MD_4kB_SECT + MD_32kB_SECT + || in_core->al_stripes != 1 + || in_core->al_stripe_size_4k != MD_32kB_SECT/8) + goto err; + } + + if (capacity < in_core->md_size_sect) + goto err; + if (capacity - in_core->md_size_sect < drbd_md_first_sector(bdev)) + goto err; + + /* should be aligned, and at least 32k */ + if ((on_disk_al_sect & 7) || (on_disk_al_sect < MD_32kB_SECT)) + goto err; + + /* should fit (for now: exactly) into the available on-disk space; + * overflow prevention is in check_activity_log_stripe_size() above. */ + if (on_disk_al_sect != in_core->al_size_4k * MD_4kB_SECT) + goto err; + + /* again, should be aligned */ + if (in_core->bm_offset & 7) + goto err; + + /* FIXME check for device grow with flex external meta data? */ + + /* can the available bitmap space cover the last agreed device size? */ + if (on_disk_bm_sect < (in_core->la_size_sect+7)/MD_4kB_SECT/8/512) + goto err; + + return 0; + +err: + drbd_err(device, "meta data offsets don't make sense: idx=%d " + "al_s=%u, al_sz4k=%u, al_offset=%d, bm_offset=%d, " + "md_size_sect=%u, la_size=%llu, md_capacity=%llu\n", + in_core->meta_dev_idx, + in_core->al_stripes, in_core->al_stripe_size_4k, + in_core->al_offset, in_core->bm_offset, in_core->md_size_sect, + (unsigned long long)in_core->la_size_sect, + (unsigned long long)capacity); + + return -EINVAL; +} + + +/** + * drbd_md_read() - Reads in the meta data super block + * @device: DRBD device. + * @bdev: Device from which the meta data should be read in. + * + * Return NO_ERROR on success, and an enum drbd_ret_code in case + * something goes wrong. + * + * Called exactly once during drbd_adm_attach(), while still being D_DISKLESS, + * even before @bdev is assigned to @device->ldev. + */ +int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev) +{ + struct meta_data_on_disk *buffer; + u32 magic, flags; + int i, rv = NO_ERROR; + + if (device->state.disk != D_DISKLESS) + return ERR_DISK_CONFIGURED; + + buffer = drbd_md_get_buffer(device, __func__); + if (!buffer) + return ERR_NOMEM; + + /* First, figure out where our meta data superblock is located, + * and read it. */ + bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx; + bdev->md.md_offset = drbd_md_ss(bdev); + + if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset, READ)) { + /* NOTE: can't do normal error processing here as this is + called BEFORE disk is attached */ + drbd_err(device, "Error while reading metadata.\n"); + rv = ERR_IO_MD_DISK; + goto err; + } + + magic = be32_to_cpu(buffer->magic); + flags = be32_to_cpu(buffer->flags); + if (magic == DRBD_MD_MAGIC_84_UNCLEAN || + (magic == DRBD_MD_MAGIC_08 && !(flags & MDF_AL_CLEAN))) { + /* btw: that's Activity Log clean, not "all" clean. */ + drbd_err(device, "Found unclean meta data. Did you \"drbdadm apply-al\"?\n"); + rv = ERR_MD_UNCLEAN; + goto err; + } + + rv = ERR_MD_INVALID; + if (magic != DRBD_MD_MAGIC_08) { + if (magic == DRBD_MD_MAGIC_07) + drbd_err(device, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n"); + else + drbd_err(device, "Meta data magic not found. Did you \"drbdadm create-md\"?\n"); + goto err; + } + + if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { + drbd_err(device, "unexpected bm_bytes_per_bit: %u (expected %u)\n", + be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE); + goto err; + } + + + /* convert to in_core endian */ + bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect); + for (i = UI_CURRENT; i < UI_SIZE; i++) + bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); + bdev->md.flags = be32_to_cpu(buffer->flags); + bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); + + bdev->md.md_size_sect = be32_to_cpu(buffer->md_size_sect); + bdev->md.al_offset = be32_to_cpu(buffer->al_offset); + bdev->md.bm_offset = be32_to_cpu(buffer->bm_offset); + + if (check_activity_log_stripe_size(device, buffer, &bdev->md)) + goto err; + if (check_offsets_and_sizes(device, bdev)) + goto err; + + if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) { + drbd_err(device, "unexpected bm_offset: %d (expected %d)\n", + be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset); + goto err; + } + if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) { + drbd_err(device, "unexpected md_size: %u (expected %u)\n", + be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect); + goto err; + } + + rv = NO_ERROR; + + spin_lock_irq(&device->resource->req_lock); + if (device->state.conn < C_CONNECTED) { + unsigned int peer; + peer = be32_to_cpu(buffer->la_peer_max_bio_size); + peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE); + device->peer_max_bio_size = peer; + } + spin_unlock_irq(&device->resource->req_lock); + + err: + drbd_md_put_buffer(device); + + return rv; +} + +/** + * drbd_md_mark_dirty() - Mark meta data super block as dirty + * @device: DRBD device. + * + * Call this function if you change anything that should be written to + * the meta-data super block. This function sets MD_DIRTY, and starts a + * timer that ensures that within five seconds you have to call drbd_md_sync(). + */ +#ifdef DEBUG +void drbd_md_mark_dirty_(struct drbd_device *device, unsigned int line, const char *func) +{ + if (!test_and_set_bit(MD_DIRTY, &device->flags)) { + mod_timer(&device->md_sync_timer, jiffies + HZ); + device->last_md_mark_dirty.line = line; + device->last_md_mark_dirty.func = func; + } +} +#else +void drbd_md_mark_dirty(struct drbd_device *device) +{ + if (!test_and_set_bit(MD_DIRTY, &device->flags)) + mod_timer(&device->md_sync_timer, jiffies + 5*HZ); +} +#endif + +void drbd_uuid_move_history(struct drbd_device *device) __must_hold(local) +{ + int i; + + for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) + device->ldev->md.uuid[i+1] = device->ldev->md.uuid[i]; +} + +void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local) +{ + if (idx == UI_CURRENT) { + if (device->state.role == R_PRIMARY) + val |= 1; + else + val &= ~((u64)1); + + drbd_set_ed_uuid(device, val); + } + + device->ldev->md.uuid[idx] = val; + drbd_md_mark_dirty(device); +} + +void _drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local) +{ + unsigned long flags; + spin_lock_irqsave(&device->ldev->md.uuid_lock, flags); + __drbd_uuid_set(device, idx, val); + spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags); +} + +void drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local) +{ + unsigned long flags; + spin_lock_irqsave(&device->ldev->md.uuid_lock, flags); + if (device->ldev->md.uuid[idx]) { + drbd_uuid_move_history(device); + device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[idx]; + } + __drbd_uuid_set(device, idx, val); + spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags); +} + +/** + * drbd_uuid_new_current() - Creates a new current UUID + * @device: DRBD device. + * + * Creates a new current UUID, and rotates the old current UUID into + * the bitmap slot. Causes an incremental resync upon next connect. + */ +void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local) +{ + u64 val; + unsigned long long bm_uuid; + + get_random_bytes(&val, sizeof(u64)); + + spin_lock_irq(&device->ldev->md.uuid_lock); + bm_uuid = device->ldev->md.uuid[UI_BITMAP]; + + if (bm_uuid) + drbd_warn(device, "bm UUID was already set: %llX\n", bm_uuid); + + device->ldev->md.uuid[UI_BITMAP] = device->ldev->md.uuid[UI_CURRENT]; + __drbd_uuid_set(device, UI_CURRENT, val); + spin_unlock_irq(&device->ldev->md.uuid_lock); + + drbd_print_uuids(device, "new current UUID"); + /* get it to stable storage _now_ */ + drbd_md_sync(device); +} + +void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local) +{ + unsigned long flags; + if (device->ldev->md.uuid[UI_BITMAP] == 0 && val == 0) + return; + + spin_lock_irqsave(&device->ldev->md.uuid_lock, flags); + if (val == 0) { + drbd_uuid_move_history(device); + device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP]; + device->ldev->md.uuid[UI_BITMAP] = 0; + } else { + unsigned long long bm_uuid = device->ldev->md.uuid[UI_BITMAP]; + if (bm_uuid) + drbd_warn(device, "bm UUID was already set: %llX\n", bm_uuid); + + device->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1); + } + spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags); + + drbd_md_mark_dirty(device); +} + +/** + * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() + * @device: DRBD device. + * + * Sets all bits in the bitmap and writes the whole bitmap to stable storage. + */ +int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local) +{ + int rv = -EIO; + + drbd_md_set_flag(device, MDF_FULL_SYNC); + drbd_md_sync(device); + drbd_bm_set_all(device); + + rv = drbd_bm_write(device); + + if (!rv) { + drbd_md_clear_flag(device, MDF_FULL_SYNC); + drbd_md_sync(device); + } + + return rv; +} + +/** + * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() + * @device: DRBD device. + * + * Clears all bits in the bitmap and writes the whole bitmap to stable storage. + */ +int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local) +{ + drbd_resume_al(device); + drbd_bm_clear_all(device); + return drbd_bm_write(device); +} + +static int w_bitmap_io(struct drbd_work *w, int unused) +{ + struct drbd_device *device = + container_of(w, struct drbd_device, bm_io_work.w); + struct bm_io_work *work = &device->bm_io_work; + int rv = -EIO; + + D_ASSERT(device, atomic_read(&device->ap_bio_cnt) == 0); + + if (get_ldev(device)) { + drbd_bm_lock(device, work->why, work->flags); + rv = work->io_fn(device); + drbd_bm_unlock(device); + put_ldev(device); + } + + clear_bit_unlock(BITMAP_IO, &device->flags); + wake_up(&device->misc_wait); + + if (work->done) + work->done(device, rv); + + clear_bit(BITMAP_IO_QUEUED, &device->flags); + work->why = NULL; + work->flags = 0; + + return 0; +} + +/** + * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap + * @device: DRBD device. + * @io_fn: IO callback to be called when bitmap IO is possible + * @done: callback to be called after the bitmap IO was performed + * @why: Descriptive text of the reason for doing the IO + * + * While IO on the bitmap happens we freeze application IO thus we ensure + * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be + * called from worker context. It MUST NOT be used while a previous such + * work is still pending! + * + * Its worker function encloses the call of io_fn() by get_ldev() and + * put_ldev(). + */ +void drbd_queue_bitmap_io(struct drbd_device *device, + int (*io_fn)(struct drbd_device *), + void (*done)(struct drbd_device *, int), + char *why, enum bm_flag flags) +{ + D_ASSERT(device, current == first_peer_device(device)->connection->worker.task); + + D_ASSERT(device, !test_bit(BITMAP_IO_QUEUED, &device->flags)); + D_ASSERT(device, !test_bit(BITMAP_IO, &device->flags)); + D_ASSERT(device, list_empty(&device->bm_io_work.w.list)); + if (device->bm_io_work.why) + drbd_err(device, "FIXME going to queue '%s' but '%s' still pending?\n", + why, device->bm_io_work.why); + + device->bm_io_work.io_fn = io_fn; + device->bm_io_work.done = done; + device->bm_io_work.why = why; + device->bm_io_work.flags = flags; + + spin_lock_irq(&device->resource->req_lock); + set_bit(BITMAP_IO, &device->flags); + if (atomic_read(&device->ap_bio_cnt) == 0) { + if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags)) + drbd_queue_work(&first_peer_device(device)->connection->sender_work, + &device->bm_io_work.w); + } + spin_unlock_irq(&device->resource->req_lock); +} + +/** + * drbd_bitmap_io() - Does an IO operation on the whole bitmap + * @device: DRBD device. + * @io_fn: IO callback to be called when bitmap IO is possible + * @why: Descriptive text of the reason for doing the IO + * + * freezes application IO while that the actual IO operations runs. This + * functions MAY NOT be called from worker context. + */ +int drbd_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device *), + char *why, enum bm_flag flags) +{ + int rv; + + D_ASSERT(device, current != first_peer_device(device)->connection->worker.task); + + if ((flags & BM_LOCKED_SET_ALLOWED) == 0) + drbd_suspend_io(device); + + drbd_bm_lock(device, why, flags); + rv = io_fn(device); + drbd_bm_unlock(device); + + if ((flags & BM_LOCKED_SET_ALLOWED) == 0) + drbd_resume_io(device); + + return rv; +} + +void drbd_md_set_flag(struct drbd_device *device, int flag) __must_hold(local) +{ + if ((device->ldev->md.flags & flag) != flag) { + drbd_md_mark_dirty(device); + device->ldev->md.flags |= flag; + } +} + +void drbd_md_clear_flag(struct drbd_device *device, int flag) __must_hold(local) +{ + if ((device->ldev->md.flags & flag) != 0) { + drbd_md_mark_dirty(device); + device->ldev->md.flags &= ~flag; + } +} +int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag) +{ + return (bdev->md.flags & flag) != 0; +} + +static void md_sync_timer_fn(unsigned long data) +{ + struct drbd_device *device = (struct drbd_device *) data; + drbd_device_post_work(device, MD_SYNC); +} + +const char *cmdname(enum drbd_packet cmd) +{ + /* THINK may need to become several global tables + * when we want to support more than + * one PRO_VERSION */ + static const char *cmdnames[] = { + [P_DATA] = "Data", + [P_DATA_REPLY] = "DataReply", + [P_RS_DATA_REPLY] = "RSDataReply", + [P_BARRIER] = "Barrier", + [P_BITMAP] = "ReportBitMap", + [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget", + [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource", + [P_UNPLUG_REMOTE] = "UnplugRemote", + [P_DATA_REQUEST] = "DataRequest", + [P_RS_DATA_REQUEST] = "RSDataRequest", + [P_SYNC_PARAM] = "SyncParam", + [P_SYNC_PARAM89] = "SyncParam89", + [P_PROTOCOL] = "ReportProtocol", + [P_UUIDS] = "ReportUUIDs", + [P_SIZES] = "ReportSizes", + [P_STATE] = "ReportState", + [P_SYNC_UUID] = "ReportSyncUUID", + [P_AUTH_CHALLENGE] = "AuthChallenge", + [P_AUTH_RESPONSE] = "AuthResponse", + [P_PING] = "Ping", + [P_PING_ACK] = "PingAck", + [P_RECV_ACK] = "RecvAck", + [P_WRITE_ACK] = "WriteAck", + [P_RS_WRITE_ACK] = "RSWriteAck", + [P_SUPERSEDED] = "Superseded", + [P_NEG_ACK] = "NegAck", + [P_NEG_DREPLY] = "NegDReply", + [P_NEG_RS_DREPLY] = "NegRSDReply", + [P_BARRIER_ACK] = "BarrierAck", + [P_STATE_CHG_REQ] = "StateChgRequest", + [P_STATE_CHG_REPLY] = "StateChgReply", + [P_OV_REQUEST] = "OVRequest", + [P_OV_REPLY] = "OVReply", + [P_OV_RESULT] = "OVResult", + [P_CSUM_RS_REQUEST] = "CsumRSRequest", + [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", + [P_COMPRESSED_BITMAP] = "CBitmap", + [P_DELAY_PROBE] = "DelayProbe", + [P_OUT_OF_SYNC] = "OutOfSync", + [P_RETRY_WRITE] = "RetryWrite", + [P_RS_CANCEL] = "RSCancel", + [P_CONN_ST_CHG_REQ] = "conn_st_chg_req", + [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply", + [P_RETRY_WRITE] = "retry_write", + [P_PROTOCOL_UPDATE] = "protocol_update", + + /* enum drbd_packet, but not commands - obsoleted flags: + * P_MAY_IGNORE + * P_MAX_OPT_CMD + */ + }; + + /* too big for the array: 0xfffX */ + if (cmd == P_INITIAL_META) + return "InitialMeta"; + if (cmd == P_INITIAL_DATA) + return "InitialData"; + if (cmd == P_CONNECTION_FEATURES) + return "ConnectionFeatures"; + if (cmd >= ARRAY_SIZE(cmdnames)) + return "Unknown"; + return cmdnames[cmd]; +} + +/** + * drbd_wait_misc - wait for a request to make progress + * @device: device associated with the request + * @i: the struct drbd_interval embedded in struct drbd_request or + * struct drbd_peer_request + */ +int drbd_wait_misc(struct drbd_device *device, struct drbd_interval *i) +{ + struct net_conf *nc; + DEFINE_WAIT(wait); + long timeout; + + rcu_read_lock(); + nc = rcu_dereference(first_peer_device(device)->connection->net_conf); + if (!nc) { + rcu_read_unlock(); + return -ETIMEDOUT; + } + timeout = nc->ko_count ? nc->timeout * HZ / 10 * nc->ko_count : MAX_SCHEDULE_TIMEOUT; + rcu_read_unlock(); + + /* Indicate to wake up device->misc_wait on progress. */ + i->waiting = true; + prepare_to_wait(&device->misc_wait, &wait, TASK_INTERRUPTIBLE); + spin_unlock_irq(&device->resource->req_lock); + timeout = schedule_timeout(timeout); + finish_wait(&device->misc_wait, &wait); + spin_lock_irq(&device->resource->req_lock); + if (!timeout || device->state.conn < C_CONNECTED) + return -ETIMEDOUT; + if (signal_pending(current)) + return -ERESTARTSYS; + return 0; +} + +#ifdef CONFIG_DRBD_FAULT_INJECTION +/* Fault insertion support including random number generator shamelessly + * stolen from kernel/rcutorture.c */ +struct fault_random_state { + unsigned long state; + unsigned long count; +}; + +#define FAULT_RANDOM_MULT 39916801 /* prime */ +#define FAULT_RANDOM_ADD 479001701 /* prime */ +#define FAULT_RANDOM_REFRESH 10000 + +/* + * Crude but fast random-number generator. Uses a linear congruential + * generator, with occasional help from get_random_bytes(). + */ +static unsigned long +_drbd_fault_random(struct fault_random_state *rsp) +{ + long refresh; + + if (!rsp->count--) { + get_random_bytes(&refresh, sizeof(refresh)); + rsp->state += refresh; + rsp->count = FAULT_RANDOM_REFRESH; + } + rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD; + return swahw32(rsp->state); +} + +static char * +_drbd_fault_str(unsigned int type) { + static char *_faults[] = { + [DRBD_FAULT_MD_WR] = "Meta-data write", + [DRBD_FAULT_MD_RD] = "Meta-data read", + [DRBD_FAULT_RS_WR] = "Resync write", + [DRBD_FAULT_RS_RD] = "Resync read", + [DRBD_FAULT_DT_WR] = "Data write", + [DRBD_FAULT_DT_RD] = "Data read", + [DRBD_FAULT_DT_RA] = "Data read ahead", + [DRBD_FAULT_BM_ALLOC] = "BM allocation", + [DRBD_FAULT_AL_EE] = "EE allocation", + [DRBD_FAULT_RECEIVE] = "receive data corruption", + }; + + return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**"; +} + +unsigned int +_drbd_insert_fault(struct drbd_device *device, unsigned int type) +{ + static struct fault_random_state rrs = {0, 0}; + + unsigned int ret = ( + (fault_devs == 0 || + ((1 << device_to_minor(device)) & fault_devs) != 0) && + (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate)); + + if (ret) { + fault_count++; + + if (__ratelimit(&drbd_ratelimit_state)) + drbd_warn(device, "***Simulating %s failure\n", + _drbd_fault_str(type)); + } + + return ret; +} +#endif + +const char *drbd_buildtag(void) +{ + /* DRBD built from external sources has here a reference to the + git hash of the source code. */ + + static char buildtag[38] = "\0uilt-in"; + + if (buildtag[0] == 0) { +#ifdef MODULE + sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion); +#else + buildtag[0] = 'b'; +#endif + } + + return buildtag; +} + +module_init(drbd_init) +module_exit(drbd_cleanup) + +EXPORT_SYMBOL(drbd_conn_str); +EXPORT_SYMBOL(drbd_role_str); +EXPORT_SYMBOL(drbd_disk_str); +EXPORT_SYMBOL(drbd_set_st_err_str); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c new file mode 100644 index 000000000..74df8cfad --- /dev/null +++ b/drivers/block/drbd/drbd_nl.c @@ -0,0 +1,3674 @@ +/* + drbd_nl.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/drbd.h> +#include <linux/in.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/slab.h> +#include <linux/blkpg.h> +#include <linux/cpumask.h> +#include "drbd_int.h" +#include "drbd_protocol.h" +#include "drbd_req.h" +#include <asm/unaligned.h> +#include <linux/drbd_limits.h> +#include <linux/kthread.h> + +#include <net/genetlink.h> + +/* .doit */ +// int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info); +// int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info); + +int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info); + +int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_down(struct sk_buff *skb, struct genl_info *info); + +int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info); +/* .dumpit */ +int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb); + +#include <linux/drbd_genl_api.h> +#include "drbd_nla.h" +#include <linux/genl_magic_func.h> + +/* used blkdev_get_by_path, to claim our meta data device(s) */ +static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; + +static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info) +{ + genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb)))); + if (genlmsg_reply(skb, info)) + pr_err("error sending genl reply\n"); +} + +/* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only + * reason it could fail was no space in skb, and there are 4k available. */ +static int drbd_msg_put_info(struct sk_buff *skb, const char *info) +{ + struct nlattr *nla; + int err = -EMSGSIZE; + + if (!info || !info[0]) + return 0; + + nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY); + if (!nla) + return err; + + err = nla_put_string(skb, T_info_text, info); + if (err) { + nla_nest_cancel(skb, nla); + return err; + } else + nla_nest_end(skb, nla); + return 0; +} + +/* This would be a good candidate for a "pre_doit" hook, + * and per-family private info->pointers. + * But we need to stay compatible with older kernels. + * If it returns successfully, adm_ctx members are valid. + * + * At this point, we still rely on the global genl_lock(). + * If we want to avoid that, and allow "genl_family.parallel_ops", we may need + * to add additional synchronization against object destruction/modification. + */ +#define DRBD_ADM_NEED_MINOR 1 +#define DRBD_ADM_NEED_RESOURCE 2 +#define DRBD_ADM_NEED_CONNECTION 4 +static int drbd_adm_prepare(struct drbd_config_context *adm_ctx, + struct sk_buff *skb, struct genl_info *info, unsigned flags) +{ + struct drbd_genlmsghdr *d_in = info->userhdr; + const u8 cmd = info->genlhdr->cmd; + int err; + + memset(adm_ctx, 0, sizeof(*adm_ctx)); + + /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */ + if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN)) + return -EPERM; + + adm_ctx->reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!adm_ctx->reply_skb) { + err = -ENOMEM; + goto fail; + } + + adm_ctx->reply_dh = genlmsg_put_reply(adm_ctx->reply_skb, + info, &drbd_genl_family, 0, cmd); + /* put of a few bytes into a fresh skb of >= 4k will always succeed. + * but anyways */ + if (!adm_ctx->reply_dh) { + err = -ENOMEM; + goto fail; + } + + adm_ctx->reply_dh->minor = d_in->minor; + adm_ctx->reply_dh->ret_code = NO_ERROR; + + adm_ctx->volume = VOLUME_UNSPECIFIED; + if (info->attrs[DRBD_NLA_CFG_CONTEXT]) { + struct nlattr *nla; + /* parse and validate only */ + err = drbd_cfg_context_from_attrs(NULL, info); + if (err) + goto fail; + + /* It was present, and valid, + * copy it over to the reply skb. */ + err = nla_put_nohdr(adm_ctx->reply_skb, + info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len, + info->attrs[DRBD_NLA_CFG_CONTEXT]); + if (err) + goto fail; + + /* and assign stuff to the adm_ctx */ + nla = nested_attr_tb[__nla_type(T_ctx_volume)]; + if (nla) + adm_ctx->volume = nla_get_u32(nla); + nla = nested_attr_tb[__nla_type(T_ctx_resource_name)]; + if (nla) + adm_ctx->resource_name = nla_data(nla); + adm_ctx->my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)]; + adm_ctx->peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)]; + if ((adm_ctx->my_addr && + nla_len(adm_ctx->my_addr) > sizeof(adm_ctx->connection->my_addr)) || + (adm_ctx->peer_addr && + nla_len(adm_ctx->peer_addr) > sizeof(adm_ctx->connection->peer_addr))) { + err = -EINVAL; + goto fail; + } + } + + adm_ctx->minor = d_in->minor; + adm_ctx->device = minor_to_device(d_in->minor); + + /* We are protected by the global genl_lock(). + * But we may explicitly drop it/retake it in drbd_adm_set_role(), + * so make sure this object stays around. */ + if (adm_ctx->device) + kref_get(&adm_ctx->device->kref); + + if (adm_ctx->resource_name) { + adm_ctx->resource = drbd_find_resource(adm_ctx->resource_name); + } + + if (!adm_ctx->device && (flags & DRBD_ADM_NEED_MINOR)) { + drbd_msg_put_info(adm_ctx->reply_skb, "unknown minor"); + return ERR_MINOR_INVALID; + } + if (!adm_ctx->resource && (flags & DRBD_ADM_NEED_RESOURCE)) { + drbd_msg_put_info(adm_ctx->reply_skb, "unknown resource"); + if (adm_ctx->resource_name) + return ERR_RES_NOT_KNOWN; + return ERR_INVALID_REQUEST; + } + + if (flags & DRBD_ADM_NEED_CONNECTION) { + if (adm_ctx->resource) { + drbd_msg_put_info(adm_ctx->reply_skb, "no resource name expected"); + return ERR_INVALID_REQUEST; + } + if (adm_ctx->device) { + drbd_msg_put_info(adm_ctx->reply_skb, "no minor number expected"); + return ERR_INVALID_REQUEST; + } + if (adm_ctx->my_addr && adm_ctx->peer_addr) + adm_ctx->connection = conn_get_by_addrs(nla_data(adm_ctx->my_addr), + nla_len(adm_ctx->my_addr), + nla_data(adm_ctx->peer_addr), + nla_len(adm_ctx->peer_addr)); + if (!adm_ctx->connection) { + drbd_msg_put_info(adm_ctx->reply_skb, "unknown connection"); + return ERR_INVALID_REQUEST; + } + } + + /* some more paranoia, if the request was over-determined */ + if (adm_ctx->device && adm_ctx->resource && + adm_ctx->device->resource != adm_ctx->resource) { + pr_warning("request: minor=%u, resource=%s; but that minor belongs to resource %s\n", + adm_ctx->minor, adm_ctx->resource->name, + adm_ctx->device->resource->name); + drbd_msg_put_info(adm_ctx->reply_skb, "minor exists in different resource"); + return ERR_INVALID_REQUEST; + } + if (adm_ctx->device && + adm_ctx->volume != VOLUME_UNSPECIFIED && + adm_ctx->volume != adm_ctx->device->vnr) { + pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n", + adm_ctx->minor, adm_ctx->volume, + adm_ctx->device->vnr, + adm_ctx->device->resource->name); + drbd_msg_put_info(adm_ctx->reply_skb, "minor exists as different volume"); + return ERR_INVALID_REQUEST; + } + + /* still, provide adm_ctx->resource always, if possible. */ + if (!adm_ctx->resource) { + adm_ctx->resource = adm_ctx->device ? adm_ctx->device->resource + : adm_ctx->connection ? adm_ctx->connection->resource : NULL; + if (adm_ctx->resource) + kref_get(&adm_ctx->resource->kref); + } + + return NO_ERROR; + +fail: + nlmsg_free(adm_ctx->reply_skb); + adm_ctx->reply_skb = NULL; + return err; +} + +static int drbd_adm_finish(struct drbd_config_context *adm_ctx, + struct genl_info *info, int retcode) +{ + if (adm_ctx->device) { + kref_put(&adm_ctx->device->kref, drbd_destroy_device); + adm_ctx->device = NULL; + } + if (adm_ctx->connection) { + kref_put(&adm_ctx->connection->kref, &drbd_destroy_connection); + adm_ctx->connection = NULL; + } + if (adm_ctx->resource) { + kref_put(&adm_ctx->resource->kref, drbd_destroy_resource); + adm_ctx->resource = NULL; + } + + if (!adm_ctx->reply_skb) + return -ENOMEM; + + adm_ctx->reply_dh->ret_code = retcode; + drbd_adm_send_reply(adm_ctx->reply_skb, info); + return 0; +} + +static void setup_khelper_env(struct drbd_connection *connection, char **envp) +{ + char *afs; + + /* FIXME: A future version will not allow this case. */ + if (connection->my_addr_len == 0 || connection->peer_addr_len == 0) + return; + + switch (((struct sockaddr *)&connection->peer_addr)->sa_family) { + case AF_INET6: + afs = "ipv6"; + snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6", + &((struct sockaddr_in6 *)&connection->peer_addr)->sin6_addr); + break; + case AF_INET: + afs = "ipv4"; + snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4", + &((struct sockaddr_in *)&connection->peer_addr)->sin_addr); + break; + default: + afs = "ssocks"; + snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4", + &((struct sockaddr_in *)&connection->peer_addr)->sin_addr); + } + snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs); +} + +int drbd_khelper(struct drbd_device *device, char *cmd) +{ + char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + (char[20]) { }, /* address family */ + (char[60]) { }, /* address */ + NULL }; + char mb[12]; + char *argv[] = {usermode_helper, cmd, mb, NULL }; + struct drbd_connection *connection = first_peer_device(device)->connection; + struct sib_info sib; + int ret; + + if (current == connection->worker.task) + set_bit(CALLBACK_PENDING, &connection->flags); + + snprintf(mb, 12, "minor-%d", device_to_minor(device)); + setup_khelper_env(connection, envp); + + /* The helper may take some time. + * write out any unsynced meta data changes now */ + drbd_md_sync(device); + + drbd_info(device, "helper command: %s %s %s\n", usermode_helper, cmd, mb); + sib.sib_reason = SIB_HELPER_PRE; + sib.helper_name = cmd; + drbd_bcast_event(device, &sib); + ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); + if (ret) + drbd_warn(device, "helper command: %s %s %s exit code %u (0x%x)\n", + usermode_helper, cmd, mb, + (ret >> 8) & 0xff, ret); + else + drbd_info(device, "helper command: %s %s %s exit code %u (0x%x)\n", + usermode_helper, cmd, mb, + (ret >> 8) & 0xff, ret); + sib.sib_reason = SIB_HELPER_POST; + sib.helper_exit_code = ret; + drbd_bcast_event(device, &sib); + + if (current == connection->worker.task) + clear_bit(CALLBACK_PENDING, &connection->flags); + + if (ret < 0) /* Ignore any ERRNOs we got. */ + ret = 0; + + return ret; +} + +static int conn_khelper(struct drbd_connection *connection, char *cmd) +{ + char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + (char[20]) { }, /* address family */ + (char[60]) { }, /* address */ + NULL }; + char *resource_name = connection->resource->name; + char *argv[] = {usermode_helper, cmd, resource_name, NULL }; + int ret; + + setup_khelper_env(connection, envp); + conn_md_sync(connection); + + drbd_info(connection, "helper command: %s %s %s\n", usermode_helper, cmd, resource_name); + /* TODO: conn_bcast_event() ?? */ + + ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); + if (ret) + drbd_warn(connection, "helper command: %s %s %s exit code %u (0x%x)\n", + usermode_helper, cmd, resource_name, + (ret >> 8) & 0xff, ret); + else + drbd_info(connection, "helper command: %s %s %s exit code %u (0x%x)\n", + usermode_helper, cmd, resource_name, + (ret >> 8) & 0xff, ret); + /* TODO: conn_bcast_event() ?? */ + + if (ret < 0) /* Ignore any ERRNOs we got. */ + ret = 0; + + return ret; +} + +static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connection) +{ + enum drbd_fencing_p fp = FP_NOT_AVAIL; + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + if (get_ldev_if_state(device, D_CONSISTENT)) { + struct disk_conf *disk_conf = + rcu_dereference(peer_device->device->ldev->disk_conf); + fp = max_t(enum drbd_fencing_p, fp, disk_conf->fencing); + put_ldev(device); + } + } + rcu_read_unlock(); + + if (fp == FP_NOT_AVAIL) { + /* IO Suspending works on the whole resource. + Do it only for one device. */ + vnr = 0; + peer_device = idr_get_next(&connection->peer_devices, &vnr); + drbd_change_state(peer_device->device, CS_VERBOSE | CS_HARD, NS(susp_fen, 0)); + } + + return fp; +} + +bool conn_try_outdate_peer(struct drbd_connection *connection) +{ + unsigned int connect_cnt; + union drbd_state mask = { }; + union drbd_state val = { }; + enum drbd_fencing_p fp; + char *ex_to_string; + int r; + + spin_lock_irq(&connection->resource->req_lock); + if (connection->cstate >= C_WF_REPORT_PARAMS) { + drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n"); + spin_unlock_irq(&connection->resource->req_lock); + return false; + } + + connect_cnt = connection->connect_cnt; + spin_unlock_irq(&connection->resource->req_lock); + + fp = highest_fencing_policy(connection); + switch (fp) { + case FP_NOT_AVAIL: + drbd_warn(connection, "Not fencing peer, I'm not even Consistent myself.\n"); + goto out; + case FP_DONT_CARE: + return true; + default: ; + } + + r = conn_khelper(connection, "fence-peer"); + + switch ((r>>8) & 0xff) { + case 3: /* peer is inconsistent */ + ex_to_string = "peer is inconsistent or worse"; + mask.pdsk = D_MASK; + val.pdsk = D_INCONSISTENT; + break; + case 4: /* peer got outdated, or was already outdated */ + ex_to_string = "peer was fenced"; + mask.pdsk = D_MASK; + val.pdsk = D_OUTDATED; + break; + case 5: /* peer was down */ + if (conn_highest_disk(connection) == D_UP_TO_DATE) { + /* we will(have) create(d) a new UUID anyways... */ + ex_to_string = "peer is unreachable, assumed to be dead"; + mask.pdsk = D_MASK; + val.pdsk = D_OUTDATED; + } else { + ex_to_string = "peer unreachable, doing nothing since disk != UpToDate"; + } + break; + case 6: /* Peer is primary, voluntarily outdate myself. + * This is useful when an unconnected R_SECONDARY is asked to + * become R_PRIMARY, but finds the other peer being active. */ + ex_to_string = "peer is active"; + drbd_warn(connection, "Peer is primary, outdating myself.\n"); + mask.disk = D_MASK; + val.disk = D_OUTDATED; + break; + case 7: + if (fp != FP_STONITH) + drbd_err(connection, "fence-peer() = 7 && fencing != Stonith !!!\n"); + ex_to_string = "peer was stonithed"; + mask.pdsk = D_MASK; + val.pdsk = D_OUTDATED; + break; + default: + /* The script is broken ... */ + drbd_err(connection, "fence-peer helper broken, returned %d\n", (r>>8)&0xff); + return false; /* Eventually leave IO frozen */ + } + + drbd_info(connection, "fence-peer helper returned %d (%s)\n", + (r>>8) & 0xff, ex_to_string); + + out: + + /* Not using + conn_request_state(connection, mask, val, CS_VERBOSE); + here, because we might were able to re-establish the connection in the + meantime. */ + spin_lock_irq(&connection->resource->req_lock); + if (connection->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &connection->flags)) { + if (connection->connect_cnt != connect_cnt) + /* In case the connection was established and droped + while the fence-peer handler was running, ignore it */ + drbd_info(connection, "Ignoring fence-peer exit code\n"); + else + _conn_request_state(connection, mask, val, CS_VERBOSE); + } + spin_unlock_irq(&connection->resource->req_lock); + + return conn_highest_pdsk(connection) <= D_OUTDATED; +} + +static int _try_outdate_peer_async(void *data) +{ + struct drbd_connection *connection = (struct drbd_connection *)data; + + conn_try_outdate_peer(connection); + + kref_put(&connection->kref, drbd_destroy_connection); + return 0; +} + +void conn_try_outdate_peer_async(struct drbd_connection *connection) +{ + struct task_struct *opa; + + kref_get(&connection->kref); + /* We may just have force_sig()'ed this thread + * to get it out of some blocking network function. + * Clear signals; otherwise kthread_run(), which internally uses + * wait_on_completion_killable(), will mistake our pending signal + * for a new fatal signal and fail. */ + flush_signals(current); + opa = kthread_run(_try_outdate_peer_async, connection, "drbd_async_h"); + if (IS_ERR(opa)) { + drbd_err(connection, "out of mem, failed to invoke fence-peer helper\n"); + kref_put(&connection->kref, drbd_destroy_connection); + } +} + +enum drbd_state_rv +drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int force) +{ + struct drbd_peer_device *const peer_device = first_peer_device(device); + struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; + const int max_tries = 4; + enum drbd_state_rv rv = SS_UNKNOWN_ERROR; + struct net_conf *nc; + int try = 0; + int forced = 0; + union drbd_state mask, val; + + if (new_role == R_PRIMARY) { + struct drbd_connection *connection; + + /* Detect dead peers as soon as possible. */ + + rcu_read_lock(); + for_each_connection(connection, device->resource) + request_ping(connection); + rcu_read_unlock(); + } + + mutex_lock(device->state_mutex); + + mask.i = 0; mask.role = R_MASK; + val.i = 0; val.role = new_role; + + while (try++ < max_tries) { + rv = _drbd_request_state_holding_state_mutex(device, mask, val, CS_WAIT_COMPLETE); + + /* in case we first succeeded to outdate, + * but now suddenly could establish a connection */ + if (rv == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) { + val.pdsk = 0; + mask.pdsk = 0; + continue; + } + + if (rv == SS_NO_UP_TO_DATE_DISK && force && + (device->state.disk < D_UP_TO_DATE && + device->state.disk >= D_INCONSISTENT)) { + mask.disk = D_MASK; + val.disk = D_UP_TO_DATE; + forced = 1; + continue; + } + + if (rv == SS_NO_UP_TO_DATE_DISK && + device->state.disk == D_CONSISTENT && mask.pdsk == 0) { + D_ASSERT(device, device->state.pdsk == D_UNKNOWN); + + if (conn_try_outdate_peer(connection)) { + val.disk = D_UP_TO_DATE; + mask.disk = D_MASK; + } + continue; + } + + if (rv == SS_NOTHING_TO_DO) + goto out; + if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) { + if (!conn_try_outdate_peer(connection) && force) { + drbd_warn(device, "Forced into split brain situation!\n"); + mask.pdsk = D_MASK; + val.pdsk = D_OUTDATED; + + } + continue; + } + if (rv == SS_TWO_PRIMARIES) { + /* Maybe the peer is detected as dead very soon... + retry at most once more in this case. */ + int timeo; + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1; + rcu_read_unlock(); + schedule_timeout_interruptible(timeo); + if (try < max_tries) + try = max_tries - 1; + continue; + } + if (rv < SS_SUCCESS) { + rv = _drbd_request_state(device, mask, val, + CS_VERBOSE + CS_WAIT_COMPLETE); + if (rv < SS_SUCCESS) + goto out; + } + break; + } + + if (rv < SS_SUCCESS) + goto out; + + if (forced) + drbd_warn(device, "Forced to consider local data as UpToDate!\n"); + + /* Wait until nothing is on the fly :) */ + wait_event(device->misc_wait, atomic_read(&device->ap_pending_cnt) == 0); + + /* FIXME also wait for all pending P_BARRIER_ACK? */ + + if (new_role == R_SECONDARY) { + if (get_ldev(device)) { + device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; + put_ldev(device); + } + } else { + mutex_lock(&device->resource->conf_update); + nc = connection->net_conf; + if (nc) + nc->discard_my_data = 0; /* without copy; single bit op is atomic */ + mutex_unlock(&device->resource->conf_update); + + if (get_ldev(device)) { + if (((device->state.conn < C_CONNECTED || + device->state.pdsk <= D_FAILED) + && device->ldev->md.uuid[UI_BITMAP] == 0) || forced) + drbd_uuid_new_current(device); + + device->ldev->md.uuid[UI_CURRENT] |= (u64)1; + put_ldev(device); + } + } + + /* writeout of activity log covered areas of the bitmap + * to stable storage done in after state change already */ + + if (device->state.conn >= C_WF_REPORT_PARAMS) { + /* if this was forced, we should consider sync */ + if (forced) + drbd_send_uuids(peer_device); + drbd_send_current_state(peer_device); + } + + drbd_md_sync(device); + set_disk_ro(device->vdisk, new_role == R_SECONDARY); + kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE); +out: + mutex_unlock(device->state_mutex); + return rv; +} + +static const char *from_attrs_err_to_txt(int err) +{ + return err == -ENOMSG ? "required attribute missing" : + err == -EOPNOTSUPP ? "unknown mandatory attribute" : + err == -EEXIST ? "can not change invariant setting" : + "invalid attribute value"; +} + +int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct set_role_parms parms; + int err; + enum drbd_ret_code retcode; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + memset(&parms, 0, sizeof(parms)); + if (info->attrs[DRBD_NLA_SET_ROLE_PARMS]) { + err = set_role_parms_from_attrs(&parms, info); + if (err) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto out; + } + } + genl_unlock(); + mutex_lock(&adm_ctx.resource->adm_mutex); + + if (info->genlhdr->cmd == DRBD_ADM_PRIMARY) + retcode = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate); + else + retcode = drbd_set_role(adm_ctx.device, R_SECONDARY, 0); + + mutex_unlock(&adm_ctx.resource->adm_mutex); + genl_lock(); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +/* Initializes the md.*_offset members, so we are able to find + * the on disk meta data. + * + * We currently have two possible layouts: + * external: + * |----------- md_size_sect ------------------| + * [ 4k superblock ][ activity log ][ Bitmap ] + * | al_offset == 8 | + * | bm_offset = al_offset + X | + * ==> bitmap sectors = md_size_sect - bm_offset + * + * internal: + * |----------- md_size_sect ------------------| + * [data.....][ Bitmap ][ activity log ][ 4k superblock ] + * | al_offset < 0 | + * | bm_offset = al_offset - Y | + * ==> bitmap sectors = Y = al_offset - bm_offset + * + * Activity log size used to be fixed 32kB, + * but is about to become configurable. + */ +static void drbd_md_set_sector_offsets(struct drbd_device *device, + struct drbd_backing_dev *bdev) +{ + sector_t md_size_sect = 0; + unsigned int al_size_sect = bdev->md.al_size_4k * 8; + + bdev->md.md_offset = drbd_md_ss(bdev); + + switch (bdev->md.meta_dev_idx) { + default: + /* v07 style fixed size indexed meta data */ + bdev->md.md_size_sect = MD_128MB_SECT; + bdev->md.al_offset = MD_4kB_SECT; + bdev->md.bm_offset = MD_4kB_SECT + al_size_sect; + break; + case DRBD_MD_INDEX_FLEX_EXT: + /* just occupy the full device; unit: sectors */ + bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev); + bdev->md.al_offset = MD_4kB_SECT; + bdev->md.bm_offset = MD_4kB_SECT + al_size_sect; + break; + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + /* al size is still fixed */ + bdev->md.al_offset = -al_size_sect; + /* we need (slightly less than) ~ this much bitmap sectors: */ + md_size_sect = drbd_get_capacity(bdev->backing_bdev); + md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); + md_size_sect = BM_SECT_TO_EXT(md_size_sect); + md_size_sect = ALIGN(md_size_sect, 8); + + /* plus the "drbd meta data super block", + * and the activity log; */ + md_size_sect += MD_4kB_SECT + al_size_sect; + + bdev->md.md_size_sect = md_size_sect; + /* bitmap offset is adjusted by 'super' block size */ + bdev->md.bm_offset = -md_size_sect + MD_4kB_SECT; + break; + } +} + +/* input size is expected to be in KB */ +char *ppsize(char *buf, unsigned long long size) +{ + /* Needs 9 bytes at max including trailing NUL: + * -1ULL ==> "16384 EB" */ + static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' }; + int base = 0; + while (size >= 10000 && base < sizeof(units)-1) { + /* shift + round */ + size = (size >> 10) + !!(size & (1<<9)); + base++; + } + sprintf(buf, "%u %cB", (unsigned)size, units[base]); + + return buf; +} + +/* there is still a theoretical deadlock when called from receiver + * on an D_INCONSISTENT R_PRIMARY: + * remote READ does inc_ap_bio, receiver would need to receive answer + * packet from remote to dec_ap_bio again. + * receiver receive_sizes(), comes here, + * waits for ap_bio_cnt == 0. -> deadlock. + * but this cannot happen, actually, because: + * R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable + * (not connected, or bad/no disk on peer): + * see drbd_fail_request_early, ap_bio_cnt is zero. + * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET: + * peer may not initiate a resize. + */ +/* Note these are not to be confused with + * drbd_adm_suspend_io/drbd_adm_resume_io, + * which are (sub) state changes triggered by admin (drbdsetup), + * and can be long lived. + * This changes an device->flag, is triggered by drbd internals, + * and should be short-lived. */ +void drbd_suspend_io(struct drbd_device *device) +{ + set_bit(SUSPEND_IO, &device->flags); + if (drbd_suspended(device)) + return; + wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt)); +} + +void drbd_resume_io(struct drbd_device *device) +{ + clear_bit(SUSPEND_IO, &device->flags); + wake_up(&device->misc_wait); +} + +/** + * drbd_determine_dev_size() - Sets the right device size obeying all constraints + * @device: DRBD device. + * + * Returns 0 on success, negative return values indicate errors. + * You should call drbd_md_sync() after calling this function. + */ +enum determine_dev_size +drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local) +{ + sector_t prev_first_sect, prev_size; /* previous meta location */ + sector_t la_size_sect, u_size; + struct drbd_md *md = &device->ldev->md; + u32 prev_al_stripe_size_4k; + u32 prev_al_stripes; + sector_t size; + char ppb[10]; + void *buffer; + + int md_moved, la_size_changed; + enum determine_dev_size rv = DS_UNCHANGED; + + /* race: + * application request passes inc_ap_bio, + * but then cannot get an AL-reference. + * this function later may wait on ap_bio_cnt == 0. -> deadlock. + * + * to avoid that: + * Suspend IO right here. + * still lock the act_log to not trigger ASSERTs there. + */ + drbd_suspend_io(device); + buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */ + if (!buffer) { + drbd_resume_io(device); + return DS_ERROR; + } + + /* no wait necessary anymore, actually we could assert that */ + wait_event(device->al_wait, lc_try_lock(device->act_log)); + + prev_first_sect = drbd_md_first_sector(device->ldev); + prev_size = device->ldev->md.md_size_sect; + la_size_sect = device->ldev->md.la_size_sect; + + if (rs) { + /* rs is non NULL if we should change the AL layout only */ + + prev_al_stripes = md->al_stripes; + prev_al_stripe_size_4k = md->al_stripe_size_4k; + + md->al_stripes = rs->al_stripes; + md->al_stripe_size_4k = rs->al_stripe_size / 4; + md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4; + } + + drbd_md_set_sector_offsets(device, device->ldev); + + rcu_read_lock(); + u_size = rcu_dereference(device->ldev->disk_conf)->disk_size; + rcu_read_unlock(); + size = drbd_new_dev_size(device, device->ldev, u_size, flags & DDSF_FORCED); + + if (size < la_size_sect) { + if (rs && u_size == 0) { + /* Remove "rs &&" later. This check should always be active, but + right now the receiver expects the permissive behavior */ + drbd_warn(device, "Implicit shrink not allowed. " + "Use --size=%llus for explicit shrink.\n", + (unsigned long long)size); + rv = DS_ERROR_SHRINK; + } + if (u_size > size) + rv = DS_ERROR_SPACE_MD; + if (rv != DS_UNCHANGED) + goto err_out; + } + + if (drbd_get_capacity(device->this_bdev) != size || + drbd_bm_capacity(device) != size) { + int err; + err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC)); + if (unlikely(err)) { + /* currently there is only one error: ENOMEM! */ + size = drbd_bm_capacity(device)>>1; + if (size == 0) { + drbd_err(device, "OUT OF MEMORY! " + "Could not allocate bitmap!\n"); + } else { + drbd_err(device, "BM resizing failed. " + "Leaving size unchanged at size = %lu KB\n", + (unsigned long)size); + } + rv = DS_ERROR; + } + /* racy, see comments above. */ + drbd_set_my_capacity(device, size); + device->ldev->md.la_size_sect = size; + drbd_info(device, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), + (unsigned long long)size>>1); + } + if (rv <= DS_ERROR) + goto err_out; + + la_size_changed = (la_size_sect != device->ldev->md.la_size_sect); + + md_moved = prev_first_sect != drbd_md_first_sector(device->ldev) + || prev_size != device->ldev->md.md_size_sect; + + if (la_size_changed || md_moved || rs) { + u32 prev_flags; + + /* We do some synchronous IO below, which may take some time. + * Clear the timer, to avoid scary "timer expired!" messages, + * "Superblock" is written out at least twice below, anyways. */ + del_timer(&device->md_sync_timer); + drbd_al_shrink(device); /* All extents inactive. */ + + prev_flags = md->flags; + md->flags &= ~MDF_PRIMARY_IND; + drbd_md_write(device, buffer); + + drbd_info(device, "Writing the whole bitmap, %s\n", + la_size_changed && md_moved ? "size changed and md moved" : + la_size_changed ? "size changed" : "md moved"); + /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ + drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write, + "size changed", BM_LOCKED_MASK); + drbd_initialize_al(device, buffer); + + md->flags = prev_flags; + drbd_md_write(device, buffer); + + if (rs) + drbd_info(device, "Changed AL layout to al-stripes = %d, al-stripe-size-kB = %d\n", + md->al_stripes, md->al_stripe_size_4k * 4); + } + + if (size > la_size_sect) + rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO; + if (size < la_size_sect) + rv = DS_SHRUNK; + + if (0) { + err_out: + if (rs) { + md->al_stripes = prev_al_stripes; + md->al_stripe_size_4k = prev_al_stripe_size_4k; + md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k; + + drbd_md_set_sector_offsets(device, device->ldev); + } + } + lc_unlock(device->act_log); + wake_up(&device->al_wait); + drbd_md_put_buffer(device); + drbd_resume_io(device); + + return rv; +} + +sector_t +drbd_new_dev_size(struct drbd_device *device, struct drbd_backing_dev *bdev, + sector_t u_size, int assume_peer_has_space) +{ + sector_t p_size = device->p_size; /* partner's disk size. */ + sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */ + sector_t m_size; /* my size */ + sector_t size = 0; + + m_size = drbd_get_max_capacity(bdev); + + if (device->state.conn < C_CONNECTED && assume_peer_has_space) { + drbd_warn(device, "Resize while not connected was forced by the user!\n"); + p_size = m_size; + } + + if (p_size && m_size) { + size = min_t(sector_t, p_size, m_size); + } else { + if (la_size_sect) { + size = la_size_sect; + if (m_size && m_size < size) + size = m_size; + if (p_size && p_size < size) + size = p_size; + } else { + if (m_size) + size = m_size; + if (p_size) + size = p_size; + } + } + + if (size == 0) + drbd_err(device, "Both nodes diskless!\n"); + + if (u_size) { + if (u_size > size) + drbd_err(device, "Requested disk size is too big (%lu > %lu)\n", + (unsigned long)u_size>>1, (unsigned long)size>>1); + else + size = u_size; + } + + return size; +} + +/** + * drbd_check_al_size() - Ensures that the AL is of the right size + * @device: DRBD device. + * + * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation + * failed, and 0 on success. You should call drbd_md_sync() after you called + * this function. + */ +static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc) +{ + struct lru_cache *n, *t; + struct lc_element *e; + unsigned int in_use; + int i; + + if (device->act_log && + device->act_log->nr_elements == dc->al_extents) + return 0; + + in_use = 0; + t = device->act_log; + n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION, + dc->al_extents, sizeof(struct lc_element), 0); + + if (n == NULL) { + drbd_err(device, "Cannot allocate act_log lru!\n"); + return -ENOMEM; + } + spin_lock_irq(&device->al_lock); + if (t) { + for (i = 0; i < t->nr_elements; i++) { + e = lc_element_by_index(t, i); + if (e->refcnt) + drbd_err(device, "refcnt(%d)==%d\n", + e->lc_number, e->refcnt); + in_use += e->refcnt; + } + } + if (!in_use) + device->act_log = n; + spin_unlock_irq(&device->al_lock); + if (in_use) { + drbd_err(device, "Activity log still in use!\n"); + lc_destroy(n); + return -EBUSY; + } else { + if (t) + lc_destroy(t); + } + drbd_md_mark_dirty(device); /* we changed device->act_log->nr_elemens */ + return 0; +} + +static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev, + unsigned int max_bio_size) +{ + struct request_queue * const q = device->rq_queue; + unsigned int max_hw_sectors = max_bio_size >> 9; + unsigned int max_segments = 0; + struct request_queue *b = NULL; + + if (bdev) { + b = bdev->backing_bdev->bd_disk->queue; + + max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); + rcu_read_lock(); + max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs; + rcu_read_unlock(); + + blk_set_stacking_limits(&q->limits); + blk_queue_max_write_same_sectors(q, 0); + } + + blk_queue_logical_block_size(q, 512); + blk_queue_max_hw_sectors(q, max_hw_sectors); + /* This is the workaround for "bio would need to, but cannot, be split" */ + blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); + blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1); + + if (b) { + struct drbd_connection *connection = first_peer_device(device)->connection; + + if (blk_queue_discard(b) && + (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) { + /* For now, don't allow more than one activity log extent worth of data + * to be discarded in one go. We may need to rework drbd_al_begin_io() + * to allow for even larger discard ranges */ + q->limits.max_discard_sectors = DRBD_MAX_DISCARD_SECTORS; + + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + /* REALLY? Is stacking secdiscard "legal"? */ + if (blk_queue_secdiscard(b)) + queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q); + } else { + q->limits.max_discard_sectors = 0; + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); + queue_flag_clear_unlocked(QUEUE_FLAG_SECDISCARD, q); + } + + blk_queue_stack_limits(q, b); + + if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { + drbd_info(device, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n", + q->backing_dev_info.ra_pages, + b->backing_dev_info.ra_pages); + q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; + } + } +} + +void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev) +{ + unsigned int now, new, local, peer; + + now = queue_max_hw_sectors(device->rq_queue) << 9; + local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */ + peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */ + + if (bdev) { + local = queue_max_hw_sectors(bdev->backing_bdev->bd_disk->queue) << 9; + device->local_max_bio_size = local; + } + local = min(local, DRBD_MAX_BIO_SIZE); + + /* We may ignore peer limits if the peer is modern enough. + Because new from 8.3.8 onwards the peer can use multiple + BIOs for a single peer_request */ + if (device->state.conn >= C_WF_REPORT_PARAMS) { + if (first_peer_device(device)->connection->agreed_pro_version < 94) + peer = min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ + else if (first_peer_device(device)->connection->agreed_pro_version == 94) + peer = DRBD_MAX_SIZE_H80_PACKET; + else if (first_peer_device(device)->connection->agreed_pro_version < 100) + peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */ + else + peer = DRBD_MAX_BIO_SIZE; + + /* We may later detach and re-attach on a disconnected Primary. + * Avoid this setting to jump back in that case. + * We want to store what we know the peer DRBD can handle, + * not what the peer IO backend can handle. */ + if (peer > device->peer_max_bio_size) + device->peer_max_bio_size = peer; + } + new = min(local, peer); + + if (device->state.role == R_PRIMARY && new < now) + drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n", new, now); + + if (new != now) + drbd_info(device, "max BIO size = %u\n", new); + + drbd_setup_queue_param(device, bdev, new); +} + +/* Starts the worker thread */ +static void conn_reconfig_start(struct drbd_connection *connection) +{ + drbd_thread_start(&connection->worker); + drbd_flush_workqueue(&connection->sender_work); +} + +/* if still unconfigured, stops worker again. */ +static void conn_reconfig_done(struct drbd_connection *connection) +{ + bool stop_threads; + spin_lock_irq(&connection->resource->req_lock); + stop_threads = conn_all_vols_unconf(connection) && + connection->cstate == C_STANDALONE; + spin_unlock_irq(&connection->resource->req_lock); + if (stop_threads) { + /* asender is implicitly stopped by receiver + * in conn_disconnect() */ + drbd_thread_stop(&connection->receiver); + drbd_thread_stop(&connection->worker); + } +} + +/* Make sure IO is suspended before calling this function(). */ +static void drbd_suspend_al(struct drbd_device *device) +{ + int s = 0; + + if (!lc_try_lock(device->act_log)) { + drbd_warn(device, "Failed to lock al in drbd_suspend_al()\n"); + return; + } + + drbd_al_shrink(device); + spin_lock_irq(&device->resource->req_lock); + if (device->state.conn < C_CONNECTED) + s = !test_and_set_bit(AL_SUSPENDED, &device->flags); + spin_unlock_irq(&device->resource->req_lock); + lc_unlock(device->act_log); + + if (s) + drbd_info(device, "Suspended AL updates\n"); +} + + +static bool should_set_defaults(struct genl_info *info) +{ + unsigned flags = ((struct drbd_genlmsghdr*)info->userhdr)->flags; + return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS); +} + +static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev) +{ + /* This is limited by 16 bit "slot" numbers, + * and by available on-disk context storage. + * + * Also (u16)~0 is special (denotes a "free" extent). + * + * One transaction occupies one 4kB on-disk block, + * we have n such blocks in the on disk ring buffer, + * the "current" transaction may fail (n-1), + * and there is 919 slot numbers context information per transaction. + * + * 72 transaction blocks amounts to more than 2**16 context slots, + * so cap there first. + */ + const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX; + const unsigned int sufficient_on_disk = + (max_al_nr + AL_CONTEXT_PER_TRANSACTION -1) + /AL_CONTEXT_PER_TRANSACTION; + + unsigned int al_size_4k = bdev->md.al_size_4k; + + if (al_size_4k > sufficient_on_disk) + return max_al_nr; + + return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION; +} + +static bool write_ordering_changed(struct disk_conf *a, struct disk_conf *b) +{ + return a->disk_barrier != b->disk_barrier || + a->disk_flushes != b->disk_flushes || + a->disk_drain != b->disk_drain; +} + +int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + enum drbd_ret_code retcode; + struct drbd_device *device; + struct disk_conf *new_disk_conf, *old_disk_conf; + struct fifo_buffer *old_plan = NULL, *new_plan = NULL; + int err, fifo_size; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto finish; + + device = adm_ctx.device; + mutex_lock(&adm_ctx.resource->adm_mutex); + + /* we also need a disk + * to change the options on */ + if (!get_ldev(device)) { + retcode = ERR_NO_DISK; + goto out; + } + + new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL); + if (!new_disk_conf) { + retcode = ERR_NOMEM; + goto fail; + } + + mutex_lock(&device->resource->conf_update); + old_disk_conf = device->ldev->disk_conf; + *new_disk_conf = *old_disk_conf; + if (should_set_defaults(info)) + set_disk_conf_defaults(new_disk_conf); + + err = disk_conf_from_attrs_for_change(new_disk_conf, info); + if (err && err != -ENOMSG) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto fail_unlock; + } + + if (!expect(new_disk_conf->resync_rate >= 1)) + new_disk_conf->resync_rate = 1; + + if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) + new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN; + if (new_disk_conf->al_extents > drbd_al_extents_max(device->ldev)) + new_disk_conf->al_extents = drbd_al_extents_max(device->ldev); + + if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) + new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; + + fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; + if (fifo_size != device->rs_plan_s->size) { + new_plan = fifo_alloc(fifo_size); + if (!new_plan) { + drbd_err(device, "kmalloc of fifo_buffer failed"); + retcode = ERR_NOMEM; + goto fail_unlock; + } + } + + drbd_suspend_io(device); + wait_event(device->al_wait, lc_try_lock(device->act_log)); + drbd_al_shrink(device); + err = drbd_check_al_size(device, new_disk_conf); + lc_unlock(device->act_log); + wake_up(&device->al_wait); + drbd_resume_io(device); + + if (err) { + retcode = ERR_NOMEM; + goto fail_unlock; + } + + write_lock_irq(&global_state_lock); + retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after); + if (retcode == NO_ERROR) { + rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); + drbd_resync_after_changed(device); + } + write_unlock_irq(&global_state_lock); + + if (retcode != NO_ERROR) + goto fail_unlock; + + if (new_plan) { + old_plan = device->rs_plan_s; + rcu_assign_pointer(device->rs_plan_s, new_plan); + } + + mutex_unlock(&device->resource->conf_update); + + if (new_disk_conf->al_updates) + device->ldev->md.flags &= ~MDF_AL_DISABLED; + else + device->ldev->md.flags |= MDF_AL_DISABLED; + + if (new_disk_conf->md_flushes) + clear_bit(MD_NO_FUA, &device->flags); + else + set_bit(MD_NO_FUA, &device->flags); + + if (write_ordering_changed(old_disk_conf, new_disk_conf)) + drbd_bump_write_ordering(device->resource, NULL, WO_bdev_flush); + + drbd_md_sync(device); + + if (device->state.conn >= C_CONNECTED) { + struct drbd_peer_device *peer_device; + + for_each_peer_device(peer_device, device) + drbd_send_sync_param(peer_device); + } + + synchronize_rcu(); + kfree(old_disk_conf); + kfree(old_plan); + mod_timer(&device->request_timer, jiffies + HZ); + goto success; + +fail_unlock: + mutex_unlock(&device->resource->conf_update); + fail: + kfree(new_disk_conf); + kfree(new_plan); +success: + put_ldev(device); + out: + mutex_unlock(&adm_ctx.resource->adm_mutex); + finish: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct drbd_device *device; + struct drbd_peer_device *peer_device; + struct drbd_connection *connection; + int err; + enum drbd_ret_code retcode; + enum determine_dev_size dd; + sector_t max_possible_sectors; + sector_t min_md_device_sectors; + struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ + struct disk_conf *new_disk_conf = NULL; + struct block_device *bdev; + struct lru_cache *resync_lru = NULL; + struct fifo_buffer *new_plan = NULL; + union drbd_state ns, os; + enum drbd_state_rv rv; + struct net_conf *nc; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto finish; + + device = adm_ctx.device; + mutex_lock(&adm_ctx.resource->adm_mutex); + peer_device = first_peer_device(device); + connection = peer_device ? peer_device->connection : NULL; + conn_reconfig_start(connection); + + /* if you want to reconfigure, please tear down first */ + if (device->state.disk > D_DISKLESS) { + retcode = ERR_DISK_CONFIGURED; + goto fail; + } + /* It may just now have detached because of IO error. Make sure + * drbd_ldev_destroy is done already, we may end up here very fast, + * e.g. if someone calls attach from the on-io-error handler, + * to realize a "hot spare" feature (not that I'd recommend that) */ + wait_event(device->misc_wait, !test_bit(GOING_DISKLESS, &device->flags)); + + /* make sure there is no leftover from previous force-detach attempts */ + clear_bit(FORCE_DETACH, &device->flags); + clear_bit(WAS_IO_ERROR, &device->flags); + clear_bit(WAS_READ_ERROR, &device->flags); + + /* and no leftover from previously aborted resync or verify, either */ + device->rs_total = 0; + device->rs_failed = 0; + atomic_set(&device->rs_pending_cnt, 0); + + /* allocation not in the IO path, drbdsetup context */ + nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); + if (!nbc) { + retcode = ERR_NOMEM; + goto fail; + } + spin_lock_init(&nbc->md.uuid_lock); + + new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); + if (!new_disk_conf) { + retcode = ERR_NOMEM; + goto fail; + } + nbc->disk_conf = new_disk_conf; + + set_disk_conf_defaults(new_disk_conf); + err = disk_conf_from_attrs(new_disk_conf, info); + if (err) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto fail; + } + + if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) + new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; + + new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ); + if (!new_plan) { + retcode = ERR_NOMEM; + goto fail; + } + + if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) { + retcode = ERR_MD_IDX_INVALID; + goto fail; + } + + write_lock_irq(&global_state_lock); + retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after); + write_unlock_irq(&global_state_lock); + if (retcode != NO_ERROR) + goto fail; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + if (nc) { + if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) { + rcu_read_unlock(); + retcode = ERR_STONITH_AND_PROT_A; + goto fail; + } + } + rcu_read_unlock(); + + bdev = blkdev_get_by_path(new_disk_conf->backing_dev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL, device); + if (IS_ERR(bdev)) { + drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev, + PTR_ERR(bdev)); + retcode = ERR_OPEN_DISK; + goto fail; + } + nbc->backing_bdev = bdev; + + /* + * meta_dev_idx >= 0: external fixed size, possibly multiple + * drbd sharing one meta device. TODO in that case, paranoia + * check that [md_bdev, meta_dev_idx] is not yet used by some + * other drbd minor! (if you use drbd.conf + drbdadm, that + * should check it for you already; but if you don't, or + * someone fooled it, we need to double check here) + */ + bdev = blkdev_get_by_path(new_disk_conf->meta_dev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL, + (new_disk_conf->meta_dev_idx < 0) ? + (void *)device : (void *)drbd_m_holder); + if (IS_ERR(bdev)) { + drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev, + PTR_ERR(bdev)); + retcode = ERR_OPEN_MD_DISK; + goto fail; + } + nbc->md_bdev = bdev; + + if ((nbc->backing_bdev == nbc->md_bdev) != + (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL || + new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { + retcode = ERR_MD_IDX_INVALID; + goto fail; + } + + resync_lru = lc_create("resync", drbd_bm_ext_cache, + 1, 61, sizeof(struct bm_extent), + offsetof(struct bm_extent, lce)); + if (!resync_lru) { + retcode = ERR_NOMEM; + goto fail; + } + + /* Read our meta data super block early. + * This also sets other on-disk offsets. */ + retcode = drbd_md_read(device, nbc); + if (retcode != NO_ERROR) + goto fail; + + if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) + new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN; + if (new_disk_conf->al_extents > drbd_al_extents_max(nbc)) + new_disk_conf->al_extents = drbd_al_extents_max(nbc); + + if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) { + drbd_err(device, "max capacity %llu smaller than disk size %llu\n", + (unsigned long long) drbd_get_max_capacity(nbc), + (unsigned long long) new_disk_conf->disk_size); + retcode = ERR_DISK_TOO_SMALL; + goto fail; + } + + if (new_disk_conf->meta_dev_idx < 0) { + max_possible_sectors = DRBD_MAX_SECTORS_FLEX; + /* at least one MB, otherwise it does not make sense */ + min_md_device_sectors = (2<<10); + } else { + max_possible_sectors = DRBD_MAX_SECTORS; + min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1); + } + + if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { + retcode = ERR_MD_DISK_TOO_SMALL; + drbd_warn(device, "refusing attach: md-device too small, " + "at least %llu sectors needed for this meta-disk type\n", + (unsigned long long) min_md_device_sectors); + goto fail; + } + + /* Make sure the new disk is big enough + * (we may currently be R_PRIMARY with no local disk...) */ + if (drbd_get_max_capacity(nbc) < + drbd_get_capacity(device->this_bdev)) { + retcode = ERR_DISK_TOO_SMALL; + goto fail; + } + + nbc->known_size = drbd_get_capacity(nbc->backing_bdev); + + if (nbc->known_size > max_possible_sectors) { + drbd_warn(device, "==> truncating very big lower level device " + "to currently maximum possible %llu sectors <==\n", + (unsigned long long) max_possible_sectors); + if (new_disk_conf->meta_dev_idx >= 0) + drbd_warn(device, "==>> using internal or flexible " + "meta data may help <<==\n"); + } + + drbd_suspend_io(device); + /* also wait for the last barrier ack. */ + /* FIXME see also https://daiquiri.linbit/cgi-bin/bugzilla/show_bug.cgi?id=171 + * We need a way to either ignore barrier acks for barriers sent before a device + * was attached, or a way to wait for all pending barrier acks to come in. + * As barriers are counted per resource, + * we'd need to suspend io on all devices of a resource. + */ + wait_event(device->misc_wait, !atomic_read(&device->ap_pending_cnt) || drbd_suspended(device)); + /* and for any other previously queued work */ + drbd_flush_workqueue(&connection->sender_work); + + rv = _drbd_request_state(device, NS(disk, D_ATTACHING), CS_VERBOSE); + retcode = rv; /* FIXME: Type mismatch. */ + drbd_resume_io(device); + if (rv < SS_SUCCESS) + goto fail; + + if (!get_ldev_if_state(device, D_ATTACHING)) + goto force_diskless; + + if (!device->bitmap) { + if (drbd_bm_init(device)) { + retcode = ERR_NOMEM; + goto force_diskless_dec; + } + } + + if (device->state.conn < C_CONNECTED && + device->state.role == R_PRIMARY && device->ed_uuid && + (device->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { + drbd_err(device, "Can only attach to data with current UUID=%016llX\n", + (unsigned long long)device->ed_uuid); + retcode = ERR_DATA_NOT_CURRENT; + goto force_diskless_dec; + } + + /* Since we are diskless, fix the activity log first... */ + if (drbd_check_al_size(device, new_disk_conf)) { + retcode = ERR_NOMEM; + goto force_diskless_dec; + } + + /* Prevent shrinking of consistent devices ! */ + if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && + drbd_new_dev_size(device, nbc, nbc->disk_conf->disk_size, 0) < nbc->md.la_size_sect) { + drbd_warn(device, "refusing to truncate a consistent device\n"); + retcode = ERR_DISK_TOO_SMALL; + goto force_diskless_dec; + } + + /* Reset the "barriers don't work" bits here, then force meta data to + * be written, to ensure we determine if barriers are supported. */ + if (new_disk_conf->md_flushes) + clear_bit(MD_NO_FUA, &device->flags); + else + set_bit(MD_NO_FUA, &device->flags); + + /* Point of no return reached. + * Devices and memory are no longer released by error cleanup below. + * now device takes over responsibility, and the state engine should + * clean it up somewhere. */ + D_ASSERT(device, device->ldev == NULL); + device->ldev = nbc; + device->resync = resync_lru; + device->rs_plan_s = new_plan; + nbc = NULL; + resync_lru = NULL; + new_disk_conf = NULL; + new_plan = NULL; + + drbd_bump_write_ordering(device->resource, device->ldev, WO_bdev_flush); + + if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY)) + set_bit(CRASHED_PRIMARY, &device->flags); + else + clear_bit(CRASHED_PRIMARY, &device->flags); + + if (drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) && + !(device->state.role == R_PRIMARY && device->resource->susp_nod)) + set_bit(CRASHED_PRIMARY, &device->flags); + + device->send_cnt = 0; + device->recv_cnt = 0; + device->read_cnt = 0; + device->writ_cnt = 0; + + drbd_reconsider_max_bio_size(device, device->ldev); + + /* If I am currently not R_PRIMARY, + * but meta data primary indicator is set, + * I just now recover from a hard crash, + * and have been R_PRIMARY before that crash. + * + * Now, if I had no connection before that crash + * (have been degraded R_PRIMARY), chances are that + * I won't find my peer now either. + * + * In that case, and _only_ in that case, + * we use the degr-wfc-timeout instead of the default, + * so we can automatically recover from a crash of a + * degraded but active "cluster" after a certain timeout. + */ + clear_bit(USE_DEGR_WFC_T, &device->flags); + if (device->state.role != R_PRIMARY && + drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) && + !drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND)) + set_bit(USE_DEGR_WFC_T, &device->flags); + + dd = drbd_determine_dev_size(device, 0, NULL); + if (dd <= DS_ERROR) { + retcode = ERR_NOMEM_BITMAP; + goto force_diskless_dec; + } else if (dd == DS_GREW) + set_bit(RESYNC_AFTER_NEG, &device->flags); + + if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) || + (test_bit(CRASHED_PRIMARY, &device->flags) && + drbd_md_test_flag(device->ldev, MDF_AL_DISABLED))) { + drbd_info(device, "Assuming that all blocks are out of sync " + "(aka FullSync)\n"); + if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, + "set_n_write from attaching", BM_LOCKED_MASK)) { + retcode = ERR_IO_MD_DISK; + goto force_diskless_dec; + } + } else { + if (drbd_bitmap_io(device, &drbd_bm_read, + "read from attaching", BM_LOCKED_MASK)) { + retcode = ERR_IO_MD_DISK; + goto force_diskless_dec; + } + } + + if (_drbd_bm_total_weight(device) == drbd_bm_bits(device)) + drbd_suspend_al(device); /* IO is still suspended here... */ + + spin_lock_irq(&device->resource->req_lock); + os = drbd_read_state(device); + ns = os; + /* If MDF_CONSISTENT is not set go into inconsistent state, + otherwise investigate MDF_WasUpToDate... + If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state, + otherwise into D_CONSISTENT state. + */ + if (drbd_md_test_flag(device->ldev, MDF_CONSISTENT)) { + if (drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE)) + ns.disk = D_CONSISTENT; + else + ns.disk = D_OUTDATED; + } else { + ns.disk = D_INCONSISTENT; + } + + if (drbd_md_test_flag(device->ldev, MDF_PEER_OUT_DATED)) + ns.pdsk = D_OUTDATED; + + rcu_read_lock(); + if (ns.disk == D_CONSISTENT && + (ns.pdsk == D_OUTDATED || rcu_dereference(device->ldev->disk_conf)->fencing == FP_DONT_CARE)) + ns.disk = D_UP_TO_DATE; + + /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND, + MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before + this point, because drbd_request_state() modifies these + flags. */ + + if (rcu_dereference(device->ldev->disk_conf)->al_updates) + device->ldev->md.flags &= ~MDF_AL_DISABLED; + else + device->ldev->md.flags |= MDF_AL_DISABLED; + + rcu_read_unlock(); + + /* In case we are C_CONNECTED postpone any decision on the new disk + state after the negotiation phase. */ + if (device->state.conn == C_CONNECTED) { + device->new_state_tmp.i = ns.i; + ns.i = os.i; + ns.disk = D_NEGOTIATING; + + /* We expect to receive up-to-date UUIDs soon. + To avoid a race in receive_state, free p_uuid while + holding req_lock. I.e. atomic with the state change */ + kfree(device->p_uuid); + device->p_uuid = NULL; + } + + rv = _drbd_set_state(device, ns, CS_VERBOSE, NULL); + spin_unlock_irq(&device->resource->req_lock); + + if (rv < SS_SUCCESS) + goto force_diskless_dec; + + mod_timer(&device->request_timer, jiffies + HZ); + + if (device->state.role == R_PRIMARY) + device->ldev->md.uuid[UI_CURRENT] |= (u64)1; + else + device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; + + drbd_md_mark_dirty(device); + drbd_md_sync(device); + + kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE); + put_ldev(device); + conn_reconfig_done(connection); + mutex_unlock(&adm_ctx.resource->adm_mutex); + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; + + force_diskless_dec: + put_ldev(device); + force_diskless: + drbd_force_state(device, NS(disk, D_DISKLESS)); + drbd_md_sync(device); + fail: + conn_reconfig_done(connection); + if (nbc) { + if (nbc->backing_bdev) + blkdev_put(nbc->backing_bdev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL); + if (nbc->md_bdev) + blkdev_put(nbc->md_bdev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL); + kfree(nbc); + } + kfree(new_disk_conf); + lc_destroy(resync_lru); + kfree(new_plan); + mutex_unlock(&adm_ctx.resource->adm_mutex); + finish: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +static int adm_detach(struct drbd_device *device, int force) +{ + enum drbd_state_rv retcode; + int ret; + + if (force) { + set_bit(FORCE_DETACH, &device->flags); + drbd_force_state(device, NS(disk, D_FAILED)); + retcode = SS_SUCCESS; + goto out; + } + + drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */ + drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */ + retcode = drbd_request_state(device, NS(disk, D_FAILED)); + drbd_md_put_buffer(device); + /* D_FAILED will transition to DISKLESS. */ + ret = wait_event_interruptible(device->misc_wait, + device->state.disk != D_FAILED); + drbd_resume_io(device); + if ((int)retcode == (int)SS_IS_DISKLESS) + retcode = SS_NOTHING_TO_DO; + if (ret) + retcode = ERR_INTR; +out: + return retcode; +} + +/* Detaching the disk is a process in multiple stages. First we need to lock + * out application IO, in-flight IO, IO stuck in drbd_al_begin_io. + * Then we transition to D_DISKLESS, and wait for put_ldev() to return all + * internal references as well. + * Only then we have finally detached. */ +int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + enum drbd_ret_code retcode; + struct detach_parms parms = { }; + int err; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + if (info->attrs[DRBD_NLA_DETACH_PARMS]) { + err = detach_parms_from_attrs(&parms, info); + if (err) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto out; + } + } + + mutex_lock(&adm_ctx.resource->adm_mutex); + retcode = adm_detach(adm_ctx.device, parms.force_detach); + mutex_unlock(&adm_ctx.resource->adm_mutex); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +static bool conn_resync_running(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + bool rv = false; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + if (device->state.conn == C_SYNC_SOURCE || + device->state.conn == C_SYNC_TARGET || + device->state.conn == C_PAUSED_SYNC_S || + device->state.conn == C_PAUSED_SYNC_T) { + rv = true; + break; + } + } + rcu_read_unlock(); + + return rv; +} + +static bool conn_ov_running(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + bool rv = false; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + if (device->state.conn == C_VERIFY_S || + device->state.conn == C_VERIFY_T) { + rv = true; + break; + } + } + rcu_read_unlock(); + + return rv; +} + +static enum drbd_ret_code +_check_net_options(struct drbd_connection *connection, struct net_conf *old_net_conf, struct net_conf *new_net_conf) +{ + struct drbd_peer_device *peer_device; + int i; + + if (old_net_conf && connection->cstate == C_WF_REPORT_PARAMS && connection->agreed_pro_version < 100) { + if (new_net_conf->wire_protocol != old_net_conf->wire_protocol) + return ERR_NEED_APV_100; + + if (new_net_conf->two_primaries != old_net_conf->two_primaries) + return ERR_NEED_APV_100; + + if (strcmp(new_net_conf->integrity_alg, old_net_conf->integrity_alg)) + return ERR_NEED_APV_100; + } + + if (!new_net_conf->two_primaries && + conn_highest_role(connection) == R_PRIMARY && + conn_highest_peer(connection) == R_PRIMARY) + return ERR_NEED_ALLOW_TWO_PRI; + + if (new_net_conf->two_primaries && + (new_net_conf->wire_protocol != DRBD_PROT_C)) + return ERR_NOT_PROTO_C; + + idr_for_each_entry(&connection->peer_devices, peer_device, i) { + struct drbd_device *device = peer_device->device; + if (get_ldev(device)) { + enum drbd_fencing_p fp = rcu_dereference(device->ldev->disk_conf)->fencing; + put_ldev(device); + if (new_net_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) + return ERR_STONITH_AND_PROT_A; + } + if (device->state.role == R_PRIMARY && new_net_conf->discard_my_data) + return ERR_DISCARD_IMPOSSIBLE; + } + + if (new_net_conf->on_congestion != OC_BLOCK && new_net_conf->wire_protocol != DRBD_PROT_A) + return ERR_CONG_NOT_PROTO_A; + + return NO_ERROR; +} + +static enum drbd_ret_code +check_net_options(struct drbd_connection *connection, struct net_conf *new_net_conf) +{ + static enum drbd_ret_code rv; + struct drbd_peer_device *peer_device; + int i; + + rcu_read_lock(); + rv = _check_net_options(connection, rcu_dereference(connection->net_conf), new_net_conf); + rcu_read_unlock(); + + /* connection->peer_devices protected by genl_lock() here */ + idr_for_each_entry(&connection->peer_devices, peer_device, i) { + struct drbd_device *device = peer_device->device; + if (!device->bitmap) { + if (drbd_bm_init(device)) + return ERR_NOMEM; + } + } + + return rv; +} + +struct crypto { + struct crypto_hash *verify_tfm; + struct crypto_hash *csums_tfm; + struct crypto_hash *cram_hmac_tfm; + struct crypto_hash *integrity_tfm; +}; + +static int +alloc_hash(struct crypto_hash **tfm, char *tfm_name, int err_alg) +{ + if (!tfm_name[0]) + return NO_ERROR; + + *tfm = crypto_alloc_hash(tfm_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(*tfm)) { + *tfm = NULL; + return err_alg; + } + + return NO_ERROR; +} + +static enum drbd_ret_code +alloc_crypto(struct crypto *crypto, struct net_conf *new_net_conf) +{ + char hmac_name[CRYPTO_MAX_ALG_NAME]; + enum drbd_ret_code rv; + + rv = alloc_hash(&crypto->csums_tfm, new_net_conf->csums_alg, + ERR_CSUMS_ALG); + if (rv != NO_ERROR) + return rv; + rv = alloc_hash(&crypto->verify_tfm, new_net_conf->verify_alg, + ERR_VERIFY_ALG); + if (rv != NO_ERROR) + return rv; + rv = alloc_hash(&crypto->integrity_tfm, new_net_conf->integrity_alg, + ERR_INTEGRITY_ALG); + if (rv != NO_ERROR) + return rv; + if (new_net_conf->cram_hmac_alg[0] != 0) { + snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)", + new_net_conf->cram_hmac_alg); + + rv = alloc_hash(&crypto->cram_hmac_tfm, hmac_name, + ERR_AUTH_ALG); + } + + return rv; +} + +static void free_crypto(struct crypto *crypto) +{ + crypto_free_hash(crypto->cram_hmac_tfm); + crypto_free_hash(crypto->integrity_tfm); + crypto_free_hash(crypto->csums_tfm); + crypto_free_hash(crypto->verify_tfm); +} + +int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + enum drbd_ret_code retcode; + struct drbd_connection *connection; + struct net_conf *old_net_conf, *new_net_conf = NULL; + int err; + int ovr; /* online verify running */ + int rsr; /* re-sync running */ + struct crypto crypto = { }; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto finish; + + connection = adm_ctx.connection; + mutex_lock(&adm_ctx.resource->adm_mutex); + + new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); + if (!new_net_conf) { + retcode = ERR_NOMEM; + goto out; + } + + conn_reconfig_start(connection); + + mutex_lock(&connection->data.mutex); + mutex_lock(&connection->resource->conf_update); + old_net_conf = connection->net_conf; + + if (!old_net_conf) { + drbd_msg_put_info(adm_ctx.reply_skb, "net conf missing, try connect"); + retcode = ERR_INVALID_REQUEST; + goto fail; + } + + *new_net_conf = *old_net_conf; + if (should_set_defaults(info)) + set_net_conf_defaults(new_net_conf); + + err = net_conf_from_attrs_for_change(new_net_conf, info); + if (err && err != -ENOMSG) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto fail; + } + + retcode = check_net_options(connection, new_net_conf); + if (retcode != NO_ERROR) + goto fail; + + /* re-sync running */ + rsr = conn_resync_running(connection); + if (rsr && strcmp(new_net_conf->csums_alg, old_net_conf->csums_alg)) { + retcode = ERR_CSUMS_RESYNC_RUNNING; + goto fail; + } + + /* online verify running */ + ovr = conn_ov_running(connection); + if (ovr && strcmp(new_net_conf->verify_alg, old_net_conf->verify_alg)) { + retcode = ERR_VERIFY_RUNNING; + goto fail; + } + + retcode = alloc_crypto(&crypto, new_net_conf); + if (retcode != NO_ERROR) + goto fail; + + rcu_assign_pointer(connection->net_conf, new_net_conf); + + if (!rsr) { + crypto_free_hash(connection->csums_tfm); + connection->csums_tfm = crypto.csums_tfm; + crypto.csums_tfm = NULL; + } + if (!ovr) { + crypto_free_hash(connection->verify_tfm); + connection->verify_tfm = crypto.verify_tfm; + crypto.verify_tfm = NULL; + } + + crypto_free_hash(connection->integrity_tfm); + connection->integrity_tfm = crypto.integrity_tfm; + if (connection->cstate >= C_WF_REPORT_PARAMS && connection->agreed_pro_version >= 100) + /* Do this without trying to take connection->data.mutex again. */ + __drbd_send_protocol(connection, P_PROTOCOL_UPDATE); + + crypto_free_hash(connection->cram_hmac_tfm); + connection->cram_hmac_tfm = crypto.cram_hmac_tfm; + + mutex_unlock(&connection->resource->conf_update); + mutex_unlock(&connection->data.mutex); + synchronize_rcu(); + kfree(old_net_conf); + + if (connection->cstate >= C_WF_REPORT_PARAMS) { + struct drbd_peer_device *peer_device; + int vnr; + + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + drbd_send_sync_param(peer_device); + } + + goto done; + + fail: + mutex_unlock(&connection->resource->conf_update); + mutex_unlock(&connection->data.mutex); + free_crypto(&crypto); + kfree(new_net_conf); + done: + conn_reconfig_done(connection); + out: + mutex_unlock(&adm_ctx.resource->adm_mutex); + finish: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct drbd_peer_device *peer_device; + struct net_conf *old_net_conf, *new_net_conf = NULL; + struct crypto crypto = { }; + struct drbd_resource *resource; + struct drbd_connection *connection; + enum drbd_ret_code retcode; + int i; + int err; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); + + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) { + drbd_msg_put_info(adm_ctx.reply_skb, "connection endpoint(s) missing"); + retcode = ERR_INVALID_REQUEST; + goto out; + } + + /* No need for _rcu here. All reconfiguration is + * strictly serialized on genl_lock(). We are protected against + * concurrent reconfiguration/addition/deletion */ + for_each_resource(resource, &drbd_resources) { + for_each_connection(connection, resource) { + if (nla_len(adm_ctx.my_addr) == connection->my_addr_len && + !memcmp(nla_data(adm_ctx.my_addr), &connection->my_addr, + connection->my_addr_len)) { + retcode = ERR_LOCAL_ADDR; + goto out; + } + + if (nla_len(adm_ctx.peer_addr) == connection->peer_addr_len && + !memcmp(nla_data(adm_ctx.peer_addr), &connection->peer_addr, + connection->peer_addr_len)) { + retcode = ERR_PEER_ADDR; + goto out; + } + } + } + + mutex_lock(&adm_ctx.resource->adm_mutex); + connection = first_connection(adm_ctx.resource); + conn_reconfig_start(connection); + + if (connection->cstate > C_STANDALONE) { + retcode = ERR_NET_CONFIGURED; + goto fail; + } + + /* allocation not in the IO path, drbdsetup / netlink process context */ + new_net_conf = kzalloc(sizeof(*new_net_conf), GFP_KERNEL); + if (!new_net_conf) { + retcode = ERR_NOMEM; + goto fail; + } + + set_net_conf_defaults(new_net_conf); + + err = net_conf_from_attrs(new_net_conf, info); + if (err && err != -ENOMSG) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto fail; + } + + retcode = check_net_options(connection, new_net_conf); + if (retcode != NO_ERROR) + goto fail; + + retcode = alloc_crypto(&crypto, new_net_conf); + if (retcode != NO_ERROR) + goto fail; + + ((char *)new_net_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0; + + drbd_flush_workqueue(&connection->sender_work); + + mutex_lock(&adm_ctx.resource->conf_update); + old_net_conf = connection->net_conf; + if (old_net_conf) { + retcode = ERR_NET_CONFIGURED; + mutex_unlock(&adm_ctx.resource->conf_update); + goto fail; + } + rcu_assign_pointer(connection->net_conf, new_net_conf); + + conn_free_crypto(connection); + connection->cram_hmac_tfm = crypto.cram_hmac_tfm; + connection->integrity_tfm = crypto.integrity_tfm; + connection->csums_tfm = crypto.csums_tfm; + connection->verify_tfm = crypto.verify_tfm; + + connection->my_addr_len = nla_len(adm_ctx.my_addr); + memcpy(&connection->my_addr, nla_data(adm_ctx.my_addr), connection->my_addr_len); + connection->peer_addr_len = nla_len(adm_ctx.peer_addr); + memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len); + + mutex_unlock(&adm_ctx.resource->conf_update); + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, i) { + struct drbd_device *device = peer_device->device; + device->send_cnt = 0; + device->recv_cnt = 0; + } + rcu_read_unlock(); + + retcode = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE); + + conn_reconfig_done(connection); + mutex_unlock(&adm_ctx.resource->adm_mutex); + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; + +fail: + free_crypto(&crypto); + kfree(new_net_conf); + + conn_reconfig_done(connection); + mutex_unlock(&adm_ctx.resource->adm_mutex); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection, bool force) +{ + enum drbd_state_rv rv; + + rv = conn_request_state(connection, NS(conn, C_DISCONNECTING), + force ? CS_HARD : 0); + + switch (rv) { + case SS_NOTHING_TO_DO: + break; + case SS_ALREADY_STANDALONE: + return SS_SUCCESS; + case SS_PRIMARY_NOP: + /* Our state checking code wants to see the peer outdated. */ + rv = conn_request_state(connection, NS2(conn, C_DISCONNECTING, pdsk, D_OUTDATED), 0); + + if (rv == SS_OUTDATE_WO_CONN) /* lost connection before graceful disconnect succeeded */ + rv = conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_VERBOSE); + + break; + case SS_CW_FAILED_BY_PEER: + /* The peer probably wants to see us outdated. */ + rv = conn_request_state(connection, NS2(conn, C_DISCONNECTING, + disk, D_OUTDATED), 0); + if (rv == SS_IS_DISKLESS || rv == SS_LOWER_THAN_OUTDATED) { + rv = conn_request_state(connection, NS(conn, C_DISCONNECTING), + CS_HARD); + } + break; + default:; + /* no special handling necessary */ + } + + if (rv >= SS_SUCCESS) { + enum drbd_state_rv rv2; + /* No one else can reconfigure the network while I am here. + * The state handling only uses drbd_thread_stop_nowait(), + * we want to really wait here until the receiver is no more. + */ + drbd_thread_stop(&connection->receiver); + + /* Race breaker. This additional state change request may be + * necessary, if this was a forced disconnect during a receiver + * restart. We may have "killed" the receiver thread just + * after drbd_receiver() returned. Typically, we should be + * C_STANDALONE already, now, and this becomes a no-op. + */ + rv2 = conn_request_state(connection, NS(conn, C_STANDALONE), + CS_VERBOSE | CS_HARD); + if (rv2 < SS_SUCCESS) + drbd_err(connection, + "unexpected rv2=%d in conn_try_disconnect()\n", + rv2); + } + return rv; +} + +int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct disconnect_parms parms; + struct drbd_connection *connection; + enum drbd_state_rv rv; + enum drbd_ret_code retcode; + int err; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto fail; + + connection = adm_ctx.connection; + memset(&parms, 0, sizeof(parms)); + if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) { + err = disconnect_parms_from_attrs(&parms, info); + if (err) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto fail; + } + } + + mutex_lock(&adm_ctx.resource->adm_mutex); + rv = conn_try_disconnect(connection, parms.force_disconnect); + if (rv < SS_SUCCESS) + retcode = rv; /* FIXME: Type mismatch. */ + else + retcode = NO_ERROR; + mutex_unlock(&adm_ctx.resource->adm_mutex); + fail: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +void resync_after_online_grow(struct drbd_device *device) +{ + int iass; /* I am sync source */ + + drbd_info(device, "Resync of new storage after online grow\n"); + if (device->state.role != device->state.peer) + iass = (device->state.role == R_PRIMARY); + else + iass = test_bit(RESOLVE_CONFLICTS, &first_peer_device(device)->connection->flags); + + if (iass) + drbd_start_resync(device, C_SYNC_SOURCE); + else + _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE); +} + +int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct disk_conf *old_disk_conf, *new_disk_conf = NULL; + struct resize_parms rs; + struct drbd_device *device; + enum drbd_ret_code retcode; + enum determine_dev_size dd; + bool change_al_layout = false; + enum dds_flags ddsf; + sector_t u_size; + int err; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto finish; + + mutex_lock(&adm_ctx.resource->adm_mutex); + device = adm_ctx.device; + if (!get_ldev(device)) { + retcode = ERR_NO_DISK; + goto fail; + } + + memset(&rs, 0, sizeof(struct resize_parms)); + rs.al_stripes = device->ldev->md.al_stripes; + rs.al_stripe_size = device->ldev->md.al_stripe_size_4k * 4; + if (info->attrs[DRBD_NLA_RESIZE_PARMS]) { + err = resize_parms_from_attrs(&rs, info); + if (err) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto fail_ldev; + } + } + + if (device->state.conn > C_CONNECTED) { + retcode = ERR_RESIZE_RESYNC; + goto fail_ldev; + } + + if (device->state.role == R_SECONDARY && + device->state.peer == R_SECONDARY) { + retcode = ERR_NO_PRIMARY; + goto fail_ldev; + } + + if (rs.no_resync && first_peer_device(device)->connection->agreed_pro_version < 93) { + retcode = ERR_NEED_APV_93; + goto fail_ldev; + } + + rcu_read_lock(); + u_size = rcu_dereference(device->ldev->disk_conf)->disk_size; + rcu_read_unlock(); + if (u_size != (sector_t)rs.resize_size) { + new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL); + if (!new_disk_conf) { + retcode = ERR_NOMEM; + goto fail_ldev; + } + } + + if (device->ldev->md.al_stripes != rs.al_stripes || + device->ldev->md.al_stripe_size_4k != rs.al_stripe_size / 4) { + u32 al_size_k = rs.al_stripes * rs.al_stripe_size; + + if (al_size_k > (16 * 1024 * 1024)) { + retcode = ERR_MD_LAYOUT_TOO_BIG; + goto fail_ldev; + } + + if (al_size_k < MD_32kB_SECT/2) { + retcode = ERR_MD_LAYOUT_TOO_SMALL; + goto fail_ldev; + } + + if (device->state.conn != C_CONNECTED && !rs.resize_force) { + retcode = ERR_MD_LAYOUT_CONNECTED; + goto fail_ldev; + } + + change_al_layout = true; + } + + if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) + device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev); + + if (new_disk_conf) { + mutex_lock(&device->resource->conf_update); + old_disk_conf = device->ldev->disk_conf; + *new_disk_conf = *old_disk_conf; + new_disk_conf->disk_size = (sector_t)rs.resize_size; + rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); + mutex_unlock(&device->resource->conf_update); + synchronize_rcu(); + kfree(old_disk_conf); + } + + ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); + dd = drbd_determine_dev_size(device, ddsf, change_al_layout ? &rs : NULL); + drbd_md_sync(device); + put_ldev(device); + if (dd == DS_ERROR) { + retcode = ERR_NOMEM_BITMAP; + goto fail; + } else if (dd == DS_ERROR_SPACE_MD) { + retcode = ERR_MD_LAYOUT_NO_FIT; + goto fail; + } else if (dd == DS_ERROR_SHRINK) { + retcode = ERR_IMPLICIT_SHRINK; + goto fail; + } + + if (device->state.conn == C_CONNECTED) { + if (dd == DS_GREW) + set_bit(RESIZE_PENDING, &device->flags); + + drbd_send_uuids(first_peer_device(device)); + drbd_send_sizes(first_peer_device(device), 1, ddsf); + } + + fail: + mutex_unlock(&adm_ctx.resource->adm_mutex); + finish: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; + + fail_ldev: + put_ldev(device); + goto fail; +} + +int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + enum drbd_ret_code retcode; + struct res_opts res_opts; + int err; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto fail; + + res_opts = adm_ctx.resource->res_opts; + if (should_set_defaults(info)) + set_res_opts_defaults(&res_opts); + + err = res_opts_from_attrs(&res_opts, info); + if (err && err != -ENOMSG) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto fail; + } + + mutex_lock(&adm_ctx.resource->adm_mutex); + err = set_resource_options(adm_ctx.resource, &res_opts); + if (err) { + retcode = ERR_INVALID_REQUEST; + if (err == -ENOMEM) + retcode = ERR_NOMEM; + } + mutex_unlock(&adm_ctx.resource->adm_mutex); + +fail: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct drbd_device *device; + int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + device = adm_ctx.device; + if (!get_ldev(device)) { + retcode = ERR_NO_DISK; + goto out; + } + + mutex_lock(&adm_ctx.resource->adm_mutex); + + /* If there is still bitmap IO pending, probably because of a previous + * resync just being finished, wait for it before requesting a new resync. + * Also wait for it's after_state_ch(). */ + drbd_suspend_io(device); + wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags)); + drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work); + + /* If we happen to be C_STANDALONE R_SECONDARY, just change to + * D_INCONSISTENT, and set all bits in the bitmap. Otherwise, + * try to start a resync handshake as sync target for full sync. + */ + if (device->state.conn == C_STANDALONE && device->state.role == R_SECONDARY) { + retcode = drbd_request_state(device, NS(disk, D_INCONSISTENT)); + if (retcode >= SS_SUCCESS) { + if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, + "set_n_write from invalidate", BM_LOCKED_MASK)) + retcode = ERR_IO_MD_DISK; + } + } else + retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T)); + drbd_resume_io(device); + mutex_unlock(&adm_ctx.resource->adm_mutex); + put_ldev(device); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info, + union drbd_state mask, union drbd_state val) +{ + struct drbd_config_context adm_ctx; + enum drbd_ret_code retcode; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + mutex_lock(&adm_ctx.resource->adm_mutex); + retcode = drbd_request_state(adm_ctx.device, mask, val); + mutex_unlock(&adm_ctx.resource->adm_mutex); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +static int drbd_bmio_set_susp_al(struct drbd_device *device) __must_hold(local) +{ + int rv; + + rv = drbd_bmio_set_n_write(device); + drbd_suspend_al(device); + return rv; +} + +int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + int retcode; /* drbd_ret_code, drbd_state_rv */ + struct drbd_device *device; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + device = adm_ctx.device; + if (!get_ldev(device)) { + retcode = ERR_NO_DISK; + goto out; + } + + mutex_lock(&adm_ctx.resource->adm_mutex); + + /* If there is still bitmap IO pending, probably because of a previous + * resync just being finished, wait for it before requesting a new resync. + * Also wait for it's after_state_ch(). */ + drbd_suspend_io(device); + wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags)); + drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work); + + /* If we happen to be C_STANDALONE R_PRIMARY, just set all bits + * in the bitmap. Otherwise, try to start a resync handshake + * as sync source for full sync. + */ + if (device->state.conn == C_STANDALONE && device->state.role == R_PRIMARY) { + /* The peer will get a resync upon connect anyways. Just make that + into a full resync. */ + retcode = drbd_request_state(device, NS(pdsk, D_INCONSISTENT)); + if (retcode >= SS_SUCCESS) { + if (drbd_bitmap_io(device, &drbd_bmio_set_susp_al, + "set_n_write from invalidate_peer", + BM_LOCKED_SET_ALLOWED)) + retcode = ERR_IO_MD_DISK; + } + } else + retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S)); + drbd_resume_io(device); + mutex_unlock(&adm_ctx.resource->adm_mutex); + put_ldev(device); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + enum drbd_ret_code retcode; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + mutex_lock(&adm_ctx.resource->adm_mutex); + if (drbd_request_state(adm_ctx.device, NS(user_isp, 1)) == SS_NOTHING_TO_DO) + retcode = ERR_PAUSE_IS_SET; + mutex_unlock(&adm_ctx.resource->adm_mutex); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + union drbd_dev_state s; + enum drbd_ret_code retcode; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + mutex_lock(&adm_ctx.resource->adm_mutex); + if (drbd_request_state(adm_ctx.device, NS(user_isp, 0)) == SS_NOTHING_TO_DO) { + s = adm_ctx.device->state; + if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) { + retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP : + s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR; + } else { + retcode = ERR_PAUSE_IS_CLEAR; + } + } + mutex_unlock(&adm_ctx.resource->adm_mutex); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info) +{ + return drbd_adm_simple_request_state(skb, info, NS(susp, 1)); +} + +int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct drbd_device *device; + int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + mutex_lock(&adm_ctx.resource->adm_mutex); + device = adm_ctx.device; + if (test_bit(NEW_CUR_UUID, &device->flags)) { + drbd_uuid_new_current(device); + clear_bit(NEW_CUR_UUID, &device->flags); + } + drbd_suspend_io(device); + retcode = drbd_request_state(device, NS3(susp, 0, susp_nod, 0, susp_fen, 0)); + if (retcode == SS_SUCCESS) { + if (device->state.conn < C_CONNECTED) + tl_clear(first_peer_device(device)->connection); + if (device->state.disk == D_DISKLESS || device->state.disk == D_FAILED) + tl_restart(first_peer_device(device)->connection, FAIL_FROZEN_DISK_IO); + } + drbd_resume_io(device); + mutex_unlock(&adm_ctx.resource->adm_mutex); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info) +{ + return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED)); +} + +static int nla_put_drbd_cfg_context(struct sk_buff *skb, + struct drbd_resource *resource, + struct drbd_connection *connection, + struct drbd_device *device) +{ + struct nlattr *nla; + nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT); + if (!nla) + goto nla_put_failure; + if (device && + nla_put_u32(skb, T_ctx_volume, device->vnr)) + goto nla_put_failure; + if (nla_put_string(skb, T_ctx_resource_name, resource->name)) + goto nla_put_failure; + if (connection) { + if (connection->my_addr_len && + nla_put(skb, T_ctx_my_addr, connection->my_addr_len, &connection->my_addr)) + goto nla_put_failure; + if (connection->peer_addr_len && + nla_put(skb, T_ctx_peer_addr, connection->peer_addr_len, &connection->peer_addr)) + goto nla_put_failure; + } + nla_nest_end(skb, nla); + return 0; + +nla_put_failure: + if (nla) + nla_nest_cancel(skb, nla); + return -EMSGSIZE; +} + +/* + * Return the connection of @resource if @resource has exactly one connection. + */ +static struct drbd_connection *the_only_connection(struct drbd_resource *resource) +{ + struct list_head *connections = &resource->connections; + + if (list_empty(connections) || connections->next->next != connections) + return NULL; + return list_first_entry(&resource->connections, struct drbd_connection, connections); +} + +static int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device, + const struct sib_info *sib) +{ + struct drbd_resource *resource = device->resource; + struct state_info *si = NULL; /* for sizeof(si->member); */ + struct nlattr *nla; + int got_ldev; + int err = 0; + int exclude_sensitive; + + /* If sib != NULL, this is drbd_bcast_event, which anyone can listen + * to. So we better exclude_sensitive information. + * + * If sib == NULL, this is drbd_adm_get_status, executed synchronously + * in the context of the requesting user process. Exclude sensitive + * information, unless current has superuser. + * + * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and + * relies on the current implementation of netlink_dump(), which + * executes the dump callback successively from netlink_recvmsg(), + * always in the context of the receiving process */ + exclude_sensitive = sib || !capable(CAP_SYS_ADMIN); + + got_ldev = get_ldev(device); + + /* We need to add connection name and volume number information still. + * Minor number is in drbd_genlmsghdr. */ + if (nla_put_drbd_cfg_context(skb, resource, the_only_connection(resource), device)) + goto nla_put_failure; + + if (res_opts_to_skb(skb, &device->resource->res_opts, exclude_sensitive)) + goto nla_put_failure; + + rcu_read_lock(); + if (got_ldev) { + struct disk_conf *disk_conf; + + disk_conf = rcu_dereference(device->ldev->disk_conf); + err = disk_conf_to_skb(skb, disk_conf, exclude_sensitive); + } + if (!err) { + struct net_conf *nc; + + nc = rcu_dereference(first_peer_device(device)->connection->net_conf); + if (nc) + err = net_conf_to_skb(skb, nc, exclude_sensitive); + } + rcu_read_unlock(); + if (err) + goto nla_put_failure; + + nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO); + if (!nla) + goto nla_put_failure; + if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) || + nla_put_u32(skb, T_current_state, device->state.i) || + nla_put_u64(skb, T_ed_uuid, device->ed_uuid) || + nla_put_u64(skb, T_capacity, drbd_get_capacity(device->this_bdev)) || + nla_put_u64(skb, T_send_cnt, device->send_cnt) || + nla_put_u64(skb, T_recv_cnt, device->recv_cnt) || + nla_put_u64(skb, T_read_cnt, device->read_cnt) || + nla_put_u64(skb, T_writ_cnt, device->writ_cnt) || + nla_put_u64(skb, T_al_writ_cnt, device->al_writ_cnt) || + nla_put_u64(skb, T_bm_writ_cnt, device->bm_writ_cnt) || + nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&device->ap_bio_cnt)) || + nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&device->ap_pending_cnt)) || + nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&device->rs_pending_cnt))) + goto nla_put_failure; + + if (got_ldev) { + int err; + + spin_lock_irq(&device->ldev->md.uuid_lock); + err = nla_put(skb, T_uuids, sizeof(si->uuids), device->ldev->md.uuid); + spin_unlock_irq(&device->ldev->md.uuid_lock); + + if (err) + goto nla_put_failure; + + if (nla_put_u32(skb, T_disk_flags, device->ldev->md.flags) || + nla_put_u64(skb, T_bits_total, drbd_bm_bits(device)) || + nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(device))) + goto nla_put_failure; + if (C_SYNC_SOURCE <= device->state.conn && + C_PAUSED_SYNC_T >= device->state.conn) { + if (nla_put_u64(skb, T_bits_rs_total, device->rs_total) || + nla_put_u64(skb, T_bits_rs_failed, device->rs_failed)) + goto nla_put_failure; + } + } + + if (sib) { + switch(sib->sib_reason) { + case SIB_SYNC_PROGRESS: + case SIB_GET_STATUS_REPLY: + break; + case SIB_STATE_CHANGE: + if (nla_put_u32(skb, T_prev_state, sib->os.i) || + nla_put_u32(skb, T_new_state, sib->ns.i)) + goto nla_put_failure; + break; + case SIB_HELPER_POST: + if (nla_put_u32(skb, T_helper_exit_code, + sib->helper_exit_code)) + goto nla_put_failure; + /* fall through */ + case SIB_HELPER_PRE: + if (nla_put_string(skb, T_helper, sib->helper_name)) + goto nla_put_failure; + break; + } + } + nla_nest_end(skb, nla); + + if (0) +nla_put_failure: + err = -EMSGSIZE; + if (got_ldev) + put_ldev(device); + return err; +} + +int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + enum drbd_ret_code retcode; + int err; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.device, NULL); + if (err) { + nlmsg_free(adm_ctx.reply_skb); + return err; + } +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +static int get_one_status(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct drbd_device *device; + struct drbd_genlmsghdr *dh; + struct drbd_resource *pos = (struct drbd_resource *)cb->args[0]; + struct drbd_resource *resource = NULL; + struct drbd_resource *tmp; + unsigned volume = cb->args[1]; + + /* Open coded, deferred, iteration: + * for_each_resource_safe(resource, tmp, &drbd_resources) { + * connection = "first connection of resource or undefined"; + * idr_for_each_entry(&resource->devices, device, i) { + * ... + * } + * } + * where resource is cb->args[0]; + * and i is cb->args[1]; + * + * cb->args[2] indicates if we shall loop over all resources, + * or just dump all volumes of a single resource. + * + * This may miss entries inserted after this dump started, + * or entries deleted before they are reached. + * + * We need to make sure the device won't disappear while + * we are looking at it, and revalidate our iterators + * on each iteration. + */ + + /* synchronize with conn_create()/drbd_destroy_connection() */ + rcu_read_lock(); + /* revalidate iterator position */ + for_each_resource_rcu(tmp, &drbd_resources) { + if (pos == NULL) { + /* first iteration */ + pos = tmp; + resource = pos; + break; + } + if (tmp == pos) { + resource = pos; + break; + } + } + if (resource) { +next_resource: + device = idr_get_next(&resource->devices, &volume); + if (!device) { + /* No more volumes to dump on this resource. + * Advance resource iterator. */ + pos = list_entry_rcu(resource->resources.next, + struct drbd_resource, resources); + /* Did we dump any volume of this resource yet? */ + if (volume != 0) { + /* If we reached the end of the list, + * or only a single resource dump was requested, + * we are done. */ + if (&pos->resources == &drbd_resources || cb->args[2]) + goto out; + volume = 0; + resource = pos; + goto next_resource; + } + } + + dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &drbd_genl_family, + NLM_F_MULTI, DRBD_ADM_GET_STATUS); + if (!dh) + goto out; + + if (!device) { + /* This is a connection without a single volume. + * Suprisingly enough, it may have a network + * configuration. */ + struct drbd_connection *connection; + + dh->minor = -1U; + dh->ret_code = NO_ERROR; + connection = the_only_connection(resource); + if (nla_put_drbd_cfg_context(skb, resource, connection, NULL)) + goto cancel; + if (connection) { + struct net_conf *nc; + + nc = rcu_dereference(connection->net_conf); + if (nc && net_conf_to_skb(skb, nc, 1) != 0) + goto cancel; + } + goto done; + } + + D_ASSERT(device, device->vnr == volume); + D_ASSERT(device, device->resource == resource); + + dh->minor = device_to_minor(device); + dh->ret_code = NO_ERROR; + + if (nla_put_status_info(skb, device, NULL)) { +cancel: + genlmsg_cancel(skb, dh); + goto out; + } +done: + genlmsg_end(skb, dh); + } + +out: + rcu_read_unlock(); + /* where to start the next iteration */ + cb->args[0] = (long)pos; + cb->args[1] = (pos == resource) ? volume + 1 : 0; + + /* No more resources/volumes/minors found results in an empty skb. + * Which will terminate the dump. */ + return skb->len; +} + +/* + * Request status of all resources, or of all volumes within a single resource. + * + * This is a dump, as the answer may not fit in a single reply skb otherwise. + * Which means we cannot use the family->attrbuf or other such members, because + * dump is NOT protected by the genl_lock(). During dump, we only have access + * to the incoming skb, and need to opencode "parsing" of the nlattr payload. + * + * Once things are setup properly, we call into get_one_status(). + */ +int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb) +{ + const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ; + struct nlattr *nla; + const char *resource_name; + struct drbd_resource *resource; + int maxtype; + + /* Is this a followup call? */ + if (cb->args[0]) { + /* ... of a single resource dump, + * and the resource iterator has been advanced already? */ + if (cb->args[2] && cb->args[2] != cb->args[0]) + return 0; /* DONE. */ + goto dump; + } + + /* First call (from netlink_dump_start). We need to figure out + * which resource(s) the user wants us to dump. */ + nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen), + nlmsg_attrlen(cb->nlh, hdrlen), + DRBD_NLA_CFG_CONTEXT); + + /* No explicit context given. Dump all. */ + if (!nla) + goto dump; + maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1; + nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name)); + if (IS_ERR(nla)) + return PTR_ERR(nla); + /* context given, but no name present? */ + if (!nla) + return -EINVAL; + resource_name = nla_data(nla); + if (!*resource_name) + return -ENODEV; + resource = drbd_find_resource(resource_name); + if (!resource) + return -ENODEV; + + kref_put(&resource->kref, drbd_destroy_resource); /* get_one_status() revalidates the resource */ + + /* prime iterators, and set "filter" mode mark: + * only dump this connection. */ + cb->args[0] = (long)resource; + /* cb->args[1] = 0; passed in this way. */ + cb->args[2] = (long)resource; + +dump: + return get_one_status(skb, cb); +} + +int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + enum drbd_ret_code retcode; + struct timeout_parms tp; + int err; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + tp.timeout_type = + adm_ctx.device->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : + test_bit(USE_DEGR_WFC_T, &adm_ctx.device->flags) ? UT_DEGRADED : + UT_DEFAULT; + + err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp); + if (err) { + nlmsg_free(adm_ctx.reply_skb); + return err; + } +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct drbd_device *device; + enum drbd_ret_code retcode; + struct start_ov_parms parms; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + device = adm_ctx.device; + + /* resume from last known position, if possible */ + parms.ov_start_sector = device->ov_start_sector; + parms.ov_stop_sector = ULLONG_MAX; + if (info->attrs[DRBD_NLA_START_OV_PARMS]) { + int err = start_ov_parms_from_attrs(&parms, info); + if (err) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto out; + } + } + mutex_lock(&adm_ctx.resource->adm_mutex); + + /* w_make_ov_request expects position to be aligned */ + device->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1); + device->ov_stop_sector = parms.ov_stop_sector; + + /* If there is still bitmap IO pending, e.g. previous resync or verify + * just being finished, wait for it before requesting a new resync. */ + drbd_suspend_io(device); + wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags)); + retcode = drbd_request_state(device, NS(conn, C_VERIFY_S)); + drbd_resume_io(device); + + mutex_unlock(&adm_ctx.resource->adm_mutex); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + + +int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct drbd_device *device; + enum drbd_ret_code retcode; + int skip_initial_sync = 0; + int err; + struct new_c_uuid_parms args; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out_nolock; + + device = adm_ctx.device; + memset(&args, 0, sizeof(args)); + if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) { + err = new_c_uuid_parms_from_attrs(&args, info); + if (err) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto out_nolock; + } + } + + mutex_lock(&adm_ctx.resource->adm_mutex); + mutex_lock(device->state_mutex); /* Protects us against serialized state changes. */ + + if (!get_ldev(device)) { + retcode = ERR_NO_DISK; + goto out; + } + + /* this is "skip initial sync", assume to be clean */ + if (device->state.conn == C_CONNECTED && + first_peer_device(device)->connection->agreed_pro_version >= 90 && + device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) { + drbd_info(device, "Preparing to skip initial sync\n"); + skip_initial_sync = 1; + } else if (device->state.conn != C_STANDALONE) { + retcode = ERR_CONNECTED; + goto out_dec; + } + + drbd_uuid_set(device, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */ + drbd_uuid_new_current(device); /* New current, previous to UI_BITMAP */ + + if (args.clear_bm) { + err = drbd_bitmap_io(device, &drbd_bmio_clear_n_write, + "clear_n_write from new_c_uuid", BM_LOCKED_MASK); + if (err) { + drbd_err(device, "Writing bitmap failed with %d\n", err); + retcode = ERR_IO_MD_DISK; + } + if (skip_initial_sync) { + drbd_send_uuids_skip_initial_sync(first_peer_device(device)); + _drbd_uuid_set(device, UI_BITMAP, 0); + drbd_print_uuids(device, "cleared bitmap UUID"); + spin_lock_irq(&device->resource->req_lock); + _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), + CS_VERBOSE, NULL); + spin_unlock_irq(&device->resource->req_lock); + } + } + + drbd_md_sync(device); +out_dec: + put_ldev(device); +out: + mutex_unlock(device->state_mutex); + mutex_unlock(&adm_ctx.resource->adm_mutex); +out_nolock: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +static enum drbd_ret_code +drbd_check_resource_name(struct drbd_config_context *adm_ctx) +{ + const char *name = adm_ctx->resource_name; + if (!name || !name[0]) { + drbd_msg_put_info(adm_ctx->reply_skb, "resource name missing"); + return ERR_MANDATORY_TAG; + } + /* if we want to use these in sysfs/configfs/debugfs some day, + * we must not allow slashes */ + if (strchr(name, '/')) { + drbd_msg_put_info(adm_ctx->reply_skb, "invalid resource name"); + return ERR_INVALID_REQUEST; + } + return NO_ERROR; +} + +int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + enum drbd_ret_code retcode; + struct res_opts res_opts; + int err; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, 0); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + set_res_opts_defaults(&res_opts); + err = res_opts_from_attrs(&res_opts, info); + if (err && err != -ENOMSG) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto out; + } + + retcode = drbd_check_resource_name(&adm_ctx); + if (retcode != NO_ERROR) + goto out; + + if (adm_ctx.resource) { + if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) { + retcode = ERR_INVALID_REQUEST; + drbd_msg_put_info(adm_ctx.reply_skb, "resource exists"); + } + /* else: still NO_ERROR */ + goto out; + } + + /* not yet safe for genl_family.parallel_ops */ + if (!conn_create(adm_ctx.resource_name, &res_opts)) + retcode = ERR_NOMEM; +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct drbd_genlmsghdr *dh = info->userhdr; + enum drbd_ret_code retcode; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + if (dh->minor > MINORMASK) { + drbd_msg_put_info(adm_ctx.reply_skb, "requested minor out of range"); + retcode = ERR_INVALID_REQUEST; + goto out; + } + if (adm_ctx.volume > DRBD_VOLUME_MAX) { + drbd_msg_put_info(adm_ctx.reply_skb, "requested volume id out of range"); + retcode = ERR_INVALID_REQUEST; + goto out; + } + + /* drbd_adm_prepare made sure already + * that first_peer_device(device)->connection and device->vnr match the request. */ + if (adm_ctx.device) { + if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) + retcode = ERR_MINOR_OR_VOLUME_EXISTS; + /* else: still NO_ERROR */ + goto out; + } + + mutex_lock(&adm_ctx.resource->adm_mutex); + retcode = drbd_create_device(&adm_ctx, dh->minor); + mutex_unlock(&adm_ctx.resource->adm_mutex); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +static enum drbd_ret_code adm_del_minor(struct drbd_device *device) +{ + if (device->state.disk == D_DISKLESS && + /* no need to be device->state.conn == C_STANDALONE && + * we may want to delete a minor from a live replication group. + */ + device->state.role == R_SECONDARY) { + _drbd_request_state(device, NS(conn, C_WF_REPORT_PARAMS), + CS_VERBOSE + CS_WAIT_COMPLETE); + drbd_delete_device(device); + return NO_ERROR; + } else + return ERR_MINOR_CONFIGURED; +} + +int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + enum drbd_ret_code retcode; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + mutex_lock(&adm_ctx.resource->adm_mutex); + retcode = adm_del_minor(adm_ctx.device); + mutex_unlock(&adm_ctx.resource->adm_mutex); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +static int adm_del_resource(struct drbd_resource *resource) +{ + struct drbd_connection *connection; + + for_each_connection(connection, resource) { + if (connection->cstate > C_STANDALONE) + return ERR_NET_CONFIGURED; + } + if (!idr_is_empty(&resource->devices)) + return ERR_RES_IN_USE; + + list_del_rcu(&resource->resources); + /* Make sure all threads have actually stopped: state handling only + * does drbd_thread_stop_nowait(). */ + list_for_each_entry(connection, &resource->connections, connections) + drbd_thread_stop(&connection->worker); + synchronize_rcu(); + drbd_free_resource(resource); + return NO_ERROR; +} + +int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct drbd_resource *resource; + struct drbd_connection *connection; + struct drbd_device *device; + int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ + unsigned i; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto finish; + + resource = adm_ctx.resource; + mutex_lock(&resource->adm_mutex); + /* demote */ + for_each_connection(connection, resource) { + struct drbd_peer_device *peer_device; + + idr_for_each_entry(&connection->peer_devices, peer_device, i) { + retcode = drbd_set_role(peer_device->device, R_SECONDARY, 0); + if (retcode < SS_SUCCESS) { + drbd_msg_put_info(adm_ctx.reply_skb, "failed to demote"); + goto out; + } + } + + retcode = conn_try_disconnect(connection, 0); + if (retcode < SS_SUCCESS) { + drbd_msg_put_info(adm_ctx.reply_skb, "failed to disconnect"); + goto out; + } + } + + /* detach */ + idr_for_each_entry(&resource->devices, device, i) { + retcode = adm_detach(device, 0); + if (retcode < SS_SUCCESS || retcode > NO_ERROR) { + drbd_msg_put_info(adm_ctx.reply_skb, "failed to detach"); + goto out; + } + } + + /* delete volumes */ + idr_for_each_entry(&resource->devices, device, i) { + retcode = adm_del_minor(device); + if (retcode != NO_ERROR) { + /* "can not happen" */ + drbd_msg_put_info(adm_ctx.reply_skb, "failed to delete volume"); + goto out; + } + } + + retcode = adm_del_resource(resource); +out: + mutex_unlock(&resource->adm_mutex); +finish: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct drbd_resource *resource; + enum drbd_ret_code retcode; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto finish; + resource = adm_ctx.resource; + + mutex_lock(&resource->adm_mutex); + retcode = adm_del_resource(resource); + mutex_unlock(&resource->adm_mutex); +finish: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib) +{ + static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */ + struct sk_buff *msg; + struct drbd_genlmsghdr *d_out; + unsigned seq; + int err = -ENOMEM; + + seq = atomic_inc_return(&drbd_genl_seq); + msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); + if (!msg) + goto failed; + + err = -EMSGSIZE; + d_out = genlmsg_put(msg, 0, seq, &drbd_genl_family, 0, DRBD_EVENT); + if (!d_out) /* cannot happen, but anyways. */ + goto nla_put_failure; + d_out->minor = device_to_minor(device); + d_out->ret_code = NO_ERROR; + + if (nla_put_status_info(msg, device, sib)) + goto nla_put_failure; + genlmsg_end(msg, d_out); + err = drbd_genl_multicast_events(msg, 0); + /* msg has been consumed or freed in netlink_broadcast() */ + if (err && err != -ESRCH) + goto failed; + + return; + +nla_put_failure: + nlmsg_free(msg); +failed: + drbd_err(device, "Error %d while broadcasting event. " + "Event seq:%u sib_reason:%u\n", + err, seq, sib->sib_reason); +} diff --git a/drivers/block/drbd/drbd_nla.c b/drivers/block/drbd/drbd_nla.c new file mode 100644 index 000000000..b2d479149 --- /dev/null +++ b/drivers/block/drbd/drbd_nla.c @@ -0,0 +1,54 @@ +#include <linux/kernel.h> +#include <net/netlink.h> +#include <linux/drbd_genl_api.h> +#include "drbd_nla.h" + +static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla) +{ + struct nlattr *head = nla_data(nla); + int len = nla_len(nla); + int rem; + + /* + * validate_nla (called from nla_parse_nested) ignores attributes + * beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag. + * In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY + * flag set also, check and remove that flag before calling + * nla_parse_nested. + */ + + nla_for_each_attr(nla, head, len, rem) { + if (nla->nla_type & DRBD_GENLA_F_MANDATORY) { + nla->nla_type &= ~DRBD_GENLA_F_MANDATORY; + if (nla_type(nla) > maxtype) + return -EOPNOTSUPP; + } + } + return 0; +} + +int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla, + const struct nla_policy *policy) +{ + int err; + + err = drbd_nla_check_mandatory(maxtype, nla); + if (!err) + err = nla_parse_nested(tb, maxtype, nla, policy); + + return err; +} + +struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype) +{ + int err; + /* + * If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and + * we don't know about that attribute, reject all the nested + * attributes. + */ + err = drbd_nla_check_mandatory(maxtype, nla); + if (err) + return ERR_PTR(err); + return nla_find_nested(nla, attrtype); +} diff --git a/drivers/block/drbd/drbd_nla.h b/drivers/block/drbd/drbd_nla.h new file mode 100644 index 000000000..679c2d5b4 --- /dev/null +++ b/drivers/block/drbd/drbd_nla.h @@ -0,0 +1,8 @@ +#ifndef __DRBD_NLA_H +#define __DRBD_NLA_H + +extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla, + const struct nla_policy *policy); +extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype); + +#endif /* __DRBD_NLA_H */ diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c new file mode 100644 index 000000000..3b10fa6cb --- /dev/null +++ b/drivers/block/drbd/drbd_proc.c @@ -0,0 +1,368 @@ +/* + drbd_proc.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include <linux/module.h> + +#include <asm/uaccess.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/drbd.h> +#include "drbd_int.h" + +static int drbd_proc_open(struct inode *inode, struct file *file); +static int drbd_proc_release(struct inode *inode, struct file *file); + + +struct proc_dir_entry *drbd_proc; +const struct file_operations drbd_proc_fops = { + .owner = THIS_MODULE, + .open = drbd_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = drbd_proc_release, +}; + +static void seq_printf_with_thousands_grouping(struct seq_file *seq, long v) +{ + /* v is in kB/sec. We don't expect TiByte/sec yet. */ + if (unlikely(v >= 1000000)) { + /* cool: > GiByte/s */ + seq_printf(seq, "%ld,", v / 1000000); + v %= 1000000; + seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000); + } else if (likely(v >= 1000)) + seq_printf(seq, "%ld,%03ld", v/1000, v % 1000); + else + seq_printf(seq, "%ld", v); +} + +static void drbd_get_syncer_progress(struct drbd_device *device, + union drbd_dev_state state, unsigned long *rs_total, + unsigned long *bits_left, unsigned int *per_mil_done) +{ + /* this is to break it at compile time when we change that, in case we + * want to support more than (1<<32) bits on a 32bit arch. */ + typecheck(unsigned long, device->rs_total); + *rs_total = device->rs_total; + + /* note: both rs_total and rs_left are in bits, i.e. in + * units of BM_BLOCK_SIZE. + * for the percentage, we don't care. */ + + if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T) + *bits_left = device->ov_left; + else + *bits_left = drbd_bm_total_weight(device) - device->rs_failed; + /* >> 10 to prevent overflow, + * +1 to prevent division by zero */ + if (*bits_left > *rs_total) { + /* D'oh. Maybe a logic bug somewhere. More likely just a race + * between state change and reset of rs_total. + */ + *bits_left = *rs_total; + *per_mil_done = *rs_total ? 0 : 1000; + } else { + /* Make sure the division happens in long context. + * We allow up to one petabyte storage right now, + * at a granularity of 4k per bit that is 2**38 bits. + * After shift right and multiplication by 1000, + * this should still fit easily into a 32bit long, + * so we don't need a 64bit division on 32bit arch. + * Note: currently we don't support such large bitmaps on 32bit + * arch anyways, but no harm done to be prepared for it here. + */ + unsigned int shift = *rs_total > UINT_MAX ? 16 : 10; + unsigned long left = *bits_left >> shift; + unsigned long total = 1UL + (*rs_total >> shift); + unsigned long tmp = 1000UL - left * 1000UL/total; + *per_mil_done = tmp; + } +} + + +/*lge + * progress bars shamelessly adapted from driver/md/md.c + * output looks like + * [=====>..............] 33.5% (23456/123456) + * finish: 2:20:20 speed: 6,345 (6,456) K/sec + */ +static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *seq, + union drbd_dev_state state) +{ + unsigned long db, dt, dbdt, rt, rs_total, rs_left; + unsigned int res; + int i, x, y; + int stalled = 0; + + drbd_get_syncer_progress(device, state, &rs_total, &rs_left, &res); + + x = res/50; + y = 20-x; + seq_printf(seq, "\t["); + for (i = 1; i < x; i++) + seq_printf(seq, "="); + seq_printf(seq, ">"); + for (i = 0; i < y; i++) + seq_printf(seq, "."); + seq_printf(seq, "] "); + + if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T) + seq_printf(seq, "verified:"); + else + seq_printf(seq, "sync'ed:"); + seq_printf(seq, "%3u.%u%% ", res / 10, res % 10); + + /* if more than a few GB, display in MB */ + if (rs_total > (4UL << (30 - BM_BLOCK_SHIFT))) + seq_printf(seq, "(%lu/%lu)M", + (unsigned long) Bit2KB(rs_left >> 10), + (unsigned long) Bit2KB(rs_total >> 10)); + else + seq_printf(seq, "(%lu/%lu)K", + (unsigned long) Bit2KB(rs_left), + (unsigned long) Bit2KB(rs_total)); + + seq_printf(seq, "\n\t"); + + /* see drivers/md/md.c + * We do not want to overflow, so the order of operands and + * the * 100 / 100 trick are important. We do a +1 to be + * safe against division by zero. We only estimate anyway. + * + * dt: time from mark until now + * db: blocks written from mark until now + * rt: remaining time + */ + /* Rolling marks. last_mark+1 may just now be modified. last_mark+2 is + * at least (DRBD_SYNC_MARKS-2)*DRBD_SYNC_MARK_STEP old, and has at + * least DRBD_SYNC_MARK_STEP time before it will be modified. */ + /* ------------------------ ~18s average ------------------------ */ + i = (device->rs_last_mark + 2) % DRBD_SYNC_MARKS; + dt = (jiffies - device->rs_mark_time[i]) / HZ; + if (dt > 180) + stalled = 1; + + if (!dt) + dt++; + db = device->rs_mark_left[i] - rs_left; + rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */ + + seq_printf(seq, "finish: %lu:%02lu:%02lu", + rt / 3600, (rt % 3600) / 60, rt % 60); + + dbdt = Bit2KB(db/dt); + seq_printf(seq, " speed: "); + seq_printf_with_thousands_grouping(seq, dbdt); + seq_printf(seq, " ("); + /* ------------------------- ~3s average ------------------------ */ + if (proc_details >= 1) { + /* this is what drbd_rs_should_slow_down() uses */ + i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS; + dt = (jiffies - device->rs_mark_time[i]) / HZ; + if (!dt) + dt++; + db = device->rs_mark_left[i] - rs_left; + dbdt = Bit2KB(db/dt); + seq_printf_with_thousands_grouping(seq, dbdt); + seq_printf(seq, " -- "); + } + + /* --------------------- long term average ---------------------- */ + /* mean speed since syncer started + * we do account for PausedSync periods */ + dt = (jiffies - device->rs_start - device->rs_paused) / HZ; + if (dt == 0) + dt = 1; + db = rs_total - rs_left; + dbdt = Bit2KB(db/dt); + seq_printf_with_thousands_grouping(seq, dbdt); + seq_printf(seq, ")"); + + if (state.conn == C_SYNC_TARGET || + state.conn == C_VERIFY_S) { + seq_printf(seq, " want: "); + seq_printf_with_thousands_grouping(seq, device->c_sync_rate); + } + seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : ""); + + if (proc_details >= 1) { + /* 64 bit: + * we convert to sectors in the display below. */ + unsigned long bm_bits = drbd_bm_bits(device); + unsigned long bit_pos; + unsigned long long stop_sector = 0; + if (state.conn == C_VERIFY_S || + state.conn == C_VERIFY_T) { + bit_pos = bm_bits - device->ov_left; + if (verify_can_do_stop_sector(device)) + stop_sector = device->ov_stop_sector; + } else + bit_pos = device->bm_resync_fo; + /* Total sectors may be slightly off for oddly + * sized devices. So what. */ + seq_printf(seq, + "\t%3d%% sector pos: %llu/%llu", + (int)(bit_pos / (bm_bits/100+1)), + (unsigned long long)bit_pos * BM_SECT_PER_BIT, + (unsigned long long)bm_bits * BM_SECT_PER_BIT); + if (stop_sector != 0 && stop_sector != ULLONG_MAX) + seq_printf(seq, " stop sector: %llu", stop_sector); + seq_printf(seq, "\n"); + } +} + +static int drbd_seq_show(struct seq_file *seq, void *v) +{ + int i, prev_i = -1; + const char *sn; + struct drbd_device *device; + struct net_conf *nc; + union drbd_dev_state state; + char wp; + + static char write_ordering_chars[] = { + [WO_none] = 'n', + [WO_drain_io] = 'd', + [WO_bdev_flush] = 'f', + }; + + seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n", + API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag()); + + /* + cs .. connection state + ro .. node role (local/remote) + ds .. disk state (local/remote) + protocol + various flags + ns .. network send + nr .. network receive + dw .. disk write + dr .. disk read + al .. activity log write count + bm .. bitmap update write count + pe .. pending (waiting for ack or data reply) + ua .. unack'd (still need to send ack or data reply) + ap .. application requests accepted, but not yet completed + ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending + wo .. write ordering mode currently in use + oos .. known out-of-sync kB + */ + + rcu_read_lock(); + idr_for_each_entry(&drbd_devices, device, i) { + if (prev_i != i - 1) + seq_printf(seq, "\n"); + prev_i = i; + + state = device->state; + sn = drbd_conn_str(state.conn); + + if (state.conn == C_STANDALONE && + state.disk == D_DISKLESS && + state.role == R_SECONDARY) { + seq_printf(seq, "%2d: cs:Unconfigured\n", i); + } else { + /* reset device->congestion_reason */ + bdi_rw_congested(&device->rq_queue->backing_dev_info); + + nc = rcu_dereference(first_peer_device(device)->connection->net_conf); + wp = nc ? nc->wire_protocol - DRBD_PROT_A + 'A' : ' '; + seq_printf(seq, + "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n" + " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " + "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c", + i, sn, + drbd_role_str(state.role), + drbd_role_str(state.peer), + drbd_disk_str(state.disk), + drbd_disk_str(state.pdsk), + wp, + drbd_suspended(device) ? 's' : 'r', + state.aftr_isp ? 'a' : '-', + state.peer_isp ? 'p' : '-', + state.user_isp ? 'u' : '-', + device->congestion_reason ?: '-', + test_bit(AL_SUSPENDED, &device->flags) ? 's' : '-', + device->send_cnt/2, + device->recv_cnt/2, + device->writ_cnt/2, + device->read_cnt/2, + device->al_writ_cnt, + device->bm_writ_cnt, + atomic_read(&device->local_cnt), + atomic_read(&device->ap_pending_cnt) + + atomic_read(&device->rs_pending_cnt), + atomic_read(&device->unacked_cnt), + atomic_read(&device->ap_bio_cnt), + first_peer_device(device)->connection->epochs, + write_ordering_chars[device->resource->write_ordering] + ); + seq_printf(seq, " oos:%llu\n", + Bit2KB((unsigned long long) + drbd_bm_total_weight(device))); + } + if (state.conn == C_SYNC_SOURCE || + state.conn == C_SYNC_TARGET || + state.conn == C_VERIFY_S || + state.conn == C_VERIFY_T) + drbd_syncer_progress(device, seq, state); + + if (proc_details >= 1 && get_ldev_if_state(device, D_FAILED)) { + lc_seq_printf_stats(seq, device->resync); + lc_seq_printf_stats(seq, device->act_log); + put_ldev(device); + } + + if (proc_details >= 2) + seq_printf(seq, "\tblocked on activity log: %d\n", atomic_read(&device->ap_actlog_cnt)); + } + rcu_read_unlock(); + + return 0; +} + +static int drbd_proc_open(struct inode *inode, struct file *file) +{ + int err; + + if (try_module_get(THIS_MODULE)) { + err = single_open(file, drbd_seq_show, NULL); + if (err) + module_put(THIS_MODULE); + return err; + } + return -ENODEV; +} + +static int drbd_proc_release(struct inode *inode, struct file *file) +{ + module_put(THIS_MODULE); + return single_release(inode, file); +} + +/* PROC FS stuff end */ diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h new file mode 100644 index 000000000..2da9104a3 --- /dev/null +++ b/drivers/block/drbd/drbd_protocol.h @@ -0,0 +1,307 @@ +#ifndef __DRBD_PROTOCOL_H +#define __DRBD_PROTOCOL_H + +enum drbd_packet { + /* receiver (data socket) */ + P_DATA = 0x00, + P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */ + P_RS_DATA_REPLY = 0x02, /* Response to P_RS_DATA_REQUEST */ + P_BARRIER = 0x03, + P_BITMAP = 0x04, + P_BECOME_SYNC_TARGET = 0x05, + P_BECOME_SYNC_SOURCE = 0x06, + P_UNPLUG_REMOTE = 0x07, /* Used at various times to hint the peer */ + P_DATA_REQUEST = 0x08, /* Used to ask for a data block */ + P_RS_DATA_REQUEST = 0x09, /* Used to ask for a data block for resync */ + P_SYNC_PARAM = 0x0a, + P_PROTOCOL = 0x0b, + P_UUIDS = 0x0c, + P_SIZES = 0x0d, + P_STATE = 0x0e, + P_SYNC_UUID = 0x0f, + P_AUTH_CHALLENGE = 0x10, + P_AUTH_RESPONSE = 0x11, + P_STATE_CHG_REQ = 0x12, + + /* asender (meta socket */ + P_PING = 0x13, + P_PING_ACK = 0x14, + P_RECV_ACK = 0x15, /* Used in protocol B */ + P_WRITE_ACK = 0x16, /* Used in protocol C */ + P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */ + P_SUPERSEDED = 0x18, /* Used in proto C, two-primaries conflict detection */ + P_NEG_ACK = 0x19, /* Sent if local disk is unusable */ + P_NEG_DREPLY = 0x1a, /* Local disk is broken... */ + P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */ + P_BARRIER_ACK = 0x1c, + P_STATE_CHG_REPLY = 0x1d, + + /* "new" commands, no longer fitting into the ordering scheme above */ + + P_OV_REQUEST = 0x1e, /* data socket */ + P_OV_REPLY = 0x1f, + P_OV_RESULT = 0x20, /* meta socket */ + P_CSUM_RS_REQUEST = 0x21, /* data socket */ + P_RS_IS_IN_SYNC = 0x22, /* meta socket */ + P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */ + P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */ + /* P_CKPT_FENCE_REQ = 0x25, * currently reserved for protocol D */ + /* P_CKPT_DISABLE_REQ = 0x26, * currently reserved for protocol D */ + P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */ + P_OUT_OF_SYNC = 0x28, /* Mark as out of sync (Outrunning), data socket */ + P_RS_CANCEL = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */ + P_CONN_ST_CHG_REQ = 0x2a, /* data sock: Connection wide state request */ + P_CONN_ST_CHG_REPLY = 0x2b, /* meta sock: Connection side state req reply */ + P_RETRY_WRITE = 0x2c, /* Protocol C: retry conflicting write request */ + P_PROTOCOL_UPDATE = 0x2d, /* data sock: is used in established connections */ + /* 0x2e to 0x30 reserved, used in drbd 9 */ + + /* REQ_DISCARD. We used "discard" in different contexts before, + * which is why I chose TRIM here, to disambiguate. */ + P_TRIM = 0x31, + + P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ + P_MAX_OPT_CMD = 0x101, + + /* special command ids for handshake */ + + P_INITIAL_META = 0xfff1, /* First Packet on the MetaSock */ + P_INITIAL_DATA = 0xfff2, /* First Packet on the Socket */ + + P_CONNECTION_FEATURES = 0xfffe /* FIXED for the next century! */ +}; + +#ifndef __packed +#define __packed __attribute__((packed)) +#endif + +/* This is the layout for a packet on the wire. + * The byteorder is the network byte order. + * (except block_id and barrier fields. + * these are pointers to local structs + * and have no relevance for the partner, + * which just echoes them as received.) + * + * NOTE that the payload starts at a long aligned offset, + * regardless of 32 or 64 bit arch! + */ +struct p_header80 { + u32 magic; + u16 command; + u16 length; /* bytes of data after this header */ +} __packed; + +/* Header for big packets, Used for data packets exceeding 64kB */ +struct p_header95 { + u16 magic; /* use DRBD_MAGIC_BIG here */ + u16 command; + u32 length; +} __packed; + +struct p_header100 { + u32 magic; + u16 volume; + u16 command; + u32 length; + u32 pad; +} __packed; + +/* these defines must not be changed without changing the protocol version */ +#define DP_HARDBARRIER 1 /* depricated */ +#define DP_RW_SYNC 2 /* equals REQ_SYNC */ +#define DP_MAY_SET_IN_SYNC 4 +#define DP_UNPLUG 8 /* not used anymore */ +#define DP_FUA 16 /* equals REQ_FUA */ +#define DP_FLUSH 32 /* equals REQ_FLUSH */ +#define DP_DISCARD 64 /* equals REQ_DISCARD */ +#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */ +#define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */ + +struct p_data { + u64 sector; /* 64 bits sector number */ + u64 block_id; /* to identify the request in protocol B&C */ + u32 seq_num; + u32 dp_flags; +} __packed; + +struct p_trim { + struct p_data p_data; + u32 size; /* == bio->bi_size */ +} __packed; + +/* + * commands which share a struct: + * p_block_ack: + * P_RECV_ACK (proto B), P_WRITE_ACK (proto C), + * P_SUPERSEDED (proto C, two-primaries conflict detection) + * p_block_req: + * P_DATA_REQUEST, P_RS_DATA_REQUEST + */ +struct p_block_ack { + u64 sector; + u64 block_id; + u32 blksize; + u32 seq_num; +} __packed; + +struct p_block_req { + u64 sector; + u64 block_id; + u32 blksize; + u32 pad; /* to multiple of 8 Byte */ +} __packed; + +/* + * commands with their own struct for additional fields: + * P_CONNECTION_FEATURES + * P_BARRIER + * P_BARRIER_ACK + * P_SYNC_PARAM + * ReportParams + */ + +#define FF_TRIM 1 + +struct p_connection_features { + u32 protocol_min; + u32 feature_flags; + u32 protocol_max; + + /* should be more than enough for future enhancements + * for now, feature_flags and the reserved array shall be zero. + */ + + u32 _pad; + u64 reserved[7]; +} __packed; + +struct p_barrier { + u32 barrier; /* barrier number _handle_ only */ + u32 pad; /* to multiple of 8 Byte */ +} __packed; + +struct p_barrier_ack { + u32 barrier; + u32 set_size; +} __packed; + +struct p_rs_param { + u32 resync_rate; + + /* Since protocol version 88 and higher. */ + char verify_alg[0]; +} __packed; + +struct p_rs_param_89 { + u32 resync_rate; + /* protocol version 89: */ + char verify_alg[SHARED_SECRET_MAX]; + char csums_alg[SHARED_SECRET_MAX]; +} __packed; + +struct p_rs_param_95 { + u32 resync_rate; + char verify_alg[SHARED_SECRET_MAX]; + char csums_alg[SHARED_SECRET_MAX]; + u32 c_plan_ahead; + u32 c_delay_target; + u32 c_fill_target; + u32 c_max_rate; +} __packed; + +enum drbd_conn_flags { + CF_DISCARD_MY_DATA = 1, + CF_DRY_RUN = 2, +}; + +struct p_protocol { + u32 protocol; + u32 after_sb_0p; + u32 after_sb_1p; + u32 after_sb_2p; + u32 conn_flags; + u32 two_primaries; + + /* Since protocol version 87 and higher. */ + char integrity_alg[0]; + +} __packed; + +struct p_uuids { + u64 uuid[UI_EXTENDED_SIZE]; +} __packed; + +struct p_rs_uuid { + u64 uuid; +} __packed; + +struct p_sizes { + u64 d_size; /* size of disk */ + u64 u_size; /* user requested size */ + u64 c_size; /* current exported size */ + u32 max_bio_size; /* Maximal size of a BIO */ + u16 queue_order_type; /* not yet implemented in DRBD*/ + u16 dds_flags; /* use enum dds_flags here. */ +} __packed; + +struct p_state { + u32 state; +} __packed; + +struct p_req_state { + u32 mask; + u32 val; +} __packed; + +struct p_req_state_reply { + u32 retcode; +} __packed; + +struct p_drbd06_param { + u64 size; + u32 state; + u32 blksize; + u32 protocol; + u32 version; + u32 gen_cnt[5]; + u32 bit_map_gen[5]; +} __packed; + +struct p_block_desc { + u64 sector; + u32 blksize; + u32 pad; /* to multiple of 8 Byte */ +} __packed; + +/* Valid values for the encoding field. + * Bump proto version when changing this. */ +enum drbd_bitmap_code { + /* RLE_VLI_Bytes = 0, + * and other bit variants had been defined during + * algorithm evaluation. */ + RLE_VLI_Bits = 2, +}; + +struct p_compressed_bm { + /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code + * (encoding & 0x80): polarity (set/unset) of first runlength + * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits + * used to pad up to head.length bytes + */ + u8 encoding; + + u8 code[0]; +} __packed; + +struct p_delay_probe93 { + u32 seq_num; /* sequence number to match the two probe packets */ + u32 offset; /* usecs the probe got sent after the reference time point */ +} __packed; + +/* + * Bitmap packets need to fit within a single page on the sender and receiver, + * so we are limited to 4 KiB (and not to PAGE_SIZE, which can be bigger). + */ +#define DRBD_SOCKET_BUFFER_SIZE 4096 + +#endif /* __DRBD_PROTOCOL_H */ diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c new file mode 100644 index 000000000..cee20354a --- /dev/null +++ b/drivers/block/drbd/drbd_receiver.c @@ -0,0 +1,5661 @@ +/* + drbd_receiver.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + + +#include <linux/module.h> + +#include <asm/uaccess.h> +#include <net/sock.h> + +#include <linux/drbd.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/in.h> +#include <linux/mm.h> +#include <linux/memcontrol.h> +#include <linux/mm_inline.h> +#include <linux/slab.h> +#include <linux/pkt_sched.h> +#define __KERNEL_SYSCALLS__ +#include <linux/unistd.h> +#include <linux/vmalloc.h> +#include <linux/random.h> +#include <linux/string.h> +#include <linux/scatterlist.h> +#include "drbd_int.h" +#include "drbd_protocol.h" +#include "drbd_req.h" +#include "drbd_vli.h" + +#define PRO_FEATURES (FF_TRIM) + +struct packet_info { + enum drbd_packet cmd; + unsigned int size; + unsigned int vnr; + void *data; +}; + +enum finish_epoch { + FE_STILL_LIVE, + FE_DESTROYED, + FE_RECYCLED, +}; + +static int drbd_do_features(struct drbd_connection *connection); +static int drbd_do_auth(struct drbd_connection *connection); +static int drbd_disconnected(struct drbd_peer_device *); +static void conn_wait_active_ee_empty(struct drbd_connection *connection); +static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event); +static int e_end_block(struct drbd_work *, int); + + +#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) + +/* + * some helper functions to deal with single linked page lists, + * page->private being our "next" pointer. + */ + +/* If at least n pages are linked at head, get n pages off. + * Otherwise, don't modify head, and return NULL. + * Locking is the responsibility of the caller. + */ +static struct page *page_chain_del(struct page **head, int n) +{ + struct page *page; + struct page *tmp; + + BUG_ON(!n); + BUG_ON(!head); + + page = *head; + + if (!page) + return NULL; + + while (page) { + tmp = page_chain_next(page); + if (--n == 0) + break; /* found sufficient pages */ + if (tmp == NULL) + /* insufficient pages, don't use any of them. */ + return NULL; + page = tmp; + } + + /* add end of list marker for the returned list */ + set_page_private(page, 0); + /* actual return value, and adjustment of head */ + page = *head; + *head = tmp; + return page; +} + +/* may be used outside of locks to find the tail of a (usually short) + * "private" page chain, before adding it back to a global chain head + * with page_chain_add() under a spinlock. */ +static struct page *page_chain_tail(struct page *page, int *len) +{ + struct page *tmp; + int i = 1; + while ((tmp = page_chain_next(page))) + ++i, page = tmp; + if (len) + *len = i; + return page; +} + +static int page_chain_free(struct page *page) +{ + struct page *tmp; + int i = 0; + page_chain_for_each_safe(page, tmp) { + put_page(page); + ++i; + } + return i; +} + +static void page_chain_add(struct page **head, + struct page *chain_first, struct page *chain_last) +{ +#if 1 + struct page *tmp; + tmp = page_chain_tail(chain_first, NULL); + BUG_ON(tmp != chain_last); +#endif + + /* add chain to head */ + set_page_private(chain_last, (unsigned long)*head); + *head = chain_first; +} + +static struct page *__drbd_alloc_pages(struct drbd_device *device, + unsigned int number) +{ + struct page *page = NULL; + struct page *tmp = NULL; + unsigned int i = 0; + + /* Yes, testing drbd_pp_vacant outside the lock is racy. + * So what. It saves a spin_lock. */ + if (drbd_pp_vacant >= number) { + spin_lock(&drbd_pp_lock); + page = page_chain_del(&drbd_pp_pool, number); + if (page) + drbd_pp_vacant -= number; + spin_unlock(&drbd_pp_lock); + if (page) + return page; + } + + /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD + * "criss-cross" setup, that might cause write-out on some other DRBD, + * which in turn might block on the other node at this very place. */ + for (i = 0; i < number; i++) { + tmp = alloc_page(GFP_TRY); + if (!tmp) + break; + set_page_private(tmp, (unsigned long)page); + page = tmp; + } + + if (i == number) + return page; + + /* Not enough pages immediately available this time. + * No need to jump around here, drbd_alloc_pages will retry this + * function "soon". */ + if (page) { + tmp = page_chain_tail(page, NULL); + spin_lock(&drbd_pp_lock); + page_chain_add(&drbd_pp_pool, page, tmp); + drbd_pp_vacant += i; + spin_unlock(&drbd_pp_lock); + } + return NULL; +} + +static void reclaim_finished_net_peer_reqs(struct drbd_device *device, + struct list_head *to_be_freed) +{ + struct drbd_peer_request *peer_req, *tmp; + + /* The EEs are always appended to the end of the list. Since + they are sent in order over the wire, they have to finish + in order. As soon as we see the first not finished we can + stop to examine the list... */ + + list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) { + if (drbd_peer_req_has_active_page(peer_req)) + break; + list_move(&peer_req->w.list, to_be_freed); + } +} + +static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device) +{ + LIST_HEAD(reclaimed); + struct drbd_peer_request *peer_req, *t; + + spin_lock_irq(&device->resource->req_lock); + reclaim_finished_net_peer_reqs(device, &reclaimed); + spin_unlock_irq(&device->resource->req_lock); + + list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) + drbd_free_net_peer_req(device, peer_req); +} + +/** + * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled) + * @device: DRBD device. + * @number: number of pages requested + * @retry: whether to retry, if not enough pages are available right now + * + * Tries to allocate number pages, first from our own page pool, then from + * the kernel. + * Possibly retry until DRBD frees sufficient pages somewhere else. + * + * If this allocation would exceed the max_buffers setting, we throttle + * allocation (schedule_timeout) to give the system some room to breathe. + * + * We do not use max-buffers as hard limit, because it could lead to + * congestion and further to a distributed deadlock during online-verify or + * (checksum based) resync, if the max-buffers, socket buffer sizes and + * resync-rate settings are mis-configured. + * + * Returns a page chain linked via page->private. + */ +struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number, + bool retry) +{ + struct drbd_device *device = peer_device->device; + struct page *page = NULL; + struct net_conf *nc; + DEFINE_WAIT(wait); + unsigned int mxb; + + rcu_read_lock(); + nc = rcu_dereference(peer_device->connection->net_conf); + mxb = nc ? nc->max_buffers : 1000000; + rcu_read_unlock(); + + if (atomic_read(&device->pp_in_use) < mxb) + page = __drbd_alloc_pages(device, number); + + while (page == NULL) { + prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); + + drbd_kick_lo_and_reclaim_net(device); + + if (atomic_read(&device->pp_in_use) < mxb) { + page = __drbd_alloc_pages(device, number); + if (page) + break; + } + + if (!retry) + break; + + if (signal_pending(current)) { + drbd_warn(device, "drbd_alloc_pages interrupted!\n"); + break; + } + + if (schedule_timeout(HZ/10) == 0) + mxb = UINT_MAX; + } + finish_wait(&drbd_pp_wait, &wait); + + if (page) + atomic_add(number, &device->pp_in_use); + return page; +} + +/* Must not be used from irq, as that may deadlock: see drbd_alloc_pages. + * Is also used from inside an other spin_lock_irq(&resource->req_lock); + * Either links the page chain back to the global pool, + * or returns all pages to the system. */ +static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net) +{ + atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use; + int i; + + if (page == NULL) + return; + + if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count) + i = page_chain_free(page); + else { + struct page *tmp; + tmp = page_chain_tail(page, &i); + spin_lock(&drbd_pp_lock); + page_chain_add(&drbd_pp_pool, page, tmp); + drbd_pp_vacant += i; + spin_unlock(&drbd_pp_lock); + } + i = atomic_sub_return(i, a); + if (i < 0) + drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n", + is_net ? "pp_in_use_by_net" : "pp_in_use", i); + wake_up(&drbd_pp_wait); +} + +/* +You need to hold the req_lock: + _drbd_wait_ee_list_empty() + +You must not have the req_lock: + drbd_free_peer_req() + drbd_alloc_peer_req() + drbd_free_peer_reqs() + drbd_ee_fix_bhs() + drbd_finish_peer_reqs() + drbd_clear_done_ee() + drbd_wait_ee_list_empty() +*/ + +struct drbd_peer_request * +drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector, + unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local) +{ + struct drbd_device *device = peer_device->device; + struct drbd_peer_request *peer_req; + struct page *page = NULL; + unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; + + if (drbd_insert_fault(device, DRBD_FAULT_AL_EE)) + return NULL; + + peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); + if (!peer_req) { + if (!(gfp_mask & __GFP_NOWARN)) + drbd_err(device, "%s: allocation failed\n", __func__); + return NULL; + } + + if (has_payload && data_size) { + page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT)); + if (!page) + goto fail; + } + + memset(peer_req, 0, sizeof(*peer_req)); + INIT_LIST_HEAD(&peer_req->w.list); + drbd_clear_interval(&peer_req->i); + peer_req->i.size = data_size; + peer_req->i.sector = sector; + peer_req->submit_jif = jiffies; + peer_req->peer_device = peer_device; + peer_req->pages = page; + /* + * The block_id is opaque to the receiver. It is not endianness + * converted, and sent back to the sender unchanged. + */ + peer_req->block_id = id; + + return peer_req; + + fail: + mempool_free(peer_req, drbd_ee_mempool); + return NULL; +} + +void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req, + int is_net) +{ + might_sleep(); + if (peer_req->flags & EE_HAS_DIGEST) + kfree(peer_req->digest); + drbd_free_pages(device, peer_req->pages, is_net); + D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0); + D_ASSERT(device, drbd_interval_empty(&peer_req->i)); + if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) { + peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO; + drbd_al_complete_io(device, &peer_req->i); + } + mempool_free(peer_req, drbd_ee_mempool); +} + +int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list) +{ + LIST_HEAD(work_list); + struct drbd_peer_request *peer_req, *t; + int count = 0; + int is_net = list == &device->net_ee; + + spin_lock_irq(&device->resource->req_lock); + list_splice_init(list, &work_list); + spin_unlock_irq(&device->resource->req_lock); + + list_for_each_entry_safe(peer_req, t, &work_list, w.list) { + __drbd_free_peer_req(device, peer_req, is_net); + count++; + } + return count; +} + +/* + * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier. + */ +static int drbd_finish_peer_reqs(struct drbd_device *device) +{ + LIST_HEAD(work_list); + LIST_HEAD(reclaimed); + struct drbd_peer_request *peer_req, *t; + int err = 0; + + spin_lock_irq(&device->resource->req_lock); + reclaim_finished_net_peer_reqs(device, &reclaimed); + list_splice_init(&device->done_ee, &work_list); + spin_unlock_irq(&device->resource->req_lock); + + list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) + drbd_free_net_peer_req(device, peer_req); + + /* possible callbacks here: + * e_end_block, and e_end_resync_block, e_send_superseded. + * all ignore the last argument. + */ + list_for_each_entry_safe(peer_req, t, &work_list, w.list) { + int err2; + + /* list_del not necessary, next/prev members not touched */ + err2 = peer_req->w.cb(&peer_req->w, !!err); + if (!err) + err = err2; + drbd_free_peer_req(device, peer_req); + } + wake_up(&device->ee_wait); + + return err; +} + +static void _drbd_wait_ee_list_empty(struct drbd_device *device, + struct list_head *head) +{ + DEFINE_WAIT(wait); + + /* avoids spin_lock/unlock + * and calling prepare_to_wait in the fast path */ + while (!list_empty(head)) { + prepare_to_wait(&device->ee_wait, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&device->resource->req_lock); + io_schedule(); + finish_wait(&device->ee_wait, &wait); + spin_lock_irq(&device->resource->req_lock); + } +} + +static void drbd_wait_ee_list_empty(struct drbd_device *device, + struct list_head *head) +{ + spin_lock_irq(&device->resource->req_lock); + _drbd_wait_ee_list_empty(device, head); + spin_unlock_irq(&device->resource->req_lock); +} + +static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags) +{ + struct kvec iov = { + .iov_base = buf, + .iov_len = size, + }; + struct msghdr msg = { + .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL) + }; + return kernel_recvmsg(sock, &msg, &iov, 1, size, msg.msg_flags); +} + +static int drbd_recv(struct drbd_connection *connection, void *buf, size_t size) +{ + int rv; + + rv = drbd_recv_short(connection->data.socket, buf, size, 0); + + if (rv < 0) { + if (rv == -ECONNRESET) + drbd_info(connection, "sock was reset by peer\n"); + else if (rv != -ERESTARTSYS) + drbd_err(connection, "sock_recvmsg returned %d\n", rv); + } else if (rv == 0) { + if (test_bit(DISCONNECT_SENT, &connection->flags)) { + long t; + rcu_read_lock(); + t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10; + rcu_read_unlock(); + + t = wait_event_timeout(connection->ping_wait, connection->cstate < C_WF_REPORT_PARAMS, t); + + if (t) + goto out; + } + drbd_info(connection, "sock was shut down by peer\n"); + } + + if (rv != size) + conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD); + +out: + return rv; +} + +static int drbd_recv_all(struct drbd_connection *connection, void *buf, size_t size) +{ + int err; + + err = drbd_recv(connection, buf, size); + if (err != size) { + if (err >= 0) + err = -EIO; + } else + err = 0; + return err; +} + +static int drbd_recv_all_warn(struct drbd_connection *connection, void *buf, size_t size) +{ + int err; + + err = drbd_recv_all(connection, buf, size); + if (err && !signal_pending(current)) + drbd_warn(connection, "short read (expected size %d)\n", (int)size); + return err; +} + +/* quoting tcp(7): + * On individual connections, the socket buffer size must be set prior to the + * listen(2) or connect(2) calls in order to have it take effect. + * This is our wrapper to do so. + */ +static void drbd_setbufsize(struct socket *sock, unsigned int snd, + unsigned int rcv) +{ + /* open coded SO_SNDBUF, SO_RCVBUF */ + if (snd) { + sock->sk->sk_sndbuf = snd; + sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + } + if (rcv) { + sock->sk->sk_rcvbuf = rcv; + sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + } +} + +static struct socket *drbd_try_connect(struct drbd_connection *connection) +{ + const char *what; + struct socket *sock; + struct sockaddr_in6 src_in6; + struct sockaddr_in6 peer_in6; + struct net_conf *nc; + int err, peer_addr_len, my_addr_len; + int sndbuf_size, rcvbuf_size, connect_int; + int disconnect_on_error = 1; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + if (!nc) { + rcu_read_unlock(); + return NULL; + } + sndbuf_size = nc->sndbuf_size; + rcvbuf_size = nc->rcvbuf_size; + connect_int = nc->connect_int; + rcu_read_unlock(); + + my_addr_len = min_t(int, connection->my_addr_len, sizeof(src_in6)); + memcpy(&src_in6, &connection->my_addr, my_addr_len); + + if (((struct sockaddr *)&connection->my_addr)->sa_family == AF_INET6) + src_in6.sin6_port = 0; + else + ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */ + + peer_addr_len = min_t(int, connection->peer_addr_len, sizeof(src_in6)); + memcpy(&peer_in6, &connection->peer_addr, peer_addr_len); + + what = "sock_create_kern"; + err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family, + SOCK_STREAM, IPPROTO_TCP, &sock); + if (err < 0) { + sock = NULL; + goto out; + } + + sock->sk->sk_rcvtimeo = + sock->sk->sk_sndtimeo = connect_int * HZ; + drbd_setbufsize(sock, sndbuf_size, rcvbuf_size); + + /* explicitly bind to the configured IP as source IP + * for the outgoing connections. + * This is needed for multihomed hosts and to be + * able to use lo: interfaces for drbd. + * Make sure to use 0 as port number, so linux selects + * a free one dynamically. + */ + what = "bind before connect"; + err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len); + if (err < 0) + goto out; + + /* connect may fail, peer not yet available. + * stay C_WF_CONNECTION, don't go Disconnecting! */ + disconnect_on_error = 0; + what = "connect"; + err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0); + +out: + if (err < 0) { + if (sock) { + sock_release(sock); + sock = NULL; + } + switch (-err) { + /* timeout, busy, signal pending */ + case ETIMEDOUT: case EAGAIN: case EINPROGRESS: + case EINTR: case ERESTARTSYS: + /* peer not (yet) available, network problem */ + case ECONNREFUSED: case ENETUNREACH: + case EHOSTDOWN: case EHOSTUNREACH: + disconnect_on_error = 0; + break; + default: + drbd_err(connection, "%s failed, err = %d\n", what, err); + } + if (disconnect_on_error) + conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); + } + + return sock; +} + +struct accept_wait_data { + struct drbd_connection *connection; + struct socket *s_listen; + struct completion door_bell; + void (*original_sk_state_change)(struct sock *sk); + +}; + +static void drbd_incoming_connection(struct sock *sk) +{ + struct accept_wait_data *ad = sk->sk_user_data; + void (*state_change)(struct sock *sk); + + state_change = ad->original_sk_state_change; + if (sk->sk_state == TCP_ESTABLISHED) + complete(&ad->door_bell); + state_change(sk); +} + +static int prepare_listen_socket(struct drbd_connection *connection, struct accept_wait_data *ad) +{ + int err, sndbuf_size, rcvbuf_size, my_addr_len; + struct sockaddr_in6 my_addr; + struct socket *s_listen; + struct net_conf *nc; + const char *what; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + if (!nc) { + rcu_read_unlock(); + return -EIO; + } + sndbuf_size = nc->sndbuf_size; + rcvbuf_size = nc->rcvbuf_size; + rcu_read_unlock(); + + my_addr_len = min_t(int, connection->my_addr_len, sizeof(struct sockaddr_in6)); + memcpy(&my_addr, &connection->my_addr, my_addr_len); + + what = "sock_create_kern"; + err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family, + SOCK_STREAM, IPPROTO_TCP, &s_listen); + if (err) { + s_listen = NULL; + goto out; + } + + s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ + drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size); + + what = "bind before listen"; + err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len); + if (err < 0) + goto out; + + ad->s_listen = s_listen; + write_lock_bh(&s_listen->sk->sk_callback_lock); + ad->original_sk_state_change = s_listen->sk->sk_state_change; + s_listen->sk->sk_state_change = drbd_incoming_connection; + s_listen->sk->sk_user_data = ad; + write_unlock_bh(&s_listen->sk->sk_callback_lock); + + what = "listen"; + err = s_listen->ops->listen(s_listen, 5); + if (err < 0) + goto out; + + return 0; +out: + if (s_listen) + sock_release(s_listen); + if (err < 0) { + if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { + drbd_err(connection, "%s failed, err = %d\n", what, err); + conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); + } + } + + return -EIO; +} + +static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad) +{ + write_lock_bh(&sk->sk_callback_lock); + sk->sk_state_change = ad->original_sk_state_change; + sk->sk_user_data = NULL; + write_unlock_bh(&sk->sk_callback_lock); +} + +static struct socket *drbd_wait_for_connect(struct drbd_connection *connection, struct accept_wait_data *ad) +{ + int timeo, connect_int, err = 0; + struct socket *s_estab = NULL; + struct net_conf *nc; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + if (!nc) { + rcu_read_unlock(); + return NULL; + } + connect_int = nc->connect_int; + rcu_read_unlock(); + + timeo = connect_int * HZ; + /* 28.5% random jitter */ + timeo += (prandom_u32() & 1) ? timeo / 7 : -timeo / 7; + + err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo); + if (err <= 0) + return NULL; + + err = kernel_accept(ad->s_listen, &s_estab, 0); + if (err < 0) { + if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { + drbd_err(connection, "accept failed, err = %d\n", err); + conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); + } + } + + if (s_estab) + unregister_state_change(s_estab->sk, ad); + + return s_estab; +} + +static int decode_header(struct drbd_connection *, void *, struct packet_info *); + +static int send_first_packet(struct drbd_connection *connection, struct drbd_socket *sock, + enum drbd_packet cmd) +{ + if (!conn_prepare_command(connection, sock)) + return -EIO; + return conn_send_command(connection, sock, cmd, 0, NULL, 0); +} + +static int receive_first_packet(struct drbd_connection *connection, struct socket *sock) +{ + unsigned int header_size = drbd_header_size(connection); + struct packet_info pi; + struct net_conf *nc; + int err; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + if (!nc) { + rcu_read_unlock(); + return -EIO; + } + sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10; + rcu_read_unlock(); + + err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0); + if (err != header_size) { + if (err >= 0) + err = -EIO; + return err; + } + err = decode_header(connection, connection->data.rbuf, &pi); + if (err) + return err; + return pi.cmd; +} + +/** + * drbd_socket_okay() - Free the socket if its connection is not okay + * @sock: pointer to the pointer to the socket. + */ +static bool drbd_socket_okay(struct socket **sock) +{ + int rr; + char tb[4]; + + if (!*sock) + return false; + + rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK); + + if (rr > 0 || rr == -EAGAIN) { + return true; + } else { + sock_release(*sock); + *sock = NULL; + return false; + } +} + +static bool connection_established(struct drbd_connection *connection, + struct socket **sock1, + struct socket **sock2) +{ + struct net_conf *nc; + int timeout; + bool ok; + + if (!*sock1 || !*sock2) + return false; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10; + rcu_read_unlock(); + schedule_timeout_interruptible(timeout); + + ok = drbd_socket_okay(sock1); + ok = drbd_socket_okay(sock2) && ok; + + return ok; +} + +/* Gets called if a connection is established, or if a new minor gets created + in a connection */ +int drbd_connected(struct drbd_peer_device *peer_device) +{ + struct drbd_device *device = peer_device->device; + int err; + + atomic_set(&device->packet_seq, 0); + device->peer_seq = 0; + + device->state_mutex = peer_device->connection->agreed_pro_version < 100 ? + &peer_device->connection->cstate_mutex : + &device->own_state_mutex; + + err = drbd_send_sync_param(peer_device); + if (!err) + err = drbd_send_sizes(peer_device, 0, 0); + if (!err) + err = drbd_send_uuids(peer_device); + if (!err) + err = drbd_send_current_state(peer_device); + clear_bit(USE_DEGR_WFC_T, &device->flags); + clear_bit(RESIZE_PENDING, &device->flags); + atomic_set(&device->ap_in_flight, 0); + mod_timer(&device->request_timer, jiffies + HZ); /* just start it here. */ + return err; +} + +/* + * return values: + * 1 yes, we have a valid connection + * 0 oops, did not work out, please try again + * -1 peer talks different language, + * no point in trying again, please go standalone. + * -2 We do not have a network config... + */ +static int conn_connect(struct drbd_connection *connection) +{ + struct drbd_socket sock, msock; + struct drbd_peer_device *peer_device; + struct net_conf *nc; + int vnr, timeout, h; + bool discard_my_data, ok; + enum drbd_state_rv rv; + struct accept_wait_data ad = { + .connection = connection, + .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell), + }; + + clear_bit(DISCONNECT_SENT, &connection->flags); + if (conn_request_state(connection, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS) + return -2; + + mutex_init(&sock.mutex); + sock.sbuf = connection->data.sbuf; + sock.rbuf = connection->data.rbuf; + sock.socket = NULL; + mutex_init(&msock.mutex); + msock.sbuf = connection->meta.sbuf; + msock.rbuf = connection->meta.rbuf; + msock.socket = NULL; + + /* Assume that the peer only understands protocol 80 until we know better. */ + connection->agreed_pro_version = 80; + + if (prepare_listen_socket(connection, &ad)) + return 0; + + do { + struct socket *s; + + s = drbd_try_connect(connection); + if (s) { + if (!sock.socket) { + sock.socket = s; + send_first_packet(connection, &sock, P_INITIAL_DATA); + } else if (!msock.socket) { + clear_bit(RESOLVE_CONFLICTS, &connection->flags); + msock.socket = s; + send_first_packet(connection, &msock, P_INITIAL_META); + } else { + drbd_err(connection, "Logic error in conn_connect()\n"); + goto out_release_sockets; + } + } + + if (connection_established(connection, &sock.socket, &msock.socket)) + break; + +retry: + s = drbd_wait_for_connect(connection, &ad); + if (s) { + int fp = receive_first_packet(connection, s); + drbd_socket_okay(&sock.socket); + drbd_socket_okay(&msock.socket); + switch (fp) { + case P_INITIAL_DATA: + if (sock.socket) { + drbd_warn(connection, "initial packet S crossed\n"); + sock_release(sock.socket); + sock.socket = s; + goto randomize; + } + sock.socket = s; + break; + case P_INITIAL_META: + set_bit(RESOLVE_CONFLICTS, &connection->flags); + if (msock.socket) { + drbd_warn(connection, "initial packet M crossed\n"); + sock_release(msock.socket); + msock.socket = s; + goto randomize; + } + msock.socket = s; + break; + default: + drbd_warn(connection, "Error receiving initial packet\n"); + sock_release(s); +randomize: + if (prandom_u32() & 1) + goto retry; + } + } + + if (connection->cstate <= C_DISCONNECTING) + goto out_release_sockets; + if (signal_pending(current)) { + flush_signals(current); + smp_rmb(); + if (get_t_state(&connection->receiver) == EXITING) + goto out_release_sockets; + } + + ok = connection_established(connection, &sock.socket, &msock.socket); + } while (!ok); + + if (ad.s_listen) + sock_release(ad.s_listen); + + sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ + msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ + + sock.socket->sk->sk_allocation = GFP_NOIO; + msock.socket->sk->sk_allocation = GFP_NOIO; + + sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; + msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE; + + /* NOT YET ... + * sock.socket->sk->sk_sndtimeo = connection->net_conf->timeout*HZ/10; + * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + * first set it to the P_CONNECTION_FEATURES timeout, + * which we set to 4x the configured ping_timeout. */ + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + + sock.socket->sk->sk_sndtimeo = + sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10; + + msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ; + timeout = nc->timeout * HZ / 10; + discard_my_data = nc->discard_my_data; + rcu_read_unlock(); + + msock.socket->sk->sk_sndtimeo = timeout; + + /* we don't want delays. + * we use TCP_CORK where appropriate, though */ + drbd_tcp_nodelay(sock.socket); + drbd_tcp_nodelay(msock.socket); + + connection->data.socket = sock.socket; + connection->meta.socket = msock.socket; + connection->last_received = jiffies; + + h = drbd_do_features(connection); + if (h <= 0) + return h; + + if (connection->cram_hmac_tfm) { + /* drbd_request_state(device, NS(conn, WFAuth)); */ + switch (drbd_do_auth(connection)) { + case -1: + drbd_err(connection, "Authentication of peer failed\n"); + return -1; + case 0: + drbd_err(connection, "Authentication of peer failed, trying again.\n"); + return 0; + } + } + + connection->data.socket->sk->sk_sndtimeo = timeout; + connection->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + + if (drbd_send_protocol(connection) == -EOPNOTSUPP) + return -1; + + /* Prevent a race between resync-handshake and + * being promoted to Primary. + * + * Grab and release the state mutex, so we know that any current + * drbd_set_role() is finished, and any incoming drbd_set_role + * will see the STATE_SENT flag, and wait for it to be cleared. + */ + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + mutex_lock(peer_device->device->state_mutex); + + set_bit(STATE_SENT, &connection->flags); + + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + mutex_unlock(peer_device->device->state_mutex); + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + kref_get(&device->kref); + rcu_read_unlock(); + + if (discard_my_data) + set_bit(DISCARD_MY_DATA, &device->flags); + else + clear_bit(DISCARD_MY_DATA, &device->flags); + + drbd_connected(peer_device); + kref_put(&device->kref, drbd_destroy_device); + rcu_read_lock(); + } + rcu_read_unlock(); + + rv = conn_request_state(connection, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE); + if (rv < SS_SUCCESS || connection->cstate != C_WF_REPORT_PARAMS) { + clear_bit(STATE_SENT, &connection->flags); + return 0; + } + + drbd_thread_start(&connection->asender); + + mutex_lock(&connection->resource->conf_update); + /* The discard_my_data flag is a single-shot modifier to the next + * connection attempt, the handshake of which is now well underway. + * No need for rcu style copying of the whole struct + * just to clear a single value. */ + connection->net_conf->discard_my_data = 0; + mutex_unlock(&connection->resource->conf_update); + + return h; + +out_release_sockets: + if (ad.s_listen) + sock_release(ad.s_listen); + if (sock.socket) + sock_release(sock.socket); + if (msock.socket) + sock_release(msock.socket); + return -1; +} + +static int decode_header(struct drbd_connection *connection, void *header, struct packet_info *pi) +{ + unsigned int header_size = drbd_header_size(connection); + + if (header_size == sizeof(struct p_header100) && + *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) { + struct p_header100 *h = header; + if (h->pad != 0) { + drbd_err(connection, "Header padding is not zero\n"); + return -EINVAL; + } + pi->vnr = be16_to_cpu(h->volume); + pi->cmd = be16_to_cpu(h->command); + pi->size = be32_to_cpu(h->length); + } else if (header_size == sizeof(struct p_header95) && + *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) { + struct p_header95 *h = header; + pi->cmd = be16_to_cpu(h->command); + pi->size = be32_to_cpu(h->length); + pi->vnr = 0; + } else if (header_size == sizeof(struct p_header80) && + *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) { + struct p_header80 *h = header; + pi->cmd = be16_to_cpu(h->command); + pi->size = be16_to_cpu(h->length); + pi->vnr = 0; + } else { + drbd_err(connection, "Wrong magic value 0x%08x in protocol version %d\n", + be32_to_cpu(*(__be32 *)header), + connection->agreed_pro_version); + return -EINVAL; + } + pi->data = header + header_size; + return 0; +} + +static int drbd_recv_header(struct drbd_connection *connection, struct packet_info *pi) +{ + void *buffer = connection->data.rbuf; + int err; + + err = drbd_recv_all_warn(connection, buffer, drbd_header_size(connection)); + if (err) + return err; + + err = decode_header(connection, buffer, pi); + connection->last_received = jiffies; + + return err; +} + +static void drbd_flush(struct drbd_connection *connection) +{ + int rv; + struct drbd_peer_device *peer_device; + int vnr; + + if (connection->resource->write_ordering >= WO_bdev_flush) { + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + + if (!get_ldev(device)) + continue; + kref_get(&device->kref); + rcu_read_unlock(); + + /* Right now, we have only this one synchronous code path + * for flushes between request epochs. + * We may want to make those asynchronous, + * or at least parallelize the flushes to the volume devices. + */ + device->flush_jif = jiffies; + set_bit(FLUSH_PENDING, &device->flags); + rv = blkdev_issue_flush(device->ldev->backing_bdev, + GFP_NOIO, NULL); + clear_bit(FLUSH_PENDING, &device->flags); + if (rv) { + drbd_info(device, "local disk flush failed with status %d\n", rv); + /* would rather check on EOPNOTSUPP, but that is not reliable. + * don't try again for ANY return value != 0 + * if (rv == -EOPNOTSUPP) */ + drbd_bump_write_ordering(connection->resource, NULL, WO_drain_io); + } + put_ldev(device); + kref_put(&device->kref, drbd_destroy_device); + + rcu_read_lock(); + if (rv) + break; + } + rcu_read_unlock(); + } +} + +/** + * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it. + * @device: DRBD device. + * @epoch: Epoch object. + * @ev: Epoch event. + */ +static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connection, + struct drbd_epoch *epoch, + enum epoch_event ev) +{ + int epoch_size; + struct drbd_epoch *next_epoch; + enum finish_epoch rv = FE_STILL_LIVE; + + spin_lock(&connection->epoch_lock); + do { + next_epoch = NULL; + + epoch_size = atomic_read(&epoch->epoch_size); + + switch (ev & ~EV_CLEANUP) { + case EV_PUT: + atomic_dec(&epoch->active); + break; + case EV_GOT_BARRIER_NR: + set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags); + break; + case EV_BECAME_LAST: + /* nothing to do*/ + break; + } + + if (epoch_size != 0 && + atomic_read(&epoch->active) == 0 && + (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) { + if (!(ev & EV_CLEANUP)) { + spin_unlock(&connection->epoch_lock); + drbd_send_b_ack(epoch->connection, epoch->barrier_nr, epoch_size); + spin_lock(&connection->epoch_lock); + } +#if 0 + /* FIXME: dec unacked on connection, once we have + * something to count pending connection packets in. */ + if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) + dec_unacked(epoch->connection); +#endif + + if (connection->current_epoch != epoch) { + next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); + list_del(&epoch->list); + ev = EV_BECAME_LAST | (ev & EV_CLEANUP); + connection->epochs--; + kfree(epoch); + + if (rv == FE_STILL_LIVE) + rv = FE_DESTROYED; + } else { + epoch->flags = 0; + atomic_set(&epoch->epoch_size, 0); + /* atomic_set(&epoch->active, 0); is already zero */ + if (rv == FE_STILL_LIVE) + rv = FE_RECYCLED; + } + } + + if (!next_epoch) + break; + + epoch = next_epoch; + } while (1); + + spin_unlock(&connection->epoch_lock); + + return rv; +} + +static enum write_ordering_e +max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo) +{ + struct disk_conf *dc; + + dc = rcu_dereference(bdev->disk_conf); + + if (wo == WO_bdev_flush && !dc->disk_flushes) + wo = WO_drain_io; + if (wo == WO_drain_io && !dc->disk_drain) + wo = WO_none; + + return wo; +} + +/** + * drbd_bump_write_ordering() - Fall back to an other write ordering method + * @connection: DRBD connection. + * @wo: Write ordering method to try. + */ +void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev, + enum write_ordering_e wo) +{ + struct drbd_device *device; + enum write_ordering_e pwo; + int vnr; + static char *write_ordering_str[] = { + [WO_none] = "none", + [WO_drain_io] = "drain", + [WO_bdev_flush] = "flush", + }; + + pwo = resource->write_ordering; + if (wo != WO_bdev_flush) + wo = min(pwo, wo); + rcu_read_lock(); + idr_for_each_entry(&resource->devices, device, vnr) { + if (get_ldev(device)) { + wo = max_allowed_wo(device->ldev, wo); + if (device->ldev == bdev) + bdev = NULL; + put_ldev(device); + } + } + + if (bdev) + wo = max_allowed_wo(bdev, wo); + + rcu_read_unlock(); + + resource->write_ordering = wo; + if (pwo != resource->write_ordering || wo == WO_bdev_flush) + drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]); +} + +/** + * drbd_submit_peer_request() + * @device: DRBD device. + * @peer_req: peer request + * @rw: flag field, see bio->bi_rw + * + * May spread the pages to multiple bios, + * depending on bio_add_page restrictions. + * + * Returns 0 if all bios have been submitted, + * -ENOMEM if we could not allocate enough bios, + * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a + * single page to an empty bio (which should never happen and likely indicates + * that the lower level IO stack is in some way broken). This has been observed + * on certain Xen deployments. + */ +/* TODO allocate from our own bio_set. */ +int drbd_submit_peer_request(struct drbd_device *device, + struct drbd_peer_request *peer_req, + const unsigned rw, const int fault_type) +{ + struct bio *bios = NULL; + struct bio *bio; + struct page *page = peer_req->pages; + sector_t sector = peer_req->i.sector; + unsigned data_size = peer_req->i.size; + unsigned n_bios = 0; + unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; + int err = -ENOMEM; + + if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) { + /* wait for all pending IO completions, before we start + * zeroing things out. */ + conn_wait_active_ee_empty(first_peer_device(device)->connection); + /* add it to the active list now, + * so we can find it to present it in debugfs */ + peer_req->submit_jif = jiffies; + peer_req->flags |= EE_SUBMITTED; + spin_lock_irq(&device->resource->req_lock); + list_add_tail(&peer_req->w.list, &device->active_ee); + spin_unlock_irq(&device->resource->req_lock); + if (blkdev_issue_zeroout(device->ldev->backing_bdev, + sector, data_size >> 9, GFP_NOIO, false)) + peer_req->flags |= EE_WAS_ERROR; + drbd_endio_write_sec_final(peer_req); + return 0; + } + + /* Discards don't have any payload. + * But the scsi layer still expects a bio_vec it can use internally, + * see sd_setup_discard_cmnd() and blk_add_request_payload(). */ + if (peer_req->flags & EE_IS_TRIM) + nr_pages = 1; + + /* In most cases, we will only need one bio. But in case the lower + * level restrictions happen to be different at this offset on this + * side than those of the sending peer, we may need to submit the + * request in more than one bio. + * + * Plain bio_alloc is good enough here, this is no DRBD internally + * generated bio, but a bio allocated on behalf of the peer. + */ +next_bio: + bio = bio_alloc(GFP_NOIO, nr_pages); + if (!bio) { + drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages); + goto fail; + } + /* > peer_req->i.sector, unless this is the first bio */ + bio->bi_iter.bi_sector = sector; + bio->bi_bdev = device->ldev->backing_bdev; + bio->bi_rw = rw; + bio->bi_private = peer_req; + bio->bi_end_io = drbd_peer_request_endio; + + bio->bi_next = bios; + bios = bio; + ++n_bios; + + if (rw & REQ_DISCARD) { + bio->bi_iter.bi_size = data_size; + goto submit; + } + + page_chain_for_each(page) { + unsigned len = min_t(unsigned, data_size, PAGE_SIZE); + if (!bio_add_page(bio, page, len, 0)) { + /* A single page must always be possible! + * But in case it fails anyways, + * we deal with it, and complain (below). */ + if (bio->bi_vcnt == 0) { + drbd_err(device, + "bio_add_page failed for len=%u, " + "bi_vcnt=0 (bi_sector=%llu)\n", + len, (uint64_t)bio->bi_iter.bi_sector); + err = -ENOSPC; + goto fail; + } + goto next_bio; + } + data_size -= len; + sector += len >> 9; + --nr_pages; + } + D_ASSERT(device, data_size == 0); +submit: + D_ASSERT(device, page == NULL); + + atomic_set(&peer_req->pending_bios, n_bios); + /* for debugfs: update timestamp, mark as submitted */ + peer_req->submit_jif = jiffies; + peer_req->flags |= EE_SUBMITTED; + do { + bio = bios; + bios = bios->bi_next; + bio->bi_next = NULL; + + drbd_generic_make_request(device, fault_type, bio); + } while (bios); + return 0; + +fail: + while (bios) { + bio = bios; + bios = bios->bi_next; + bio_put(bio); + } + return err; +} + +static void drbd_remove_epoch_entry_interval(struct drbd_device *device, + struct drbd_peer_request *peer_req) +{ + struct drbd_interval *i = &peer_req->i; + + drbd_remove_interval(&device->write_requests, i); + drbd_clear_interval(i); + + /* Wake up any processes waiting for this peer request to complete. */ + if (i->waiting) + wake_up(&device->misc_wait); +} + +static void conn_wait_active_ee_empty(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + + kref_get(&device->kref); + rcu_read_unlock(); + drbd_wait_ee_list_empty(device, &device->active_ee); + kref_put(&device->kref, drbd_destroy_device); + rcu_read_lock(); + } + rcu_read_unlock(); +} + +static struct drbd_peer_device * +conn_peer_device(struct drbd_connection *connection, int volume_number) +{ + return idr_find(&connection->peer_devices, volume_number); +} + +static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi) +{ + int rv; + struct p_barrier *p = pi->data; + struct drbd_epoch *epoch; + + /* FIXME these are unacked on connection, + * not a specific (peer)device. + */ + connection->current_epoch->barrier_nr = p->barrier; + connection->current_epoch->connection = connection; + rv = drbd_may_finish_epoch(connection, connection->current_epoch, EV_GOT_BARRIER_NR); + + /* P_BARRIER_ACK may imply that the corresponding extent is dropped from + * the activity log, which means it would not be resynced in case the + * R_PRIMARY crashes now. + * Therefore we must send the barrier_ack after the barrier request was + * completed. */ + switch (connection->resource->write_ordering) { + case WO_none: + if (rv == FE_RECYCLED) + return 0; + + /* receiver context, in the writeout path of the other node. + * avoid potential distributed deadlock */ + epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); + if (epoch) + break; + else + drbd_warn(connection, "Allocation of an epoch failed, slowing down\n"); + /* Fall through */ + + case WO_bdev_flush: + case WO_drain_io: + conn_wait_active_ee_empty(connection); + drbd_flush(connection); + + if (atomic_read(&connection->current_epoch->epoch_size)) { + epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); + if (epoch) + break; + } + + return 0; + default: + drbd_err(connection, "Strangeness in connection->write_ordering %d\n", + connection->resource->write_ordering); + return -EIO; + } + + epoch->flags = 0; + atomic_set(&epoch->epoch_size, 0); + atomic_set(&epoch->active, 0); + + spin_lock(&connection->epoch_lock); + if (atomic_read(&connection->current_epoch->epoch_size)) { + list_add(&epoch->list, &connection->current_epoch->list); + connection->current_epoch = epoch; + connection->epochs++; + } else { + /* The current_epoch got recycled while we allocated this one... */ + kfree(epoch); + } + spin_unlock(&connection->epoch_lock); + + return 0; +} + +/* used from receive_RSDataReply (recv_resync_read) + * and from receive_Data */ +static struct drbd_peer_request * +read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, + struct packet_info *pi) __must_hold(local) +{ + struct drbd_device *device = peer_device->device; + const sector_t capacity = drbd_get_capacity(device->this_bdev); + struct drbd_peer_request *peer_req; + struct page *page; + int digest_size, err; + unsigned int data_size = pi->size, ds; + void *dig_in = peer_device->connection->int_dig_in; + void *dig_vv = peer_device->connection->int_dig_vv; + unsigned long *data; + struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL; + + digest_size = 0; + if (!trim && peer_device->connection->peer_integrity_tfm) { + digest_size = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm); + /* + * FIXME: Receive the incoming digest into the receive buffer + * here, together with its struct p_data? + */ + err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size); + if (err) + return NULL; + data_size -= digest_size; + } + + if (trim) { + D_ASSERT(peer_device, data_size == 0); + data_size = be32_to_cpu(trim->size); + } + + if (!expect(IS_ALIGNED(data_size, 512))) + return NULL; + /* prepare for larger trim requests. */ + if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE)) + return NULL; + + /* even though we trust out peer, + * we sometimes have to double check. */ + if (sector + (data_size>>9) > capacity) { + drbd_err(device, "request from peer beyond end of local disk: " + "capacity: %llus < sector: %llus + size: %u\n", + (unsigned long long)capacity, + (unsigned long long)sector, data_size); + return NULL; + } + + /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD + * "criss-cross" setup, that might cause write-out on some other DRBD, + * which in turn might block on the other node at this very place. */ + peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO); + if (!peer_req) + return NULL; + + peer_req->flags |= EE_WRITE; + if (trim) + return peer_req; + + ds = data_size; + page = peer_req->pages; + page_chain_for_each(page) { + unsigned len = min_t(int, ds, PAGE_SIZE); + data = kmap(page); + err = drbd_recv_all_warn(peer_device->connection, data, len); + if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) { + drbd_err(device, "Fault injection: Corrupting data on receive\n"); + data[0] = data[0] ^ (unsigned long)-1; + } + kunmap(page); + if (err) { + drbd_free_peer_req(device, peer_req); + return NULL; + } + ds -= len; + } + + if (digest_size) { + drbd_csum_ee(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv); + if (memcmp(dig_in, dig_vv, digest_size)) { + drbd_err(device, "Digest integrity check FAILED: %llus +%u\n", + (unsigned long long)sector, data_size); + drbd_free_peer_req(device, peer_req); + return NULL; + } + } + device->recv_cnt += data_size >> 9; + return peer_req; +} + +/* drbd_drain_block() just takes a data block + * out of the socket input buffer, and discards it. + */ +static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size) +{ + struct page *page; + int err = 0; + void *data; + + if (!data_size) + return 0; + + page = drbd_alloc_pages(peer_device, 1, 1); + + data = kmap(page); + while (data_size) { + unsigned int len = min_t(int, data_size, PAGE_SIZE); + + err = drbd_recv_all_warn(peer_device->connection, data, len); + if (err) + break; + data_size -= len; + } + kunmap(page); + drbd_free_pages(peer_device->device, page, 0); + return err; +} + +static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_request *req, + sector_t sector, int data_size) +{ + struct bio_vec bvec; + struct bvec_iter iter; + struct bio *bio; + int digest_size, err, expect; + void *dig_in = peer_device->connection->int_dig_in; + void *dig_vv = peer_device->connection->int_dig_vv; + + digest_size = 0; + if (peer_device->connection->peer_integrity_tfm) { + digest_size = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm); + err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size); + if (err) + return err; + data_size -= digest_size; + } + + /* optimistically update recv_cnt. if receiving fails below, + * we disconnect anyways, and counters will be reset. */ + peer_device->device->recv_cnt += data_size>>9; + + bio = req->master_bio; + D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector); + + bio_for_each_segment(bvec, bio, iter) { + void *mapped = kmap(bvec.bv_page) + bvec.bv_offset; + expect = min_t(int, data_size, bvec.bv_len); + err = drbd_recv_all_warn(peer_device->connection, mapped, expect); + kunmap(bvec.bv_page); + if (err) + return err; + data_size -= expect; + } + + if (digest_size) { + drbd_csum_bio(peer_device->connection->peer_integrity_tfm, bio, dig_vv); + if (memcmp(dig_in, dig_vv, digest_size)) { + drbd_err(peer_device, "Digest integrity check FAILED. Broken NICs?\n"); + return -EINVAL; + } + } + + D_ASSERT(peer_device->device, data_size == 0); + return 0; +} + +/* + * e_end_resync_block() is called in asender context via + * drbd_finish_peer_reqs(). + */ +static int e_end_resync_block(struct drbd_work *w, int unused) +{ + struct drbd_peer_request *peer_req = + container_of(w, struct drbd_peer_request, w); + struct drbd_peer_device *peer_device = peer_req->peer_device; + struct drbd_device *device = peer_device->device; + sector_t sector = peer_req->i.sector; + int err; + + D_ASSERT(device, drbd_interval_empty(&peer_req->i)); + + if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { + drbd_set_in_sync(device, sector, peer_req->i.size); + err = drbd_send_ack(peer_device, P_RS_WRITE_ACK, peer_req); + } else { + /* Record failure to sync */ + drbd_rs_failed_io(device, sector, peer_req->i.size); + + err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req); + } + dec_unacked(device); + + return err; +} + +static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector, + struct packet_info *pi) __releases(local) +{ + struct drbd_device *device = peer_device->device; + struct drbd_peer_request *peer_req; + + peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi); + if (!peer_req) + goto fail; + + dec_rs_pending(device); + + inc_unacked(device); + /* corresponding dec_unacked() in e_end_resync_block() + * respective _drbd_clear_done_ee */ + + peer_req->w.cb = e_end_resync_block; + peer_req->submit_jif = jiffies; + + spin_lock_irq(&device->resource->req_lock); + list_add_tail(&peer_req->w.list, &device->sync_ee); + spin_unlock_irq(&device->resource->req_lock); + + atomic_add(pi->size >> 9, &device->rs_sect_ev); + if (drbd_submit_peer_request(device, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0) + return 0; + + /* don't care for the reason here */ + drbd_err(device, "submit failed, triggering re-connect\n"); + spin_lock_irq(&device->resource->req_lock); + list_del(&peer_req->w.list); + spin_unlock_irq(&device->resource->req_lock); + + drbd_free_peer_req(device, peer_req); +fail: + put_ldev(device); + return -EIO; +} + +static struct drbd_request * +find_request(struct drbd_device *device, struct rb_root *root, u64 id, + sector_t sector, bool missing_ok, const char *func) +{ + struct drbd_request *req; + + /* Request object according to our peer */ + req = (struct drbd_request *)(unsigned long)id; + if (drbd_contains_interval(root, sector, &req->i) && req->i.local) + return req; + if (!missing_ok) { + drbd_err(device, "%s: failed to find request 0x%lx, sector %llus\n", func, + (unsigned long)id, (unsigned long long)sector); + } + return NULL; +} + +static int receive_DataReply(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct drbd_request *req; + sector_t sector; + int err; + struct p_data *p = pi->data; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + sector = be64_to_cpu(p->sector); + + spin_lock_irq(&device->resource->req_lock); + req = find_request(device, &device->read_requests, p->block_id, sector, false, __func__); + spin_unlock_irq(&device->resource->req_lock); + if (unlikely(!req)) + return -EIO; + + /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid + * special casing it there for the various failure cases. + * still no race with drbd_fail_pending_reads */ + err = recv_dless_read(peer_device, req, sector, pi->size); + if (!err) + req_mod(req, DATA_RECEIVED); + /* else: nothing. handled from drbd_disconnect... + * I don't think we may complete this just yet + * in case we are "on-disconnect: freeze" */ + + return err; +} + +static int receive_RSDataReply(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + sector_t sector; + int err; + struct p_data *p = pi->data; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + sector = be64_to_cpu(p->sector); + D_ASSERT(device, p->block_id == ID_SYNCER); + + if (get_ldev(device)) { + /* data is submitted to disk within recv_resync_read. + * corresponding put_ldev done below on error, + * or in drbd_peer_request_endio. */ + err = recv_resync_read(peer_device, sector, pi); + } else { + if (__ratelimit(&drbd_ratelimit_state)) + drbd_err(device, "Can not write resync data to local disk.\n"); + + err = drbd_drain_block(peer_device, pi->size); + + drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size); + } + + atomic_add(pi->size >> 9, &device->rs_sect_in); + + return err; +} + +static void restart_conflicting_writes(struct drbd_device *device, + sector_t sector, int size) +{ + struct drbd_interval *i; + struct drbd_request *req; + + drbd_for_each_overlap(i, &device->write_requests, sector, size) { + if (!i->local) + continue; + req = container_of(i, struct drbd_request, i); + if (req->rq_state & RQ_LOCAL_PENDING || + !(req->rq_state & RQ_POSTPONED)) + continue; + /* as it is RQ_POSTPONED, this will cause it to + * be queued on the retry workqueue. */ + __req_mod(req, CONFLICT_RESOLVED, NULL); + } +} + +/* + * e_end_block() is called in asender context via drbd_finish_peer_reqs(). + */ +static int e_end_block(struct drbd_work *w, int cancel) +{ + struct drbd_peer_request *peer_req = + container_of(w, struct drbd_peer_request, w); + struct drbd_peer_device *peer_device = peer_req->peer_device; + struct drbd_device *device = peer_device->device; + sector_t sector = peer_req->i.sector; + int err = 0, pcmd; + + if (peer_req->flags & EE_SEND_WRITE_ACK) { + if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { + pcmd = (device->state.conn >= C_SYNC_SOURCE && + device->state.conn <= C_PAUSED_SYNC_T && + peer_req->flags & EE_MAY_SET_IN_SYNC) ? + P_RS_WRITE_ACK : P_WRITE_ACK; + err = drbd_send_ack(peer_device, pcmd, peer_req); + if (pcmd == P_RS_WRITE_ACK) + drbd_set_in_sync(device, sector, peer_req->i.size); + } else { + err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req); + /* we expect it to be marked out of sync anyways... + * maybe assert this? */ + } + dec_unacked(device); + } + + /* we delete from the conflict detection hash _after_ we sent out the + * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ + if (peer_req->flags & EE_IN_INTERVAL_TREE) { + spin_lock_irq(&device->resource->req_lock); + D_ASSERT(device, !drbd_interval_empty(&peer_req->i)); + drbd_remove_epoch_entry_interval(device, peer_req); + if (peer_req->flags & EE_RESTART_REQUESTS) + restart_conflicting_writes(device, sector, peer_req->i.size); + spin_unlock_irq(&device->resource->req_lock); + } else + D_ASSERT(device, drbd_interval_empty(&peer_req->i)); + + drbd_may_finish_epoch(first_peer_device(device)->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); + + return err; +} + +static int e_send_ack(struct drbd_work *w, enum drbd_packet ack) +{ + struct drbd_peer_request *peer_req = + container_of(w, struct drbd_peer_request, w); + struct drbd_peer_device *peer_device = peer_req->peer_device; + int err; + + err = drbd_send_ack(peer_device, ack, peer_req); + dec_unacked(peer_device->device); + + return err; +} + +static int e_send_superseded(struct drbd_work *w, int unused) +{ + return e_send_ack(w, P_SUPERSEDED); +} + +static int e_send_retry_write(struct drbd_work *w, int unused) +{ + struct drbd_peer_request *peer_req = + container_of(w, struct drbd_peer_request, w); + struct drbd_connection *connection = peer_req->peer_device->connection; + + return e_send_ack(w, connection->agreed_pro_version >= 100 ? + P_RETRY_WRITE : P_SUPERSEDED); +} + +static bool seq_greater(u32 a, u32 b) +{ + /* + * We assume 32-bit wrap-around here. + * For 24-bit wrap-around, we would have to shift: + * a <<= 8; b <<= 8; + */ + return (s32)a - (s32)b > 0; +} + +static u32 seq_max(u32 a, u32 b) +{ + return seq_greater(a, b) ? a : b; +} + +static void update_peer_seq(struct drbd_peer_device *peer_device, unsigned int peer_seq) +{ + struct drbd_device *device = peer_device->device; + unsigned int newest_peer_seq; + + if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) { + spin_lock(&device->peer_seq_lock); + newest_peer_seq = seq_max(device->peer_seq, peer_seq); + device->peer_seq = newest_peer_seq; + spin_unlock(&device->peer_seq_lock); + /* wake up only if we actually changed device->peer_seq */ + if (peer_seq == newest_peer_seq) + wake_up(&device->seq_wait); + } +} + +static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2) +{ + return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9))); +} + +/* maybe change sync_ee into interval trees as well? */ +static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req) +{ + struct drbd_peer_request *rs_req; + bool rv = 0; + + spin_lock_irq(&device->resource->req_lock); + list_for_each_entry(rs_req, &device->sync_ee, w.list) { + if (overlaps(peer_req->i.sector, peer_req->i.size, + rs_req->i.sector, rs_req->i.size)) { + rv = 1; + break; + } + } + spin_unlock_irq(&device->resource->req_lock); + + return rv; +} + +/* Called from receive_Data. + * Synchronize packets on sock with packets on msock. + * + * This is here so even when a P_DATA packet traveling via sock overtook an Ack + * packet traveling on msock, they are still processed in the order they have + * been sent. + * + * Note: we don't care for Ack packets overtaking P_DATA packets. + * + * In case packet_seq is larger than device->peer_seq number, there are + * outstanding packets on the msock. We wait for them to arrive. + * In case we are the logically next packet, we update device->peer_seq + * ourselves. Correctly handles 32bit wrap around. + * + * Assume we have a 10 GBit connection, that is about 1<<30 byte per second, + * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds + * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have + * 1<<9 == 512 seconds aka ages for the 32bit wrap around... + * + * returns 0 if we may process the packet, + * -ERESTARTSYS if we were interrupted (by disconnect signal). */ +static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, const u32 peer_seq) +{ + struct drbd_device *device = peer_device->device; + DEFINE_WAIT(wait); + long timeout; + int ret = 0, tp; + + if (!test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) + return 0; + + spin_lock(&device->peer_seq_lock); + for (;;) { + if (!seq_greater(peer_seq - 1, device->peer_seq)) { + device->peer_seq = seq_max(device->peer_seq, peer_seq); + break; + } + + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + rcu_read_lock(); + tp = rcu_dereference(first_peer_device(device)->connection->net_conf)->two_primaries; + rcu_read_unlock(); + + if (!tp) + break; + + /* Only need to wait if two_primaries is enabled */ + prepare_to_wait(&device->seq_wait, &wait, TASK_INTERRUPTIBLE); + spin_unlock(&device->peer_seq_lock); + rcu_read_lock(); + timeout = rcu_dereference(peer_device->connection->net_conf)->ping_timeo*HZ/10; + rcu_read_unlock(); + timeout = schedule_timeout(timeout); + spin_lock(&device->peer_seq_lock); + if (!timeout) { + ret = -ETIMEDOUT; + drbd_err(device, "Timed out waiting for missing ack packets; disconnecting\n"); + break; + } + } + spin_unlock(&device->peer_seq_lock); + finish_wait(&device->seq_wait, &wait); + return ret; +} + +/* see also bio_flags_to_wire() + * DRBD_REQ_*, because we need to semantically map the flags to data packet + * flags and back. We may replicate to other kernel versions. */ +static unsigned long wire_flags_to_bio(u32 dpf) +{ + return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) | + (dpf & DP_FUA ? REQ_FUA : 0) | + (dpf & DP_FLUSH ? REQ_FLUSH : 0) | + (dpf & DP_DISCARD ? REQ_DISCARD : 0); +} + +static void fail_postponed_requests(struct drbd_device *device, sector_t sector, + unsigned int size) +{ + struct drbd_interval *i; + + repeat: + drbd_for_each_overlap(i, &device->write_requests, sector, size) { + struct drbd_request *req; + struct bio_and_error m; + + if (!i->local) + continue; + req = container_of(i, struct drbd_request, i); + if (!(req->rq_state & RQ_POSTPONED)) + continue; + req->rq_state &= ~RQ_POSTPONED; + __req_mod(req, NEG_ACKED, &m); + spin_unlock_irq(&device->resource->req_lock); + if (m.bio) + complete_master_bio(device, &m); + spin_lock_irq(&device->resource->req_lock); + goto repeat; + } +} + +static int handle_write_conflicts(struct drbd_device *device, + struct drbd_peer_request *peer_req) +{ + struct drbd_connection *connection = peer_req->peer_device->connection; + bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &connection->flags); + sector_t sector = peer_req->i.sector; + const unsigned int size = peer_req->i.size; + struct drbd_interval *i; + bool equal; + int err; + + /* + * Inserting the peer request into the write_requests tree will prevent + * new conflicting local requests from being added. + */ + drbd_insert_interval(&device->write_requests, &peer_req->i); + + repeat: + drbd_for_each_overlap(i, &device->write_requests, sector, size) { + if (i == &peer_req->i) + continue; + if (i->completed) + continue; + + if (!i->local) { + /* + * Our peer has sent a conflicting remote request; this + * should not happen in a two-node setup. Wait for the + * earlier peer request to complete. + */ + err = drbd_wait_misc(device, i); + if (err) + goto out; + goto repeat; + } + + equal = i->sector == sector && i->size == size; + if (resolve_conflicts) { + /* + * If the peer request is fully contained within the + * overlapping request, it can be considered overwritten + * and thus superseded; otherwise, it will be retried + * once all overlapping requests have completed. + */ + bool superseded = i->sector <= sector && i->sector + + (i->size >> 9) >= sector + (size >> 9); + + if (!equal) + drbd_alert(device, "Concurrent writes detected: " + "local=%llus +%u, remote=%llus +%u, " + "assuming %s came first\n", + (unsigned long long)i->sector, i->size, + (unsigned long long)sector, size, + superseded ? "local" : "remote"); + + peer_req->w.cb = superseded ? e_send_superseded : + e_send_retry_write; + list_add_tail(&peer_req->w.list, &device->done_ee); + wake_asender(connection); + + err = -ENOENT; + goto out; + } else { + struct drbd_request *req = + container_of(i, struct drbd_request, i); + + if (!equal) + drbd_alert(device, "Concurrent writes detected: " + "local=%llus +%u, remote=%llus +%u\n", + (unsigned long long)i->sector, i->size, + (unsigned long long)sector, size); + + if (req->rq_state & RQ_LOCAL_PENDING || + !(req->rq_state & RQ_POSTPONED)) { + /* + * Wait for the node with the discard flag to + * decide if this request has been superseded + * or needs to be retried. + * Requests that have been superseded will + * disappear from the write_requests tree. + * + * In addition, wait for the conflicting + * request to finish locally before submitting + * the conflicting peer request. + */ + err = drbd_wait_misc(device, &req->i); + if (err) { + _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD); + fail_postponed_requests(device, sector, size); + goto out; + } + goto repeat; + } + /* + * Remember to restart the conflicting requests after + * the new peer request has completed. + */ + peer_req->flags |= EE_RESTART_REQUESTS; + } + } + err = 0; + + out: + if (err) + drbd_remove_epoch_entry_interval(device, peer_req); + return err; +} + +/* mirrored write */ +static int receive_Data(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct net_conf *nc; + sector_t sector; + struct drbd_peer_request *peer_req; + struct p_data *p = pi->data; + u32 peer_seq = be32_to_cpu(p->seq_num); + int rw = WRITE; + u32 dp_flags; + int err, tp; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + if (!get_ldev(device)) { + int err2; + + err = wait_for_and_update_peer_seq(peer_device, peer_seq); + drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size); + atomic_inc(&connection->current_epoch->epoch_size); + err2 = drbd_drain_block(peer_device, pi->size); + if (!err) + err = err2; + return err; + } + + /* + * Corresponding put_ldev done either below (on various errors), or in + * drbd_peer_request_endio, if we successfully submit the data at the + * end of this function. + */ + + sector = be64_to_cpu(p->sector); + peer_req = read_in_block(peer_device, p->block_id, sector, pi); + if (!peer_req) { + put_ldev(device); + return -EIO; + } + + peer_req->w.cb = e_end_block; + peer_req->submit_jif = jiffies; + peer_req->flags |= EE_APPLICATION; + + dp_flags = be32_to_cpu(p->dp_flags); + rw |= wire_flags_to_bio(dp_flags); + if (pi->cmd == P_TRIM) { + struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev); + peer_req->flags |= EE_IS_TRIM; + if (!blk_queue_discard(q)) + peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT; + D_ASSERT(peer_device, peer_req->i.size > 0); + D_ASSERT(peer_device, rw & REQ_DISCARD); + D_ASSERT(peer_device, peer_req->pages == NULL); + } else if (peer_req->pages == NULL) { + D_ASSERT(device, peer_req->i.size == 0); + D_ASSERT(device, dp_flags & DP_FLUSH); + } + + if (dp_flags & DP_MAY_SET_IN_SYNC) + peer_req->flags |= EE_MAY_SET_IN_SYNC; + + spin_lock(&connection->epoch_lock); + peer_req->epoch = connection->current_epoch; + atomic_inc(&peer_req->epoch->epoch_size); + atomic_inc(&peer_req->epoch->active); + spin_unlock(&connection->epoch_lock); + + rcu_read_lock(); + nc = rcu_dereference(peer_device->connection->net_conf); + tp = nc->two_primaries; + if (peer_device->connection->agreed_pro_version < 100) { + switch (nc->wire_protocol) { + case DRBD_PROT_C: + dp_flags |= DP_SEND_WRITE_ACK; + break; + case DRBD_PROT_B: + dp_flags |= DP_SEND_RECEIVE_ACK; + break; + } + } + rcu_read_unlock(); + + if (dp_flags & DP_SEND_WRITE_ACK) { + peer_req->flags |= EE_SEND_WRITE_ACK; + inc_unacked(device); + /* corresponding dec_unacked() in e_end_block() + * respective _drbd_clear_done_ee */ + } + + if (dp_flags & DP_SEND_RECEIVE_ACK) { + /* I really don't like it that the receiver thread + * sends on the msock, but anyways */ + drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req); + } + + if (tp) { + /* two primaries implies protocol C */ + D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK); + peer_req->flags |= EE_IN_INTERVAL_TREE; + err = wait_for_and_update_peer_seq(peer_device, peer_seq); + if (err) + goto out_interrupted; + spin_lock_irq(&device->resource->req_lock); + err = handle_write_conflicts(device, peer_req); + if (err) { + spin_unlock_irq(&device->resource->req_lock); + if (err == -ENOENT) { + put_ldev(device); + return 0; + } + goto out_interrupted; + } + } else { + update_peer_seq(peer_device, peer_seq); + spin_lock_irq(&device->resource->req_lock); + } + /* if we use the zeroout fallback code, we process synchronously + * and we wait for all pending requests, respectively wait for + * active_ee to become empty in drbd_submit_peer_request(); + * better not add ourselves here. */ + if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0) + list_add_tail(&peer_req->w.list, &device->active_ee); + spin_unlock_irq(&device->resource->req_lock); + + if (device->state.conn == C_SYNC_TARGET) + wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req)); + + if (device->state.pdsk < D_INCONSISTENT) { + /* In case we have the only disk of the cluster, */ + drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size); + peer_req->flags &= ~EE_MAY_SET_IN_SYNC; + drbd_al_begin_io(device, &peer_req->i); + peer_req->flags |= EE_CALL_AL_COMPLETE_IO; + } + + err = drbd_submit_peer_request(device, peer_req, rw, DRBD_FAULT_DT_WR); + if (!err) + return 0; + + /* don't care for the reason here */ + drbd_err(device, "submit failed, triggering re-connect\n"); + spin_lock_irq(&device->resource->req_lock); + list_del(&peer_req->w.list); + drbd_remove_epoch_entry_interval(device, peer_req); + spin_unlock_irq(&device->resource->req_lock); + if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) { + peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO; + drbd_al_complete_io(device, &peer_req->i); + } + +out_interrupted: + drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP); + put_ldev(device); + drbd_free_peer_req(device, peer_req); + return err; +} + +/* We may throttle resync, if the lower device seems to be busy, + * and current sync rate is above c_min_rate. + * + * To decide whether or not the lower device is busy, we use a scheme similar + * to MD RAID is_mddev_idle(): if the partition stats reveal "significant" + * (more than 64 sectors) of activity we cannot account for with our own resync + * activity, it obviously is "busy". + * + * The current sync rate used here uses only the most recent two step marks, + * to have a short time average so we can react faster. + */ +bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, + bool throttle_if_app_is_waiting) +{ + struct lc_element *tmp; + bool throttle = drbd_rs_c_min_rate_throttle(device); + + if (!throttle || throttle_if_app_is_waiting) + return throttle; + + spin_lock_irq(&device->al_lock); + tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector)); + if (tmp) { + struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); + if (test_bit(BME_PRIORITY, &bm_ext->flags)) + throttle = false; + /* Do not slow down if app IO is already waiting for this extent, + * and our progress is necessary for application IO to complete. */ + } + spin_unlock_irq(&device->al_lock); + + return throttle; +} + +bool drbd_rs_c_min_rate_throttle(struct drbd_device *device) +{ + struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk; + unsigned long db, dt, dbdt; + unsigned int c_min_rate; + int curr_events; + + rcu_read_lock(); + c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate; + rcu_read_unlock(); + + /* feature disabled? */ + if (c_min_rate == 0) + return false; + + curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + + (int)part_stat_read(&disk->part0, sectors[1]) - + atomic_read(&device->rs_sect_ev); + + if (atomic_read(&device->ap_actlog_cnt) + || curr_events - device->rs_last_events > 64) { + unsigned long rs_left; + int i; + + device->rs_last_events = curr_events; + + /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP, + * approx. */ + i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS; + + if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T) + rs_left = device->ov_left; + else + rs_left = drbd_bm_total_weight(device) - device->rs_failed; + + dt = ((long)jiffies - (long)device->rs_mark_time[i]) / HZ; + if (!dt) + dt++; + db = device->rs_mark_left[i] - rs_left; + dbdt = Bit2KB(db/dt); + + if (dbdt > c_min_rate) + return true; + } + return false; +} + +static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + sector_t sector; + sector_t capacity; + struct drbd_peer_request *peer_req; + struct digest_info *di = NULL; + int size, verb; + unsigned int fault_type; + struct p_block_req *p = pi->data; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + capacity = drbd_get_capacity(device->this_bdev); + + sector = be64_to_cpu(p->sector); + size = be32_to_cpu(p->blksize); + + if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { + drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, + (unsigned long long)sector, size); + return -EINVAL; + } + if (sector + (size>>9) > capacity) { + drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, + (unsigned long long)sector, size); + return -EINVAL; + } + + if (!get_ldev_if_state(device, D_UP_TO_DATE)) { + verb = 1; + switch (pi->cmd) { + case P_DATA_REQUEST: + drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p); + break; + case P_RS_DATA_REQUEST: + case P_CSUM_RS_REQUEST: + case P_OV_REQUEST: + drbd_send_ack_rp(peer_device, P_NEG_RS_DREPLY , p); + break; + case P_OV_REPLY: + verb = 0; + dec_rs_pending(device); + drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, ID_IN_SYNC); + break; + default: + BUG(); + } + if (verb && __ratelimit(&drbd_ratelimit_state)) + drbd_err(device, "Can not satisfy peer's read request, " + "no local data.\n"); + + /* drain possibly payload */ + return drbd_drain_block(peer_device, pi->size); + } + + /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD + * "criss-cross" setup, that might cause write-out on some other DRBD, + * which in turn might block on the other node at this very place. */ + peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size, + true /* has real payload */, GFP_NOIO); + if (!peer_req) { + put_ldev(device); + return -ENOMEM; + } + + switch (pi->cmd) { + case P_DATA_REQUEST: + peer_req->w.cb = w_e_end_data_req; + fault_type = DRBD_FAULT_DT_RD; + /* application IO, don't drbd_rs_begin_io */ + peer_req->flags |= EE_APPLICATION; + goto submit; + + case P_RS_DATA_REQUEST: + peer_req->w.cb = w_e_end_rsdata_req; + fault_type = DRBD_FAULT_RS_RD; + /* used in the sector offset progress display */ + device->bm_resync_fo = BM_SECT_TO_BIT(sector); + break; + + case P_OV_REPLY: + case P_CSUM_RS_REQUEST: + fault_type = DRBD_FAULT_RS_RD; + di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO); + if (!di) + goto out_free_e; + + di->digest_size = pi->size; + di->digest = (((char *)di)+sizeof(struct digest_info)); + + peer_req->digest = di; + peer_req->flags |= EE_HAS_DIGEST; + + if (drbd_recv_all(peer_device->connection, di->digest, pi->size)) + goto out_free_e; + + if (pi->cmd == P_CSUM_RS_REQUEST) { + D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89); + peer_req->w.cb = w_e_end_csum_rs_req; + /* used in the sector offset progress display */ + device->bm_resync_fo = BM_SECT_TO_BIT(sector); + /* remember to report stats in drbd_resync_finished */ + device->use_csums = true; + } else if (pi->cmd == P_OV_REPLY) { + /* track progress, we may need to throttle */ + atomic_add(size >> 9, &device->rs_sect_in); + peer_req->w.cb = w_e_end_ov_reply; + dec_rs_pending(device); + /* drbd_rs_begin_io done when we sent this request, + * but accounting still needs to be done. */ + goto submit_for_resync; + } + break; + + case P_OV_REQUEST: + if (device->ov_start_sector == ~(sector_t)0 && + peer_device->connection->agreed_pro_version >= 90) { + unsigned long now = jiffies; + int i; + device->ov_start_sector = sector; + device->ov_position = sector; + device->ov_left = drbd_bm_bits(device) - BM_SECT_TO_BIT(sector); + device->rs_total = device->ov_left; + for (i = 0; i < DRBD_SYNC_MARKS; i++) { + device->rs_mark_left[i] = device->ov_left; + device->rs_mark_time[i] = now; + } + drbd_info(device, "Online Verify start sector: %llu\n", + (unsigned long long)sector); + } + peer_req->w.cb = w_e_end_ov_req; + fault_type = DRBD_FAULT_RS_RD; + break; + + default: + BUG(); + } + + /* Throttle, drbd_rs_begin_io and submit should become asynchronous + * wrt the receiver, but it is not as straightforward as it may seem. + * Various places in the resync start and stop logic assume resync + * requests are processed in order, requeuing this on the worker thread + * introduces a bunch of new code for synchronization between threads. + * + * Unlimited throttling before drbd_rs_begin_io may stall the resync + * "forever", throttling after drbd_rs_begin_io will lock that extent + * for application writes for the same time. For now, just throttle + * here, where the rest of the code expects the receiver to sleep for + * a while, anyways. + */ + + /* Throttle before drbd_rs_begin_io, as that locks out application IO; + * this defers syncer requests for some time, before letting at least + * on request through. The resync controller on the receiving side + * will adapt to the incoming rate accordingly. + * + * We cannot throttle here if remote is Primary/SyncTarget: + * we would also throttle its application reads. + * In that case, throttling is done on the SyncTarget only. + */ + + /* Even though this may be a resync request, we do add to "read_ee"; + * "sync_ee" is only used for resync WRITEs. + * Add to list early, so debugfs can find this request + * even if we have to sleep below. */ + spin_lock_irq(&device->resource->req_lock); + list_add_tail(&peer_req->w.list, &device->read_ee); + spin_unlock_irq(&device->resource->req_lock); + + update_receiver_timing_details(connection, drbd_rs_should_slow_down); + if (device->state.peer != R_PRIMARY + && drbd_rs_should_slow_down(device, sector, false)) + schedule_timeout_uninterruptible(HZ/10); + update_receiver_timing_details(connection, drbd_rs_begin_io); + if (drbd_rs_begin_io(device, sector)) + goto out_free_e; + +submit_for_resync: + atomic_add(size >> 9, &device->rs_sect_ev); + +submit: + update_receiver_timing_details(connection, drbd_submit_peer_request); + inc_unacked(device); + if (drbd_submit_peer_request(device, peer_req, READ, fault_type) == 0) + return 0; + + /* don't care for the reason here */ + drbd_err(device, "submit failed, triggering re-connect\n"); + +out_free_e: + spin_lock_irq(&device->resource->req_lock); + list_del(&peer_req->w.list); + spin_unlock_irq(&device->resource->req_lock); + /* no drbd_rs_complete_io(), we are dropping the connection anyways */ + + put_ldev(device); + drbd_free_peer_req(device, peer_req); + return -EIO; +} + +/** + * drbd_asb_recover_0p - Recover after split-brain with no remaining primaries + */ +static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold(local) +{ + struct drbd_device *device = peer_device->device; + int self, peer, rv = -100; + unsigned long ch_self, ch_peer; + enum drbd_after_sb_p after_sb_0p; + + self = device->ldev->md.uuid[UI_BITMAP] & 1; + peer = device->p_uuid[UI_BITMAP] & 1; + + ch_peer = device->p_uuid[UI_SIZE]; + ch_self = device->comm_bm_set; + + rcu_read_lock(); + after_sb_0p = rcu_dereference(peer_device->connection->net_conf)->after_sb_0p; + rcu_read_unlock(); + switch (after_sb_0p) { + case ASB_CONSENSUS: + case ASB_DISCARD_SECONDARY: + case ASB_CALL_HELPER: + case ASB_VIOLENTLY: + drbd_err(device, "Configuration error.\n"); + break; + case ASB_DISCONNECT: + break; + case ASB_DISCARD_YOUNGER_PRI: + if (self == 0 && peer == 1) { + rv = -1; + break; + } + if (self == 1 && peer == 0) { + rv = 1; + break; + } + /* Else fall through to one of the other strategies... */ + case ASB_DISCARD_OLDER_PRI: + if (self == 0 && peer == 1) { + rv = 1; + break; + } + if (self == 1 && peer == 0) { + rv = -1; + break; + } + /* Else fall through to one of the other strategies... */ + drbd_warn(device, "Discard younger/older primary did not find a decision\n" + "Using discard-least-changes instead\n"); + case ASB_DISCARD_ZERO_CHG: + if (ch_peer == 0 && ch_self == 0) { + rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) + ? -1 : 1; + break; + } else { + if (ch_peer == 0) { rv = 1; break; } + if (ch_self == 0) { rv = -1; break; } + } + if (after_sb_0p == ASB_DISCARD_ZERO_CHG) + break; + case ASB_DISCARD_LEAST_CHG: + if (ch_self < ch_peer) + rv = -1; + else if (ch_self > ch_peer) + rv = 1; + else /* ( ch_self == ch_peer ) */ + /* Well, then use something else. */ + rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) + ? -1 : 1; + break; + case ASB_DISCARD_LOCAL: + rv = -1; + break; + case ASB_DISCARD_REMOTE: + rv = 1; + } + + return rv; +} + +/** + * drbd_asb_recover_1p - Recover after split-brain with one remaining primary + */ +static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold(local) +{ + struct drbd_device *device = peer_device->device; + int hg, rv = -100; + enum drbd_after_sb_p after_sb_1p; + + rcu_read_lock(); + after_sb_1p = rcu_dereference(peer_device->connection->net_conf)->after_sb_1p; + rcu_read_unlock(); + switch (after_sb_1p) { + case ASB_DISCARD_YOUNGER_PRI: + case ASB_DISCARD_OLDER_PRI: + case ASB_DISCARD_LEAST_CHG: + case ASB_DISCARD_LOCAL: + case ASB_DISCARD_REMOTE: + case ASB_DISCARD_ZERO_CHG: + drbd_err(device, "Configuration error.\n"); + break; + case ASB_DISCONNECT: + break; + case ASB_CONSENSUS: + hg = drbd_asb_recover_0p(peer_device); + if (hg == -1 && device->state.role == R_SECONDARY) + rv = hg; + if (hg == 1 && device->state.role == R_PRIMARY) + rv = hg; + break; + case ASB_VIOLENTLY: + rv = drbd_asb_recover_0p(peer_device); + break; + case ASB_DISCARD_SECONDARY: + return device->state.role == R_PRIMARY ? 1 : -1; + case ASB_CALL_HELPER: + hg = drbd_asb_recover_0p(peer_device); + if (hg == -1 && device->state.role == R_PRIMARY) { + enum drbd_state_rv rv2; + + /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, + * we might be here in C_WF_REPORT_PARAMS which is transient. + * we do not need to wait for the after state change work either. */ + rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY)); + if (rv2 != SS_SUCCESS) { + drbd_khelper(device, "pri-lost-after-sb"); + } else { + drbd_warn(device, "Successfully gave up primary role.\n"); + rv = hg; + } + } else + rv = hg; + } + + return rv; +} + +/** + * drbd_asb_recover_2p - Recover after split-brain with two remaining primaries + */ +static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold(local) +{ + struct drbd_device *device = peer_device->device; + int hg, rv = -100; + enum drbd_after_sb_p after_sb_2p; + + rcu_read_lock(); + after_sb_2p = rcu_dereference(peer_device->connection->net_conf)->after_sb_2p; + rcu_read_unlock(); + switch (after_sb_2p) { + case ASB_DISCARD_YOUNGER_PRI: + case ASB_DISCARD_OLDER_PRI: + case ASB_DISCARD_LEAST_CHG: + case ASB_DISCARD_LOCAL: + case ASB_DISCARD_REMOTE: + case ASB_CONSENSUS: + case ASB_DISCARD_SECONDARY: + case ASB_DISCARD_ZERO_CHG: + drbd_err(device, "Configuration error.\n"); + break; + case ASB_VIOLENTLY: + rv = drbd_asb_recover_0p(peer_device); + break; + case ASB_DISCONNECT: + break; + case ASB_CALL_HELPER: + hg = drbd_asb_recover_0p(peer_device); + if (hg == -1) { + enum drbd_state_rv rv2; + + /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, + * we might be here in C_WF_REPORT_PARAMS which is transient. + * we do not need to wait for the after state change work either. */ + rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY)); + if (rv2 != SS_SUCCESS) { + drbd_khelper(device, "pri-lost-after-sb"); + } else { + drbd_warn(device, "Successfully gave up primary role.\n"); + rv = hg; + } + } else + rv = hg; + } + + return rv; +} + +static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid, + u64 bits, u64 flags) +{ + if (!uuid) { + drbd_info(device, "%s uuid info vanished while I was looking!\n", text); + return; + } + drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n", + text, + (unsigned long long)uuid[UI_CURRENT], + (unsigned long long)uuid[UI_BITMAP], + (unsigned long long)uuid[UI_HISTORY_START], + (unsigned long long)uuid[UI_HISTORY_END], + (unsigned long long)bits, + (unsigned long long)flags); +} + +/* + 100 after split brain try auto recover + 2 C_SYNC_SOURCE set BitMap + 1 C_SYNC_SOURCE use BitMap + 0 no Sync + -1 C_SYNC_TARGET use BitMap + -2 C_SYNC_TARGET set BitMap + -100 after split brain, disconnect +-1000 unrelated data +-1091 requires proto 91 +-1096 requires proto 96 + */ +static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __must_hold(local) +{ + struct drbd_peer_device *const peer_device = first_peer_device(device); + struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; + u64 self, peer; + int i, j; + + self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1); + peer = device->p_uuid[UI_CURRENT] & ~((u64)1); + + *rule_nr = 10; + if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED) + return 0; + + *rule_nr = 20; + if ((self == UUID_JUST_CREATED || self == (u64)0) && + peer != UUID_JUST_CREATED) + return -2; + + *rule_nr = 30; + if (self != UUID_JUST_CREATED && + (peer == UUID_JUST_CREATED || peer == (u64)0)) + return 2; + + if (self == peer) { + int rct, dc; /* roles at crash time */ + + if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) { + + if (connection->agreed_pro_version < 91) + return -1091; + + if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) && + (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) { + drbd_info(device, "was SyncSource, missed the resync finished event, corrected myself:\n"); + drbd_uuid_move_history(device); + device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP]; + device->ldev->md.uuid[UI_BITMAP] = 0; + + drbd_uuid_dump(device, "self", device->ldev->md.uuid, + device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0); + *rule_nr = 34; + } else { + drbd_info(device, "was SyncSource (peer failed to write sync_uuid)\n"); + *rule_nr = 36; + } + + return 1; + } + + if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) { + + if (connection->agreed_pro_version < 91) + return -1091; + + if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) && + (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1))) { + drbd_info(device, "was SyncTarget, peer missed the resync finished event, corrected peer:\n"); + + device->p_uuid[UI_HISTORY_START + 1] = device->p_uuid[UI_HISTORY_START]; + device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_BITMAP]; + device->p_uuid[UI_BITMAP] = 0UL; + + drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]); + *rule_nr = 35; + } else { + drbd_info(device, "was SyncTarget (failed to write sync_uuid)\n"); + *rule_nr = 37; + } + + return -1; + } + + /* Common power [off|failure] */ + rct = (test_bit(CRASHED_PRIMARY, &device->flags) ? 1 : 0) + + (device->p_uuid[UI_FLAGS] & 2); + /* lowest bit is set when we were primary, + * next bit (weight 2) is set when peer was primary */ + *rule_nr = 40; + + switch (rct) { + case 0: /* !self_pri && !peer_pri */ return 0; + case 1: /* self_pri && !peer_pri */ return 1; + case 2: /* !self_pri && peer_pri */ return -1; + case 3: /* self_pri && peer_pri */ + dc = test_bit(RESOLVE_CONFLICTS, &connection->flags); + return dc ? -1 : 1; + } + } + + *rule_nr = 50; + peer = device->p_uuid[UI_BITMAP] & ~((u64)1); + if (self == peer) + return -1; + + *rule_nr = 51; + peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1); + if (self == peer) { + if (connection->agreed_pro_version < 96 ? + (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == + (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) : + peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) { + /* The last P_SYNC_UUID did not get though. Undo the last start of + resync as sync source modifications of the peer's UUIDs. */ + + if (connection->agreed_pro_version < 91) + return -1091; + + device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START]; + device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_HISTORY_START + 1]; + + drbd_info(device, "Lost last syncUUID packet, corrected:\n"); + drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]); + + return -1; + } + } + + *rule_nr = 60; + self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1); + for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { + peer = device->p_uuid[i] & ~((u64)1); + if (self == peer) + return -2; + } + + *rule_nr = 70; + self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1); + peer = device->p_uuid[UI_CURRENT] & ~((u64)1); + if (self == peer) + return 1; + + *rule_nr = 71; + self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); + if (self == peer) { + if (connection->agreed_pro_version < 96 ? + (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == + (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) : + self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) { + /* The last P_SYNC_UUID did not get though. Undo the last start of + resync as sync source modifications of our UUIDs. */ + + if (connection->agreed_pro_version < 91) + return -1091; + + __drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]); + __drbd_uuid_set(device, UI_HISTORY_START, device->ldev->md.uuid[UI_HISTORY_START + 1]); + + drbd_info(device, "Last syncUUID did not get through, corrected:\n"); + drbd_uuid_dump(device, "self", device->ldev->md.uuid, + device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0); + + return 1; + } + } + + + *rule_nr = 80; + peer = device->p_uuid[UI_CURRENT] & ~((u64)1); + for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { + self = device->ldev->md.uuid[i] & ~((u64)1); + if (self == peer) + return 2; + } + + *rule_nr = 90; + self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1); + peer = device->p_uuid[UI_BITMAP] & ~((u64)1); + if (self == peer && self != ((u64)0)) + return 100; + + *rule_nr = 100; + for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { + self = device->ldev->md.uuid[i] & ~((u64)1); + for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) { + peer = device->p_uuid[j] & ~((u64)1); + if (self == peer) + return -100; + } + } + + return -1000; +} + +/* drbd_sync_handshake() returns the new conn state on success, or + CONN_MASK (-1) on failure. + */ +static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, + enum drbd_role peer_role, + enum drbd_disk_state peer_disk) __must_hold(local) +{ + struct drbd_device *device = peer_device->device; + enum drbd_conns rv = C_MASK; + enum drbd_disk_state mydisk; + struct net_conf *nc; + int hg, rule_nr, rr_conflict, tentative; + + mydisk = device->state.disk; + if (mydisk == D_NEGOTIATING) + mydisk = device->new_state_tmp.disk; + + drbd_info(device, "drbd_sync_handshake:\n"); + + spin_lock_irq(&device->ldev->md.uuid_lock); + drbd_uuid_dump(device, "self", device->ldev->md.uuid, device->comm_bm_set, 0); + drbd_uuid_dump(device, "peer", device->p_uuid, + device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]); + + hg = drbd_uuid_compare(device, &rule_nr); + spin_unlock_irq(&device->ldev->md.uuid_lock); + + drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr); + + if (hg == -1000) { + drbd_alert(device, "Unrelated data, aborting!\n"); + return C_MASK; + } + if (hg < -1000) { + drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000); + return C_MASK; + } + + if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) || + (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) { + int f = (hg == -100) || abs(hg) == 2; + hg = mydisk > D_INCONSISTENT ? 1 : -1; + if (f) + hg = hg*2; + drbd_info(device, "Becoming sync %s due to disk states.\n", + hg > 0 ? "source" : "target"); + } + + if (abs(hg) == 100) + drbd_khelper(device, "initial-split-brain"); + + rcu_read_lock(); + nc = rcu_dereference(peer_device->connection->net_conf); + + if (hg == 100 || (hg == -100 && nc->always_asbp)) { + int pcount = (device->state.role == R_PRIMARY) + + (peer_role == R_PRIMARY); + int forced = (hg == -100); + + switch (pcount) { + case 0: + hg = drbd_asb_recover_0p(peer_device); + break; + case 1: + hg = drbd_asb_recover_1p(peer_device); + break; + case 2: + hg = drbd_asb_recover_2p(peer_device); + break; + } + if (abs(hg) < 100) { + drbd_warn(device, "Split-Brain detected, %d primaries, " + "automatically solved. Sync from %s node\n", + pcount, (hg < 0) ? "peer" : "this"); + if (forced) { + drbd_warn(device, "Doing a full sync, since" + " UUIDs where ambiguous.\n"); + hg = hg*2; + } + } + } + + if (hg == -100) { + if (test_bit(DISCARD_MY_DATA, &device->flags) && !(device->p_uuid[UI_FLAGS]&1)) + hg = -1; + if (!test_bit(DISCARD_MY_DATA, &device->flags) && (device->p_uuid[UI_FLAGS]&1)) + hg = 1; + + if (abs(hg) < 100) + drbd_warn(device, "Split-Brain detected, manually solved. " + "Sync from %s node\n", + (hg < 0) ? "peer" : "this"); + } + rr_conflict = nc->rr_conflict; + tentative = nc->tentative; + rcu_read_unlock(); + + if (hg == -100) { + /* FIXME this log message is not correct if we end up here + * after an attempted attach on a diskless node. + * We just refuse to attach -- well, we drop the "connection" + * to that disk, in a way... */ + drbd_alert(device, "Split-Brain detected but unresolved, dropping connection!\n"); + drbd_khelper(device, "split-brain"); + return C_MASK; + } + + if (hg > 0 && mydisk <= D_INCONSISTENT) { + drbd_err(device, "I shall become SyncSource, but I am inconsistent!\n"); + return C_MASK; + } + + if (hg < 0 && /* by intention we do not use mydisk here. */ + device->state.role == R_PRIMARY && device->state.disk >= D_CONSISTENT) { + switch (rr_conflict) { + case ASB_CALL_HELPER: + drbd_khelper(device, "pri-lost"); + /* fall through */ + case ASB_DISCONNECT: + drbd_err(device, "I shall become SyncTarget, but I am primary!\n"); + return C_MASK; + case ASB_VIOLENTLY: + drbd_warn(device, "Becoming SyncTarget, violating the stable-data" + "assumption\n"); + } + } + + if (tentative || test_bit(CONN_DRY_RUN, &peer_device->connection->flags)) { + if (hg == 0) + drbd_info(device, "dry-run connect: No resync, would become Connected immediately.\n"); + else + drbd_info(device, "dry-run connect: Would become %s, doing a %s resync.", + drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET), + abs(hg) >= 2 ? "full" : "bit-map based"); + return C_MASK; + } + + if (abs(hg) >= 2) { + drbd_info(device, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n"); + if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from sync_handshake", + BM_LOCKED_SET_ALLOWED)) + return C_MASK; + } + + if (hg > 0) { /* become sync source. */ + rv = C_WF_BITMAP_S; + } else if (hg < 0) { /* become sync target */ + rv = C_WF_BITMAP_T; + } else { + rv = C_CONNECTED; + if (drbd_bm_total_weight(device)) { + drbd_info(device, "No resync, but %lu bits in bitmap!\n", + drbd_bm_total_weight(device)); + } + } + + return rv; +} + +static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer) +{ + /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */ + if (peer == ASB_DISCARD_REMOTE) + return ASB_DISCARD_LOCAL; + + /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */ + if (peer == ASB_DISCARD_LOCAL) + return ASB_DISCARD_REMOTE; + + /* everything else is valid if they are equal on both sides. */ + return peer; +} + +static int receive_protocol(struct drbd_connection *connection, struct packet_info *pi) +{ + struct p_protocol *p = pi->data; + enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; + int p_proto, p_discard_my_data, p_two_primaries, cf; + struct net_conf *nc, *old_net_conf, *new_net_conf = NULL; + char integrity_alg[SHARED_SECRET_MAX] = ""; + struct crypto_hash *peer_integrity_tfm = NULL; + void *int_dig_in = NULL, *int_dig_vv = NULL; + + p_proto = be32_to_cpu(p->protocol); + p_after_sb_0p = be32_to_cpu(p->after_sb_0p); + p_after_sb_1p = be32_to_cpu(p->after_sb_1p); + p_after_sb_2p = be32_to_cpu(p->after_sb_2p); + p_two_primaries = be32_to_cpu(p->two_primaries); + cf = be32_to_cpu(p->conn_flags); + p_discard_my_data = cf & CF_DISCARD_MY_DATA; + + if (connection->agreed_pro_version >= 87) { + int err; + + if (pi->size > sizeof(integrity_alg)) + return -EIO; + err = drbd_recv_all(connection, integrity_alg, pi->size); + if (err) + return err; + integrity_alg[SHARED_SECRET_MAX - 1] = 0; + } + + if (pi->cmd != P_PROTOCOL_UPDATE) { + clear_bit(CONN_DRY_RUN, &connection->flags); + + if (cf & CF_DRY_RUN) + set_bit(CONN_DRY_RUN, &connection->flags); + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + + if (p_proto != nc->wire_protocol) { + drbd_err(connection, "incompatible %s settings\n", "protocol"); + goto disconnect_rcu_unlock; + } + + if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) { + drbd_err(connection, "incompatible %s settings\n", "after-sb-0pri"); + goto disconnect_rcu_unlock; + } + + if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) { + drbd_err(connection, "incompatible %s settings\n", "after-sb-1pri"); + goto disconnect_rcu_unlock; + } + + if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) { + drbd_err(connection, "incompatible %s settings\n", "after-sb-2pri"); + goto disconnect_rcu_unlock; + } + + if (p_discard_my_data && nc->discard_my_data) { + drbd_err(connection, "incompatible %s settings\n", "discard-my-data"); + goto disconnect_rcu_unlock; + } + + if (p_two_primaries != nc->two_primaries) { + drbd_err(connection, "incompatible %s settings\n", "allow-two-primaries"); + goto disconnect_rcu_unlock; + } + + if (strcmp(integrity_alg, nc->integrity_alg)) { + drbd_err(connection, "incompatible %s settings\n", "data-integrity-alg"); + goto disconnect_rcu_unlock; + } + + rcu_read_unlock(); + } + + if (integrity_alg[0]) { + int hash_size; + + /* + * We can only change the peer data integrity algorithm + * here. Changing our own data integrity algorithm + * requires that we send a P_PROTOCOL_UPDATE packet at + * the same time; otherwise, the peer has no way to + * tell between which packets the algorithm should + * change. + */ + + peer_integrity_tfm = crypto_alloc_hash(integrity_alg, 0, CRYPTO_ALG_ASYNC); + if (!peer_integrity_tfm) { + drbd_err(connection, "peer data-integrity-alg %s not supported\n", + integrity_alg); + goto disconnect; + } + + hash_size = crypto_hash_digestsize(peer_integrity_tfm); + int_dig_in = kmalloc(hash_size, GFP_KERNEL); + int_dig_vv = kmalloc(hash_size, GFP_KERNEL); + if (!(int_dig_in && int_dig_vv)) { + drbd_err(connection, "Allocation of buffers for data integrity checking failed\n"); + goto disconnect; + } + } + + new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL); + if (!new_net_conf) { + drbd_err(connection, "Allocation of new net_conf failed\n"); + goto disconnect; + } + + mutex_lock(&connection->data.mutex); + mutex_lock(&connection->resource->conf_update); + old_net_conf = connection->net_conf; + *new_net_conf = *old_net_conf; + + new_net_conf->wire_protocol = p_proto; + new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p); + new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p); + new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p); + new_net_conf->two_primaries = p_two_primaries; + + rcu_assign_pointer(connection->net_conf, new_net_conf); + mutex_unlock(&connection->resource->conf_update); + mutex_unlock(&connection->data.mutex); + + crypto_free_hash(connection->peer_integrity_tfm); + kfree(connection->int_dig_in); + kfree(connection->int_dig_vv); + connection->peer_integrity_tfm = peer_integrity_tfm; + connection->int_dig_in = int_dig_in; + connection->int_dig_vv = int_dig_vv; + + if (strcmp(old_net_conf->integrity_alg, integrity_alg)) + drbd_info(connection, "peer data-integrity-alg: %s\n", + integrity_alg[0] ? integrity_alg : "(none)"); + + synchronize_rcu(); + kfree(old_net_conf); + return 0; + +disconnect_rcu_unlock: + rcu_read_unlock(); +disconnect: + crypto_free_hash(peer_integrity_tfm); + kfree(int_dig_in); + kfree(int_dig_vv); + conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); + return -EIO; +} + +/* helper function + * input: alg name, feature name + * return: NULL (alg name was "") + * ERR_PTR(error) if something goes wrong + * or the crypto hash ptr, if it worked out ok. */ +static struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device, + const char *alg, const char *name) +{ + struct crypto_hash *tfm; + + if (!alg[0]) + return NULL; + + tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) { + drbd_err(device, "Can not allocate \"%s\" as %s (reason: %ld)\n", + alg, name, PTR_ERR(tfm)); + return tfm; + } + return tfm; +} + +static int ignore_remaining_packet(struct drbd_connection *connection, struct packet_info *pi) +{ + void *buffer = connection->data.rbuf; + int size = pi->size; + + while (size) { + int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE); + s = drbd_recv(connection, buffer, s); + if (s <= 0) { + if (s < 0) + return s; + break; + } + size -= s; + } + if (size) + return -EIO; + return 0; +} + +/* + * config_unknown_volume - device configuration command for unknown volume + * + * When a device is added to an existing connection, the node on which the + * device is added first will send configuration commands to its peer but the + * peer will not know about the device yet. It will warn and ignore these + * commands. Once the device is added on the second node, the second node will + * send the same device configuration commands, but in the other direction. + * + * (We can also end up here if drbd is misconfigured.) + */ +static int config_unknown_volume(struct drbd_connection *connection, struct packet_info *pi) +{ + drbd_warn(connection, "%s packet received for volume %u, which is not configured locally\n", + cmdname(pi->cmd), pi->vnr); + return ignore_remaining_packet(connection, pi); +} + +static int receive_SyncParam(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_rs_param_95 *p; + unsigned int header_size, data_size, exp_max_sz; + struct crypto_hash *verify_tfm = NULL; + struct crypto_hash *csums_tfm = NULL; + struct net_conf *old_net_conf, *new_net_conf = NULL; + struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL; + const int apv = connection->agreed_pro_version; + struct fifo_buffer *old_plan = NULL, *new_plan = NULL; + int fifo_size = 0; + int err; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return config_unknown_volume(connection, pi); + device = peer_device->device; + + exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) + : apv == 88 ? sizeof(struct p_rs_param) + + SHARED_SECRET_MAX + : apv <= 94 ? sizeof(struct p_rs_param_89) + : /* apv >= 95 */ sizeof(struct p_rs_param_95); + + if (pi->size > exp_max_sz) { + drbd_err(device, "SyncParam packet too long: received %u, expected <= %u bytes\n", + pi->size, exp_max_sz); + return -EIO; + } + + if (apv <= 88) { + header_size = sizeof(struct p_rs_param); + data_size = pi->size - header_size; + } else if (apv <= 94) { + header_size = sizeof(struct p_rs_param_89); + data_size = pi->size - header_size; + D_ASSERT(device, data_size == 0); + } else { + header_size = sizeof(struct p_rs_param_95); + data_size = pi->size - header_size; + D_ASSERT(device, data_size == 0); + } + + /* initialize verify_alg and csums_alg */ + p = pi->data; + memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); + + err = drbd_recv_all(peer_device->connection, p, header_size); + if (err) + return err; + + mutex_lock(&connection->resource->conf_update); + old_net_conf = peer_device->connection->net_conf; + if (get_ldev(device)) { + new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); + if (!new_disk_conf) { + put_ldev(device); + mutex_unlock(&connection->resource->conf_update); + drbd_err(device, "Allocation of new disk_conf failed\n"); + return -ENOMEM; + } + + old_disk_conf = device->ldev->disk_conf; + *new_disk_conf = *old_disk_conf; + + new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate); + } + + if (apv >= 88) { + if (apv == 88) { + if (data_size > SHARED_SECRET_MAX || data_size == 0) { + drbd_err(device, "verify-alg of wrong size, " + "peer wants %u, accepting only up to %u byte\n", + data_size, SHARED_SECRET_MAX); + err = -EIO; + goto reconnect; + } + + err = drbd_recv_all(peer_device->connection, p->verify_alg, data_size); + if (err) + goto reconnect; + /* we expect NUL terminated string */ + /* but just in case someone tries to be evil */ + D_ASSERT(device, p->verify_alg[data_size-1] == 0); + p->verify_alg[data_size-1] = 0; + + } else /* apv >= 89 */ { + /* we still expect NUL terminated strings */ + /* but just in case someone tries to be evil */ + D_ASSERT(device, p->verify_alg[SHARED_SECRET_MAX-1] == 0); + D_ASSERT(device, p->csums_alg[SHARED_SECRET_MAX-1] == 0); + p->verify_alg[SHARED_SECRET_MAX-1] = 0; + p->csums_alg[SHARED_SECRET_MAX-1] = 0; + } + + if (strcmp(old_net_conf->verify_alg, p->verify_alg)) { + if (device->state.conn == C_WF_REPORT_PARAMS) { + drbd_err(device, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n", + old_net_conf->verify_alg, p->verify_alg); + goto disconnect; + } + verify_tfm = drbd_crypto_alloc_digest_safe(device, + p->verify_alg, "verify-alg"); + if (IS_ERR(verify_tfm)) { + verify_tfm = NULL; + goto disconnect; + } + } + + if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) { + if (device->state.conn == C_WF_REPORT_PARAMS) { + drbd_err(device, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n", + old_net_conf->csums_alg, p->csums_alg); + goto disconnect; + } + csums_tfm = drbd_crypto_alloc_digest_safe(device, + p->csums_alg, "csums-alg"); + if (IS_ERR(csums_tfm)) { + csums_tfm = NULL; + goto disconnect; + } + } + + if (apv > 94 && new_disk_conf) { + new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead); + new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target); + new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target); + new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate); + + fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; + if (fifo_size != device->rs_plan_s->size) { + new_plan = fifo_alloc(fifo_size); + if (!new_plan) { + drbd_err(device, "kmalloc of fifo_buffer failed"); + put_ldev(device); + goto disconnect; + } + } + } + + if (verify_tfm || csums_tfm) { + new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); + if (!new_net_conf) { + drbd_err(device, "Allocation of new net_conf failed\n"); + goto disconnect; + } + + *new_net_conf = *old_net_conf; + + if (verify_tfm) { + strcpy(new_net_conf->verify_alg, p->verify_alg); + new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1; + crypto_free_hash(peer_device->connection->verify_tfm); + peer_device->connection->verify_tfm = verify_tfm; + drbd_info(device, "using verify-alg: \"%s\"\n", p->verify_alg); + } + if (csums_tfm) { + strcpy(new_net_conf->csums_alg, p->csums_alg); + new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1; + crypto_free_hash(peer_device->connection->csums_tfm); + peer_device->connection->csums_tfm = csums_tfm; + drbd_info(device, "using csums-alg: \"%s\"\n", p->csums_alg); + } + rcu_assign_pointer(connection->net_conf, new_net_conf); + } + } + + if (new_disk_conf) { + rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); + put_ldev(device); + } + + if (new_plan) { + old_plan = device->rs_plan_s; + rcu_assign_pointer(device->rs_plan_s, new_plan); + } + + mutex_unlock(&connection->resource->conf_update); + synchronize_rcu(); + if (new_net_conf) + kfree(old_net_conf); + kfree(old_disk_conf); + kfree(old_plan); + + return 0; + +reconnect: + if (new_disk_conf) { + put_ldev(device); + kfree(new_disk_conf); + } + mutex_unlock(&connection->resource->conf_update); + return -EIO; + +disconnect: + kfree(new_plan); + if (new_disk_conf) { + put_ldev(device); + kfree(new_disk_conf); + } + mutex_unlock(&connection->resource->conf_update); + /* just for completeness: actually not needed, + * as this is not reached if csums_tfm was ok. */ + crypto_free_hash(csums_tfm); + /* but free the verify_tfm again, if csums_tfm did not work out */ + crypto_free_hash(verify_tfm); + conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); + return -EIO; +} + +/* warn if the arguments differ by more than 12.5% */ +static void warn_if_differ_considerably(struct drbd_device *device, + const char *s, sector_t a, sector_t b) +{ + sector_t d; + if (a == 0 || b == 0) + return; + d = (a > b) ? (a - b) : (b - a); + if (d > (a>>3) || d > (b>>3)) + drbd_warn(device, "Considerable difference in %s: %llus vs. %llus\n", s, + (unsigned long long)a, (unsigned long long)b); +} + +static int receive_sizes(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_sizes *p = pi->data; + enum determine_dev_size dd = DS_UNCHANGED; + sector_t p_size, p_usize, p_csize, my_usize; + int ldsc = 0; /* local disk size changed */ + enum dds_flags ddsf; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return config_unknown_volume(connection, pi); + device = peer_device->device; + + p_size = be64_to_cpu(p->d_size); + p_usize = be64_to_cpu(p->u_size); + p_csize = be64_to_cpu(p->c_size); + + /* just store the peer's disk size for now. + * we still need to figure out whether we accept that. */ + device->p_size = p_size; + + if (get_ldev(device)) { + rcu_read_lock(); + my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size; + rcu_read_unlock(); + + warn_if_differ_considerably(device, "lower level device sizes", + p_size, drbd_get_max_capacity(device->ldev)); + warn_if_differ_considerably(device, "user requested size", + p_usize, my_usize); + + /* if this is the first connect, or an otherwise expected + * param exchange, choose the minimum */ + if (device->state.conn == C_WF_REPORT_PARAMS) + p_usize = min_not_zero(my_usize, p_usize); + + /* Never shrink a device with usable data during connect. + But allow online shrinking if we are connected. */ + if (drbd_new_dev_size(device, device->ldev, p_usize, 0) < + drbd_get_capacity(device->this_bdev) && + device->state.disk >= D_OUTDATED && + device->state.conn < C_CONNECTED) { + drbd_err(device, "The peer's disk size is too small!\n"); + conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); + put_ldev(device); + return -EIO; + } + + if (my_usize != p_usize) { + struct disk_conf *old_disk_conf, *new_disk_conf = NULL; + + new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); + if (!new_disk_conf) { + drbd_err(device, "Allocation of new disk_conf failed\n"); + put_ldev(device); + return -ENOMEM; + } + + mutex_lock(&connection->resource->conf_update); + old_disk_conf = device->ldev->disk_conf; + *new_disk_conf = *old_disk_conf; + new_disk_conf->disk_size = p_usize; + + rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); + mutex_unlock(&connection->resource->conf_update); + synchronize_rcu(); + kfree(old_disk_conf); + + drbd_info(device, "Peer sets u_size to %lu sectors\n", + (unsigned long)my_usize); + } + + put_ldev(device); + } + + device->peer_max_bio_size = be32_to_cpu(p->max_bio_size); + /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size(). + In case we cleared the QUEUE_FLAG_DISCARD from our queue in + drbd_reconsider_max_bio_size(), we can be sure that after + drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */ + + ddsf = be16_to_cpu(p->dds_flags); + if (get_ldev(device)) { + drbd_reconsider_max_bio_size(device, device->ldev); + dd = drbd_determine_dev_size(device, ddsf, NULL); + put_ldev(device); + if (dd == DS_ERROR) + return -EIO; + drbd_md_sync(device); + } else { + /* + * I am diskless, need to accept the peer's *current* size. + * I must NOT accept the peers backing disk size, + * it may have been larger than mine all along... + * + * At this point, the peer knows more about my disk, or at + * least about what we last agreed upon, than myself. + * So if his c_size is less than his d_size, the most likely + * reason is that *my* d_size was smaller last time we checked. + * + * However, if he sends a zero current size, + * take his (user-capped or) backing disk size anyways. + */ + drbd_reconsider_max_bio_size(device, NULL); + drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size); + } + + if (get_ldev(device)) { + if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) { + device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev); + ldsc = 1; + } + + put_ldev(device); + } + + if (device->state.conn > C_WF_REPORT_PARAMS) { + if (be64_to_cpu(p->c_size) != + drbd_get_capacity(device->this_bdev) || ldsc) { + /* we have different sizes, probably peer + * needs to know my new size... */ + drbd_send_sizes(peer_device, 0, ddsf); + } + if (test_and_clear_bit(RESIZE_PENDING, &device->flags) || + (dd == DS_GREW && device->state.conn == C_CONNECTED)) { + if (device->state.pdsk >= D_INCONSISTENT && + device->state.disk >= D_INCONSISTENT) { + if (ddsf & DDSF_NO_RESYNC) + drbd_info(device, "Resync of new storage suppressed with --assume-clean\n"); + else + resync_after_online_grow(device); + } else + set_bit(RESYNC_AFTER_NEG, &device->flags); + } + } + + return 0; +} + +static int receive_uuids(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_uuids *p = pi->data; + u64 *p_uuid; + int i, updated_uuids = 0; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return config_unknown_volume(connection, pi); + device = peer_device->device; + + p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); + if (!p_uuid) { + drbd_err(device, "kmalloc of p_uuid failed\n"); + return false; + } + + for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++) + p_uuid[i] = be64_to_cpu(p->uuid[i]); + + kfree(device->p_uuid); + device->p_uuid = p_uuid; + + if (device->state.conn < C_CONNECTED && + device->state.disk < D_INCONSISTENT && + device->state.role == R_PRIMARY && + (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) { + drbd_err(device, "Can only connect to data with current UUID=%016llX\n", + (unsigned long long)device->ed_uuid); + conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); + return -EIO; + } + + if (get_ldev(device)) { + int skip_initial_sync = + device->state.conn == C_CONNECTED && + peer_device->connection->agreed_pro_version >= 90 && + device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && + (p_uuid[UI_FLAGS] & 8); + if (skip_initial_sync) { + drbd_info(device, "Accepted new current UUID, preparing to skip initial sync\n"); + drbd_bitmap_io(device, &drbd_bmio_clear_n_write, + "clear_n_write from receive_uuids", + BM_LOCKED_TEST_ALLOWED); + _drbd_uuid_set(device, UI_CURRENT, p_uuid[UI_CURRENT]); + _drbd_uuid_set(device, UI_BITMAP, 0); + _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), + CS_VERBOSE, NULL); + drbd_md_sync(device); + updated_uuids = 1; + } + put_ldev(device); + } else if (device->state.disk < D_INCONSISTENT && + device->state.role == R_PRIMARY) { + /* I am a diskless primary, the peer just created a new current UUID + for me. */ + updated_uuids = drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]); + } + + /* Before we test for the disk state, we should wait until an eventually + ongoing cluster wide state change is finished. That is important if + we are primary and are detaching from our disk. We need to see the + new disk state... */ + mutex_lock(device->state_mutex); + mutex_unlock(device->state_mutex); + if (device->state.conn >= C_CONNECTED && device->state.disk < D_INCONSISTENT) + updated_uuids |= drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]); + + if (updated_uuids) + drbd_print_uuids(device, "receiver updated UUIDs to"); + + return 0; +} + +/** + * convert_state() - Converts the peer's view of the cluster state to our point of view + * @ps: The state as seen by the peer. + */ +static union drbd_state convert_state(union drbd_state ps) +{ + union drbd_state ms; + + static enum drbd_conns c_tab[] = { + [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS, + [C_CONNECTED] = C_CONNECTED, + + [C_STARTING_SYNC_S] = C_STARTING_SYNC_T, + [C_STARTING_SYNC_T] = C_STARTING_SYNC_S, + [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */ + [C_VERIFY_S] = C_VERIFY_T, + [C_MASK] = C_MASK, + }; + + ms.i = ps.i; + + ms.conn = c_tab[ps.conn]; + ms.peer = ps.role; + ms.role = ps.peer; + ms.pdsk = ps.disk; + ms.disk = ps.pdsk; + ms.peer_isp = (ps.aftr_isp | ps.user_isp); + + return ms; +} + +static int receive_req_state(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_req_state *p = pi->data; + union drbd_state mask, val; + enum drbd_state_rv rv; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + mask.i = be32_to_cpu(p->mask); + val.i = be32_to_cpu(p->val); + + if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) && + mutex_is_locked(device->state_mutex)) { + drbd_send_sr_reply(peer_device, SS_CONCURRENT_ST_CHG); + return 0; + } + + mask = convert_state(mask); + val = convert_state(val); + + rv = drbd_change_state(device, CS_VERBOSE, mask, val); + drbd_send_sr_reply(peer_device, rv); + + drbd_md_sync(device); + + return 0; +} + +static int receive_req_conn_state(struct drbd_connection *connection, struct packet_info *pi) +{ + struct p_req_state *p = pi->data; + union drbd_state mask, val; + enum drbd_state_rv rv; + + mask.i = be32_to_cpu(p->mask); + val.i = be32_to_cpu(p->val); + + if (test_bit(RESOLVE_CONFLICTS, &connection->flags) && + mutex_is_locked(&connection->cstate_mutex)) { + conn_send_sr_reply(connection, SS_CONCURRENT_ST_CHG); + return 0; + } + + mask = convert_state(mask); + val = convert_state(val); + + rv = conn_request_state(connection, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL); + conn_send_sr_reply(connection, rv); + + return 0; +} + +static int receive_state(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_state *p = pi->data; + union drbd_state os, ns, peer_state; + enum drbd_disk_state real_peer_disk; + enum chg_state_flags cs_flags; + int rv; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return config_unknown_volume(connection, pi); + device = peer_device->device; + + peer_state.i = be32_to_cpu(p->state); + + real_peer_disk = peer_state.disk; + if (peer_state.disk == D_NEGOTIATING) { + real_peer_disk = device->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT; + drbd_info(device, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk)); + } + + spin_lock_irq(&device->resource->req_lock); + retry: + os = ns = drbd_read_state(device); + spin_unlock_irq(&device->resource->req_lock); + + /* If some other part of the code (asender thread, timeout) + * already decided to close the connection again, + * we must not "re-establish" it here. */ + if (os.conn <= C_TEAR_DOWN) + return -ECONNRESET; + + /* If this is the "end of sync" confirmation, usually the peer disk + * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits + * set) resync started in PausedSyncT, or if the timing of pause-/ + * unpause-sync events has been "just right", the peer disk may + * transition from D_CONSISTENT to D_UP_TO_DATE as well. + */ + if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) && + real_peer_disk == D_UP_TO_DATE && + os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) { + /* If we are (becoming) SyncSource, but peer is still in sync + * preparation, ignore its uptodate-ness to avoid flapping, it + * will change to inconsistent once the peer reaches active + * syncing states. + * It may have changed syncer-paused flags, however, so we + * cannot ignore this completely. */ + if (peer_state.conn > C_CONNECTED && + peer_state.conn < C_SYNC_SOURCE) + real_peer_disk = D_INCONSISTENT; + + /* if peer_state changes to connected at the same time, + * it explicitly notifies us that it finished resync. + * Maybe we should finish it up, too? */ + else if (os.conn >= C_SYNC_SOURCE && + peer_state.conn == C_CONNECTED) { + if (drbd_bm_total_weight(device) <= device->rs_failed) + drbd_resync_finished(device); + return 0; + } + } + + /* explicit verify finished notification, stop sector reached. */ + if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE && + peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) { + ov_out_of_sync_print(device); + drbd_resync_finished(device); + return 0; + } + + /* peer says his disk is inconsistent, while we think it is uptodate, + * and this happens while the peer still thinks we have a sync going on, + * but we think we are already done with the sync. + * We ignore this to avoid flapping pdsk. + * This should not happen, if the peer is a recent version of drbd. */ + if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT && + os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE) + real_peer_disk = D_UP_TO_DATE; + + if (ns.conn == C_WF_REPORT_PARAMS) + ns.conn = C_CONNECTED; + + if (peer_state.conn == C_AHEAD) + ns.conn = C_BEHIND; + + if (device->p_uuid && peer_state.disk >= D_NEGOTIATING && + get_ldev_if_state(device, D_NEGOTIATING)) { + int cr; /* consider resync */ + + /* if we established a new connection */ + cr = (os.conn < C_CONNECTED); + /* if we had an established connection + * and one of the nodes newly attaches a disk */ + cr |= (os.conn == C_CONNECTED && + (peer_state.disk == D_NEGOTIATING || + os.disk == D_NEGOTIATING)); + /* if we have both been inconsistent, and the peer has been + * forced to be UpToDate with --overwrite-data */ + cr |= test_bit(CONSIDER_RESYNC, &device->flags); + /* if we had been plain connected, and the admin requested to + * start a sync by "invalidate" or "invalidate-remote" */ + cr |= (os.conn == C_CONNECTED && + (peer_state.conn >= C_STARTING_SYNC_S && + peer_state.conn <= C_WF_BITMAP_T)); + + if (cr) + ns.conn = drbd_sync_handshake(peer_device, peer_state.role, real_peer_disk); + + put_ldev(device); + if (ns.conn == C_MASK) { + ns.conn = C_CONNECTED; + if (device->state.disk == D_NEGOTIATING) { + drbd_force_state(device, NS(disk, D_FAILED)); + } else if (peer_state.disk == D_NEGOTIATING) { + drbd_err(device, "Disk attach process on the peer node was aborted.\n"); + peer_state.disk = D_DISKLESS; + real_peer_disk = D_DISKLESS; + } else { + if (test_and_clear_bit(CONN_DRY_RUN, &peer_device->connection->flags)) + return -EIO; + D_ASSERT(device, os.conn == C_WF_REPORT_PARAMS); + conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); + return -EIO; + } + } + } + + spin_lock_irq(&device->resource->req_lock); + if (os.i != drbd_read_state(device).i) + goto retry; + clear_bit(CONSIDER_RESYNC, &device->flags); + ns.peer = peer_state.role; + ns.pdsk = real_peer_disk; + ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp); + if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) + ns.disk = device->new_state_tmp.disk; + cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD); + if (ns.pdsk == D_CONSISTENT && drbd_suspended(device) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED && + test_bit(NEW_CUR_UUID, &device->flags)) { + /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this + for temporal network outages! */ + spin_unlock_irq(&device->resource->req_lock); + drbd_err(device, "Aborting Connect, can not thaw IO with an only Consistent peer\n"); + tl_clear(peer_device->connection); + drbd_uuid_new_current(device); + clear_bit(NEW_CUR_UUID, &device->flags); + conn_request_state(peer_device->connection, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD); + return -EIO; + } + rv = _drbd_set_state(device, ns, cs_flags, NULL); + ns = drbd_read_state(device); + spin_unlock_irq(&device->resource->req_lock); + + if (rv < SS_SUCCESS) { + conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); + return -EIO; + } + + if (os.conn > C_WF_REPORT_PARAMS) { + if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED && + peer_state.disk != D_NEGOTIATING ) { + /* we want resync, peer has not yet decided to sync... */ + /* Nowadays only used when forcing a node into primary role and + setting its disk to UpToDate with that */ + drbd_send_uuids(peer_device); + drbd_send_current_state(peer_device); + } + } + + clear_bit(DISCARD_MY_DATA, &device->flags); + + drbd_md_sync(device); /* update connected indicator, la_size_sect, ... */ + + return 0; +} + +static int receive_sync_uuid(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_rs_uuid *p = pi->data; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + wait_event(device->misc_wait, + device->state.conn == C_WF_SYNC_UUID || + device->state.conn == C_BEHIND || + device->state.conn < C_CONNECTED || + device->state.disk < D_NEGOTIATING); + + /* D_ASSERT(device, device->state.conn == C_WF_SYNC_UUID ); */ + + /* Here the _drbd_uuid_ functions are right, current should + _not_ be rotated into the history */ + if (get_ldev_if_state(device, D_NEGOTIATING)) { + _drbd_uuid_set(device, UI_CURRENT, be64_to_cpu(p->uuid)); + _drbd_uuid_set(device, UI_BITMAP, 0UL); + + drbd_print_uuids(device, "updated sync uuid"); + drbd_start_resync(device, C_SYNC_TARGET); + + put_ldev(device); + } else + drbd_err(device, "Ignoring SyncUUID packet!\n"); + + return 0; +} + +/** + * receive_bitmap_plain + * + * Return 0 when done, 1 when another iteration is needed, and a negative error + * code upon failure. + */ +static int +receive_bitmap_plain(struct drbd_peer_device *peer_device, unsigned int size, + unsigned long *p, struct bm_xfer_ctx *c) +{ + unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - + drbd_header_size(peer_device->connection); + unsigned int num_words = min_t(size_t, data_size / sizeof(*p), + c->bm_words - c->word_offset); + unsigned int want = num_words * sizeof(*p); + int err; + + if (want != size) { + drbd_err(peer_device, "%s:want (%u) != size (%u)\n", __func__, want, size); + return -EIO; + } + if (want == 0) + return 0; + err = drbd_recv_all(peer_device->connection, p, want); + if (err) + return err; + + drbd_bm_merge_lel(peer_device->device, c->word_offset, num_words, p); + + c->word_offset += num_words; + c->bit_offset = c->word_offset * BITS_PER_LONG; + if (c->bit_offset > c->bm_bits) + c->bit_offset = c->bm_bits; + + return 1; +} + +static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p) +{ + return (enum drbd_bitmap_code)(p->encoding & 0x0f); +} + +static int dcbp_get_start(struct p_compressed_bm *p) +{ + return (p->encoding & 0x80) != 0; +} + +static int dcbp_get_pad_bits(struct p_compressed_bm *p) +{ + return (p->encoding >> 4) & 0x7; +} + +/** + * recv_bm_rle_bits + * + * Return 0 when done, 1 when another iteration is needed, and a negative error + * code upon failure. + */ +static int +recv_bm_rle_bits(struct drbd_peer_device *peer_device, + struct p_compressed_bm *p, + struct bm_xfer_ctx *c, + unsigned int len) +{ + struct bitstream bs; + u64 look_ahead; + u64 rl; + u64 tmp; + unsigned long s = c->bit_offset; + unsigned long e; + int toggle = dcbp_get_start(p); + int have; + int bits; + + bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p)); + + bits = bitstream_get_bits(&bs, &look_ahead, 64); + if (bits < 0) + return -EIO; + + for (have = bits; have > 0; s += rl, toggle = !toggle) { + bits = vli_decode_bits(&rl, look_ahead); + if (bits <= 0) + return -EIO; + + if (toggle) { + e = s + rl -1; + if (e >= c->bm_bits) { + drbd_err(peer_device, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e); + return -EIO; + } + _drbd_bm_set_bits(peer_device->device, s, e); + } + + if (have < bits) { + drbd_err(peer_device, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n", + have, bits, look_ahead, + (unsigned int)(bs.cur.b - p->code), + (unsigned int)bs.buf_len); + return -EIO; + } + /* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */ + if (likely(bits < 64)) + look_ahead >>= bits; + else + look_ahead = 0; + have -= bits; + + bits = bitstream_get_bits(&bs, &tmp, 64 - have); + if (bits < 0) + return -EIO; + look_ahead |= tmp << have; + have += bits; + } + + c->bit_offset = s; + bm_xfer_ctx_bit_to_word_offset(c); + + return (s != c->bm_bits); +} + +/** + * decode_bitmap_c + * + * Return 0 when done, 1 when another iteration is needed, and a negative error + * code upon failure. + */ +static int +decode_bitmap_c(struct drbd_peer_device *peer_device, + struct p_compressed_bm *p, + struct bm_xfer_ctx *c, + unsigned int len) +{ + if (dcbp_get_code(p) == RLE_VLI_Bits) + return recv_bm_rle_bits(peer_device, p, c, len - sizeof(*p)); + + /* other variants had been implemented for evaluation, + * but have been dropped as this one turned out to be "best" + * during all our tests. */ + + drbd_err(peer_device, "receive_bitmap_c: unknown encoding %u\n", p->encoding); + conn_request_state(peer_device->connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD); + return -EIO; +} + +void INFO_bm_xfer_stats(struct drbd_device *device, + const char *direction, struct bm_xfer_ctx *c) +{ + /* what would it take to transfer it "plaintext" */ + unsigned int header_size = drbd_header_size(first_peer_device(device)->connection); + unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size; + unsigned int plain = + header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) + + c->bm_words * sizeof(unsigned long); + unsigned int total = c->bytes[0] + c->bytes[1]; + unsigned int r; + + /* total can not be zero. but just in case: */ + if (total == 0) + return; + + /* don't report if not compressed */ + if (total >= plain) + return; + + /* total < plain. check for overflow, still */ + r = (total > UINT_MAX/1000) ? (total / (plain/1000)) + : (1000 * total / plain); + + if (r > 1000) + r = 1000; + + r = 1000 - r; + drbd_info(device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), " + "total %u; compression: %u.%u%%\n", + direction, + c->bytes[1], c->packets[1], + c->bytes[0], c->packets[0], + total, r/10, r % 10); +} + +/* Since we are processing the bitfield from lower addresses to higher, + it does not matter if the process it in 32 bit chunks or 64 bit + chunks as long as it is little endian. (Understand it as byte stream, + beginning with the lowest byte...) If we would use big endian + we would need to process it from the highest address to the lowest, + in order to be agnostic to the 32 vs 64 bits issue. + + returns 0 on failure, 1 if we successfully received it. */ +static int receive_bitmap(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct bm_xfer_ctx c; + int err; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + drbd_bm_lock(device, "receive bitmap", BM_LOCKED_SET_ALLOWED); + /* you are supposed to send additional out-of-sync information + * if you actually set bits during this phase */ + + c = (struct bm_xfer_ctx) { + .bm_bits = drbd_bm_bits(device), + .bm_words = drbd_bm_words(device), + }; + + for(;;) { + if (pi->cmd == P_BITMAP) + err = receive_bitmap_plain(peer_device, pi->size, pi->data, &c); + else if (pi->cmd == P_COMPRESSED_BITMAP) { + /* MAYBE: sanity check that we speak proto >= 90, + * and the feature is enabled! */ + struct p_compressed_bm *p = pi->data; + + if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(connection)) { + drbd_err(device, "ReportCBitmap packet too large\n"); + err = -EIO; + goto out; + } + if (pi->size <= sizeof(*p)) { + drbd_err(device, "ReportCBitmap packet too small (l:%u)\n", pi->size); + err = -EIO; + goto out; + } + err = drbd_recv_all(peer_device->connection, p, pi->size); + if (err) + goto out; + err = decode_bitmap_c(peer_device, p, &c, pi->size); + } else { + drbd_warn(device, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd); + err = -EIO; + goto out; + } + + c.packets[pi->cmd == P_BITMAP]++; + c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(connection) + pi->size; + + if (err <= 0) { + if (err < 0) + goto out; + break; + } + err = drbd_recv_header(peer_device->connection, pi); + if (err) + goto out; + } + + INFO_bm_xfer_stats(device, "receive", &c); + + if (device->state.conn == C_WF_BITMAP_T) { + enum drbd_state_rv rv; + + err = drbd_send_bitmap(device); + if (err) + goto out; + /* Omit CS_ORDERED with this state transition to avoid deadlocks. */ + rv = _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); + D_ASSERT(device, rv == SS_SUCCESS); + } else if (device->state.conn != C_WF_BITMAP_S) { + /* admin may have requested C_DISCONNECTING, + * other threads may have noticed network errors */ + drbd_info(device, "unexpected cstate (%s) in receive_bitmap\n", + drbd_conn_str(device->state.conn)); + } + err = 0; + + out: + drbd_bm_unlock(device); + if (!err && device->state.conn == C_WF_BITMAP_S) + drbd_start_resync(device, C_SYNC_SOURCE); + return err; +} + +static int receive_skip(struct drbd_connection *connection, struct packet_info *pi) +{ + drbd_warn(connection, "skipping unknown optional packet type %d, l: %d!\n", + pi->cmd, pi->size); + + return ignore_remaining_packet(connection, pi); +} + +static int receive_UnplugRemote(struct drbd_connection *connection, struct packet_info *pi) +{ + /* Make sure we've acked all the TCP data associated + * with the data requests being unplugged */ + drbd_tcp_quickack(connection->data.socket); + + return 0; +} + +static int receive_out_of_sync(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_block_desc *p = pi->data; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + switch (device->state.conn) { + case C_WF_SYNC_UUID: + case C_WF_BITMAP_T: + case C_BEHIND: + break; + default: + drbd_err(device, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n", + drbd_conn_str(device->state.conn)); + } + + drbd_set_out_of_sync(device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize)); + + return 0; +} + +struct data_cmd { + int expect_payload; + size_t pkt_size; + int (*fn)(struct drbd_connection *, struct packet_info *); +}; + +static struct data_cmd drbd_cmd_handler[] = { + [P_DATA] = { 1, sizeof(struct p_data), receive_Data }, + [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply }, + [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } , + [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } , + [P_BITMAP] = { 1, 0, receive_bitmap } , + [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } , + [P_UNPLUG_REMOTE] = { 0, 0, receive_UnplugRemote }, + [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, + [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, + [P_SYNC_PARAM] = { 1, 0, receive_SyncParam }, + [P_SYNC_PARAM89] = { 1, 0, receive_SyncParam }, + [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol }, + [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids }, + [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes }, + [P_STATE] = { 0, sizeof(struct p_state), receive_state }, + [P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state }, + [P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid }, + [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, + [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest }, + [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest }, + [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip }, + [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync }, + [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, + [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, + [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data }, +}; + +static void drbdd(struct drbd_connection *connection) +{ + struct packet_info pi; + size_t shs; /* sub header size */ + int err; + + while (get_t_state(&connection->receiver) == RUNNING) { + struct data_cmd *cmd; + + drbd_thread_current_set_cpu(&connection->receiver); + update_receiver_timing_details(connection, drbd_recv_header); + if (drbd_recv_header(connection, &pi)) + goto err_out; + + cmd = &drbd_cmd_handler[pi.cmd]; + if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) { + drbd_err(connection, "Unexpected data packet %s (0x%04x)", + cmdname(pi.cmd), pi.cmd); + goto err_out; + } + + shs = cmd->pkt_size; + if (pi.size > shs && !cmd->expect_payload) { + drbd_err(connection, "No payload expected %s l:%d\n", + cmdname(pi.cmd), pi.size); + goto err_out; + } + + if (shs) { + update_receiver_timing_details(connection, drbd_recv_all_warn); + err = drbd_recv_all_warn(connection, pi.data, shs); + if (err) + goto err_out; + pi.size -= shs; + } + + update_receiver_timing_details(connection, cmd->fn); + err = cmd->fn(connection, &pi); + if (err) { + drbd_err(connection, "error receiving %s, e: %d l: %d!\n", + cmdname(pi.cmd), err, pi.size); + goto err_out; + } + } + return; + + err_out: + conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD); +} + +static void conn_disconnect(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + enum drbd_conns oc; + int vnr; + + if (connection->cstate == C_STANDALONE) + return; + + /* We are about to start the cleanup after connection loss. + * Make sure drbd_make_request knows about that. + * Usually we should be in some network failure state already, + * but just in case we are not, we fix it up here. + */ + conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD); + + /* asender does not clean up anything. it must not interfere, either */ + drbd_thread_stop(&connection->asender); + drbd_free_sock(connection); + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + kref_get(&device->kref); + rcu_read_unlock(); + drbd_disconnected(peer_device); + kref_put(&device->kref, drbd_destroy_device); + rcu_read_lock(); + } + rcu_read_unlock(); + + if (!list_empty(&connection->current_epoch->list)) + drbd_err(connection, "ASSERTION FAILED: connection->current_epoch->list not empty\n"); + /* ok, no more ee's on the fly, it is safe to reset the epoch_size */ + atomic_set(&connection->current_epoch->epoch_size, 0); + connection->send.seen_any_write_yet = false; + + drbd_info(connection, "Connection closed\n"); + + if (conn_highest_role(connection) == R_PRIMARY && conn_highest_pdsk(connection) >= D_UNKNOWN) + conn_try_outdate_peer_async(connection); + + spin_lock_irq(&connection->resource->req_lock); + oc = connection->cstate; + if (oc >= C_UNCONNECTED) + _conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE); + + spin_unlock_irq(&connection->resource->req_lock); + + if (oc == C_DISCONNECTING) + conn_request_state(connection, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD); +} + +static int drbd_disconnected(struct drbd_peer_device *peer_device) +{ + struct drbd_device *device = peer_device->device; + unsigned int i; + + /* wait for current activity to cease. */ + spin_lock_irq(&device->resource->req_lock); + _drbd_wait_ee_list_empty(device, &device->active_ee); + _drbd_wait_ee_list_empty(device, &device->sync_ee); + _drbd_wait_ee_list_empty(device, &device->read_ee); + spin_unlock_irq(&device->resource->req_lock); + + /* We do not have data structures that would allow us to + * get the rs_pending_cnt down to 0 again. + * * On C_SYNC_TARGET we do not have any data structures describing + * the pending RSDataRequest's we have sent. + * * On C_SYNC_SOURCE there is no data structure that tracks + * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget. + * And no, it is not the sum of the reference counts in the + * resync_LRU. The resync_LRU tracks the whole operation including + * the disk-IO, while the rs_pending_cnt only tracks the blocks + * on the fly. */ + drbd_rs_cancel_all(device); + device->rs_total = 0; + device->rs_failed = 0; + atomic_set(&device->rs_pending_cnt, 0); + wake_up(&device->misc_wait); + + del_timer_sync(&device->resync_timer); + resync_timer_fn((unsigned long)device); + + /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier, + * w_make_resync_request etc. which may still be on the worker queue + * to be "canceled" */ + drbd_flush_workqueue(&peer_device->connection->sender_work); + + drbd_finish_peer_reqs(device); + + /* This second workqueue flush is necessary, since drbd_finish_peer_reqs() + might have issued a work again. The one before drbd_finish_peer_reqs() is + necessary to reclain net_ee in drbd_finish_peer_reqs(). */ + drbd_flush_workqueue(&peer_device->connection->sender_work); + + /* need to do it again, drbd_finish_peer_reqs() may have populated it + * again via drbd_try_clear_on_disk_bm(). */ + drbd_rs_cancel_all(device); + + kfree(device->p_uuid); + device->p_uuid = NULL; + + if (!drbd_suspended(device)) + tl_clear(peer_device->connection); + + drbd_md_sync(device); + + /* serialize with bitmap writeout triggered by the state change, + * if any. */ + wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags)); + + /* tcp_close and release of sendpage pages can be deferred. I don't + * want to use SO_LINGER, because apparently it can be deferred for + * more than 20 seconds (longest time I checked). + * + * Actually we don't care for exactly when the network stack does its + * put_page(), but release our reference on these pages right here. + */ + i = drbd_free_peer_reqs(device, &device->net_ee); + if (i) + drbd_info(device, "net_ee not empty, killed %u entries\n", i); + i = atomic_read(&device->pp_in_use_by_net); + if (i) + drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i); + i = atomic_read(&device->pp_in_use); + if (i) + drbd_info(device, "pp_in_use = %d, expected 0\n", i); + + D_ASSERT(device, list_empty(&device->read_ee)); + D_ASSERT(device, list_empty(&device->active_ee)); + D_ASSERT(device, list_empty(&device->sync_ee)); + D_ASSERT(device, list_empty(&device->done_ee)); + + return 0; +} + +/* + * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version + * we can agree on is stored in agreed_pro_version. + * + * feature flags and the reserved array should be enough room for future + * enhancements of the handshake protocol, and possible plugins... + * + * for now, they are expected to be zero, but ignored. + */ +static int drbd_send_features(struct drbd_connection *connection) +{ + struct drbd_socket *sock; + struct p_connection_features *p; + + sock = &connection->data; + p = conn_prepare_command(connection, sock); + if (!p) + return -EIO; + memset(p, 0, sizeof(*p)); + p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); + p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); + p->feature_flags = cpu_to_be32(PRO_FEATURES); + return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0); +} + +/* + * return values: + * 1 yes, we have a valid connection + * 0 oops, did not work out, please try again + * -1 peer talks different language, + * no point in trying again, please go standalone. + */ +static int drbd_do_features(struct drbd_connection *connection) +{ + /* ASSERT current == connection->receiver ... */ + struct p_connection_features *p; + const int expect = sizeof(struct p_connection_features); + struct packet_info pi; + int err; + + err = drbd_send_features(connection); + if (err) + return 0; + + err = drbd_recv_header(connection, &pi); + if (err) + return 0; + + if (pi.cmd != P_CONNECTION_FEATURES) { + drbd_err(connection, "expected ConnectionFeatures packet, received: %s (0x%04x)\n", + cmdname(pi.cmd), pi.cmd); + return -1; + } + + if (pi.size != expect) { + drbd_err(connection, "expected ConnectionFeatures length: %u, received: %u\n", + expect, pi.size); + return -1; + } + + p = pi.data; + err = drbd_recv_all_warn(connection, p, expect); + if (err) + return 0; + + p->protocol_min = be32_to_cpu(p->protocol_min); + p->protocol_max = be32_to_cpu(p->protocol_max); + if (p->protocol_max == 0) + p->protocol_max = p->protocol_min; + + if (PRO_VERSION_MAX < p->protocol_min || + PRO_VERSION_MIN > p->protocol_max) + goto incompat; + + connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); + connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags); + + drbd_info(connection, "Handshake successful: " + "Agreed network protocol version %d\n", connection->agreed_pro_version); + + drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n", + connection->agreed_features & FF_TRIM ? " " : " not "); + + return 1; + + incompat: + drbd_err(connection, "incompatible DRBD dialects: " + "I support %d-%d, peer supports %d-%d\n", + PRO_VERSION_MIN, PRO_VERSION_MAX, + p->protocol_min, p->protocol_max); + return -1; +} + +#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) +static int drbd_do_auth(struct drbd_connection *connection) +{ + drbd_err(connection, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); + drbd_err(connection, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); + return -1; +} +#else +#define CHALLENGE_LEN 64 + +/* Return value: + 1 - auth succeeded, + 0 - failed, try again (network error), + -1 - auth failed, don't try again. +*/ + +static int drbd_do_auth(struct drbd_connection *connection) +{ + struct drbd_socket *sock; + char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */ + struct scatterlist sg; + char *response = NULL; + char *right_response = NULL; + char *peers_ch = NULL; + unsigned int key_len; + char secret[SHARED_SECRET_MAX]; /* 64 byte */ + unsigned int resp_size; + struct hash_desc desc; + struct packet_info pi; + struct net_conf *nc; + int err, rv; + + /* FIXME: Put the challenge/response into the preallocated socket buffer. */ + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + key_len = strlen(nc->shared_secret); + memcpy(secret, nc->shared_secret, key_len); + rcu_read_unlock(); + + desc.tfm = connection->cram_hmac_tfm; + desc.flags = 0; + + rv = crypto_hash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len); + if (rv) { + drbd_err(connection, "crypto_hash_setkey() failed with %d\n", rv); + rv = -1; + goto fail; + } + + get_random_bytes(my_challenge, CHALLENGE_LEN); + + sock = &connection->data; + if (!conn_prepare_command(connection, sock)) { + rv = 0; + goto fail; + } + rv = !conn_send_command(connection, sock, P_AUTH_CHALLENGE, 0, + my_challenge, CHALLENGE_LEN); + if (!rv) + goto fail; + + err = drbd_recv_header(connection, &pi); + if (err) { + rv = 0; + goto fail; + } + + if (pi.cmd != P_AUTH_CHALLENGE) { + drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n", + cmdname(pi.cmd), pi.cmd); + rv = 0; + goto fail; + } + + if (pi.size > CHALLENGE_LEN * 2) { + drbd_err(connection, "expected AuthChallenge payload too big.\n"); + rv = -1; + goto fail; + } + + if (pi.size < CHALLENGE_LEN) { + drbd_err(connection, "AuthChallenge payload too small.\n"); + rv = -1; + goto fail; + } + + peers_ch = kmalloc(pi.size, GFP_NOIO); + if (peers_ch == NULL) { + drbd_err(connection, "kmalloc of peers_ch failed\n"); + rv = -1; + goto fail; + } + + err = drbd_recv_all_warn(connection, peers_ch, pi.size); + if (err) { + rv = 0; + goto fail; + } + + if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) { + drbd_err(connection, "Peer presented the same challenge!\n"); + rv = -1; + goto fail; + } + + resp_size = crypto_hash_digestsize(connection->cram_hmac_tfm); + response = kmalloc(resp_size, GFP_NOIO); + if (response == NULL) { + drbd_err(connection, "kmalloc of response failed\n"); + rv = -1; + goto fail; + } + + sg_init_table(&sg, 1); + sg_set_buf(&sg, peers_ch, pi.size); + + rv = crypto_hash_digest(&desc, &sg, sg.length, response); + if (rv) { + drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv); + rv = -1; + goto fail; + } + + if (!conn_prepare_command(connection, sock)) { + rv = 0; + goto fail; + } + rv = !conn_send_command(connection, sock, P_AUTH_RESPONSE, 0, + response, resp_size); + if (!rv) + goto fail; + + err = drbd_recv_header(connection, &pi); + if (err) { + rv = 0; + goto fail; + } + + if (pi.cmd != P_AUTH_RESPONSE) { + drbd_err(connection, "expected AuthResponse packet, received: %s (0x%04x)\n", + cmdname(pi.cmd), pi.cmd); + rv = 0; + goto fail; + } + + if (pi.size != resp_size) { + drbd_err(connection, "expected AuthResponse payload of wrong size\n"); + rv = 0; + goto fail; + } + + err = drbd_recv_all_warn(connection, response , resp_size); + if (err) { + rv = 0; + goto fail; + } + + right_response = kmalloc(resp_size, GFP_NOIO); + if (right_response == NULL) { + drbd_err(connection, "kmalloc of right_response failed\n"); + rv = -1; + goto fail; + } + + sg_set_buf(&sg, my_challenge, CHALLENGE_LEN); + + rv = crypto_hash_digest(&desc, &sg, sg.length, right_response); + if (rv) { + drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv); + rv = -1; + goto fail; + } + + rv = !memcmp(response, right_response, resp_size); + + if (rv) + drbd_info(connection, "Peer authenticated using %d bytes HMAC\n", + resp_size); + else + rv = -1; + + fail: + kfree(peers_ch); + kfree(response); + kfree(right_response); + + return rv; +} +#endif + +int drbd_receiver(struct drbd_thread *thi) +{ + struct drbd_connection *connection = thi->connection; + int h; + + drbd_info(connection, "receiver (re)started\n"); + + do { + h = conn_connect(connection); + if (h == 0) { + conn_disconnect(connection); + schedule_timeout_interruptible(HZ); + } + if (h == -1) { + drbd_warn(connection, "Discarding network configuration.\n"); + conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); + } + } while (h == 0); + + if (h > 0) + drbdd(connection); + + conn_disconnect(connection); + + drbd_info(connection, "receiver terminated\n"); + return 0; +} + +/* ********* acknowledge sender ******** */ + +static int got_conn_RqSReply(struct drbd_connection *connection, struct packet_info *pi) +{ + struct p_req_state_reply *p = pi->data; + int retcode = be32_to_cpu(p->retcode); + + if (retcode >= SS_SUCCESS) { + set_bit(CONN_WD_ST_CHG_OKAY, &connection->flags); + } else { + set_bit(CONN_WD_ST_CHG_FAIL, &connection->flags); + drbd_err(connection, "Requested state change failed by peer: %s (%d)\n", + drbd_set_st_err_str(retcode), retcode); + } + wake_up(&connection->ping_wait); + + return 0; +} + +static int got_RqSReply(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_req_state_reply *p = pi->data; + int retcode = be32_to_cpu(p->retcode); + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + if (test_bit(CONN_WD_ST_CHG_REQ, &connection->flags)) { + D_ASSERT(device, connection->agreed_pro_version < 100); + return got_conn_RqSReply(connection, pi); + } + + if (retcode >= SS_SUCCESS) { + set_bit(CL_ST_CHG_SUCCESS, &device->flags); + } else { + set_bit(CL_ST_CHG_FAIL, &device->flags); + drbd_err(device, "Requested state change failed by peer: %s (%d)\n", + drbd_set_st_err_str(retcode), retcode); + } + wake_up(&device->state_wait); + + return 0; +} + +static int got_Ping(struct drbd_connection *connection, struct packet_info *pi) +{ + return drbd_send_ping_ack(connection); + +} + +static int got_PingAck(struct drbd_connection *connection, struct packet_info *pi) +{ + /* restore idle timeout */ + connection->meta.socket->sk->sk_rcvtimeo = connection->net_conf->ping_int*HZ; + if (!test_and_set_bit(GOT_PING_ACK, &connection->flags)) + wake_up(&connection->ping_wait); + + return 0; +} + +static int got_IsInSync(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_block_ack *p = pi->data; + sector_t sector = be64_to_cpu(p->sector); + int blksize = be32_to_cpu(p->blksize); + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89); + + update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); + + if (get_ldev(device)) { + drbd_rs_complete_io(device, sector); + drbd_set_in_sync(device, sector, blksize); + /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */ + device->rs_same_csum += (blksize >> BM_BLOCK_SHIFT); + put_ldev(device); + } + dec_rs_pending(device); + atomic_add(blksize >> 9, &device->rs_sect_in); + + return 0; +} + +static int +validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t sector, + struct rb_root *root, const char *func, + enum drbd_req_event what, bool missing_ok) +{ + struct drbd_request *req; + struct bio_and_error m; + + spin_lock_irq(&device->resource->req_lock); + req = find_request(device, root, id, sector, missing_ok, func); + if (unlikely(!req)) { + spin_unlock_irq(&device->resource->req_lock); + return -EIO; + } + __req_mod(req, what, &m); + spin_unlock_irq(&device->resource->req_lock); + + if (m.bio) + complete_master_bio(device, &m); + return 0; +} + +static int got_BlockAck(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_block_ack *p = pi->data; + sector_t sector = be64_to_cpu(p->sector); + int blksize = be32_to_cpu(p->blksize); + enum drbd_req_event what; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); + + if (p->block_id == ID_SYNCER) { + drbd_set_in_sync(device, sector, blksize); + dec_rs_pending(device); + return 0; + } + switch (pi->cmd) { + case P_RS_WRITE_ACK: + what = WRITE_ACKED_BY_PEER_AND_SIS; + break; + case P_WRITE_ACK: + what = WRITE_ACKED_BY_PEER; + break; + case P_RECV_ACK: + what = RECV_ACKED_BY_PEER; + break; + case P_SUPERSEDED: + what = CONFLICT_RESOLVED; + break; + case P_RETRY_WRITE: + what = POSTPONE_WRITE; + break; + default: + BUG(); + } + + return validate_req_change_req_state(device, p->block_id, sector, + &device->write_requests, __func__, + what, false); +} + +static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_block_ack *p = pi->data; + sector_t sector = be64_to_cpu(p->sector); + int size = be32_to_cpu(p->blksize); + int err; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); + + if (p->block_id == ID_SYNCER) { + dec_rs_pending(device); + drbd_rs_failed_io(device, sector, size); + return 0; + } + + err = validate_req_change_req_state(device, p->block_id, sector, + &device->write_requests, __func__, + NEG_ACKED, true); + if (err) { + /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs. + The master bio might already be completed, therefore the + request is no longer in the collision hash. */ + /* In Protocol B we might already have got a P_RECV_ACK + but then get a P_NEG_ACK afterwards. */ + drbd_set_out_of_sync(device, sector, size); + } + return 0; +} + +static int got_NegDReply(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_block_ack *p = pi->data; + sector_t sector = be64_to_cpu(p->sector); + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); + + drbd_err(device, "Got NegDReply; Sector %llus, len %u.\n", + (unsigned long long)sector, be32_to_cpu(p->blksize)); + + return validate_req_change_req_state(device, p->block_id, sector, + &device->read_requests, __func__, + NEG_ACKED, false); +} + +static int got_NegRSDReply(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + sector_t sector; + int size; + struct p_block_ack *p = pi->data; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + sector = be64_to_cpu(p->sector); + size = be32_to_cpu(p->blksize); + + update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); + + dec_rs_pending(device); + + if (get_ldev_if_state(device, D_FAILED)) { + drbd_rs_complete_io(device, sector); + switch (pi->cmd) { + case P_NEG_RS_DREPLY: + drbd_rs_failed_io(device, sector, size); + case P_RS_CANCEL: + break; + default: + BUG(); + } + put_ldev(device); + } + + return 0; +} + +static int got_BarrierAck(struct drbd_connection *connection, struct packet_info *pi) +{ + struct p_barrier_ack *p = pi->data; + struct drbd_peer_device *peer_device; + int vnr; + + tl_release(connection, p->barrier, be32_to_cpu(p->set_size)); + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + + if (device->state.conn == C_AHEAD && + atomic_read(&device->ap_in_flight) == 0 && + !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &device->flags)) { + device->start_resync_timer.expires = jiffies + HZ; + add_timer(&device->start_resync_timer); + } + } + rcu_read_unlock(); + + return 0; +} + +static int got_OVResult(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_block_ack *p = pi->data; + struct drbd_device_work *dw; + sector_t sector; + int size; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + sector = be64_to_cpu(p->sector); + size = be32_to_cpu(p->blksize); + + update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); + + if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC) + drbd_ov_out_of_sync_found(device, sector, size); + else + ov_out_of_sync_print(device); + + if (!get_ldev(device)) + return 0; + + drbd_rs_complete_io(device, sector); + dec_rs_pending(device); + + --device->ov_left; + + /* let's advance progress step marks only for every other megabyte */ + if ((device->ov_left & 0x200) == 0x200) + drbd_advance_rs_marks(device, device->ov_left); + + if (device->ov_left == 0) { + dw = kmalloc(sizeof(*dw), GFP_NOIO); + if (dw) { + dw->w.cb = w_ov_finished; + dw->device = device; + drbd_queue_work(&peer_device->connection->sender_work, &dw->w); + } else { + drbd_err(device, "kmalloc(dw) failed."); + ov_out_of_sync_print(device); + drbd_resync_finished(device); + } + } + put_ldev(device); + return 0; +} + +static int got_skip(struct drbd_connection *connection, struct packet_info *pi) +{ + return 0; +} + +static int connection_finish_peer_reqs(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + int vnr, not_empty = 0; + + do { + clear_bit(SIGNAL_ASENDER, &connection->flags); + flush_signals(current); + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + kref_get(&device->kref); + rcu_read_unlock(); + if (drbd_finish_peer_reqs(device)) { + kref_put(&device->kref, drbd_destroy_device); + return 1; + } + kref_put(&device->kref, drbd_destroy_device); + rcu_read_lock(); + } + set_bit(SIGNAL_ASENDER, &connection->flags); + + spin_lock_irq(&connection->resource->req_lock); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + not_empty = !list_empty(&device->done_ee); + if (not_empty) + break; + } + spin_unlock_irq(&connection->resource->req_lock); + rcu_read_unlock(); + } while (not_empty); + + return 0; +} + +struct asender_cmd { + size_t pkt_size; + int (*fn)(struct drbd_connection *connection, struct packet_info *); +}; + +static struct asender_cmd asender_tbl[] = { + [P_PING] = { 0, got_Ping }, + [P_PING_ACK] = { 0, got_PingAck }, + [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, + [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, + [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, + [P_SUPERSEDED] = { sizeof(struct p_block_ack), got_BlockAck }, + [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck }, + [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply }, + [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply }, + [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult }, + [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, + [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, + [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, + [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip }, + [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply }, + [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply }, + [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck }, +}; + +int drbd_asender(struct drbd_thread *thi) +{ + struct drbd_connection *connection = thi->connection; + struct asender_cmd *cmd = NULL; + struct packet_info pi; + int rv; + void *buf = connection->meta.rbuf; + int received = 0; + unsigned int header_size = drbd_header_size(connection); + int expect = header_size; + bool ping_timeout_active = false; + struct net_conf *nc; + int ping_timeo, tcp_cork, ping_int; + struct sched_param param = { .sched_priority = 2 }; + + rv = sched_setscheduler(current, SCHED_RR, ¶m); + if (rv < 0) + drbd_err(connection, "drbd_asender: ERROR set priority, ret=%d\n", rv); + + while (get_t_state(thi) == RUNNING) { + drbd_thread_current_set_cpu(thi); + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + ping_timeo = nc->ping_timeo; + tcp_cork = nc->tcp_cork; + ping_int = nc->ping_int; + rcu_read_unlock(); + + if (test_and_clear_bit(SEND_PING, &connection->flags)) { + if (drbd_send_ping(connection)) { + drbd_err(connection, "drbd_send_ping has failed\n"); + goto reconnect; + } + connection->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10; + ping_timeout_active = true; + } + + /* TODO: conditionally cork; it may hurt latency if we cork without + much to send */ + if (tcp_cork) + drbd_tcp_cork(connection->meta.socket); + if (connection_finish_peer_reqs(connection)) { + drbd_err(connection, "connection_finish_peer_reqs() failed\n"); + goto reconnect; + } + /* but unconditionally uncork unless disabled */ + if (tcp_cork) + drbd_tcp_uncork(connection->meta.socket); + + /* short circuit, recv_msg would return EINTR anyways. */ + if (signal_pending(current)) + continue; + + rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0); + clear_bit(SIGNAL_ASENDER, &connection->flags); + + flush_signals(current); + + /* Note: + * -EINTR (on meta) we got a signal + * -EAGAIN (on meta) rcvtimeo expired + * -ECONNRESET other side closed the connection + * -ERESTARTSYS (on data) we got a signal + * rv < 0 other than above: unexpected error! + * rv == expected: full header or command + * rv < expected: "woken" by signal during receive + * rv == 0 : "connection shut down by peer" + */ +received_more: + if (likely(rv > 0)) { + received += rv; + buf += rv; + } else if (rv == 0) { + if (test_bit(DISCONNECT_SENT, &connection->flags)) { + long t; + rcu_read_lock(); + t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10; + rcu_read_unlock(); + + t = wait_event_timeout(connection->ping_wait, + connection->cstate < C_WF_REPORT_PARAMS, + t); + if (t) + break; + } + drbd_err(connection, "meta connection shut down by peer.\n"); + goto reconnect; + } else if (rv == -EAGAIN) { + /* If the data socket received something meanwhile, + * that is good enough: peer is still alive. */ + if (time_after(connection->last_received, + jiffies - connection->meta.socket->sk->sk_rcvtimeo)) + continue; + if (ping_timeout_active) { + drbd_err(connection, "PingAck did not arrive in time.\n"); + goto reconnect; + } + set_bit(SEND_PING, &connection->flags); + continue; + } else if (rv == -EINTR) { + continue; + } else { + drbd_err(connection, "sock_recvmsg returned %d\n", rv); + goto reconnect; + } + + if (received == expect && cmd == NULL) { + if (decode_header(connection, connection->meta.rbuf, &pi)) + goto reconnect; + cmd = &asender_tbl[pi.cmd]; + if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) { + drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n", + cmdname(pi.cmd), pi.cmd); + goto disconnect; + } + expect = header_size + cmd->pkt_size; + if (pi.size != expect - header_size) { + drbd_err(connection, "Wrong packet size on meta (c: %d, l: %d)\n", + pi.cmd, pi.size); + goto reconnect; + } + } + if (received == expect) { + bool err; + + err = cmd->fn(connection, &pi); + if (err) { + drbd_err(connection, "%pf failed\n", cmd->fn); + goto reconnect; + } + + connection->last_received = jiffies; + + if (cmd == &asender_tbl[P_PING_ACK]) { + /* restore idle timeout */ + connection->meta.socket->sk->sk_rcvtimeo = ping_int * HZ; + ping_timeout_active = false; + } + + buf = connection->meta.rbuf; + received = 0; + expect = header_size; + cmd = NULL; + } + if (test_bit(SEND_PING, &connection->flags)) + continue; + rv = drbd_recv_short(connection->meta.socket, buf, expect-received, MSG_DONTWAIT); + if (rv > 0) + goto received_more; + } + + if (0) { +reconnect: + conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD); + conn_md_sync(connection); + } + if (0) { +disconnect: + conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); + } + clear_bit(SIGNAL_ASENDER, &connection->flags); + + drbd_info(connection, "asender terminated\n"); + + return 0; +} diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c new file mode 100644 index 000000000..3907202fb --- /dev/null +++ b/drivers/block/drbd/drbd_req.c @@ -0,0 +1,1638 @@ +/* + drbd_req.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include <linux/module.h> + +#include <linux/slab.h> +#include <linux/drbd.h> +#include "drbd_int.h" +#include "drbd_req.h" + + +static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, int size); + +/* Update disk stats at start of I/O request */ +static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request *req) +{ + generic_start_io_acct(bio_data_dir(req->master_bio), req->i.size >> 9, + &device->vdisk->part0); +} + +/* Update disk stats when completing request upwards */ +static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req) +{ + generic_end_io_acct(bio_data_dir(req->master_bio), + &device->vdisk->part0, req->start_jif); +} + +static struct drbd_request *drbd_req_new(struct drbd_device *device, + struct bio *bio_src) +{ + struct drbd_request *req; + + req = mempool_alloc(drbd_request_mempool, GFP_NOIO); + if (!req) + return NULL; + memset(req, 0, sizeof(*req)); + + drbd_req_make_private_bio(req, bio_src); + req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0; + req->device = device; + req->master_bio = bio_src; + req->epoch = 0; + + drbd_clear_interval(&req->i); + req->i.sector = bio_src->bi_iter.bi_sector; + req->i.size = bio_src->bi_iter.bi_size; + req->i.local = true; + req->i.waiting = false; + + INIT_LIST_HEAD(&req->tl_requests); + INIT_LIST_HEAD(&req->w.list); + INIT_LIST_HEAD(&req->req_pending_master_completion); + INIT_LIST_HEAD(&req->req_pending_local); + + /* one reference to be put by __drbd_make_request */ + atomic_set(&req->completion_ref, 1); + /* one kref as long as completion_ref > 0 */ + kref_init(&req->kref); + return req; +} + +static void drbd_remove_request_interval(struct rb_root *root, + struct drbd_request *req) +{ + struct drbd_device *device = req->device; + struct drbd_interval *i = &req->i; + + drbd_remove_interval(root, i); + + /* Wake up any processes waiting for this request to complete. */ + if (i->waiting) + wake_up(&device->misc_wait); +} + +void drbd_req_destroy(struct kref *kref) +{ + struct drbd_request *req = container_of(kref, struct drbd_request, kref); + struct drbd_device *device = req->device; + const unsigned s = req->rq_state; + + if ((req->master_bio && !(s & RQ_POSTPONED)) || + atomic_read(&req->completion_ref) || + (s & RQ_LOCAL_PENDING) || + ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) { + drbd_err(device, "drbd_req_destroy: Logic BUG rq_state = 0x%x, completion_ref = %d\n", + s, atomic_read(&req->completion_ref)); + return; + } + + /* If called from mod_rq_state (expected normal case) or + * drbd_send_and_submit (the less likely normal path), this holds the + * req_lock, and req->tl_requests will typicaly be on ->transfer_log, + * though it may be still empty (never added to the transfer log). + * + * If called from do_retry(), we do NOT hold the req_lock, but we are + * still allowed to unconditionally list_del(&req->tl_requests), + * because it will be on a local on-stack list only. */ + list_del_init(&req->tl_requests); + + /* finally remove the request from the conflict detection + * respective block_id verification interval tree. */ + if (!drbd_interval_empty(&req->i)) { + struct rb_root *root; + + if (s & RQ_WRITE) + root = &device->write_requests; + else + root = &device->read_requests; + drbd_remove_request_interval(root, req); + } else if (s & (RQ_NET_MASK & ~RQ_NET_DONE) && req->i.size != 0) + drbd_err(device, "drbd_req_destroy: Logic BUG: interval empty, but: rq_state=0x%x, sect=%llu, size=%u\n", + s, (unsigned long long)req->i.sector, req->i.size); + + /* if it was a write, we may have to set the corresponding + * bit(s) out-of-sync first. If it had a local part, we need to + * release the reference to the activity log. */ + if (s & RQ_WRITE) { + /* Set out-of-sync unless both OK flags are set + * (local only or remote failed). + * Other places where we set out-of-sync: + * READ with local io-error */ + + /* There is a special case: + * we may notice late that IO was suspended, + * and postpone, or schedule for retry, a write, + * before it even was submitted or sent. + * In that case we do not want to touch the bitmap at all. + */ + if ((s & (RQ_POSTPONED|RQ_LOCAL_MASK|RQ_NET_MASK)) != RQ_POSTPONED) { + if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK)) + drbd_set_out_of_sync(device, req->i.sector, req->i.size); + + if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS)) + drbd_set_in_sync(device, req->i.sector, req->i.size); + } + + /* one might be tempted to move the drbd_al_complete_io + * to the local io completion callback drbd_request_endio. + * but, if this was a mirror write, we may only + * drbd_al_complete_io after this is RQ_NET_DONE, + * otherwise the extent could be dropped from the al + * before it has actually been written on the peer. + * if we crash before our peer knows about the request, + * but after the extent has been dropped from the al, + * we would forget to resync the corresponding extent. + */ + if (s & RQ_IN_ACT_LOG) { + if (get_ldev_if_state(device, D_FAILED)) { + drbd_al_complete_io(device, &req->i); + put_ldev(device); + } else if (__ratelimit(&drbd_ratelimit_state)) { + drbd_warn(device, "Should have called drbd_al_complete_io(, %llu, %u), " + "but my Disk seems to have failed :(\n", + (unsigned long long) req->i.sector, req->i.size); + } + } + } + + mempool_free(req, drbd_request_mempool); +} + +static void wake_all_senders(struct drbd_connection *connection) +{ + wake_up(&connection->sender_work.q_wait); +} + +/* must hold resource->req_lock */ +void start_new_tl_epoch(struct drbd_connection *connection) +{ + /* no point closing an epoch, if it is empty, anyways. */ + if (connection->current_tle_writes == 0) + return; + + connection->current_tle_writes = 0; + atomic_inc(&connection->current_tle_nr); + wake_all_senders(connection); +} + +void complete_master_bio(struct drbd_device *device, + struct bio_and_error *m) +{ + bio_endio(m->bio, m->error); + dec_ap_bio(device); +} + + +/* Helper for __req_mod(). + * Set m->bio to the master bio, if it is fit to be completed, + * or leave it alone (it is initialized to NULL in __req_mod), + * if it has already been completed, or cannot be completed yet. + * If m->bio is set, the error status to be returned is placed in m->error. + */ +static +void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) +{ + const unsigned s = req->rq_state; + struct drbd_device *device = req->device; + int rw; + int error, ok; + + /* we must not complete the master bio, while it is + * still being processed by _drbd_send_zc_bio (drbd_send_dblock) + * not yet acknowledged by the peer + * not yet completed by the local io subsystem + * these flags may get cleared in any order by + * the worker, + * the receiver, + * the bio_endio completion callbacks. + */ + if ((s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) || + (s & RQ_NET_QUEUED) || (s & RQ_NET_PENDING) || + (s & RQ_COMPLETION_SUSP)) { + drbd_err(device, "drbd_req_complete: Logic BUG rq_state = 0x%x\n", s); + return; + } + + if (!req->master_bio) { + drbd_err(device, "drbd_req_complete: Logic BUG, master_bio == NULL!\n"); + return; + } + + rw = bio_rw(req->master_bio); + + /* + * figure out whether to report success or failure. + * + * report success when at least one of the operations succeeded. + * or, to put the other way, + * only report failure, when both operations failed. + * + * what to do about the failures is handled elsewhere. + * what we need to do here is just: complete the master_bio. + * + * local completion error, if any, has been stored as ERR_PTR + * in private_bio within drbd_request_endio. + */ + ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); + error = PTR_ERR(req->private_bio); + + /* Before we can signal completion to the upper layers, + * we may need to close the current transfer log epoch. + * We are within the request lock, so we can simply compare + * the request epoch number with the current transfer log + * epoch number. If they match, increase the current_tle_nr, + * and reset the transfer log epoch write_cnt. + */ + if (rw == WRITE && + req->epoch == atomic_read(&first_peer_device(device)->connection->current_tle_nr)) + start_new_tl_epoch(first_peer_device(device)->connection); + + /* Update disk stats */ + _drbd_end_io_acct(device, req); + + /* If READ failed, + * have it be pushed back to the retry work queue, + * so it will re-enter __drbd_make_request(), + * and be re-assigned to a suitable local or remote path, + * or failed if we do not have access to good data anymore. + * + * Unless it was failed early by __drbd_make_request(), + * because no path was available, in which case + * it was not even added to the transfer_log. + * + * READA may fail, and will not be retried. + * + * WRITE should have used all available paths already. + */ + if (!ok && rw == READ && !list_empty(&req->tl_requests)) + req->rq_state |= RQ_POSTPONED; + + if (!(req->rq_state & RQ_POSTPONED)) { + m->error = ok ? 0 : (error ?: -EIO); + m->bio = req->master_bio; + req->master_bio = NULL; + /* We leave it in the tree, to be able to verify later + * write-acks in protocol != C during resync. + * But we mark it as "complete", so it won't be counted as + * conflict in a multi-primary setup. */ + req->i.completed = true; + } + + if (req->i.waiting) + wake_up(&device->misc_wait); + + /* Either we are about to complete to upper layers, + * or we will restart this request. + * In either case, the request object will be destroyed soon, + * so better remove it from all lists. */ + list_del_init(&req->req_pending_master_completion); +} + +/* still holds resource->req_lock */ +static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put) +{ + struct drbd_device *device = req->device; + D_ASSERT(device, m || (req->rq_state & RQ_POSTPONED)); + + if (!atomic_sub_and_test(put, &req->completion_ref)) + return 0; + + drbd_req_complete(req, m); + + if (req->rq_state & RQ_POSTPONED) { + /* don't destroy the req object just yet, + * but queue it for retry */ + drbd_restart_request(req); + return 0; + } + + return 1; +} + +static void set_if_null_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req) +{ + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + if (!connection) + return; + if (connection->req_next == NULL) + connection->req_next = req; +} + +static void advance_conn_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req) +{ + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + if (!connection) + return; + if (connection->req_next != req) + return; + list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) { + const unsigned s = req->rq_state; + if (s & RQ_NET_QUEUED) + break; + } + if (&req->tl_requests == &connection->transfer_log) + req = NULL; + connection->req_next = req; +} + +static void set_if_null_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req) +{ + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + if (!connection) + return; + if (connection->req_ack_pending == NULL) + connection->req_ack_pending = req; +} + +static void advance_conn_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req) +{ + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + if (!connection) + return; + if (connection->req_ack_pending != req) + return; + list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) { + const unsigned s = req->rq_state; + if ((s & RQ_NET_SENT) && (s & RQ_NET_PENDING)) + break; + } + if (&req->tl_requests == &connection->transfer_log) + req = NULL; + connection->req_ack_pending = req; +} + +static void set_if_null_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req) +{ + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + if (!connection) + return; + if (connection->req_not_net_done == NULL) + connection->req_not_net_done = req; +} + +static void advance_conn_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req) +{ + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + if (!connection) + return; + if (connection->req_not_net_done != req) + return; + list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) { + const unsigned s = req->rq_state; + if ((s & RQ_NET_SENT) && !(s & RQ_NET_DONE)) + break; + } + if (&req->tl_requests == &connection->transfer_log) + req = NULL; + connection->req_not_net_done = req; +} + +/* I'd like this to be the only place that manipulates + * req->completion_ref and req->kref. */ +static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, + int clear, int set) +{ + struct drbd_device *device = req->device; + struct drbd_peer_device *peer_device = first_peer_device(device); + unsigned s = req->rq_state; + int c_put = 0; + int k_put = 0; + + if (drbd_suspended(device) && !((s | clear) & RQ_COMPLETION_SUSP)) + set |= RQ_COMPLETION_SUSP; + + /* apply */ + + req->rq_state &= ~clear; + req->rq_state |= set; + + /* no change? */ + if (req->rq_state == s) + return; + + /* intent: get references */ + + if (!(s & RQ_LOCAL_PENDING) && (set & RQ_LOCAL_PENDING)) + atomic_inc(&req->completion_ref); + + if (!(s & RQ_NET_PENDING) && (set & RQ_NET_PENDING)) { + inc_ap_pending(device); + atomic_inc(&req->completion_ref); + } + + if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) { + atomic_inc(&req->completion_ref); + set_if_null_req_next(peer_device, req); + } + + if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK)) + kref_get(&req->kref); /* wait for the DONE */ + + if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) { + /* potentially already completed in the asender thread */ + if (!(s & RQ_NET_DONE)) { + atomic_add(req->i.size >> 9, &device->ap_in_flight); + set_if_null_req_not_net_done(peer_device, req); + } + if (s & RQ_NET_PENDING) + set_if_null_req_ack_pending(peer_device, req); + } + + if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP)) + atomic_inc(&req->completion_ref); + + /* progress: put references */ + + if ((s & RQ_COMPLETION_SUSP) && (clear & RQ_COMPLETION_SUSP)) + ++c_put; + + if (!(s & RQ_LOCAL_ABORTED) && (set & RQ_LOCAL_ABORTED)) { + D_ASSERT(device, req->rq_state & RQ_LOCAL_PENDING); + /* local completion may still come in later, + * we need to keep the req object around. */ + kref_get(&req->kref); + ++c_put; + } + + if ((s & RQ_LOCAL_PENDING) && (clear & RQ_LOCAL_PENDING)) { + if (req->rq_state & RQ_LOCAL_ABORTED) + ++k_put; + else + ++c_put; + list_del_init(&req->req_pending_local); + } + + if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) { + dec_ap_pending(device); + ++c_put; + req->acked_jif = jiffies; + advance_conn_req_ack_pending(peer_device, req); + } + + if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED)) { + ++c_put; + advance_conn_req_next(peer_device, req); + } + + if (!(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) { + if (s & RQ_NET_SENT) + atomic_sub(req->i.size >> 9, &device->ap_in_flight); + if (s & RQ_EXP_BARR_ACK) + ++k_put; + req->net_done_jif = jiffies; + + /* in ahead/behind mode, or just in case, + * before we finally destroy this request, + * the caching pointers must not reference it anymore */ + advance_conn_req_next(peer_device, req); + advance_conn_req_ack_pending(peer_device, req); + advance_conn_req_not_net_done(peer_device, req); + } + + /* potentially complete and destroy */ + + if (k_put || c_put) { + /* Completion does it's own kref_put. If we are going to + * kref_sub below, we need req to be still around then. */ + int at_least = k_put + !!c_put; + int refcount = atomic_read(&req->kref.refcount); + if (refcount < at_least) + drbd_err(device, + "mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n", + s, req->rq_state, refcount, at_least); + } + + /* If we made progress, retry conflicting peer requests, if any. */ + if (req->i.waiting) + wake_up(&device->misc_wait); + + if (c_put) + k_put += drbd_req_put_completion_ref(req, m, c_put); + if (k_put) + kref_sub(&req->kref, k_put, drbd_req_destroy); +} + +static void drbd_report_io_error(struct drbd_device *device, struct drbd_request *req) +{ + char b[BDEVNAME_SIZE]; + + if (!__ratelimit(&drbd_ratelimit_state)) + return; + + drbd_warn(device, "local %s IO error sector %llu+%u on %s\n", + (req->rq_state & RQ_WRITE) ? "WRITE" : "READ", + (unsigned long long)req->i.sector, + req->i.size >> 9, + bdevname(device->ldev->backing_bdev, b)); +} + +/* Helper for HANDED_OVER_TO_NETWORK. + * Is this a protocol A write (neither WRITE_ACK nor RECEIVE_ACK expected)? + * Is it also still "PENDING"? + * --> If so, clear PENDING and set NET_OK below. + * If it is a protocol A write, but not RQ_PENDING anymore, neg-ack was faster + * (and we must not set RQ_NET_OK) */ +static inline bool is_pending_write_protocol_A(struct drbd_request *req) +{ + return (req->rq_state & + (RQ_WRITE|RQ_NET_PENDING|RQ_EXP_WRITE_ACK|RQ_EXP_RECEIVE_ACK)) + == (RQ_WRITE|RQ_NET_PENDING); +} + +/* obviously this could be coded as many single functions + * instead of one huge switch, + * or by putting the code directly in the respective locations + * (as it has been before). + * + * but having it this way + * enforces that it is all in this one place, where it is easier to audit, + * it makes it obvious that whatever "event" "happens" to a request should + * happen "atomically" within the req_lock, + * and it enforces that we have to think in a very structured manner + * about the "events" that may happen to a request during its life time ... + */ +int __req_mod(struct drbd_request *req, enum drbd_req_event what, + struct bio_and_error *m) +{ + struct drbd_device *const device = req->device; + struct drbd_peer_device *const peer_device = first_peer_device(device); + struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; + struct net_conf *nc; + int p, rv = 0; + + if (m) + m->bio = NULL; + + switch (what) { + default: + drbd_err(device, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__); + break; + + /* does not happen... + * initialization done in drbd_req_new + case CREATED: + break; + */ + + case TO_BE_SENT: /* via network */ + /* reached via __drbd_make_request + * and from w_read_retry_remote */ + D_ASSERT(device, !(req->rq_state & RQ_NET_MASK)); + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + p = nc->wire_protocol; + rcu_read_unlock(); + req->rq_state |= + p == DRBD_PROT_C ? RQ_EXP_WRITE_ACK : + p == DRBD_PROT_B ? RQ_EXP_RECEIVE_ACK : 0; + mod_rq_state(req, m, 0, RQ_NET_PENDING); + break; + + case TO_BE_SUBMITTED: /* locally */ + /* reached via __drbd_make_request */ + D_ASSERT(device, !(req->rq_state & RQ_LOCAL_MASK)); + mod_rq_state(req, m, 0, RQ_LOCAL_PENDING); + break; + + case COMPLETED_OK: + if (req->rq_state & RQ_WRITE) + device->writ_cnt += req->i.size >> 9; + else + device->read_cnt += req->i.size >> 9; + + mod_rq_state(req, m, RQ_LOCAL_PENDING, + RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); + break; + + case ABORT_DISK_IO: + mod_rq_state(req, m, 0, RQ_LOCAL_ABORTED); + break; + + case WRITE_COMPLETED_WITH_ERROR: + drbd_report_io_error(device, req); + __drbd_chk_io_error(device, DRBD_WRITE_ERROR); + mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); + break; + + case READ_COMPLETED_WITH_ERROR: + drbd_set_out_of_sync(device, req->i.sector, req->i.size); + drbd_report_io_error(device, req); + __drbd_chk_io_error(device, DRBD_READ_ERROR); + /* fall through. */ + case READ_AHEAD_COMPLETED_WITH_ERROR: + /* it is legal to fail READA, no __drbd_chk_io_error in that case. */ + mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); + break; + + case DISCARD_COMPLETED_NOTSUPP: + case DISCARD_COMPLETED_WITH_ERROR: + /* I'd rather not detach from local disk just because it + * failed a REQ_DISCARD. */ + mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); + break; + + case QUEUE_FOR_NET_READ: + /* READ or READA, and + * no local disk, + * or target area marked as invalid, + * or just got an io-error. */ + /* from __drbd_make_request + * or from bio_endio during read io-error recovery */ + + /* So we can verify the handle in the answer packet. + * Corresponding drbd_remove_request_interval is in + * drbd_req_complete() */ + D_ASSERT(device, drbd_interval_empty(&req->i)); + drbd_insert_interval(&device->read_requests, &req->i); + + set_bit(UNPLUG_REMOTE, &device->flags); + + D_ASSERT(device, req->rq_state & RQ_NET_PENDING); + D_ASSERT(device, (req->rq_state & RQ_LOCAL_MASK) == 0); + mod_rq_state(req, m, 0, RQ_NET_QUEUED); + req->w.cb = w_send_read_req; + drbd_queue_work(&connection->sender_work, + &req->w); + break; + + case QUEUE_FOR_NET_WRITE: + /* assert something? */ + /* from __drbd_make_request only */ + + /* Corresponding drbd_remove_request_interval is in + * drbd_req_complete() */ + D_ASSERT(device, drbd_interval_empty(&req->i)); + drbd_insert_interval(&device->write_requests, &req->i); + + /* NOTE + * In case the req ended up on the transfer log before being + * queued on the worker, it could lead to this request being + * missed during cleanup after connection loss. + * So we have to do both operations here, + * within the same lock that protects the transfer log. + * + * _req_add_to_epoch(req); this has to be after the + * _maybe_start_new_epoch(req); which happened in + * __drbd_make_request, because we now may set the bit + * again ourselves to close the current epoch. + * + * Add req to the (now) current epoch (barrier). */ + + /* otherwise we may lose an unplug, which may cause some remote + * io-scheduler timeout to expire, increasing maximum latency, + * hurting performance. */ + set_bit(UNPLUG_REMOTE, &device->flags); + + /* queue work item to send data */ + D_ASSERT(device, req->rq_state & RQ_NET_PENDING); + mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK); + req->w.cb = w_send_dblock; + drbd_queue_work(&connection->sender_work, + &req->w); + + /* close the epoch, in case it outgrew the limit */ + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + p = nc->max_epoch_size; + rcu_read_unlock(); + if (connection->current_tle_writes >= p) + start_new_tl_epoch(connection); + + break; + + case QUEUE_FOR_SEND_OOS: + mod_rq_state(req, m, 0, RQ_NET_QUEUED); + req->w.cb = w_send_out_of_sync; + drbd_queue_work(&connection->sender_work, + &req->w); + break; + + case READ_RETRY_REMOTE_CANCELED: + case SEND_CANCELED: + case SEND_FAILED: + /* real cleanup will be done from tl_clear. just update flags + * so it is no longer marked as on the worker queue */ + mod_rq_state(req, m, RQ_NET_QUEUED, 0); + break; + + case HANDED_OVER_TO_NETWORK: + /* assert something? */ + if (is_pending_write_protocol_A(req)) + /* this is what is dangerous about protocol A: + * pretend it was successfully written on the peer. */ + mod_rq_state(req, m, RQ_NET_QUEUED|RQ_NET_PENDING, + RQ_NET_SENT|RQ_NET_OK); + else + mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT); + /* It is still not yet RQ_NET_DONE until the + * corresponding epoch barrier got acked as well, + * so we know what to dirty on connection loss. */ + break; + + case OOS_HANDED_TO_NETWORK: + /* Was not set PENDING, no longer QUEUED, so is now DONE + * as far as this connection is concerned. */ + mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_DONE); + break; + + case CONNECTION_LOST_WHILE_PENDING: + /* transfer log cleanup after connection loss */ + mod_rq_state(req, m, + RQ_NET_OK|RQ_NET_PENDING|RQ_COMPLETION_SUSP, + RQ_NET_DONE); + break; + + case CONFLICT_RESOLVED: + /* for superseded conflicting writes of multiple primaries, + * there is no need to keep anything in the tl, potential + * node crashes are covered by the activity log. + * + * If this request had been marked as RQ_POSTPONED before, + * it will actually not be completed, but "restarted", + * resubmitted from the retry worker context. */ + D_ASSERT(device, req->rq_state & RQ_NET_PENDING); + D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK); + mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_DONE|RQ_NET_OK); + break; + + case WRITE_ACKED_BY_PEER_AND_SIS: + req->rq_state |= RQ_NET_SIS; + case WRITE_ACKED_BY_PEER: + /* Normal operation protocol C: successfully written on peer. + * During resync, even in protocol != C, + * we requested an explicit write ack anyways. + * Which means we cannot even assert anything here. + * Nothing more to do here. + * We want to keep the tl in place for all protocols, to cater + * for volatile write-back caches on lower level devices. */ + goto ack_common; + case RECV_ACKED_BY_PEER: + D_ASSERT(device, req->rq_state & RQ_EXP_RECEIVE_ACK); + /* protocol B; pretends to be successfully written on peer. + * see also notes above in HANDED_OVER_TO_NETWORK about + * protocol != C */ + ack_common: + mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK); + break; + + case POSTPONE_WRITE: + D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK); + /* If this node has already detected the write conflict, the + * worker will be waiting on misc_wait. Wake it up once this + * request has completed locally. + */ + D_ASSERT(device, req->rq_state & RQ_NET_PENDING); + req->rq_state |= RQ_POSTPONED; + if (req->i.waiting) + wake_up(&device->misc_wait); + /* Do not clear RQ_NET_PENDING. This request will make further + * progress via restart_conflicting_writes() or + * fail_postponed_requests(). Hopefully. */ + break; + + case NEG_ACKED: + mod_rq_state(req, m, RQ_NET_OK|RQ_NET_PENDING, 0); + break; + + case FAIL_FROZEN_DISK_IO: + if (!(req->rq_state & RQ_LOCAL_COMPLETED)) + break; + mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0); + break; + + case RESTART_FROZEN_DISK_IO: + if (!(req->rq_state & RQ_LOCAL_COMPLETED)) + break; + + mod_rq_state(req, m, + RQ_COMPLETION_SUSP|RQ_LOCAL_COMPLETED, + RQ_LOCAL_PENDING); + + rv = MR_READ; + if (bio_data_dir(req->master_bio) == WRITE) + rv = MR_WRITE; + + get_ldev(device); /* always succeeds in this call path */ + req->w.cb = w_restart_disk_io; + drbd_queue_work(&connection->sender_work, + &req->w); + break; + + case RESEND: + /* Simply complete (local only) READs. */ + if (!(req->rq_state & RQ_WRITE) && !req->w.cb) { + mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0); + break; + } + + /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK + before the connection loss (B&C only); only P_BARRIER_ACK + (or the local completion?) was missing when we suspended. + Throwing them out of the TL here by pretending we got a BARRIER_ACK. + During connection handshake, we ensure that the peer was not rebooted. */ + if (!(req->rq_state & RQ_NET_OK)) { + /* FIXME could this possibly be a req->dw.cb == w_send_out_of_sync? + * in that case we must not set RQ_NET_PENDING. */ + + mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING); + if (req->w.cb) { + /* w.cb expected to be w_send_dblock, or w_send_read_req */ + drbd_queue_work(&connection->sender_work, + &req->w); + rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ; + } /* else: FIXME can this happen? */ + break; + } + /* else, fall through to BARRIER_ACKED */ + + case BARRIER_ACKED: + /* barrier ack for READ requests does not make sense */ + if (!(req->rq_state & RQ_WRITE)) + break; + + if (req->rq_state & RQ_NET_PENDING) { + /* barrier came in before all requests were acked. + * this is bad, because if the connection is lost now, + * we won't be able to clean them up... */ + drbd_err(device, "FIXME (BARRIER_ACKED but pending)\n"); + } + /* Allowed to complete requests, even while suspended. + * As this is called for all requests within a matching epoch, + * we need to filter, and only set RQ_NET_DONE for those that + * have actually been on the wire. */ + mod_rq_state(req, m, RQ_COMPLETION_SUSP, + (req->rq_state & RQ_NET_MASK) ? RQ_NET_DONE : 0); + break; + + case DATA_RECEIVED: + D_ASSERT(device, req->rq_state & RQ_NET_PENDING); + mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE); + break; + + case QUEUE_AS_DRBD_BARRIER: + start_new_tl_epoch(connection); + mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE); + break; + }; + + return rv; +} + +/* we may do a local read if: + * - we are consistent (of course), + * - or we are generally inconsistent, + * BUT we are still/already IN SYNC for this area. + * since size may be bigger than BM_BLOCK_SIZE, + * we may need to check several bits. + */ +static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, int size) +{ + unsigned long sbnr, ebnr; + sector_t esector, nr_sectors; + + if (device->state.disk == D_UP_TO_DATE) + return true; + if (device->state.disk != D_INCONSISTENT) + return false; + esector = sector + (size >> 9) - 1; + nr_sectors = drbd_get_capacity(device->this_bdev); + D_ASSERT(device, sector < nr_sectors); + D_ASSERT(device, esector < nr_sectors); + + sbnr = BM_SECT_TO_BIT(sector); + ebnr = BM_SECT_TO_BIT(esector); + + return drbd_bm_count_bits(device, sbnr, ebnr) == 0; +} + +static bool remote_due_to_read_balancing(struct drbd_device *device, sector_t sector, + enum drbd_read_balancing rbm) +{ + struct backing_dev_info *bdi; + int stripe_shift; + + switch (rbm) { + case RB_CONGESTED_REMOTE: + bdi = &device->ldev->backing_bdev->bd_disk->queue->backing_dev_info; + return bdi_read_congested(bdi); + case RB_LEAST_PENDING: + return atomic_read(&device->local_cnt) > + atomic_read(&device->ap_pending_cnt) + atomic_read(&device->rs_pending_cnt); + case RB_32K_STRIPING: /* stripe_shift = 15 */ + case RB_64K_STRIPING: + case RB_128K_STRIPING: + case RB_256K_STRIPING: + case RB_512K_STRIPING: + case RB_1M_STRIPING: /* stripe_shift = 20 */ + stripe_shift = (rbm - RB_32K_STRIPING + 15); + return (sector >> (stripe_shift - 9)) & 1; + case RB_ROUND_ROBIN: + return test_and_change_bit(READ_BALANCE_RR, &device->flags); + case RB_PREFER_REMOTE: + return true; + case RB_PREFER_LOCAL: + default: + return false; + } +} + +/* + * complete_conflicting_writes - wait for any conflicting write requests + * + * The write_requests tree contains all active write requests which we + * currently know about. Wait for any requests to complete which conflict with + * the new one. + * + * Only way out: remove the conflicting intervals from the tree. + */ +static void complete_conflicting_writes(struct drbd_request *req) +{ + DEFINE_WAIT(wait); + struct drbd_device *device = req->device; + struct drbd_interval *i; + sector_t sector = req->i.sector; + int size = req->i.size; + + i = drbd_find_overlap(&device->write_requests, sector, size); + if (!i) + return; + + for (;;) { + prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE); + i = drbd_find_overlap(&device->write_requests, sector, size); + if (!i) + break; + /* Indicate to wake up device->misc_wait on progress. */ + i->waiting = true; + spin_unlock_irq(&device->resource->req_lock); + schedule(); + spin_lock_irq(&device->resource->req_lock); + } + finish_wait(&device->misc_wait, &wait); +} + +/* called within req_lock and rcu_read_lock() */ +static void maybe_pull_ahead(struct drbd_device *device) +{ + struct drbd_connection *connection = first_peer_device(device)->connection; + struct net_conf *nc; + bool congested = false; + enum drbd_on_congestion on_congestion; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + on_congestion = nc ? nc->on_congestion : OC_BLOCK; + rcu_read_unlock(); + if (on_congestion == OC_BLOCK || + connection->agreed_pro_version < 96) + return; + + if (on_congestion == OC_PULL_AHEAD && device->state.conn == C_AHEAD) + return; /* nothing to do ... */ + + /* If I don't even have good local storage, we can not reasonably try + * to pull ahead of the peer. We also need the local reference to make + * sure device->act_log is there. + */ + if (!get_ldev_if_state(device, D_UP_TO_DATE)) + return; + + if (nc->cong_fill && + atomic_read(&device->ap_in_flight) >= nc->cong_fill) { + drbd_info(device, "Congestion-fill threshold reached\n"); + congested = true; + } + + if (device->act_log->used >= nc->cong_extents) { + drbd_info(device, "Congestion-extents threshold reached\n"); + congested = true; + } + + if (congested) { + /* start a new epoch for non-mirrored writes */ + start_new_tl_epoch(first_peer_device(device)->connection); + + if (on_congestion == OC_PULL_AHEAD) + _drbd_set_state(_NS(device, conn, C_AHEAD), 0, NULL); + else /*nc->on_congestion == OC_DISCONNECT */ + _drbd_set_state(_NS(device, conn, C_DISCONNECTING), 0, NULL); + } + put_ldev(device); +} + +/* If this returns false, and req->private_bio is still set, + * this should be submitted locally. + * + * If it returns false, but req->private_bio is not set, + * we do not have access to good data :( + * + * Otherwise, this destroys req->private_bio, if any, + * and returns true. + */ +static bool do_remote_read(struct drbd_request *req) +{ + struct drbd_device *device = req->device; + enum drbd_read_balancing rbm; + + if (req->private_bio) { + if (!drbd_may_do_local_read(device, + req->i.sector, req->i.size)) { + bio_put(req->private_bio); + req->private_bio = NULL; + put_ldev(device); + } + } + + if (device->state.pdsk != D_UP_TO_DATE) + return false; + + if (req->private_bio == NULL) + return true; + + /* TODO: improve read balancing decisions, take into account drbd + * protocol, pending requests etc. */ + + rcu_read_lock(); + rbm = rcu_dereference(device->ldev->disk_conf)->read_balancing; + rcu_read_unlock(); + + if (rbm == RB_PREFER_LOCAL && req->private_bio) + return false; /* submit locally */ + + if (remote_due_to_read_balancing(device, req->i.sector, rbm)) { + if (req->private_bio) { + bio_put(req->private_bio); + req->private_bio = NULL; + put_ldev(device); + } + return true; + } + + return false; +} + +/* returns number of connections (== 1, for drbd 8.4) + * expected to actually write this data, + * which does NOT include those that we are L_AHEAD for. */ +static int drbd_process_write_request(struct drbd_request *req) +{ + struct drbd_device *device = req->device; + int remote, send_oos; + + remote = drbd_should_do_remote(device->state); + send_oos = drbd_should_send_out_of_sync(device->state); + + /* Need to replicate writes. Unless it is an empty flush, + * which is better mapped to a DRBD P_BARRIER packet, + * also for drbd wire protocol compatibility reasons. + * If this was a flush, just start a new epoch. + * Unless the current epoch was empty anyways, or we are not currently + * replicating, in which case there is no point. */ + if (unlikely(req->i.size == 0)) { + /* The only size==0 bios we expect are empty flushes. */ + D_ASSERT(device, req->master_bio->bi_rw & REQ_FLUSH); + if (remote) + _req_mod(req, QUEUE_AS_DRBD_BARRIER); + return remote; + } + + if (!remote && !send_oos) + return 0; + + D_ASSERT(device, !(remote && send_oos)); + + if (remote) { + _req_mod(req, TO_BE_SENT); + _req_mod(req, QUEUE_FOR_NET_WRITE); + } else if (drbd_set_out_of_sync(device, req->i.sector, req->i.size)) + _req_mod(req, QUEUE_FOR_SEND_OOS); + + return remote; +} + +static void +drbd_submit_req_private_bio(struct drbd_request *req) +{ + struct drbd_device *device = req->device; + struct bio *bio = req->private_bio; + const int rw = bio_rw(bio); + + bio->bi_bdev = device->ldev->backing_bdev; + + /* State may have changed since we grabbed our reference on the + * ->ldev member. Double check, and short-circuit to endio. + * In case the last activity log transaction failed to get on + * stable storage, and this is a WRITE, we may not even submit + * this bio. */ + if (get_ldev(device)) { + req->pre_submit_jif = jiffies; + if (drbd_insert_fault(device, + rw == WRITE ? DRBD_FAULT_DT_WR + : rw == READ ? DRBD_FAULT_DT_RD + : DRBD_FAULT_DT_RA)) + bio_endio(bio, -EIO); + else + generic_make_request(bio); + put_ldev(device); + } else + bio_endio(bio, -EIO); +} + +static void drbd_queue_write(struct drbd_device *device, struct drbd_request *req) +{ + spin_lock_irq(&device->resource->req_lock); + list_add_tail(&req->tl_requests, &device->submit.writes); + list_add_tail(&req->req_pending_master_completion, + &device->pending_master_completion[1 /* WRITE */]); + spin_unlock_irq(&device->resource->req_lock); + queue_work(device->submit.wq, &device->submit.worker); + /* do_submit() may sleep internally on al_wait, too */ + wake_up(&device->al_wait); +} + +/* returns the new drbd_request pointer, if the caller is expected to + * drbd_send_and_submit() it (to save latency), or NULL if we queued the + * request on the submitter thread. + * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request. + */ +static struct drbd_request * +drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_jif) +{ + const int rw = bio_data_dir(bio); + struct drbd_request *req; + + /* allocate outside of all locks; */ + req = drbd_req_new(device, bio); + if (!req) { + dec_ap_bio(device); + /* only pass the error to the upper layers. + * if user cannot handle io errors, that's not our business. */ + drbd_err(device, "could not kmalloc() req\n"); + bio_endio(bio, -ENOMEM); + return ERR_PTR(-ENOMEM); + } + req->start_jif = start_jif; + + if (!get_ldev(device)) { + bio_put(req->private_bio); + req->private_bio = NULL; + } + + /* Update disk stats */ + _drbd_start_io_acct(device, req); + + if (rw == WRITE && req->private_bio && req->i.size + && !test_bit(AL_SUSPENDED, &device->flags)) { + if (!drbd_al_begin_io_fastpath(device, &req->i)) { + atomic_inc(&device->ap_actlog_cnt); + drbd_queue_write(device, req); + return NULL; + } + req->rq_state |= RQ_IN_ACT_LOG; + req->in_actlog_jif = jiffies; + } + + return req; +} + +static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req) +{ + struct drbd_resource *resource = device->resource; + const int rw = bio_rw(req->master_bio); + struct bio_and_error m = { NULL, }; + bool no_remote = false; + bool submit_private_bio = false; + + spin_lock_irq(&resource->req_lock); + if (rw == WRITE) { + /* This may temporarily give up the req_lock, + * but will re-aquire it before it returns here. + * Needs to be before the check on drbd_suspended() */ + complete_conflicting_writes(req); + /* no more giving up req_lock from now on! */ + + /* check for congestion, and potentially stop sending + * full data updates, but start sending "dirty bits" only. */ + maybe_pull_ahead(device); + } + + + if (drbd_suspended(device)) { + /* push back and retry: */ + req->rq_state |= RQ_POSTPONED; + if (req->private_bio) { + bio_put(req->private_bio); + req->private_bio = NULL; + put_ldev(device); + } + goto out; + } + + /* We fail READ/READA early, if we can not serve it. + * We must do this before req is registered on any lists. + * Otherwise, drbd_req_complete() will queue failed READ for retry. */ + if (rw != WRITE) { + if (!do_remote_read(req) && !req->private_bio) + goto nodata; + } + + /* which transfer log epoch does this belong to? */ + req->epoch = atomic_read(&first_peer_device(device)->connection->current_tle_nr); + + /* no point in adding empty flushes to the transfer log, + * they are mapped to drbd barriers already. */ + if (likely(req->i.size!=0)) { + if (rw == WRITE) + first_peer_device(device)->connection->current_tle_writes++; + + list_add_tail(&req->tl_requests, &first_peer_device(device)->connection->transfer_log); + } + + if (rw == WRITE) { + if (!drbd_process_write_request(req)) + no_remote = true; + } else { + /* We either have a private_bio, or we can read from remote. + * Otherwise we had done the goto nodata above. */ + if (req->private_bio == NULL) { + _req_mod(req, TO_BE_SENT); + _req_mod(req, QUEUE_FOR_NET_READ); + } else + no_remote = true; + } + + /* If it took the fast path in drbd_request_prepare, add it here. + * The slow path has added it already. */ + if (list_empty(&req->req_pending_master_completion)) + list_add_tail(&req->req_pending_master_completion, + &device->pending_master_completion[rw == WRITE]); + if (req->private_bio) { + /* needs to be marked within the same spinlock */ + list_add_tail(&req->req_pending_local, + &device->pending_completion[rw == WRITE]); + _req_mod(req, TO_BE_SUBMITTED); + /* but we need to give up the spinlock to submit */ + submit_private_bio = true; + } else if (no_remote) { +nodata: + if (__ratelimit(&drbd_ratelimit_state)) + drbd_err(device, "IO ERROR: neither local nor remote data, sector %llu+%u\n", + (unsigned long long)req->i.sector, req->i.size >> 9); + /* A write may have been queued for send_oos, however. + * So we can not simply free it, we must go through drbd_req_put_completion_ref() */ + } + +out: + if (drbd_req_put_completion_ref(req, &m, 1)) + kref_put(&req->kref, drbd_req_destroy); + spin_unlock_irq(&resource->req_lock); + + /* Even though above is a kref_put(), this is safe. + * As long as we still need to submit our private bio, + * we hold a completion ref, and the request cannot disappear. + * If however this request did not even have a private bio to submit + * (e.g. remote read), req may already be invalid now. + * That's why we cannot check on req->private_bio. */ + if (submit_private_bio) + drbd_submit_req_private_bio(req); + if (m.bio) + complete_master_bio(device, &m); +} + +void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_jif) +{ + struct drbd_request *req = drbd_request_prepare(device, bio, start_jif); + if (IS_ERR_OR_NULL(req)) + return; + drbd_send_and_submit(device, req); +} + +static void submit_fast_path(struct drbd_device *device, struct list_head *incoming) +{ + struct drbd_request *req, *tmp; + list_for_each_entry_safe(req, tmp, incoming, tl_requests) { + const int rw = bio_data_dir(req->master_bio); + + if (rw == WRITE /* rw != WRITE should not even end up here! */ + && req->private_bio && req->i.size + && !test_bit(AL_SUSPENDED, &device->flags)) { + if (!drbd_al_begin_io_fastpath(device, &req->i)) + continue; + + req->rq_state |= RQ_IN_ACT_LOG; + req->in_actlog_jif = jiffies; + atomic_dec(&device->ap_actlog_cnt); + } + + list_del_init(&req->tl_requests); + drbd_send_and_submit(device, req); + } +} + +static bool prepare_al_transaction_nonblock(struct drbd_device *device, + struct list_head *incoming, + struct list_head *pending, + struct list_head *later) +{ + struct drbd_request *req, *tmp; + int wake = 0; + int err; + + spin_lock_irq(&device->al_lock); + list_for_each_entry_safe(req, tmp, incoming, tl_requests) { + err = drbd_al_begin_io_nonblock(device, &req->i); + if (err == -ENOBUFS) + break; + if (err == -EBUSY) + wake = 1; + if (err) + list_move_tail(&req->tl_requests, later); + else + list_move_tail(&req->tl_requests, pending); + } + spin_unlock_irq(&device->al_lock); + if (wake) + wake_up(&device->al_wait); + return !list_empty(pending); +} + +void send_and_submit_pending(struct drbd_device *device, struct list_head *pending) +{ + struct drbd_request *req, *tmp; + + list_for_each_entry_safe(req, tmp, pending, tl_requests) { + req->rq_state |= RQ_IN_ACT_LOG; + req->in_actlog_jif = jiffies; + atomic_dec(&device->ap_actlog_cnt); + list_del_init(&req->tl_requests); + drbd_send_and_submit(device, req); + } +} + +void do_submit(struct work_struct *ws) +{ + struct drbd_device *device = container_of(ws, struct drbd_device, submit.worker); + LIST_HEAD(incoming); /* from drbd_make_request() */ + LIST_HEAD(pending); /* to be submitted after next AL-transaction commit */ + LIST_HEAD(busy); /* blocked by resync requests */ + + /* grab new incoming requests */ + spin_lock_irq(&device->resource->req_lock); + list_splice_tail_init(&device->submit.writes, &incoming); + spin_unlock_irq(&device->resource->req_lock); + + for (;;) { + DEFINE_WAIT(wait); + + /* move used-to-be-busy back to front of incoming */ + list_splice_init(&busy, &incoming); + submit_fast_path(device, &incoming); + if (list_empty(&incoming)) + break; + + for (;;) { + prepare_to_wait(&device->al_wait, &wait, TASK_UNINTERRUPTIBLE); + + list_splice_init(&busy, &incoming); + prepare_al_transaction_nonblock(device, &incoming, &pending, &busy); + if (!list_empty(&pending)) + break; + + schedule(); + + /* If all currently "hot" activity log extents are kept busy by + * incoming requests, we still must not totally starve new + * requests to "cold" extents. + * Something left on &incoming means there had not been + * enough update slots available, and the activity log + * has been marked as "starving". + * + * Try again now, without looking for new requests, + * effectively blocking all new requests until we made + * at least _some_ progress with what we currently have. + */ + if (!list_empty(&incoming)) + continue; + + /* Nothing moved to pending, but nothing left + * on incoming: all moved to busy! + * Grab new and iterate. */ + spin_lock_irq(&device->resource->req_lock); + list_splice_tail_init(&device->submit.writes, &incoming); + spin_unlock_irq(&device->resource->req_lock); + } + finish_wait(&device->al_wait, &wait); + + /* If the transaction was full, before all incoming requests + * had been processed, skip ahead to commit, and iterate + * without splicing in more incoming requests from upper layers. + * + * Else, if all incoming have been processed, + * they have become either "pending" (to be submitted after + * next transaction commit) or "busy" (blocked by resync). + * + * Maybe more was queued, while we prepared the transaction? + * Try to stuff those into this transaction as well. + * Be strictly non-blocking here, + * we already have something to commit. + * + * Commit if we don't make any more progres. + */ + + while (list_empty(&incoming)) { + LIST_HEAD(more_pending); + LIST_HEAD(more_incoming); + bool made_progress; + + /* It is ok to look outside the lock, + * it's only an optimization anyways */ + if (list_empty(&device->submit.writes)) + break; + + spin_lock_irq(&device->resource->req_lock); + list_splice_tail_init(&device->submit.writes, &more_incoming); + spin_unlock_irq(&device->resource->req_lock); + + if (list_empty(&more_incoming)) + break; + + made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending, &busy); + + list_splice_tail_init(&more_pending, &pending); + list_splice_tail_init(&more_incoming, &incoming); + if (!made_progress) + break; + } + + drbd_al_begin_io_commit(device); + send_and_submit_pending(device, &pending); + } +} + +void drbd_make_request(struct request_queue *q, struct bio *bio) +{ + struct drbd_device *device = (struct drbd_device *) q->queuedata; + unsigned long start_jif; + + start_jif = jiffies; + + /* + * what we "blindly" assume: + */ + D_ASSERT(device, IS_ALIGNED(bio->bi_iter.bi_size, 512)); + + inc_ap_bio(device); + __drbd_make_request(device, bio, start_jif); +} + +/* This is called by bio_add_page(). + * + * q->max_hw_sectors and other global limits are already enforced there. + * + * We need to call down to our lower level device, + * in case it has special restrictions. + * + * We also may need to enforce configured max-bio-bvecs limits. + * + * As long as the BIO is empty we have to allow at least one bvec, + * regardless of size and offset, so no need to ask lower levels. + */ +int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec) +{ + struct drbd_device *device = (struct drbd_device *) q->queuedata; + unsigned int bio_size = bvm->bi_size; + int limit = DRBD_MAX_BIO_SIZE; + int backing_limit; + + if (bio_size && get_ldev(device)) { + unsigned int max_hw_sectors = queue_max_hw_sectors(q); + struct request_queue * const b = + device->ldev->backing_bdev->bd_disk->queue; + if (b->merge_bvec_fn) { + bvm->bi_bdev = device->ldev->backing_bdev; + backing_limit = b->merge_bvec_fn(b, bvm, bvec); + limit = min(limit, backing_limit); + } + put_ldev(device); + if ((limit >> 9) > max_hw_sectors) + limit = max_hw_sectors << 9; + } + return limit; +} + +void request_timer_fn(unsigned long data) +{ + struct drbd_device *device = (struct drbd_device *) data; + struct drbd_connection *connection = first_peer_device(device)->connection; + struct drbd_request *req_read, *req_write, *req_peer; /* oldest request */ + struct net_conf *nc; + unsigned long oldest_submit_jif; + unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ + unsigned long now; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + if (nc && device->state.conn >= C_WF_REPORT_PARAMS) + ent = nc->timeout * HZ/10 * nc->ko_count; + + if (get_ldev(device)) { /* implicit state.disk >= D_INCONSISTENT */ + dt = rcu_dereference(device->ldev->disk_conf)->disk_timeout * HZ / 10; + put_ldev(device); + } + rcu_read_unlock(); + + et = min_not_zero(dt, ent); + + if (!et) + return; /* Recurring timer stopped */ + + now = jiffies; + nt = now + et; + + spin_lock_irq(&device->resource->req_lock); + req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local); + req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local); + req_peer = connection->req_not_net_done; + /* maybe the oldest request waiting for the peer is in fact still + * blocking in tcp sendmsg */ + if (!req_peer && connection->req_next && connection->req_next->pre_send_jif) + req_peer = connection->req_next; + + /* evaluate the oldest peer request only in one timer! */ + if (req_peer && req_peer->device != device) + req_peer = NULL; + + /* do we have something to evaluate? */ + if (req_peer == NULL && req_write == NULL && req_read == NULL) + goto out; + + oldest_submit_jif = + (req_write && req_read) + ? ( time_before(req_write->pre_submit_jif, req_read->pre_submit_jif) + ? req_write->pre_submit_jif : req_read->pre_submit_jif ) + : req_write ? req_write->pre_submit_jif + : req_read ? req_read->pre_submit_jif : now; + + /* The request is considered timed out, if + * - we have some effective timeout from the configuration, + * with above state restrictions applied, + * - the oldest request is waiting for a response from the network + * resp. the local disk, + * - the oldest request is in fact older than the effective timeout, + * - the connection was established (resp. disk was attached) + * for longer than the timeout already. + * Note that for 32bit jiffies and very stable connections/disks, + * we may have a wrap around, which is catched by + * !time_in_range(now, last_..._jif, last_..._jif + timeout). + * + * Side effect: once per 32bit wrap-around interval, which means every + * ~198 days with 250 HZ, we have a window where the timeout would need + * to expire twice (worst case) to become effective. Good enough. + */ + if (ent && req_peer && + time_after(now, req_peer->pre_send_jif + ent) && + !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) { + drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n"); + _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD); + } + if (dt && oldest_submit_jif != now && + time_after(now, oldest_submit_jif + dt) && + !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) { + drbd_warn(device, "Local backing device failed to meet the disk-timeout\n"); + __drbd_chk_io_error(device, DRBD_FORCE_DETACH); + } + + /* Reschedule timer for the nearest not already expired timeout. + * Fallback to now + min(effective network timeout, disk timeout). */ + ent = (ent && req_peer && time_before(now, req_peer->pre_send_jif + ent)) + ? req_peer->pre_send_jif + ent : now + et; + dt = (dt && oldest_submit_jif != now && time_before(now, oldest_submit_jif + dt)) + ? oldest_submit_jif + dt : now + et; + nt = time_before(ent, dt) ? ent : dt; +out: + spin_unlock_irq(&device->resource->req_lock); + mod_timer(&device->request_timer, nt); +} diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h new file mode 100644 index 000000000..9f6a04080 --- /dev/null +++ b/drivers/block/drbd/drbd_req.h @@ -0,0 +1,351 @@ +/* + drbd_req.h + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2006-2008, LINBIT Information Technologies GmbH. + Copyright (C) 2006-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + Copyright (C) 2006-2008, Philipp Reisner <philipp.reisner@linbit.com>. + + DRBD is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + DRBD is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _DRBD_REQ_H +#define _DRBD_REQ_H + +#include <linux/module.h> + +#include <linux/slab.h> +#include <linux/drbd.h> +#include "drbd_int.h" + +/* The request callbacks will be called in irq context by the IDE drivers, + and in Softirqs/Tasklets/BH context by the SCSI drivers, + and by the receiver and worker in kernel-thread context. + Try to get the locking right :) */ + +/* + * Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are + * associated with IO requests originating from the block layer above us. + * + * There are quite a few things that may happen to a drbd request + * during its lifetime. + * + * It will be created. + * It will be marked with the intention to be + * submitted to local disk and/or + * send via the network. + * + * It has to be placed on the transfer log and other housekeeping lists, + * In case we have a network connection. + * + * It may be identified as a concurrent (write) request + * and be handled accordingly. + * + * It may me handed over to the local disk subsystem. + * It may be completed by the local disk subsystem, + * either successfully or with io-error. + * In case it is a READ request, and it failed locally, + * it may be retried remotely. + * + * It may be queued for sending. + * It may be handed over to the network stack, + * which may fail. + * It may be acknowledged by the "peer" according to the wire_protocol in use. + * this may be a negative ack. + * It may receive a faked ack when the network connection is lost and the + * transfer log is cleaned up. + * Sending may be canceled due to network connection loss. + * When it finally has outlived its time, + * corresponding dirty bits in the resync-bitmap may be cleared or set, + * it will be destroyed, + * and completion will be signalled to the originator, + * with or without "success". + */ + +enum drbd_req_event { + CREATED, + TO_BE_SENT, + TO_BE_SUBMITTED, + + /* XXX yes, now I am inconsistent... + * these are not "events" but "actions" + * oh, well... */ + QUEUE_FOR_NET_WRITE, + QUEUE_FOR_NET_READ, + QUEUE_FOR_SEND_OOS, + + /* An empty flush is queued as P_BARRIER, + * which will cause it to complete "successfully", + * even if the local disk flush failed. + * + * Just like "real" requests, empty flushes (blkdev_issue_flush()) will + * only see an error if neither local nor remote data is reachable. */ + QUEUE_AS_DRBD_BARRIER, + + SEND_CANCELED, + SEND_FAILED, + HANDED_OVER_TO_NETWORK, + OOS_HANDED_TO_NETWORK, + CONNECTION_LOST_WHILE_PENDING, + READ_RETRY_REMOTE_CANCELED, + RECV_ACKED_BY_PEER, + WRITE_ACKED_BY_PEER, + WRITE_ACKED_BY_PEER_AND_SIS, /* and set_in_sync */ + CONFLICT_RESOLVED, + POSTPONE_WRITE, + NEG_ACKED, + BARRIER_ACKED, /* in protocol A and B */ + DATA_RECEIVED, /* (remote read) */ + + COMPLETED_OK, + READ_COMPLETED_WITH_ERROR, + READ_AHEAD_COMPLETED_WITH_ERROR, + WRITE_COMPLETED_WITH_ERROR, + DISCARD_COMPLETED_NOTSUPP, + DISCARD_COMPLETED_WITH_ERROR, + + ABORT_DISK_IO, + RESEND, + FAIL_FROZEN_DISK_IO, + RESTART_FROZEN_DISK_IO, + NOTHING, +}; + +/* encoding of request states for now. we don't actually need that many bits. + * we don't need to do atomic bit operations either, since most of the time we + * need to look at the connection state and/or manipulate some lists at the + * same time, so we should hold the request lock anyways. + */ +enum drbd_req_state_bits { + /* 3210 + * 0000: no local possible + * 0001: to be submitted + * UNUSED, we could map: 011: submitted, completion still pending + * 0110: completed ok + * 0010: completed with error + * 1001: Aborted (before completion) + * 1x10: Aborted and completed -> free + */ + __RQ_LOCAL_PENDING, + __RQ_LOCAL_COMPLETED, + __RQ_LOCAL_OK, + __RQ_LOCAL_ABORTED, + + /* 87654 + * 00000: no network possible + * 00001: to be send + * 00011: to be send, on worker queue + * 00101: sent, expecting recv_ack (B) or write_ack (C) + * 11101: sent, + * recv_ack (B) or implicit "ack" (A), + * still waiting for the barrier ack. + * master_bio may already be completed and invalidated. + * 11100: write acked (C), + * data received (for remote read, any protocol) + * or finally the barrier ack has arrived (B,A)... + * request can be freed + * 01100: neg-acked (write, protocol C) + * or neg-d-acked (read, any protocol) + * or killed from the transfer log + * during cleanup after connection loss + * request can be freed + * 01000: canceled or send failed... + * request can be freed + */ + + /* if "SENT" is not set, yet, this can still fail or be canceled. + * if "SENT" is set already, we still wait for an Ack packet. + * when cleared, the master_bio may be completed. + * in (B,A) the request object may still linger on the transaction log + * until the corresponding barrier ack comes in */ + __RQ_NET_PENDING, + + /* If it is QUEUED, and it is a WRITE, it is also registered in the + * transfer log. Currently we need this flag to avoid conflicts between + * worker canceling the request and tl_clear_barrier killing it from + * transfer log. We should restructure the code so this conflict does + * no longer occur. */ + __RQ_NET_QUEUED, + + /* well, actually only "handed over to the network stack". + * + * TODO can potentially be dropped because of the similar meaning + * of RQ_NET_SENT and ~RQ_NET_QUEUED. + * however it is not exactly the same. before we drop it + * we must ensure that we can tell a request with network part + * from a request without, regardless of what happens to it. */ + __RQ_NET_SENT, + + /* when set, the request may be freed (if RQ_NET_QUEUED is clear). + * basically this means the corresponding P_BARRIER_ACK was received */ + __RQ_NET_DONE, + + /* whether or not we know (C) or pretend (B,A) that the write + * was successfully written on the peer. + */ + __RQ_NET_OK, + + /* peer called drbd_set_in_sync() for this write */ + __RQ_NET_SIS, + + /* keep this last, its for the RQ_NET_MASK */ + __RQ_NET_MAX, + + /* Set when this is a write, clear for a read */ + __RQ_WRITE, + + /* Should call drbd_al_complete_io() for this request... */ + __RQ_IN_ACT_LOG, + + /* The peer has sent a retry ACK */ + __RQ_POSTPONED, + + /* would have been completed, + * but was not, because of drbd_suspended() */ + __RQ_COMPLETION_SUSP, + + /* We expect a receive ACK (wire proto B) */ + __RQ_EXP_RECEIVE_ACK, + + /* We expect a write ACK (wite proto C) */ + __RQ_EXP_WRITE_ACK, + + /* waiting for a barrier ack, did an extra kref_get */ + __RQ_EXP_BARR_ACK, +}; + +#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) +#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED) +#define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK) +#define RQ_LOCAL_ABORTED (1UL << __RQ_LOCAL_ABORTED) + +#define RQ_LOCAL_MASK ((RQ_LOCAL_ABORTED << 1)-1) + +#define RQ_NET_PENDING (1UL << __RQ_NET_PENDING) +#define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED) +#define RQ_NET_SENT (1UL << __RQ_NET_SENT) +#define RQ_NET_DONE (1UL << __RQ_NET_DONE) +#define RQ_NET_OK (1UL << __RQ_NET_OK) +#define RQ_NET_SIS (1UL << __RQ_NET_SIS) + +/* 0x1f8 */ +#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK) + +#define RQ_WRITE (1UL << __RQ_WRITE) +#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG) +#define RQ_POSTPONED (1UL << __RQ_POSTPONED) +#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP) +#define RQ_EXP_RECEIVE_ACK (1UL << __RQ_EXP_RECEIVE_ACK) +#define RQ_EXP_WRITE_ACK (1UL << __RQ_EXP_WRITE_ACK) +#define RQ_EXP_BARR_ACK (1UL << __RQ_EXP_BARR_ACK) + +/* For waking up the frozen transfer log mod_req() has to return if the request + should be counted in the epoch object*/ +#define MR_WRITE 1 +#define MR_READ 2 + +static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src) +{ + struct bio *bio; + bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */ + + req->private_bio = bio; + + bio->bi_private = req; + bio->bi_end_io = drbd_request_endio; + bio->bi_next = NULL; +} + +/* Short lived temporary struct on the stack. + * We could squirrel the error to be returned into + * bio->bi_iter.bi_size, or similar. But that would be too ugly. */ +struct bio_and_error { + struct bio *bio; + int error; +}; + +extern void start_new_tl_epoch(struct drbd_connection *connection); +extern void drbd_req_destroy(struct kref *kref); +extern void _req_may_be_done(struct drbd_request *req, + struct bio_and_error *m); +extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, + struct bio_and_error *m); +extern void complete_master_bio(struct drbd_device *device, + struct bio_and_error *m); +extern void request_timer_fn(unsigned long data); +extern void tl_restart(struct drbd_connection *connection, enum drbd_req_event what); +extern void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what); +extern void tl_abort_disk_io(struct drbd_device *device); + +/* this is in drbd_main.c */ +extern void drbd_restart_request(struct drbd_request *req); + +/* use this if you don't want to deal with calling complete_master_bio() + * outside the spinlock, e.g. when walking some list on cleanup. */ +static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what) +{ + struct drbd_device *device = req->device; + struct bio_and_error m; + int rv; + + /* __req_mod possibly frees req, do not touch req after that! */ + rv = __req_mod(req, what, &m); + if (m.bio) + complete_master_bio(device, &m); + + return rv; +} + +/* completion of master bio is outside of our spinlock. + * We still may or may not be inside some irqs disabled section + * of the lower level driver completion callback, so we need to + * spin_lock_irqsave here. */ +static inline int req_mod(struct drbd_request *req, + enum drbd_req_event what) +{ + unsigned long flags; + struct drbd_device *device = req->device; + struct bio_and_error m; + int rv; + + spin_lock_irqsave(&device->resource->req_lock, flags); + rv = __req_mod(req, what, &m); + spin_unlock_irqrestore(&device->resource->req_lock, flags); + + if (m.bio) + complete_master_bio(device, &m); + + return rv; +} + +static inline bool drbd_should_do_remote(union drbd_dev_state s) +{ + return s.pdsk == D_UP_TO_DATE || + (s.pdsk >= D_INCONSISTENT && + s.conn >= C_WF_BITMAP_T && + s.conn < C_AHEAD); + /* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T. + That is equivalent since before 96 IO was frozen in the C_WF_BITMAP* + states. */ +} +static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s) +{ + return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S; + /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary + since we enter state C_AHEAD only if proto >= 96 */ +} + +#endif diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c new file mode 100644 index 000000000..2d7dd269b --- /dev/null +++ b/drivers/block/drbd/drbd_state.c @@ -0,0 +1,1918 @@ +/* + drbd_state.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev + from Logicworks, Inc. for making SDP replication support possible. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/drbd_limits.h> +#include "drbd_int.h" +#include "drbd_protocol.h" +#include "drbd_req.h" + +struct after_state_chg_work { + struct drbd_work w; + struct drbd_device *device; + union drbd_state os; + union drbd_state ns; + enum chg_state_flags flags; + struct completion *done; +}; + +enum sanitize_state_warnings { + NO_WARNING, + ABORTED_ONLINE_VERIFY, + ABORTED_RESYNC, + CONNECTION_LOST_NEGOTIATING, + IMPLICITLY_UPGRADED_DISK, + IMPLICITLY_UPGRADED_PDSK, +}; + +static int w_after_state_ch(struct drbd_work *w, int unused); +static void after_state_ch(struct drbd_device *device, union drbd_state os, + union drbd_state ns, enum chg_state_flags flags); +static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state); +static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *); +static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns); +static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os, + union drbd_state ns, enum sanitize_state_warnings *warn); + +static inline bool is_susp(union drbd_state s) +{ + return s.susp || s.susp_nod || s.susp_fen; +} + +bool conn_all_vols_unconf(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + bool rv = true; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + if (device->state.disk != D_DISKLESS || + device->state.conn != C_STANDALONE || + device->state.role != R_SECONDARY) { + rv = false; + break; + } + } + rcu_read_unlock(); + + return rv; +} + +/* Unfortunately the states where not correctly ordered, when + they where defined. therefore can not use max_t() here. */ +static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2) +{ + if (role1 == R_PRIMARY || role2 == R_PRIMARY) + return R_PRIMARY; + if (role1 == R_SECONDARY || role2 == R_SECONDARY) + return R_SECONDARY; + return R_UNKNOWN; +} +static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2) +{ + if (role1 == R_UNKNOWN || role2 == R_UNKNOWN) + return R_UNKNOWN; + if (role1 == R_SECONDARY || role2 == R_SECONDARY) + return R_SECONDARY; + return R_PRIMARY; +} + +enum drbd_role conn_highest_role(struct drbd_connection *connection) +{ + enum drbd_role role = R_UNKNOWN; + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + role = max_role(role, device->state.role); + } + rcu_read_unlock(); + + return role; +} + +enum drbd_role conn_highest_peer(struct drbd_connection *connection) +{ + enum drbd_role peer = R_UNKNOWN; + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + peer = max_role(peer, device->state.peer); + } + rcu_read_unlock(); + + return peer; +} + +enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection) +{ + enum drbd_disk_state disk_state = D_DISKLESS; + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + disk_state = max_t(enum drbd_disk_state, disk_state, device->state.disk); + } + rcu_read_unlock(); + + return disk_state; +} + +enum drbd_disk_state conn_lowest_disk(struct drbd_connection *connection) +{ + enum drbd_disk_state disk_state = D_MASK; + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + disk_state = min_t(enum drbd_disk_state, disk_state, device->state.disk); + } + rcu_read_unlock(); + + return disk_state; +} + +enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection) +{ + enum drbd_disk_state disk_state = D_DISKLESS; + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + disk_state = max_t(enum drbd_disk_state, disk_state, device->state.pdsk); + } + rcu_read_unlock(); + + return disk_state; +} + +enum drbd_conns conn_lowest_conn(struct drbd_connection *connection) +{ + enum drbd_conns conn = C_MASK; + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + conn = min_t(enum drbd_conns, conn, device->state.conn); + } + rcu_read_unlock(); + + return conn; +} + +static bool no_peer_wf_report_params(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + int vnr; + bool rv = true; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + if (peer_device->device->state.conn == C_WF_REPORT_PARAMS) { + rv = false; + break; + } + rcu_read_unlock(); + + return rv; +} + +static void wake_up_all_devices(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + wake_up(&peer_device->device->state_wait); + rcu_read_unlock(); + +} + + +/** + * cl_wide_st_chg() - true if the state change is a cluster wide one + * @device: DRBD device. + * @os: old (current) state. + * @ns: new (wanted) state. + */ +static int cl_wide_st_chg(struct drbd_device *device, + union drbd_state os, union drbd_state ns) +{ + return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED && + ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || + (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || + (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || + (os.disk != D_FAILED && ns.disk == D_FAILED))) || + (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || + (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S) || + (os.conn == C_CONNECTED && ns.conn == C_WF_REPORT_PARAMS); +} + +static union drbd_state +apply_mask_val(union drbd_state os, union drbd_state mask, union drbd_state val) +{ + union drbd_state ns; + ns.i = (os.i & ~mask.i) | val.i; + return ns; +} + +enum drbd_state_rv +drbd_change_state(struct drbd_device *device, enum chg_state_flags f, + union drbd_state mask, union drbd_state val) +{ + unsigned long flags; + union drbd_state ns; + enum drbd_state_rv rv; + + spin_lock_irqsave(&device->resource->req_lock, flags); + ns = apply_mask_val(drbd_read_state(device), mask, val); + rv = _drbd_set_state(device, ns, f, NULL); + spin_unlock_irqrestore(&device->resource->req_lock, flags); + + return rv; +} + +/** + * drbd_force_state() - Impose a change which happens outside our control on our state + * @device: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + */ +void drbd_force_state(struct drbd_device *device, + union drbd_state mask, union drbd_state val) +{ + drbd_change_state(device, CS_HARD, mask, val); +} + +static enum drbd_state_rv +_req_st_cond(struct drbd_device *device, union drbd_state mask, + union drbd_state val) +{ + union drbd_state os, ns; + unsigned long flags; + enum drbd_state_rv rv; + + if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &device->flags)) + return SS_CW_SUCCESS; + + if (test_and_clear_bit(CL_ST_CHG_FAIL, &device->flags)) + return SS_CW_FAILED_BY_PEER; + + spin_lock_irqsave(&device->resource->req_lock, flags); + os = drbd_read_state(device); + ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL); + rv = is_valid_transition(os, ns); + if (rv >= SS_SUCCESS) + rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ + + if (!cl_wide_st_chg(device, os, ns)) + rv = SS_CW_NO_NEED; + if (rv == SS_UNKNOWN_ERROR) { + rv = is_valid_state(device, ns); + if (rv >= SS_SUCCESS) { + rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection); + if (rv >= SS_SUCCESS) + rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ + } + } + spin_unlock_irqrestore(&device->resource->req_lock, flags); + + return rv; +} + +/** + * drbd_req_state() - Perform an eventually cluster wide state change + * @device: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + * @f: flags + * + * Should not be called directly, use drbd_request_state() or + * _drbd_request_state(). + */ +static enum drbd_state_rv +drbd_req_state(struct drbd_device *device, union drbd_state mask, + union drbd_state val, enum chg_state_flags f) +{ + struct completion done; + unsigned long flags; + union drbd_state os, ns; + enum drbd_state_rv rv; + + init_completion(&done); + + if (f & CS_SERIALIZE) + mutex_lock(device->state_mutex); + + spin_lock_irqsave(&device->resource->req_lock, flags); + os = drbd_read_state(device); + ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL); + rv = is_valid_transition(os, ns); + if (rv < SS_SUCCESS) { + spin_unlock_irqrestore(&device->resource->req_lock, flags); + goto abort; + } + + if (cl_wide_st_chg(device, os, ns)) { + rv = is_valid_state(device, ns); + if (rv == SS_SUCCESS) + rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection); + spin_unlock_irqrestore(&device->resource->req_lock, flags); + + if (rv < SS_SUCCESS) { + if (f & CS_VERBOSE) + print_st_err(device, os, ns, rv); + goto abort; + } + + if (drbd_send_state_req(first_peer_device(device), mask, val)) { + rv = SS_CW_FAILED_BY_PEER; + if (f & CS_VERBOSE) + print_st_err(device, os, ns, rv); + goto abort; + } + + wait_event(device->state_wait, + (rv = _req_st_cond(device, mask, val))); + + if (rv < SS_SUCCESS) { + if (f & CS_VERBOSE) + print_st_err(device, os, ns, rv); + goto abort; + } + spin_lock_irqsave(&device->resource->req_lock, flags); + ns = apply_mask_val(drbd_read_state(device), mask, val); + rv = _drbd_set_state(device, ns, f, &done); + } else { + rv = _drbd_set_state(device, ns, f, &done); + } + + spin_unlock_irqrestore(&device->resource->req_lock, flags); + + if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) { + D_ASSERT(device, current != first_peer_device(device)->connection->worker.task); + wait_for_completion(&done); + } + +abort: + if (f & CS_SERIALIZE) + mutex_unlock(device->state_mutex); + + return rv; +} + +/** + * _drbd_request_state() - Request a state change (with flags) + * @device: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + * @f: flags + * + * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE + * flag, or when logging of failed state change requests is not desired. + */ +enum drbd_state_rv +_drbd_request_state(struct drbd_device *device, union drbd_state mask, + union drbd_state val, enum chg_state_flags f) +{ + enum drbd_state_rv rv; + + wait_event(device->state_wait, + (rv = drbd_req_state(device, mask, val, f)) != SS_IN_TRANSIENT_STATE); + + return rv; +} + +enum drbd_state_rv +_drbd_request_state_holding_state_mutex(struct drbd_device *device, union drbd_state mask, + union drbd_state val, enum chg_state_flags f) +{ + enum drbd_state_rv rv; + + BUG_ON(f & CS_SERIALIZE); + + wait_event_cmd(device->state_wait, + (rv = drbd_req_state(device, mask, val, f)) != SS_IN_TRANSIENT_STATE, + mutex_unlock(device->state_mutex), + mutex_lock(device->state_mutex)); + + return rv; +} + +static void print_st(struct drbd_device *device, const char *name, union drbd_state ns) +{ + drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n", + name, + drbd_conn_str(ns.conn), + drbd_role_str(ns.role), + drbd_role_str(ns.peer), + drbd_disk_str(ns.disk), + drbd_disk_str(ns.pdsk), + is_susp(ns) ? 's' : 'r', + ns.aftr_isp ? 'a' : '-', + ns.peer_isp ? 'p' : '-', + ns.user_isp ? 'u' : '-', + ns.susp_fen ? 'F' : '-', + ns.susp_nod ? 'N' : '-' + ); +} + +void print_st_err(struct drbd_device *device, union drbd_state os, + union drbd_state ns, enum drbd_state_rv err) +{ + if (err == SS_IN_TRANSIENT_STATE) + return; + drbd_err(device, "State change failed: %s\n", drbd_set_st_err_str(err)); + print_st(device, " state", os); + print_st(device, "wanted", ns); +} + +static long print_state_change(char *pb, union drbd_state os, union drbd_state ns, + enum chg_state_flags flags) +{ + char *pbp; + pbp = pb; + *pbp = 0; + + if (ns.role != os.role && flags & CS_DC_ROLE) + pbp += sprintf(pbp, "role( %s -> %s ) ", + drbd_role_str(os.role), + drbd_role_str(ns.role)); + if (ns.peer != os.peer && flags & CS_DC_PEER) + pbp += sprintf(pbp, "peer( %s -> %s ) ", + drbd_role_str(os.peer), + drbd_role_str(ns.peer)); + if (ns.conn != os.conn && flags & CS_DC_CONN) + pbp += sprintf(pbp, "conn( %s -> %s ) ", + drbd_conn_str(os.conn), + drbd_conn_str(ns.conn)); + if (ns.disk != os.disk && flags & CS_DC_DISK) + pbp += sprintf(pbp, "disk( %s -> %s ) ", + drbd_disk_str(os.disk), + drbd_disk_str(ns.disk)); + if (ns.pdsk != os.pdsk && flags & CS_DC_PDSK) + pbp += sprintf(pbp, "pdsk( %s -> %s ) ", + drbd_disk_str(os.pdsk), + drbd_disk_str(ns.pdsk)); + + return pbp - pb; +} + +static void drbd_pr_state_change(struct drbd_device *device, union drbd_state os, union drbd_state ns, + enum chg_state_flags flags) +{ + char pb[300]; + char *pbp = pb; + + pbp += print_state_change(pbp, os, ns, flags ^ CS_DC_MASK); + + if (ns.aftr_isp != os.aftr_isp) + pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ", + os.aftr_isp, + ns.aftr_isp); + if (ns.peer_isp != os.peer_isp) + pbp += sprintf(pbp, "peer_isp( %d -> %d ) ", + os.peer_isp, + ns.peer_isp); + if (ns.user_isp != os.user_isp) + pbp += sprintf(pbp, "user_isp( %d -> %d ) ", + os.user_isp, + ns.user_isp); + + if (pbp != pb) + drbd_info(device, "%s\n", pb); +} + +static void conn_pr_state_change(struct drbd_connection *connection, union drbd_state os, union drbd_state ns, + enum chg_state_flags flags) +{ + char pb[300]; + char *pbp = pb; + + pbp += print_state_change(pbp, os, ns, flags); + + if (is_susp(ns) != is_susp(os) && flags & CS_DC_SUSP) + pbp += sprintf(pbp, "susp( %d -> %d ) ", + is_susp(os), + is_susp(ns)); + + if (pbp != pb) + drbd_info(connection, "%s\n", pb); +} + + +/** + * is_valid_state() - Returns an SS_ error code if ns is not valid + * @device: DRBD device. + * @ns: State to consider. + */ +static enum drbd_state_rv +is_valid_state(struct drbd_device *device, union drbd_state ns) +{ + /* See drbd_state_sw_errors in drbd_strings.c */ + + enum drbd_fencing_p fp; + enum drbd_state_rv rv = SS_SUCCESS; + struct net_conf *nc; + + rcu_read_lock(); + fp = FP_DONT_CARE; + if (get_ldev(device)) { + fp = rcu_dereference(device->ldev->disk_conf)->fencing; + put_ldev(device); + } + + nc = rcu_dereference(first_peer_device(device)->connection->net_conf); + if (nc) { + if (!nc->two_primaries && ns.role == R_PRIMARY) { + if (ns.peer == R_PRIMARY) + rv = SS_TWO_PRIMARIES; + else if (conn_highest_peer(first_peer_device(device)->connection) == R_PRIMARY) + rv = SS_O_VOL_PEER_PRI; + } + } + + if (rv <= 0) + /* already found a reason to abort */; + else if (ns.role == R_SECONDARY && device->open_cnt) + rv = SS_DEVICE_IN_USE; + + else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE) + rv = SS_NO_UP_TO_DATE_DISK; + + else if (fp >= FP_RESOURCE && + ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN) + rv = SS_PRIMARY_NOP; + + else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT) + rv = SS_NO_UP_TO_DATE_DISK; + + else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT) + rv = SS_NO_LOCAL_DISK; + + else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) + rv = SS_NO_REMOTE_DISK; + + else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) + rv = SS_NO_UP_TO_DATE_DISK; + + else if ((ns.conn == C_CONNECTED || + ns.conn == C_WF_BITMAP_S || + ns.conn == C_SYNC_SOURCE || + ns.conn == C_PAUSED_SYNC_S) && + ns.disk == D_OUTDATED) + rv = SS_CONNECTED_OUTDATES; + + else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && + (nc->verify_alg[0] == 0)) + rv = SS_NO_VERIFY_ALG; + + else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && + first_peer_device(device)->connection->agreed_pro_version < 88) + rv = SS_NOT_SUPPORTED; + + else if (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) + rv = SS_NO_UP_TO_DATE_DISK; + + else if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && + ns.pdsk == D_UNKNOWN) + rv = SS_NEED_CONNECTION; + + else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) + rv = SS_CONNECTED_OUTDATES; + + rcu_read_unlock(); + + return rv; +} + +/** + * is_valid_soft_transition() - Returns an SS_ error code if the state transition is not possible + * This function limits state transitions that may be declined by DRBD. I.e. + * user requests (aka soft transitions). + * @device: DRBD device. + * @ns: new state. + * @os: old state. + */ +static enum drbd_state_rv +is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_connection *connection) +{ + enum drbd_state_rv rv = SS_SUCCESS; + + if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) && + os.conn > C_CONNECTED) + rv = SS_RESYNC_RUNNING; + + if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE) + rv = SS_ALREADY_STANDALONE; + + if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS) + rv = SS_IS_DISKLESS; + + if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED) + rv = SS_NO_NET_CONFIG; + + if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING) + rv = SS_LOWER_THAN_OUTDATED; + + if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) + rv = SS_IN_TRANSIENT_STATE; + + /* While establishing a connection only allow cstate to change. + Delay/refuse role changes, detach attach etc... (they do not touch cstate) */ + if (test_bit(STATE_SENT, &connection->flags) && + !((ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION) || + (ns.conn >= C_CONNECTED && os.conn == C_WF_REPORT_PARAMS))) + rv = SS_IN_TRANSIENT_STATE; + + if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) + rv = SS_NEED_CONNECTION; + + if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && + ns.conn != os.conn && os.conn > C_CONNECTED) + rv = SS_RESYNC_RUNNING; + + if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && + os.conn < C_CONNECTED) + rv = SS_NEED_CONNECTION; + + if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE) + && os.conn < C_WF_REPORT_PARAMS) + rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */ + + if (ns.conn == C_DISCONNECTING && ns.pdsk == D_OUTDATED && + os.conn < C_CONNECTED && os.pdsk > D_OUTDATED) + rv = SS_OUTDATE_WO_CONN; + + return rv; +} + +static enum drbd_state_rv +is_valid_conn_transition(enum drbd_conns oc, enum drbd_conns nc) +{ + /* no change -> nothing to do, at least for the connection part */ + if (oc == nc) + return SS_NOTHING_TO_DO; + + /* disconnect of an unconfigured connection does not make sense */ + if (oc == C_STANDALONE && nc == C_DISCONNECTING) + return SS_ALREADY_STANDALONE; + + /* from C_STANDALONE, we start with C_UNCONNECTED */ + if (oc == C_STANDALONE && nc != C_UNCONNECTED) + return SS_NEED_CONNECTION; + + /* When establishing a connection we need to go through WF_REPORT_PARAMS! + Necessary to do the right thing upon invalidate-remote on a disconnected resource */ + if (oc < C_WF_REPORT_PARAMS && nc >= C_CONNECTED) + return SS_NEED_CONNECTION; + + /* After a network error only C_UNCONNECTED or C_DISCONNECTING may follow. */ + if (oc >= C_TIMEOUT && oc <= C_TEAR_DOWN && nc != C_UNCONNECTED && nc != C_DISCONNECTING) + return SS_IN_TRANSIENT_STATE; + + /* After C_DISCONNECTING only C_STANDALONE may follow */ + if (oc == C_DISCONNECTING && nc != C_STANDALONE) + return SS_IN_TRANSIENT_STATE; + + return SS_SUCCESS; +} + + +/** + * is_valid_transition() - Returns an SS_ error code if the state transition is not possible + * This limits hard state transitions. Hard state transitions are facts there are + * imposed on DRBD by the environment. E.g. disk broke or network broke down. + * But those hard state transitions are still not allowed to do everything. + * @ns: new state. + * @os: old state. + */ +static enum drbd_state_rv +is_valid_transition(union drbd_state os, union drbd_state ns) +{ + enum drbd_state_rv rv; + + rv = is_valid_conn_transition(os.conn, ns.conn); + + /* we cannot fail (again) if we already detached */ + if (ns.disk == D_FAILED && os.disk == D_DISKLESS) + rv = SS_IS_DISKLESS; + + return rv; +} + +static void print_sanitize_warnings(struct drbd_device *device, enum sanitize_state_warnings warn) +{ + static const char *msg_table[] = { + [NO_WARNING] = "", + [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.", + [ABORTED_RESYNC] = "Resync aborted.", + [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!", + [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk", + [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk", + }; + + if (warn != NO_WARNING) + drbd_warn(device, "%s\n", msg_table[warn]); +} + +/** + * sanitize_state() - Resolves implicitly necessary additional changes to a state transition + * @device: DRBD device. + * @os: old state. + * @ns: new state. + * @warn_sync_abort: + * + * When we loose connection, we have to set the state of the peers disk (pdsk) + * to D_UNKNOWN. This rule and many more along those lines are in this function. + */ +static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os, + union drbd_state ns, enum sanitize_state_warnings *warn) +{ + enum drbd_fencing_p fp; + enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; + + if (warn) + *warn = NO_WARNING; + + fp = FP_DONT_CARE; + if (get_ldev(device)) { + rcu_read_lock(); + fp = rcu_dereference(device->ldev->disk_conf)->fencing; + rcu_read_unlock(); + put_ldev(device); + } + + /* Implications from connection to peer and peer_isp */ + if (ns.conn < C_CONNECTED) { + ns.peer_isp = 0; + ns.peer = R_UNKNOWN; + if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT) + ns.pdsk = D_UNKNOWN; + } + + /* Clear the aftr_isp when becoming unconfigured */ + if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) + ns.aftr_isp = 0; + + /* An implication of the disk states onto the connection state */ + /* Abort resync if a disk fails/detaches */ + if (ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { + if (warn) + *warn = ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T ? + ABORTED_ONLINE_VERIFY : ABORTED_RESYNC; + ns.conn = C_CONNECTED; + } + + /* Connection breaks down before we finished "Negotiating" */ + if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING && + get_ldev_if_state(device, D_NEGOTIATING)) { + if (device->ed_uuid == device->ldev->md.uuid[UI_CURRENT]) { + ns.disk = device->new_state_tmp.disk; + ns.pdsk = device->new_state_tmp.pdsk; + } else { + if (warn) + *warn = CONNECTION_LOST_NEGOTIATING; + ns.disk = D_DISKLESS; + ns.pdsk = D_UNKNOWN; + } + put_ldev(device); + } + + /* D_CONSISTENT and D_OUTDATED vanish when we get connected */ + if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) { + if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) + ns.disk = D_UP_TO_DATE; + if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED) + ns.pdsk = D_UP_TO_DATE; + } + + /* Implications of the connection stat on the disk states */ + disk_min = D_DISKLESS; + disk_max = D_UP_TO_DATE; + pdsk_min = D_INCONSISTENT; + pdsk_max = D_UNKNOWN; + switch ((enum drbd_conns)ns.conn) { + case C_WF_BITMAP_T: + case C_PAUSED_SYNC_T: + case C_STARTING_SYNC_T: + case C_WF_SYNC_UUID: + case C_BEHIND: + disk_min = D_INCONSISTENT; + disk_max = D_OUTDATED; + pdsk_min = D_UP_TO_DATE; + pdsk_max = D_UP_TO_DATE; + break; + case C_VERIFY_S: + case C_VERIFY_T: + disk_min = D_UP_TO_DATE; + disk_max = D_UP_TO_DATE; + pdsk_min = D_UP_TO_DATE; + pdsk_max = D_UP_TO_DATE; + break; + case C_CONNECTED: + disk_min = D_DISKLESS; + disk_max = D_UP_TO_DATE; + pdsk_min = D_DISKLESS; + pdsk_max = D_UP_TO_DATE; + break; + case C_WF_BITMAP_S: + case C_PAUSED_SYNC_S: + case C_STARTING_SYNC_S: + case C_AHEAD: + disk_min = D_UP_TO_DATE; + disk_max = D_UP_TO_DATE; + pdsk_min = D_INCONSISTENT; + pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/ + break; + case C_SYNC_TARGET: + disk_min = D_INCONSISTENT; + disk_max = D_INCONSISTENT; + pdsk_min = D_UP_TO_DATE; + pdsk_max = D_UP_TO_DATE; + break; + case C_SYNC_SOURCE: + disk_min = D_UP_TO_DATE; + disk_max = D_UP_TO_DATE; + pdsk_min = D_INCONSISTENT; + pdsk_max = D_INCONSISTENT; + break; + case C_STANDALONE: + case C_DISCONNECTING: + case C_UNCONNECTED: + case C_TIMEOUT: + case C_BROKEN_PIPE: + case C_NETWORK_FAILURE: + case C_PROTOCOL_ERROR: + case C_TEAR_DOWN: + case C_WF_CONNECTION: + case C_WF_REPORT_PARAMS: + case C_MASK: + break; + } + if (ns.disk > disk_max) + ns.disk = disk_max; + + if (ns.disk < disk_min) { + if (warn) + *warn = IMPLICITLY_UPGRADED_DISK; + ns.disk = disk_min; + } + if (ns.pdsk > pdsk_max) + ns.pdsk = pdsk_max; + + if (ns.pdsk < pdsk_min) { + if (warn) + *warn = IMPLICITLY_UPGRADED_PDSK; + ns.pdsk = pdsk_min; + } + + if (fp == FP_STONITH && + (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) && + !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)) + ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */ + + if (device->resource->res_opts.on_no_data == OND_SUSPEND_IO && + (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) && + !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE)) + ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */ + + if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { + if (ns.conn == C_SYNC_SOURCE) + ns.conn = C_PAUSED_SYNC_S; + if (ns.conn == C_SYNC_TARGET) + ns.conn = C_PAUSED_SYNC_T; + } else { + if (ns.conn == C_PAUSED_SYNC_S) + ns.conn = C_SYNC_SOURCE; + if (ns.conn == C_PAUSED_SYNC_T) + ns.conn = C_SYNC_TARGET; + } + + return ns; +} + +void drbd_resume_al(struct drbd_device *device) +{ + if (test_and_clear_bit(AL_SUSPENDED, &device->flags)) + drbd_info(device, "Resumed AL updates\n"); +} + +/* helper for __drbd_set_state */ +static void set_ov_position(struct drbd_device *device, enum drbd_conns cs) +{ + if (first_peer_device(device)->connection->agreed_pro_version < 90) + device->ov_start_sector = 0; + device->rs_total = drbd_bm_bits(device); + device->ov_position = 0; + if (cs == C_VERIFY_T) { + /* starting online verify from an arbitrary position + * does not fit well into the existing protocol. + * on C_VERIFY_T, we initialize ov_left and friends + * implicitly in receive_DataRequest once the + * first P_OV_REQUEST is received */ + device->ov_start_sector = ~(sector_t)0; + } else { + unsigned long bit = BM_SECT_TO_BIT(device->ov_start_sector); + if (bit >= device->rs_total) { + device->ov_start_sector = + BM_BIT_TO_SECT(device->rs_total - 1); + device->rs_total = 1; + } else + device->rs_total -= bit; + device->ov_position = device->ov_start_sector; + } + device->ov_left = device->rs_total; +} + +/** + * __drbd_set_state() - Set a new DRBD state + * @device: DRBD device. + * @ns: new state. + * @flags: Flags + * @done: Optional completion, that will get completed after the after_state_ch() finished + * + * Caller needs to hold req_lock, and global_state_lock. Do not call directly. + */ +enum drbd_state_rv +__drbd_set_state(struct drbd_device *device, union drbd_state ns, + enum chg_state_flags flags, struct completion *done) +{ + struct drbd_peer_device *peer_device = first_peer_device(device); + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + union drbd_state os; + enum drbd_state_rv rv = SS_SUCCESS; + enum sanitize_state_warnings ssw; + struct after_state_chg_work *ascw; + + os = drbd_read_state(device); + + ns = sanitize_state(device, os, ns, &ssw); + if (ns.i == os.i) + return SS_NOTHING_TO_DO; + + rv = is_valid_transition(os, ns); + if (rv < SS_SUCCESS) + return rv; + + if (!(flags & CS_HARD)) { + /* pre-state-change checks ; only look at ns */ + /* See drbd_state_sw_errors in drbd_strings.c */ + + rv = is_valid_state(device, ns); + if (rv < SS_SUCCESS) { + /* If the old state was illegal as well, then let + this happen...*/ + + if (is_valid_state(device, os) == rv) + rv = is_valid_soft_transition(os, ns, connection); + } else + rv = is_valid_soft_transition(os, ns, connection); + } + + if (rv < SS_SUCCESS) { + if (flags & CS_VERBOSE) + print_st_err(device, os, ns, rv); + return rv; + } + + print_sanitize_warnings(device, ssw); + + drbd_pr_state_change(device, os, ns, flags); + + /* Display changes to the susp* flags that where caused by the call to + sanitize_state(). Only display it here if we where not called from + _conn_request_state() */ + if (!(flags & CS_DC_SUSP)) + conn_pr_state_change(connection, os, ns, + (flags & ~CS_DC_MASK) | CS_DC_SUSP); + + /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference + * on the ldev here, to be sure the transition -> D_DISKLESS resp. + * drbd_ldev_destroy() won't happen before our corresponding + * after_state_ch works run, where we put_ldev again. */ + if ((os.disk != D_FAILED && ns.disk == D_FAILED) || + (os.disk != D_DISKLESS && ns.disk == D_DISKLESS)) + atomic_inc(&device->local_cnt); + + if (!is_sync_state(os.conn) && is_sync_state(ns.conn)) + clear_bit(RS_DONE, &device->flags); + + /* changes to local_cnt and device flags should be visible before + * changes to state, which again should be visible before anything else + * depending on that change happens. */ + smp_wmb(); + device->state.i = ns.i; + device->resource->susp = ns.susp; + device->resource->susp_nod = ns.susp_nod; + device->resource->susp_fen = ns.susp_fen; + smp_wmb(); + + /* put replicated vs not-replicated requests in seperate epochs */ + if (drbd_should_do_remote((union drbd_dev_state)os.i) != + drbd_should_do_remote((union drbd_dev_state)ns.i)) + start_new_tl_epoch(connection); + + if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING) + drbd_print_uuids(device, "attached to UUIDs"); + + /* Wake up role changes, that were delayed because of connection establishing */ + if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS && + no_peer_wf_report_params(connection)) { + clear_bit(STATE_SENT, &connection->flags); + wake_up_all_devices(connection); + } + + wake_up(&device->misc_wait); + wake_up(&device->state_wait); + wake_up(&connection->ping_wait); + + /* Aborted verify run, or we reached the stop sector. + * Log the last position, unless end-of-device. */ + if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && + ns.conn <= C_CONNECTED) { + device->ov_start_sector = + BM_BIT_TO_SECT(drbd_bm_bits(device) - device->ov_left); + if (device->ov_left) + drbd_info(device, "Online Verify reached sector %llu\n", + (unsigned long long)device->ov_start_sector); + } + + if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && + (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { + drbd_info(device, "Syncer continues.\n"); + device->rs_paused += (long)jiffies + -(long)device->rs_mark_time[device->rs_last_mark]; + if (ns.conn == C_SYNC_TARGET) + mod_timer(&device->resync_timer, jiffies); + } + + if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && + (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { + drbd_info(device, "Resync suspended\n"); + device->rs_mark_time[device->rs_last_mark] = jiffies; + } + + if (os.conn == C_CONNECTED && + (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { + unsigned long now = jiffies; + int i; + + set_ov_position(device, ns.conn); + device->rs_start = now; + device->rs_last_sect_ev = 0; + device->ov_last_oos_size = 0; + device->ov_last_oos_start = 0; + + for (i = 0; i < DRBD_SYNC_MARKS; i++) { + device->rs_mark_left[i] = device->ov_left; + device->rs_mark_time[i] = now; + } + + drbd_rs_controller_reset(device); + + if (ns.conn == C_VERIFY_S) { + drbd_info(device, "Starting Online Verify from sector %llu\n", + (unsigned long long)device->ov_position); + mod_timer(&device->resync_timer, jiffies); + } + } + + if (get_ldev(device)) { + u32 mdf = device->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND| + MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE| + MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY); + + mdf &= ~MDF_AL_CLEAN; + if (test_bit(CRASHED_PRIMARY, &device->flags)) + mdf |= MDF_CRASHED_PRIMARY; + if (device->state.role == R_PRIMARY || + (device->state.pdsk < D_INCONSISTENT && device->state.peer == R_PRIMARY)) + mdf |= MDF_PRIMARY_IND; + if (device->state.conn > C_WF_REPORT_PARAMS) + mdf |= MDF_CONNECTED_IND; + if (device->state.disk > D_INCONSISTENT) + mdf |= MDF_CONSISTENT; + if (device->state.disk > D_OUTDATED) + mdf |= MDF_WAS_UP_TO_DATE; + if (device->state.pdsk <= D_OUTDATED && device->state.pdsk >= D_INCONSISTENT) + mdf |= MDF_PEER_OUT_DATED; + if (mdf != device->ldev->md.flags) { + device->ldev->md.flags = mdf; + drbd_md_mark_dirty(device); + } + if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT) + drbd_set_ed_uuid(device, device->ldev->md.uuid[UI_CURRENT]); + put_ldev(device); + } + + /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */ + if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT && + os.peer == R_SECONDARY && ns.peer == R_PRIMARY) + set_bit(CONSIDER_RESYNC, &device->flags); + + /* Receiver should clean up itself */ + if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING) + drbd_thread_stop_nowait(&connection->receiver); + + /* Now the receiver finished cleaning up itself, it should die */ + if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE) + drbd_thread_stop_nowait(&connection->receiver); + + /* Upon network failure, we need to restart the receiver. */ + if (os.conn > C_WF_CONNECTION && + ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) + drbd_thread_restart_nowait(&connection->receiver); + + /* Resume AL writing if we get a connection */ + if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) { + drbd_resume_al(device); + connection->connect_cnt++; + } + + /* remember last attach time so request_timer_fn() won't + * kill newly established sessions while we are still trying to thaw + * previously frozen IO */ + if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && + ns.disk > D_NEGOTIATING) + device->last_reattach_jif = jiffies; + + ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); + if (ascw) { + ascw->os = os; + ascw->ns = ns; + ascw->flags = flags; + ascw->w.cb = w_after_state_ch; + ascw->device = device; + ascw->done = done; + drbd_queue_work(&connection->sender_work, + &ascw->w); + } else { + drbd_err(device, "Could not kmalloc an ascw\n"); + } + + return rv; +} + +static int w_after_state_ch(struct drbd_work *w, int unused) +{ + struct after_state_chg_work *ascw = + container_of(w, struct after_state_chg_work, w); + struct drbd_device *device = ascw->device; + + after_state_ch(device, ascw->os, ascw->ns, ascw->flags); + if (ascw->flags & CS_WAIT_COMPLETE) + complete(ascw->done); + kfree(ascw); + + return 0; +} + +static void abw_start_sync(struct drbd_device *device, int rv) +{ + if (rv) { + drbd_err(device, "Writing the bitmap failed not starting resync.\n"); + _drbd_request_state(device, NS(conn, C_CONNECTED), CS_VERBOSE); + return; + } + + switch (device->state.conn) { + case C_STARTING_SYNC_T: + _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); + break; + case C_STARTING_SYNC_S: + drbd_start_resync(device, C_SYNC_SOURCE); + break; + } +} + +int drbd_bitmap_io_from_worker(struct drbd_device *device, + int (*io_fn)(struct drbd_device *), + char *why, enum bm_flag flags) +{ + int rv; + + D_ASSERT(device, current == first_peer_device(device)->connection->worker.task); + + /* open coded non-blocking drbd_suspend_io(device); */ + set_bit(SUSPEND_IO, &device->flags); + + drbd_bm_lock(device, why, flags); + rv = io_fn(device); + drbd_bm_unlock(device); + + drbd_resume_io(device); + + return rv; +} + +/** + * after_state_ch() - Perform after state change actions that may sleep + * @device: DRBD device. + * @os: old state. + * @ns: new state. + * @flags: Flags + */ +static void after_state_ch(struct drbd_device *device, union drbd_state os, + union drbd_state ns, enum chg_state_flags flags) +{ + struct drbd_resource *resource = device->resource; + struct drbd_peer_device *peer_device = first_peer_device(device); + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + struct sib_info sib; + + sib.sib_reason = SIB_STATE_CHANGE; + sib.os = os; + sib.ns = ns; + + if ((os.disk != D_UP_TO_DATE || os.pdsk != D_UP_TO_DATE) + && (ns.disk == D_UP_TO_DATE && ns.pdsk == D_UP_TO_DATE)) { + clear_bit(CRASHED_PRIMARY, &device->flags); + if (device->p_uuid) + device->p_uuid[UI_FLAGS] &= ~((u64)2); + } + + /* Inform userspace about the change... */ + drbd_bcast_event(device, &sib); + + if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) && + (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) + drbd_khelper(device, "pri-on-incon-degr"); + + /* Here we have the actions that are performed after a + state change. This function might sleep */ + + if (ns.susp_nod) { + enum drbd_req_event what = NOTHING; + + spin_lock_irq(&device->resource->req_lock); + if (os.conn < C_CONNECTED && conn_lowest_conn(connection) >= C_CONNECTED) + what = RESEND; + + if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && + conn_lowest_disk(connection) > D_NEGOTIATING) + what = RESTART_FROZEN_DISK_IO; + + if (resource->susp_nod && what != NOTHING) { + _tl_restart(connection, what); + _conn_request_state(connection, + (union drbd_state) { { .susp_nod = 1 } }, + (union drbd_state) { { .susp_nod = 0 } }, + CS_VERBOSE); + } + spin_unlock_irq(&device->resource->req_lock); + } + + if (ns.susp_fen) { + spin_lock_irq(&device->resource->req_lock); + if (resource->susp_fen && conn_lowest_conn(connection) >= C_CONNECTED) { + /* case2: The connection was established again: */ + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + clear_bit(NEW_CUR_UUID, &peer_device->device->flags); + rcu_read_unlock(); + _tl_restart(connection, RESEND); + _conn_request_state(connection, + (union drbd_state) { { .susp_fen = 1 } }, + (union drbd_state) { { .susp_fen = 0 } }, + CS_VERBOSE); + } + spin_unlock_irq(&device->resource->req_lock); + } + + /* Became sync source. With protocol >= 96, we still need to send out + * the sync uuid now. Need to do that before any drbd_send_state, or + * the other side may go "paused sync" before receiving the sync uuids, + * which is unexpected. */ + if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) && + (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) && + connection->agreed_pro_version >= 96 && get_ldev(device)) { + drbd_gen_and_send_sync_uuid(peer_device); + put_ldev(device); + } + + /* Do not change the order of the if above and the two below... */ + if (os.pdsk == D_DISKLESS && + ns.pdsk > D_DISKLESS && ns.pdsk != D_UNKNOWN) { /* attach on the peer */ + /* we probably will start a resync soon. + * make sure those things are properly reset. */ + device->rs_total = 0; + device->rs_failed = 0; + atomic_set(&device->rs_pending_cnt, 0); + drbd_rs_cancel_all(device); + + drbd_send_uuids(peer_device); + drbd_send_state(peer_device, ns); + } + /* No point in queuing send_bitmap if we don't have a connection + * anymore, so check also the _current_ state, not only the new state + * at the time this work was queued. */ + if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S && + device->state.conn == C_WF_BITMAP_S) + drbd_queue_bitmap_io(device, &drbd_send_bitmap, NULL, + "send_bitmap (WFBitMapS)", + BM_LOCKED_TEST_ALLOWED); + + /* Lost contact to peer's copy of the data */ + if ((os.pdsk >= D_INCONSISTENT && + os.pdsk != D_UNKNOWN && + os.pdsk != D_OUTDATED) + && (ns.pdsk < D_INCONSISTENT || + ns.pdsk == D_UNKNOWN || + ns.pdsk == D_OUTDATED)) { + if (get_ldev(device)) { + if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && + device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { + if (drbd_suspended(device)) { + set_bit(NEW_CUR_UUID, &device->flags); + } else { + drbd_uuid_new_current(device); + drbd_send_uuids(peer_device); + } + } + put_ldev(device); + } + } + + if (ns.pdsk < D_INCONSISTENT && get_ldev(device)) { + if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && + device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { + drbd_uuid_new_current(device); + drbd_send_uuids(peer_device); + } + /* D_DISKLESS Peer becomes secondary */ + if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) + /* We may still be Primary ourselves. + * No harm done if the bitmap still changes, + * redirtied pages will follow later. */ + drbd_bitmap_io_from_worker(device, &drbd_bm_write, + "demote diskless peer", BM_LOCKED_SET_ALLOWED); + put_ldev(device); + } + + /* Write out all changed bits on demote. + * Though, no need to da that just yet + * if there is a resync going on still */ + if (os.role == R_PRIMARY && ns.role == R_SECONDARY && + device->state.conn <= C_CONNECTED && get_ldev(device)) { + /* No changes to the bitmap expected this time, so assert that, + * even though no harm was done if it did change. */ + drbd_bitmap_io_from_worker(device, &drbd_bm_write, + "demote", BM_LOCKED_TEST_ALLOWED); + put_ldev(device); + } + + /* Last part of the attaching process ... */ + if (ns.conn >= C_CONNECTED && + os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { + drbd_send_sizes(peer_device, 0, 0); /* to start sync... */ + drbd_send_uuids(peer_device); + drbd_send_state(peer_device, ns); + } + + /* We want to pause/continue resync, tell peer. */ + if (ns.conn >= C_CONNECTED && + ((os.aftr_isp != ns.aftr_isp) || + (os.user_isp != ns.user_isp))) + drbd_send_state(peer_device, ns); + + /* In case one of the isp bits got set, suspend other devices. */ + if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && + (ns.aftr_isp || ns.peer_isp || ns.user_isp)) + suspend_other_sg(device); + + /* Make sure the peer gets informed about eventual state + changes (ISP bits) while we were in WFReportParams. */ + if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) + drbd_send_state(peer_device, ns); + + if (os.conn != C_AHEAD && ns.conn == C_AHEAD) + drbd_send_state(peer_device, ns); + + /* We are in the progress to start a full sync... */ + if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || + (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S)) + /* no other bitmap changes expected during this phase */ + drbd_queue_bitmap_io(device, + &drbd_bmio_set_n_write, &abw_start_sync, + "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED); + + /* first half of local IO error, failure to attach, + * or administrative detach */ + if (os.disk != D_FAILED && ns.disk == D_FAILED) { + enum drbd_io_error_p eh = EP_PASS_ON; + int was_io_error = 0; + /* corresponding get_ldev was in __drbd_set_state, to serialize + * our cleanup here with the transition to D_DISKLESS. + * But is is still not save to dreference ldev here, since + * we might come from an failed Attach before ldev was set. */ + if (device->ldev) { + rcu_read_lock(); + eh = rcu_dereference(device->ldev->disk_conf)->on_io_error; + rcu_read_unlock(); + + was_io_error = test_and_clear_bit(WAS_IO_ERROR, &device->flags); + + if (was_io_error && eh == EP_CALL_HELPER) + drbd_khelper(device, "local-io-error"); + + /* Immediately allow completion of all application IO, + * that waits for completion from the local disk, + * if this was a force-detach due to disk_timeout + * or administrator request (drbdsetup detach --force). + * Do NOT abort otherwise. + * Aborting local requests may cause serious problems, + * if requests are completed to upper layers already, + * and then later the already submitted local bio completes. + * This can cause DMA into former bio pages that meanwhile + * have been re-used for other things. + * So aborting local requests may cause crashes, + * or even worse, silent data corruption. + */ + if (test_and_clear_bit(FORCE_DETACH, &device->flags)) + tl_abort_disk_io(device); + + /* current state still has to be D_FAILED, + * there is only one way out: to D_DISKLESS, + * and that may only happen after our put_ldev below. */ + if (device->state.disk != D_FAILED) + drbd_err(device, + "ASSERT FAILED: disk is %s during detach\n", + drbd_disk_str(device->state.disk)); + + if (ns.conn >= C_CONNECTED) + drbd_send_state(peer_device, ns); + + drbd_rs_cancel_all(device); + + /* In case we want to get something to stable storage still, + * this may be the last chance. + * Following put_ldev may transition to D_DISKLESS. */ + drbd_md_sync(device); + } + put_ldev(device); + } + + /* second half of local IO error, failure to attach, + * or administrative detach, + * after local_cnt references have reached zero again */ + if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) { + /* We must still be diskless, + * re-attach has to be serialized with this! */ + if (device->state.disk != D_DISKLESS) + drbd_err(device, + "ASSERT FAILED: disk is %s while going diskless\n", + drbd_disk_str(device->state.disk)); + + if (ns.conn >= C_CONNECTED) + drbd_send_state(peer_device, ns); + /* corresponding get_ldev in __drbd_set_state + * this may finally trigger drbd_ldev_destroy. */ + put_ldev(device); + } + + /* Notify peer that I had a local IO error, and did not detached.. */ + if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED) + drbd_send_state(peer_device, ns); + + /* Disks got bigger while they were detached */ + if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && + test_and_clear_bit(RESYNC_AFTER_NEG, &device->flags)) { + if (ns.conn == C_CONNECTED) + resync_after_online_grow(device); + } + + /* A resync finished or aborted, wake paused devices... */ + if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) || + (os.peer_isp && !ns.peer_isp) || + (os.user_isp && !ns.user_isp)) + resume_next_sg(device); + + /* sync target done with resync. Explicitly notify peer, even though + * it should (at least for non-empty resyncs) already know itself. */ + if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) + drbd_send_state(peer_device, ns); + + /* Verify finished, or reached stop sector. Peer did not know about + * the stop sector, and we may even have changed the stop sector during + * verify to interrupt/stop early. Send the new state. */ + if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED + && verify_can_do_stop_sector(device)) + drbd_send_state(peer_device, ns); + + /* This triggers bitmap writeout of potentially still unwritten pages + * if the resync finished cleanly, or aborted because of peer disk + * failure, or because of connection loss. + * For resync aborted because of local disk failure, we cannot do + * any bitmap writeout anymore. + * No harm done if some bits change during this phase. + */ + if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(device)) { + drbd_queue_bitmap_io(device, &drbd_bm_write_copy_pages, NULL, + "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); + put_ldev(device); + } + + if (ns.disk == D_DISKLESS && + ns.conn == C_STANDALONE && + ns.role == R_SECONDARY) { + if (os.aftr_isp != ns.aftr_isp) + resume_next_sg(device); + } + + drbd_md_sync(device); +} + +struct after_conn_state_chg_work { + struct drbd_work w; + enum drbd_conns oc; + union drbd_state ns_min; + union drbd_state ns_max; /* new, max state, over all devices */ + enum chg_state_flags flags; + struct drbd_connection *connection; +}; + +static int w_after_conn_state_ch(struct drbd_work *w, int unused) +{ + struct after_conn_state_chg_work *acscw = + container_of(w, struct after_conn_state_chg_work, w); + struct drbd_connection *connection = acscw->connection; + enum drbd_conns oc = acscw->oc; + union drbd_state ns_max = acscw->ns_max; + struct drbd_peer_device *peer_device; + int vnr; + + kfree(acscw); + + /* Upon network configuration, we need to start the receiver */ + if (oc == C_STANDALONE && ns_max.conn == C_UNCONNECTED) + drbd_thread_start(&connection->receiver); + + if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) { + struct net_conf *old_conf; + + mutex_lock(&connection->resource->conf_update); + old_conf = connection->net_conf; + connection->my_addr_len = 0; + connection->peer_addr_len = 0; + RCU_INIT_POINTER(connection->net_conf, NULL); + conn_free_crypto(connection); + mutex_unlock(&connection->resource->conf_update); + + synchronize_rcu(); + kfree(old_conf); + } + + if (ns_max.susp_fen) { + /* case1: The outdate peer handler is successful: */ + if (ns_max.pdsk <= D_OUTDATED) { + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + if (test_bit(NEW_CUR_UUID, &device->flags)) { + drbd_uuid_new_current(device); + clear_bit(NEW_CUR_UUID, &device->flags); + } + } + rcu_read_unlock(); + spin_lock_irq(&connection->resource->req_lock); + _tl_restart(connection, CONNECTION_LOST_WHILE_PENDING); + _conn_request_state(connection, + (union drbd_state) { { .susp_fen = 1 } }, + (union drbd_state) { { .susp_fen = 0 } }, + CS_VERBOSE); + spin_unlock_irq(&connection->resource->req_lock); + } + } + kref_put(&connection->kref, drbd_destroy_connection); + + conn_md_sync(connection); + + return 0; +} + +static void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf) +{ + enum chg_state_flags flags = ~0; + struct drbd_peer_device *peer_device; + int vnr, first_vol = 1; + union drbd_dev_state os, cs = { + { .role = R_SECONDARY, + .peer = R_UNKNOWN, + .conn = connection->cstate, + .disk = D_DISKLESS, + .pdsk = D_UNKNOWN, + } }; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + os = device->state; + + if (first_vol) { + cs = os; + first_vol = 0; + continue; + } + + if (cs.role != os.role) + flags &= ~CS_DC_ROLE; + + if (cs.peer != os.peer) + flags &= ~CS_DC_PEER; + + if (cs.conn != os.conn) + flags &= ~CS_DC_CONN; + + if (cs.disk != os.disk) + flags &= ~CS_DC_DISK; + + if (cs.pdsk != os.pdsk) + flags &= ~CS_DC_PDSK; + } + rcu_read_unlock(); + + *pf |= CS_DC_MASK; + *pf &= flags; + (*pcs).i = cs.i; +} + +static enum drbd_state_rv +conn_is_valid_transition(struct drbd_connection *connection, union drbd_state mask, union drbd_state val, + enum chg_state_flags flags) +{ + enum drbd_state_rv rv = SS_SUCCESS; + union drbd_state ns, os; + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + os = drbd_read_state(device); + ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL); + + if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) + ns.disk = os.disk; + + if (ns.i == os.i) + continue; + + rv = is_valid_transition(os, ns); + + if (rv >= SS_SUCCESS && !(flags & CS_HARD)) { + rv = is_valid_state(device, ns); + if (rv < SS_SUCCESS) { + if (is_valid_state(device, os) == rv) + rv = is_valid_soft_transition(os, ns, connection); + } else + rv = is_valid_soft_transition(os, ns, connection); + } + + if (rv < SS_SUCCESS) { + if (flags & CS_VERBOSE) + print_st_err(device, os, ns, rv); + break; + } + } + rcu_read_unlock(); + + return rv; +} + +static void +conn_set_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val, + union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags) +{ + union drbd_state ns, os, ns_max = { }; + union drbd_state ns_min = { + { .role = R_MASK, + .peer = R_MASK, + .conn = val.conn, + .disk = D_MASK, + .pdsk = D_MASK + } }; + struct drbd_peer_device *peer_device; + enum drbd_state_rv rv; + int vnr, number_of_volumes = 0; + + if (mask.conn == C_MASK) { + /* remember last connect time so request_timer_fn() won't + * kill newly established sessions while we are still trying to thaw + * previously frozen IO */ + if (connection->cstate != C_WF_REPORT_PARAMS && val.conn == C_WF_REPORT_PARAMS) + connection->last_reconnect_jif = jiffies; + + connection->cstate = val.conn; + } + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + number_of_volumes++; + os = drbd_read_state(device); + ns = apply_mask_val(os, mask, val); + ns = sanitize_state(device, os, ns, NULL); + + if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) + ns.disk = os.disk; + + rv = __drbd_set_state(device, ns, flags, NULL); + if (rv < SS_SUCCESS) + BUG(); + + ns.i = device->state.i; + ns_max.role = max_role(ns.role, ns_max.role); + ns_max.peer = max_role(ns.peer, ns_max.peer); + ns_max.conn = max_t(enum drbd_conns, ns.conn, ns_max.conn); + ns_max.disk = max_t(enum drbd_disk_state, ns.disk, ns_max.disk); + ns_max.pdsk = max_t(enum drbd_disk_state, ns.pdsk, ns_max.pdsk); + + ns_min.role = min_role(ns.role, ns_min.role); + ns_min.peer = min_role(ns.peer, ns_min.peer); + ns_min.conn = min_t(enum drbd_conns, ns.conn, ns_min.conn); + ns_min.disk = min_t(enum drbd_disk_state, ns.disk, ns_min.disk); + ns_min.pdsk = min_t(enum drbd_disk_state, ns.pdsk, ns_min.pdsk); + } + rcu_read_unlock(); + + if (number_of_volumes == 0) { + ns_min = ns_max = (union drbd_state) { { + .role = R_SECONDARY, + .peer = R_UNKNOWN, + .conn = val.conn, + .disk = D_DISKLESS, + .pdsk = D_UNKNOWN + } }; + } + + ns_min.susp = ns_max.susp = connection->resource->susp; + ns_min.susp_nod = ns_max.susp_nod = connection->resource->susp_nod; + ns_min.susp_fen = ns_max.susp_fen = connection->resource->susp_fen; + + *pns_min = ns_min; + *pns_max = ns_max; +} + +static enum drbd_state_rv +_conn_rq_cond(struct drbd_connection *connection, union drbd_state mask, union drbd_state val) +{ + enum drbd_state_rv err, rv = SS_UNKNOWN_ERROR; /* continue waiting */; + + if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &connection->flags)) + rv = SS_CW_SUCCESS; + + if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &connection->flags)) + rv = SS_CW_FAILED_BY_PEER; + + err = conn_is_valid_transition(connection, mask, val, 0); + if (err == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS) + return rv; + + return err; +} + +enum drbd_state_rv +_conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val, + enum chg_state_flags flags) +{ + enum drbd_state_rv rv = SS_SUCCESS; + struct after_conn_state_chg_work *acscw; + enum drbd_conns oc = connection->cstate; + union drbd_state ns_max, ns_min, os; + bool have_mutex = false; + + if (mask.conn) { + rv = is_valid_conn_transition(oc, val.conn); + if (rv < SS_SUCCESS) + goto abort; + } + + rv = conn_is_valid_transition(connection, mask, val, flags); + if (rv < SS_SUCCESS) + goto abort; + + if (oc == C_WF_REPORT_PARAMS && val.conn == C_DISCONNECTING && + !(flags & (CS_LOCAL_ONLY | CS_HARD))) { + + /* This will be a cluster-wide state change. + * Need to give up the spinlock, grab the mutex, + * then send the state change request, ... */ + spin_unlock_irq(&connection->resource->req_lock); + mutex_lock(&connection->cstate_mutex); + have_mutex = true; + + set_bit(CONN_WD_ST_CHG_REQ, &connection->flags); + if (conn_send_state_req(connection, mask, val)) { + /* sending failed. */ + clear_bit(CONN_WD_ST_CHG_REQ, &connection->flags); + rv = SS_CW_FAILED_BY_PEER; + /* need to re-aquire the spin lock, though */ + goto abort_unlocked; + } + + if (val.conn == C_DISCONNECTING) + set_bit(DISCONNECT_SENT, &connection->flags); + + /* ... and re-aquire the spinlock. + * If _conn_rq_cond() returned >= SS_SUCCESS, we must call + * conn_set_state() within the same spinlock. */ + spin_lock_irq(&connection->resource->req_lock); + wait_event_lock_irq(connection->ping_wait, + (rv = _conn_rq_cond(connection, mask, val)), + connection->resource->req_lock); + clear_bit(CONN_WD_ST_CHG_REQ, &connection->flags); + if (rv < SS_SUCCESS) + goto abort; + } + + conn_old_common_state(connection, &os, &flags); + flags |= CS_DC_SUSP; + conn_set_state(connection, mask, val, &ns_min, &ns_max, flags); + conn_pr_state_change(connection, os, ns_max, flags); + + acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC); + if (acscw) { + acscw->oc = os.conn; + acscw->ns_min = ns_min; + acscw->ns_max = ns_max; + acscw->flags = flags; + acscw->w.cb = w_after_conn_state_ch; + kref_get(&connection->kref); + acscw->connection = connection; + drbd_queue_work(&connection->sender_work, &acscw->w); + } else { + drbd_err(connection, "Could not kmalloc an acscw\n"); + } + + abort: + if (have_mutex) { + /* mutex_unlock() "... must not be used in interrupt context.", + * so give up the spinlock, then re-aquire it */ + spin_unlock_irq(&connection->resource->req_lock); + abort_unlocked: + mutex_unlock(&connection->cstate_mutex); + spin_lock_irq(&connection->resource->req_lock); + } + if (rv < SS_SUCCESS && flags & CS_VERBOSE) { + drbd_err(connection, "State change failed: %s\n", drbd_set_st_err_str(rv)); + drbd_err(connection, " mask = 0x%x val = 0x%x\n", mask.i, val.i); + drbd_err(connection, " old_conn:%s wanted_conn:%s\n", drbd_conn_str(oc), drbd_conn_str(val.conn)); + } + return rv; +} + +enum drbd_state_rv +conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val, + enum chg_state_flags flags) +{ + enum drbd_state_rv rv; + + spin_lock_irq(&connection->resource->req_lock); + rv = _conn_request_state(connection, mask, val, flags); + spin_unlock_irq(&connection->resource->req_lock); + + return rv; +} diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h new file mode 100644 index 000000000..7f53c4082 --- /dev/null +++ b/drivers/block/drbd/drbd_state.h @@ -0,0 +1,166 @@ +#ifndef DRBD_STATE_H +#define DRBD_STATE_H + +struct drbd_device; +struct drbd_connection; + +/** + * DOC: DRBD State macros + * + * These macros are used to express state changes in easily readable form. + * + * The NS macros expand to a mask and a value, that can be bit ored onto the + * current state as soon as the spinlock (req_lock) was taken. + * + * The _NS macros are used for state functions that get called with the + * spinlock. These macros expand directly to the new state value. + * + * Besides the basic forms NS() and _NS() additional _?NS[23] are defined + * to express state changes that affect more than one aspect of the state. + * + * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY) + * Means that the network connection was established and that the peer + * is in secondary role. + */ +#define role_MASK R_MASK +#define peer_MASK R_MASK +#define disk_MASK D_MASK +#define pdsk_MASK D_MASK +#define conn_MASK C_MASK +#define susp_MASK 1 +#define user_isp_MASK 1 +#define aftr_isp_MASK 1 +#define susp_nod_MASK 1 +#define susp_fen_MASK 1 + +#define NS(T, S) \ + ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \ + ({ union drbd_state val; val.i = 0; val.T = (S); val; }) +#define NS2(T1, S1, T2, S2) \ + ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ + mask.T2 = T2##_MASK; mask; }), \ + ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ + val.T2 = (S2); val; }) +#define NS3(T1, S1, T2, S2, T3, S3) \ + ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ + mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \ + ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ + val.T2 = (S2); val.T3 = (S3); val; }) + +#define _NS(D, T, S) \ + D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T = (S); __ns; }) +#define _NS2(D, T1, S1, T2, S2) \ + D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \ + __ns.T2 = (S2); __ns; }) +#define _NS3(D, T1, S1, T2, S2, T3, S3) \ + D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \ + __ns.T2 = (S2); __ns.T3 = (S3); __ns; }) + +enum chg_state_flags { + CS_HARD = 1 << 0, + CS_VERBOSE = 1 << 1, + CS_WAIT_COMPLETE = 1 << 2, + CS_SERIALIZE = 1 << 3, + CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, + CS_LOCAL_ONLY = 1 << 4, /* Do not consider a device pair wide state change */ + CS_DC_ROLE = 1 << 5, /* DC = display as connection state change */ + CS_DC_PEER = 1 << 6, + CS_DC_CONN = 1 << 7, + CS_DC_DISK = 1 << 8, + CS_DC_PDSK = 1 << 9, + CS_DC_SUSP = 1 << 10, + CS_DC_MASK = CS_DC_ROLE + CS_DC_PEER + CS_DC_CONN + CS_DC_DISK + CS_DC_PDSK, + CS_IGN_OUTD_FAIL = 1 << 11, +}; + +/* drbd_dev_state and drbd_state are different types. This is to stress the + small difference. There is no suspended flag (.susp), and no suspended + while fence handler runs flas (susp_fen). */ +union drbd_dev_state { + struct { +#if defined(__LITTLE_ENDIAN_BITFIELD) + unsigned role:2 ; /* 3/4 primary/secondary/unknown */ + unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ + unsigned conn:5 ; /* 17/32 cstates */ + unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ + unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ + unsigned _unused:1 ; + unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ + unsigned peer_isp:1 ; + unsigned user_isp:1 ; + unsigned _pad:11; /* 0 unused */ +#elif defined(__BIG_ENDIAN_BITFIELD) + unsigned _pad:11; + unsigned user_isp:1 ; + unsigned peer_isp:1 ; + unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ + unsigned _unused:1 ; + unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ + unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ + unsigned conn:5 ; /* 17/32 cstates */ + unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ + unsigned role:2 ; /* 3/4 primary/secondary/unknown */ +#else +# error "this endianess is not supported" +#endif + }; + unsigned int i; +}; + +extern enum drbd_state_rv drbd_change_state(struct drbd_device *device, + enum chg_state_flags f, + union drbd_state mask, + union drbd_state val); +extern void drbd_force_state(struct drbd_device *, union drbd_state, + union drbd_state); +extern enum drbd_state_rv _drbd_request_state(struct drbd_device *, + union drbd_state, + union drbd_state, + enum chg_state_flags); + +extern enum drbd_state_rv +_drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state, + union drbd_state, enum chg_state_flags); + +extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state, + enum chg_state_flags, + struct completion *done); +extern void print_st_err(struct drbd_device *, union drbd_state, + union drbd_state, int); + +enum drbd_state_rv +_conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val, + enum chg_state_flags flags); + +enum drbd_state_rv +conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val, + enum chg_state_flags flags); + +extern void drbd_resume_al(struct drbd_device *device); +extern bool conn_all_vols_unconf(struct drbd_connection *connection); + +/** + * drbd_request_state() - Reqest a state change + * @device: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + * + * This is the most graceful way of requesting a state change. It is verbose + * quite verbose in case the state change is not possible, and all those + * state changes are globally serialized. + */ +static inline int drbd_request_state(struct drbd_device *device, + union drbd_state mask, + union drbd_state val) +{ + return _drbd_request_state(device, mask, val, CS_VERBOSE + CS_ORDERED); +} + +enum drbd_role conn_highest_role(struct drbd_connection *connection); +enum drbd_role conn_highest_peer(struct drbd_connection *connection); +enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection); +enum drbd_disk_state conn_lowest_disk(struct drbd_connection *connection); +enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection); +enum drbd_conns conn_lowest_conn(struct drbd_connection *connection); + +#endif diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c new file mode 100644 index 000000000..80b0f63c7 --- /dev/null +++ b/drivers/block/drbd/drbd_strings.c @@ -0,0 +1,118 @@ +/* + drbd.h + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. + Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + +#include <linux/drbd.h> +#include "drbd_strings.h" + +static const char *drbd_conn_s_names[] = { + [C_STANDALONE] = "StandAlone", + [C_DISCONNECTING] = "Disconnecting", + [C_UNCONNECTED] = "Unconnected", + [C_TIMEOUT] = "Timeout", + [C_BROKEN_PIPE] = "BrokenPipe", + [C_NETWORK_FAILURE] = "NetworkFailure", + [C_PROTOCOL_ERROR] = "ProtocolError", + [C_WF_CONNECTION] = "WFConnection", + [C_WF_REPORT_PARAMS] = "WFReportParams", + [C_TEAR_DOWN] = "TearDown", + [C_CONNECTED] = "Connected", + [C_STARTING_SYNC_S] = "StartingSyncS", + [C_STARTING_SYNC_T] = "StartingSyncT", + [C_WF_BITMAP_S] = "WFBitMapS", + [C_WF_BITMAP_T] = "WFBitMapT", + [C_WF_SYNC_UUID] = "WFSyncUUID", + [C_SYNC_SOURCE] = "SyncSource", + [C_SYNC_TARGET] = "SyncTarget", + [C_PAUSED_SYNC_S] = "PausedSyncS", + [C_PAUSED_SYNC_T] = "PausedSyncT", + [C_VERIFY_S] = "VerifyS", + [C_VERIFY_T] = "VerifyT", + [C_AHEAD] = "Ahead", + [C_BEHIND] = "Behind", +}; + +static const char *drbd_role_s_names[] = { + [R_PRIMARY] = "Primary", + [R_SECONDARY] = "Secondary", + [R_UNKNOWN] = "Unknown" +}; + +static const char *drbd_disk_s_names[] = { + [D_DISKLESS] = "Diskless", + [D_ATTACHING] = "Attaching", + [D_FAILED] = "Failed", + [D_NEGOTIATING] = "Negotiating", + [D_INCONSISTENT] = "Inconsistent", + [D_OUTDATED] = "Outdated", + [D_UNKNOWN] = "DUnknown", + [D_CONSISTENT] = "Consistent", + [D_UP_TO_DATE] = "UpToDate", +}; + +static const char *drbd_state_sw_errors[] = { + [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config", + [-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data", + [-SS_NO_LOCAL_DISK] = "Can not resync without local disk", + [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk", + [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected", + [-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated", + [-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active", + [-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device", + [-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node", + [-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk", + [-SS_DEVICE_IN_USE] = "Device is held open by someone", + [-SS_NO_NET_CONFIG] = "Have no net/connection configuration", + [-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify", + [-SS_NEED_CONNECTION] = "Need a connection to start verify or resync", + [-SS_NOT_SUPPORTED] = "Peer does not support protocol", + [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", + [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", + [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", + [-SS_OUTDATE_WO_CONN] = "Need a connection for a graceful disconnect/outdate peer", + [-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config", +}; + +const char *drbd_conn_str(enum drbd_conns s) +{ + /* enums are unsigned... */ + return s > C_BEHIND ? "TOO_LARGE" : drbd_conn_s_names[s]; +} + +const char *drbd_role_str(enum drbd_role s) +{ + return s > R_SECONDARY ? "TOO_LARGE" : drbd_role_s_names[s]; +} + +const char *drbd_disk_str(enum drbd_disk_state s) +{ + return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s]; +} + +const char *drbd_set_st_err_str(enum drbd_state_rv err) +{ + return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" : + err > SS_TWO_PRIMARIES ? "TOO_LARGE" + : drbd_state_sw_errors[-err]; +} diff --git a/drivers/block/drbd/drbd_strings.h b/drivers/block/drbd/drbd_strings.h new file mode 100644 index 000000000..f9923cc88 --- /dev/null +++ b/drivers/block/drbd/drbd_strings.h @@ -0,0 +1,9 @@ +#ifndef __DRBD_STRINGS_H +#define __DRBD_STRINGS_H + +extern const char *drbd_conn_str(enum drbd_conns); +extern const char *drbd_role_str(enum drbd_role); +extern const char *drbd_disk_str(enum drbd_disk_state); +extern const char *drbd_set_st_err_str(enum drbd_state_rv); + +#endif /* __DRBD_STRINGS_H */ diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h new file mode 100644 index 000000000..8cb1532a3 --- /dev/null +++ b/drivers/block/drbd/drbd_vli.h @@ -0,0 +1,351 @@ +/* +-*- linux-c -*- + drbd_receiver.c + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _DRBD_VLI_H +#define _DRBD_VLI_H + +/* + * At a granularity of 4KiB storage represented per bit, + * and stroage sizes of several TiB, + * and possibly small-bandwidth replication, + * the bitmap transfer time can take much too long, + * if transmitted in plain text. + * + * We try to reduce the transferred bitmap information + * by encoding runlengths of bit polarity. + * + * We never actually need to encode a "zero" (runlengths are positive). + * But then we have to store the value of the first bit. + * The first bit of information thus shall encode if the first runlength + * gives the number of set or unset bits. + * + * We assume that large areas are either completely set or unset, + * which gives good compression with any runlength method, + * even when encoding the runlength as fixed size 32bit/64bit integers. + * + * Still, there may be areas where the polarity flips every few bits, + * and encoding the runlength sequence of those areas with fix size + * integers would be much worse than plaintext. + * + * We want to encode small runlength values with minimum code length, + * while still being able to encode a Huge run of all zeros. + * + * Thus we need a Variable Length Integer encoding, VLI. + * + * For some cases, we produce more code bits than plaintext input. + * We need to send incompressible chunks as plaintext, skip over them + * and then see if the next chunk compresses better. + * + * We don't care too much about "excellent" compression ratio for large + * runlengths (all set/all clear): whether we achieve a factor of 100 + * or 1000 is not that much of an issue. + * We do not want to waste too much on short runlengths in the "noisy" + * parts of the bitmap, though. + * + * There are endless variants of VLI, we experimented with: + * * simple byte-based + * * various bit based with different code word length. + * + * To avoid yet an other configuration parameter (choice of bitmap compression + * algorithm) which was difficult to explain and tune, we just chose the one + * variant that turned out best in all test cases. + * Based on real world usage patterns, with device sizes ranging from a few GiB + * to several TiB, file server/mailserver/webserver/mysql/postgress, + * mostly idle to really busy, the all time winner (though sometimes only + * marginally better) is: + */ + +/* + * encoding is "visualised" as + * __little endian__ bitstream, least significant bit first (left most) + * + * this particular encoding is chosen so that the prefix code + * starts as unary encoding the level, then modified so that + * 10 levels can be described in 8bit, with minimal overhead + * for the smaller levels. + * + * Number of data bits follow fibonacci sequence, with the exception of the + * last level (+1 data bit, so it makes 64bit total). The only worse code when + * encoding bit polarity runlength is 1 plain bits => 2 code bits. +prefix data bits max val Nº data bits +0 x 0x2 1 +10 x 0x4 1 +110 xx 0x8 2 +1110 xxx 0x10 3 +11110 xxx xx 0x30 5 +111110 xx xxxxxx 0x130 8 +11111100 xxxxxxxx xxxxx 0x2130 13 +11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21 +11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34 +11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56 + * maximum encodable value: 0x100000400202130 == 2**56 + some */ + +/* compression "table": + transmitted x 0.29 + as plaintext x ........................ + x ........................ + x ........................ + x 0.59 0.21........................ + x ........................................................ + x .. c ................................................... + x 0.44.. o ................................................... + x .......... d ................................................... + x .......... e ................................................... + X............. ................................................... + x.............. b ................................................... +2.0x............... i ................................................... + #X................ t ................................................... + #................. s ........................... plain bits .......... +-+----------------------------------------------------------------------- + 1 16 32 64 +*/ + +/* LEVEL: (total bits, prefix bits, prefix value), + * sorted ascending by number of total bits. + * The rest of the code table is calculated at compiletime from this. */ + +/* fibonacci data 1, 1, ... */ +#define VLI_L_1_1() do { \ + LEVEL( 2, 1, 0x00); \ + LEVEL( 3, 2, 0x01); \ + LEVEL( 5, 3, 0x03); \ + LEVEL( 7, 4, 0x07); \ + LEVEL(10, 5, 0x0f); \ + LEVEL(14, 6, 0x1f); \ + LEVEL(21, 8, 0x3f); \ + LEVEL(29, 8, 0x7f); \ + LEVEL(42, 8, 0xbf); \ + LEVEL(64, 8, 0xff); \ + } while (0) + +/* finds a suitable level to decode the least significant part of in. + * returns number of bits consumed. + * + * BUG() for bad input, as that would mean a buggy code table. */ +static inline int vli_decode_bits(u64 *out, const u64 in) +{ + u64 adj = 1; + +#define LEVEL(t,b,v) \ + do { \ + if ((in & ((1 << b) -1)) == v) { \ + *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \ + return t; \ + } \ + adj += 1ULL << (t - b); \ + } while (0) + + VLI_L_1_1(); + + /* NOT REACHED, if VLI_LEVELS code table is defined properly */ + BUG(); +#undef LEVEL +} + +/* return number of code bits needed, + * or negative error number */ +static inline int __vli_encode_bits(u64 *out, const u64 in) +{ + u64 max = 0; + u64 adj = 1; + + if (in == 0) + return -EINVAL; + +#define LEVEL(t,b,v) do { \ + max += 1ULL << (t - b); \ + if (in <= max) { \ + if (out) \ + *out = ((in - adj) << b) | v; \ + return t; \ + } \ + adj = max + 1; \ + } while (0) + + VLI_L_1_1(); + + return -EOVERFLOW; +#undef LEVEL +} + +#undef VLI_L_1_1 + +/* code from here down is independend of actually used bit code */ + +/* + * Code length is determined by some unique (e.g. unary) prefix. + * This encodes arbitrary bit length, not whole bytes: we have a bit-stream, + * not a byte stream. + */ + +/* for the bitstream, we need a cursor */ +struct bitstream_cursor { + /* the current byte */ + u8 *b; + /* the current bit within *b, nomalized: 0..7 */ + unsigned int bit; +}; + +/* initialize cursor to point to first bit of stream */ +static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s) +{ + cur->b = s; + cur->bit = 0; +} + +/* advance cursor by that many bits; maximum expected input value: 64, + * but depending on VLI implementation, it may be more. */ +static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits) +{ + bits += cur->bit; + cur->b = cur->b + (bits >> 3); + cur->bit = bits & 7; +} + +/* the bitstream itself knows its length */ +struct bitstream { + struct bitstream_cursor cur; + unsigned char *buf; + size_t buf_len; /* in bytes */ + + /* for input stream: + * number of trailing 0 bits for padding + * total number of valid bits in stream: buf_len * 8 - pad_bits */ + unsigned int pad_bits; +}; + +static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits) +{ + bs->buf = s; + bs->buf_len = len; + bs->pad_bits = pad_bits; + bitstream_cursor_reset(&bs->cur, bs->buf); +} + +static inline void bitstream_rewind(struct bitstream *bs) +{ + bitstream_cursor_reset(&bs->cur, bs->buf); + memset(bs->buf, 0, bs->buf_len); +} + +/* Put (at most 64) least significant bits of val into bitstream, and advance cursor. + * Ignores "pad_bits". + * Returns zero if bits == 0 (nothing to do). + * Returns number of bits used if successful. + * + * If there is not enough room left in bitstream, + * leaves bitstream unchanged and returns -ENOBUFS. + */ +static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits) +{ + unsigned char *b = bs->cur.b; + unsigned int tmp; + + if (bits == 0) + return 0; + + if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len) + return -ENOBUFS; + + /* paranoia: strip off hi bits; they should not be set anyways. */ + if (bits < 64) + val &= ~0ULL >> (64 - bits); + + *b++ |= (val & 0xff) << bs->cur.bit; + + for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8) + *b++ |= (val >> tmp) & 0xff; + + bitstream_cursor_advance(&bs->cur, bits); + return bits; +} + +/* Fetch (at most 64) bits from bitstream into *out, and advance cursor. + * + * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged. + * + * If there are less than the requested number of valid bits left in the + * bitstream, still fetches all available bits. + * + * Returns number of actually fetched bits. + */ +static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits) +{ + u64 val; + unsigned int n; + + if (bits > 64) + return -EINVAL; + + if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len) + bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3) + - bs->cur.bit - bs->pad_bits; + + if (bits == 0) { + *out = 0; + return 0; + } + + /* get the high bits */ + val = 0; + n = (bs->cur.bit + bits + 7) >> 3; + /* n may be at most 9, if cur.bit + bits > 64 */ + /* which means this copies at most 8 byte */ + if (n) { + memcpy(&val, bs->cur.b+1, n - 1); + val = le64_to_cpu(val) << (8 - bs->cur.bit); + } + + /* we still need the low bits */ + val |= bs->cur.b[0] >> bs->cur.bit; + + /* and mask out bits we don't want */ + val &= ~0ULL >> (64 - bits); + + bitstream_cursor_advance(&bs->cur, bits); + *out = val; + + return bits; +} + +/* encodes @in as vli into @bs; + + * return values + * > 0: number of bits successfully stored in bitstream + * -ENOBUFS @bs is full + * -EINVAL input zero (invalid) + * -EOVERFLOW input too large for this vli code (invalid) + */ +static inline int vli_encode_bits(struct bitstream *bs, u64 in) +{ + u64 code = code; + int bits = __vli_encode_bits(&code, in); + + if (bits <= 0) + return bits; + + return bitstream_put_bits(bs, code, bits); +} + +#endif diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c new file mode 100644 index 000000000..d0fae55d8 --- /dev/null +++ b/drivers/block/drbd/drbd_worker.c @@ -0,0 +1,2156 @@ +/* + drbd_worker.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + +#include <linux/module.h> +#include <linux/drbd.h> +#include <linux/sched.h> +#include <linux/wait.h> +#include <linux/mm.h> +#include <linux/memcontrol.h> +#include <linux/mm_inline.h> +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/string.h> +#include <linux/scatterlist.h> + +#include "drbd_int.h" +#include "drbd_protocol.h" +#include "drbd_req.h" + +static int make_ov_request(struct drbd_device *, int); +static int make_resync_request(struct drbd_device *, int); + +/* endio handlers: + * drbd_md_endio (defined here) + * drbd_request_endio (defined here) + * drbd_peer_request_endio (defined here) + * drbd_bm_endio (defined in drbd_bitmap.c) + * + * For all these callbacks, note the following: + * The callbacks will be called in irq context by the IDE drivers, + * and in Softirqs/Tasklets/BH context by the SCSI drivers. + * Try to get the locking right :) + * + */ + + +/* About the global_state_lock + Each state transition on an device holds a read lock. In case we have + to evaluate the resync after dependencies, we grab a write lock, because + we need stable states on all devices for that. */ +rwlock_t global_state_lock; + +/* used for synchronous meta data and bitmap IO + * submitted by drbd_md_sync_page_io() + */ +void drbd_md_endio(struct bio *bio, int error) +{ + struct drbd_device *device; + + device = bio->bi_private; + device->md_io.error = error; + + /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able + * to timeout on the lower level device, and eventually detach from it. + * If this io completion runs after that timeout expired, this + * drbd_md_put_buffer() may allow us to finally try and re-attach. + * During normal operation, this only puts that extra reference + * down to 1 again. + * Make sure we first drop the reference, and only then signal + * completion, or we may (in drbd_al_read_log()) cycle so fast into the + * next drbd_md_sync_page_io(), that we trigger the + * ASSERT(atomic_read(&device->md_io_in_use) == 1) there. + */ + drbd_md_put_buffer(device); + device->md_io.done = 1; + wake_up(&device->misc_wait); + bio_put(bio); + if (device->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */ + put_ldev(device); +} + +/* reads on behalf of the partner, + * "submitted" by the receiver + */ +static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local) +{ + unsigned long flags = 0; + struct drbd_peer_device *peer_device = peer_req->peer_device; + struct drbd_device *device = peer_device->device; + + spin_lock_irqsave(&device->resource->req_lock, flags); + device->read_cnt += peer_req->i.size >> 9; + list_del(&peer_req->w.list); + if (list_empty(&device->read_ee)) + wake_up(&device->ee_wait); + if (test_bit(__EE_WAS_ERROR, &peer_req->flags)) + __drbd_chk_io_error(device, DRBD_READ_ERROR); + spin_unlock_irqrestore(&device->resource->req_lock, flags); + + drbd_queue_work(&peer_device->connection->sender_work, &peer_req->w); + put_ldev(device); +} + +/* writes on behalf of the partner, or resync writes, + * "submitted" by the receiver, final stage. */ +void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local) +{ + unsigned long flags = 0; + struct drbd_peer_device *peer_device = peer_req->peer_device; + struct drbd_device *device = peer_device->device; + struct drbd_interval i; + int do_wake; + u64 block_id; + int do_al_complete_io; + + /* after we moved peer_req to done_ee, + * we may no longer access it, + * it may be freed/reused already! + * (as soon as we release the req_lock) */ + i = peer_req->i; + do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO; + block_id = peer_req->block_id; + peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO; + + spin_lock_irqsave(&device->resource->req_lock, flags); + device->writ_cnt += peer_req->i.size >> 9; + list_move_tail(&peer_req->w.list, &device->done_ee); + + /* + * Do not remove from the write_requests tree here: we did not send the + * Ack yet and did not wake possibly waiting conflicting requests. + * Removed from the tree from "drbd_process_done_ee" within the + * appropriate dw.cb (e_end_block/e_end_resync_block) or from + * _drbd_clear_done_ee. + */ + + do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee); + + /* FIXME do we want to detach for failed REQ_DISCARD? + * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */ + if (peer_req->flags & EE_WAS_ERROR) + __drbd_chk_io_error(device, DRBD_WRITE_ERROR); + spin_unlock_irqrestore(&device->resource->req_lock, flags); + + if (block_id == ID_SYNCER) + drbd_rs_complete_io(device, i.sector); + + if (do_wake) + wake_up(&device->ee_wait); + + if (do_al_complete_io) + drbd_al_complete_io(device, &i); + + wake_asender(peer_device->connection); + put_ldev(device); +} + +/* writes on behalf of the partner, or resync writes, + * "submitted" by the receiver. + */ +void drbd_peer_request_endio(struct bio *bio, int error) +{ + struct drbd_peer_request *peer_req = bio->bi_private; + struct drbd_device *device = peer_req->peer_device->device; + int uptodate = bio_flagged(bio, BIO_UPTODATE); + int is_write = bio_data_dir(bio) == WRITE; + int is_discard = !!(bio->bi_rw & REQ_DISCARD); + + if (error && __ratelimit(&drbd_ratelimit_state)) + drbd_warn(device, "%s: error=%d s=%llus\n", + is_write ? (is_discard ? "discard" : "write") + : "read", error, + (unsigned long long)peer_req->i.sector); + if (!error && !uptodate) { + if (__ratelimit(&drbd_ratelimit_state)) + drbd_warn(device, "%s: setting error to -EIO s=%llus\n", + is_write ? "write" : "read", + (unsigned long long)peer_req->i.sector); + /* strange behavior of some lower level drivers... + * fail the request by clearing the uptodate flag, + * but do not return any error?! */ + error = -EIO; + } + + if (error) + set_bit(__EE_WAS_ERROR, &peer_req->flags); + + bio_put(bio); /* no need for the bio anymore */ + if (atomic_dec_and_test(&peer_req->pending_bios)) { + if (is_write) + drbd_endio_write_sec_final(peer_req); + else + drbd_endio_read_sec_final(peer_req); + } +} + +/* read, readA or write requests on R_PRIMARY coming from drbd_make_request + */ +void drbd_request_endio(struct bio *bio, int error) +{ + unsigned long flags; + struct drbd_request *req = bio->bi_private; + struct drbd_device *device = req->device; + struct bio_and_error m; + enum drbd_req_event what; + int uptodate = bio_flagged(bio, BIO_UPTODATE); + + if (!error && !uptodate) { + drbd_warn(device, "p %s: setting error to -EIO\n", + bio_data_dir(bio) == WRITE ? "write" : "read"); + /* strange behavior of some lower level drivers... + * fail the request by clearing the uptodate flag, + * but do not return any error?! */ + error = -EIO; + } + + + /* If this request was aborted locally before, + * but now was completed "successfully", + * chances are that this caused arbitrary data corruption. + * + * "aborting" requests, or force-detaching the disk, is intended for + * completely blocked/hung local backing devices which do no longer + * complete requests at all, not even do error completions. In this + * situation, usually a hard-reset and failover is the only way out. + * + * By "aborting", basically faking a local error-completion, + * we allow for a more graceful swichover by cleanly migrating services. + * Still the affected node has to be rebooted "soon". + * + * By completing these requests, we allow the upper layers to re-use + * the associated data pages. + * + * If later the local backing device "recovers", and now DMAs some data + * from disk into the original request pages, in the best case it will + * just put random data into unused pages; but typically it will corrupt + * meanwhile completely unrelated data, causing all sorts of damage. + * + * Which means delayed successful completion, + * especially for READ requests, + * is a reason to panic(). + * + * We assume that a delayed *error* completion is OK, + * though we still will complain noisily about it. + */ + if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) { + if (__ratelimit(&drbd_ratelimit_state)) + drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n"); + + if (!error) + panic("possible random memory corruption caused by delayed completion of aborted local request\n"); + } + + /* to avoid recursion in __req_mod */ + if (unlikely(error)) { + if (bio->bi_rw & REQ_DISCARD) + what = (error == -EOPNOTSUPP) + ? DISCARD_COMPLETED_NOTSUPP + : DISCARD_COMPLETED_WITH_ERROR; + else + what = (bio_data_dir(bio) == WRITE) + ? WRITE_COMPLETED_WITH_ERROR + : (bio_rw(bio) == READ) + ? READ_COMPLETED_WITH_ERROR + : READ_AHEAD_COMPLETED_WITH_ERROR; + } else + what = COMPLETED_OK; + + bio_put(req->private_bio); + req->private_bio = ERR_PTR(error); + + /* not req_mod(), we need irqsave here! */ + spin_lock_irqsave(&device->resource->req_lock, flags); + __req_mod(req, what, &m); + spin_unlock_irqrestore(&device->resource->req_lock, flags); + put_ldev(device); + + if (m.bio) + complete_master_bio(device, &m); +} + +void drbd_csum_ee(struct crypto_hash *tfm, struct drbd_peer_request *peer_req, void *digest) +{ + struct hash_desc desc; + struct scatterlist sg; + struct page *page = peer_req->pages; + struct page *tmp; + unsigned len; + + desc.tfm = tfm; + desc.flags = 0; + + sg_init_table(&sg, 1); + crypto_hash_init(&desc); + + while ((tmp = page_chain_next(page))) { + /* all but the last page will be fully used */ + sg_set_page(&sg, page, PAGE_SIZE, 0); + crypto_hash_update(&desc, &sg, sg.length); + page = tmp; + } + /* and now the last, possibly only partially used page */ + len = peer_req->i.size & (PAGE_SIZE - 1); + sg_set_page(&sg, page, len ?: PAGE_SIZE, 0); + crypto_hash_update(&desc, &sg, sg.length); + crypto_hash_final(&desc, digest); +} + +void drbd_csum_bio(struct crypto_hash *tfm, struct bio *bio, void *digest) +{ + struct hash_desc desc; + struct scatterlist sg; + struct bio_vec bvec; + struct bvec_iter iter; + + desc.tfm = tfm; + desc.flags = 0; + + sg_init_table(&sg, 1); + crypto_hash_init(&desc); + + bio_for_each_segment(bvec, bio, iter) { + sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset); + crypto_hash_update(&desc, &sg, sg.length); + } + crypto_hash_final(&desc, digest); +} + +/* MAYBE merge common code with w_e_end_ov_req */ +static int w_e_send_csum(struct drbd_work *w, int cancel) +{ + struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); + struct drbd_peer_device *peer_device = peer_req->peer_device; + struct drbd_device *device = peer_device->device; + int digest_size; + void *digest; + int err = 0; + + if (unlikely(cancel)) + goto out; + + if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0)) + goto out; + + digest_size = crypto_hash_digestsize(peer_device->connection->csums_tfm); + digest = kmalloc(digest_size, GFP_NOIO); + if (digest) { + sector_t sector = peer_req->i.sector; + unsigned int size = peer_req->i.size; + drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest); + /* Free peer_req and pages before send. + * In case we block on congestion, we could otherwise run into + * some distributed deadlock, if the other side blocks on + * congestion as well, because our receiver blocks in + * drbd_alloc_pages due to pp_in_use > max_buffers. */ + drbd_free_peer_req(device, peer_req); + peer_req = NULL; + inc_rs_pending(device); + err = drbd_send_drequest_csum(peer_device, sector, size, + digest, digest_size, + P_CSUM_RS_REQUEST); + kfree(digest); + } else { + drbd_err(device, "kmalloc() of digest failed.\n"); + err = -ENOMEM; + } + +out: + if (peer_req) + drbd_free_peer_req(device, peer_req); + + if (unlikely(err)) + drbd_err(device, "drbd_send_drequest(..., csum) failed\n"); + return err; +} + +#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) + +static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, int size) +{ + struct drbd_device *device = peer_device->device; + struct drbd_peer_request *peer_req; + + if (!get_ldev(device)) + return -EIO; + + /* GFP_TRY, because if there is no memory available right now, this may + * be rescheduled for later. It is "only" background resync, after all. */ + peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector, + size, true /* has real payload */, GFP_TRY); + if (!peer_req) + goto defer; + + peer_req->w.cb = w_e_send_csum; + spin_lock_irq(&device->resource->req_lock); + list_add_tail(&peer_req->w.list, &device->read_ee); + spin_unlock_irq(&device->resource->req_lock); + + atomic_add(size >> 9, &device->rs_sect_ev); + if (drbd_submit_peer_request(device, peer_req, READ, DRBD_FAULT_RS_RD) == 0) + return 0; + + /* If it failed because of ENOMEM, retry should help. If it failed + * because bio_add_page failed (probably broken lower level driver), + * retry may or may not help. + * If it does not, you may need to force disconnect. */ + spin_lock_irq(&device->resource->req_lock); + list_del(&peer_req->w.list); + spin_unlock_irq(&device->resource->req_lock); + + drbd_free_peer_req(device, peer_req); +defer: + put_ldev(device); + return -EAGAIN; +} + +int w_resync_timer(struct drbd_work *w, int cancel) +{ + struct drbd_device *device = + container_of(w, struct drbd_device, resync_work); + + switch (device->state.conn) { + case C_VERIFY_S: + make_ov_request(device, cancel); + break; + case C_SYNC_TARGET: + make_resync_request(device, cancel); + break; + } + + return 0; +} + +void resync_timer_fn(unsigned long data) +{ + struct drbd_device *device = (struct drbd_device *) data; + + drbd_queue_work_if_unqueued( + &first_peer_device(device)->connection->sender_work, + &device->resync_work); +} + +static void fifo_set(struct fifo_buffer *fb, int value) +{ + int i; + + for (i = 0; i < fb->size; i++) + fb->values[i] = value; +} + +static int fifo_push(struct fifo_buffer *fb, int value) +{ + int ov; + + ov = fb->values[fb->head_index]; + fb->values[fb->head_index++] = value; + + if (fb->head_index >= fb->size) + fb->head_index = 0; + + return ov; +} + +static void fifo_add_val(struct fifo_buffer *fb, int value) +{ + int i; + + for (i = 0; i < fb->size; i++) + fb->values[i] += value; +} + +struct fifo_buffer *fifo_alloc(int fifo_size) +{ + struct fifo_buffer *fb; + + fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO); + if (!fb) + return NULL; + + fb->head_index = 0; + fb->size = fifo_size; + fb->total = 0; + + return fb; +} + +static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in) +{ + struct disk_conf *dc; + unsigned int want; /* The number of sectors we want in-flight */ + int req_sect; /* Number of sectors to request in this turn */ + int correction; /* Number of sectors more we need in-flight */ + int cps; /* correction per invocation of drbd_rs_controller() */ + int steps; /* Number of time steps to plan ahead */ + int curr_corr; + int max_sect; + struct fifo_buffer *plan; + + dc = rcu_dereference(device->ldev->disk_conf); + plan = rcu_dereference(device->rs_plan_s); + + steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */ + + if (device->rs_in_flight + sect_in == 0) { /* At start of resync */ + want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps; + } else { /* normal path */ + want = dc->c_fill_target ? dc->c_fill_target : + sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10); + } + + correction = want - device->rs_in_flight - plan->total; + + /* Plan ahead */ + cps = correction / steps; + fifo_add_val(plan, cps); + plan->total += cps * steps; + + /* What we do in this step */ + curr_corr = fifo_push(plan, 0); + plan->total -= curr_corr; + + req_sect = sect_in + curr_corr; + if (req_sect < 0) + req_sect = 0; + + max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ; + if (req_sect > max_sect) + req_sect = max_sect; + + /* + drbd_warn(device, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n", + sect_in, device->rs_in_flight, want, correction, + steps, cps, device->rs_planed, curr_corr, req_sect); + */ + + return req_sect; +} + +static int drbd_rs_number_requests(struct drbd_device *device) +{ + unsigned int sect_in; /* Number of sectors that came in since the last turn */ + int number, mxb; + + sect_in = atomic_xchg(&device->rs_sect_in, 0); + device->rs_in_flight -= sect_in; + + rcu_read_lock(); + mxb = drbd_get_max_buffers(device) / 2; + if (rcu_dereference(device->rs_plan_s)->size) { + number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9); + device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME; + } else { + device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate; + number = SLEEP_TIME * device->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ); + } + rcu_read_unlock(); + + /* Don't have more than "max-buffers"/2 in-flight. + * Otherwise we may cause the remote site to stall on drbd_alloc_pages(), + * potentially causing a distributed deadlock on congestion during + * online-verify or (checksum-based) resync, if max-buffers, + * socket buffer sizes and resync rate settings are mis-configured. */ + + /* note that "number" is in units of "BM_BLOCK_SIZE" (which is 4k), + * mxb (as used here, and in drbd_alloc_pages on the peer) is + * "number of pages" (typically also 4k), + * but "rs_in_flight" is in "sectors" (512 Byte). */ + if (mxb - device->rs_in_flight/8 < number) + number = mxb - device->rs_in_flight/8; + + return number; +} + +static int make_resync_request(struct drbd_device *const device, int cancel) +{ + struct drbd_peer_device *const peer_device = first_peer_device(device); + struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; + unsigned long bit; + sector_t sector; + const sector_t capacity = drbd_get_capacity(device->this_bdev); + int max_bio_size; + int number, rollback_i, size; + int align, requeue = 0; + int i = 0; + + if (unlikely(cancel)) + return 0; + + if (device->rs_total == 0) { + /* empty resync? */ + drbd_resync_finished(device); + return 0; + } + + if (!get_ldev(device)) { + /* Since we only need to access device->rsync a + get_ldev_if_state(device,D_FAILED) would be sufficient, but + to continue resync with a broken disk makes no sense at + all */ + drbd_err(device, "Disk broke down during resync!\n"); + return 0; + } + + max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9; + number = drbd_rs_number_requests(device); + if (number <= 0) + goto requeue; + + for (i = 0; i < number; i++) { + /* Stop generating RS requests when half of the send buffer is filled, + * but notify TCP that we'd like to have more space. */ + mutex_lock(&connection->data.mutex); + if (connection->data.socket) { + struct sock *sk = connection->data.socket->sk; + int queued = sk->sk_wmem_queued; + int sndbuf = sk->sk_sndbuf; + if (queued > sndbuf / 2) { + requeue = 1; + if (sk->sk_socket) + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + } + } else + requeue = 1; + mutex_unlock(&connection->data.mutex); + if (requeue) + goto requeue; + +next_sector: + size = BM_BLOCK_SIZE; + bit = drbd_bm_find_next(device, device->bm_resync_fo); + + if (bit == DRBD_END_OF_BITMAP) { + device->bm_resync_fo = drbd_bm_bits(device); + put_ldev(device); + return 0; + } + + sector = BM_BIT_TO_SECT(bit); + + if (drbd_try_rs_begin_io(device, sector)) { + device->bm_resync_fo = bit; + goto requeue; + } + device->bm_resync_fo = bit + 1; + + if (unlikely(drbd_bm_test_bit(device, bit) == 0)) { + drbd_rs_complete_io(device, sector); + goto next_sector; + } + +#if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE + /* try to find some adjacent bits. + * we stop if we have already the maximum req size. + * + * Additionally always align bigger requests, in order to + * be prepared for all stripe sizes of software RAIDs. + */ + align = 1; + rollback_i = i; + while (i < number) { + if (size + BM_BLOCK_SIZE > max_bio_size) + break; + + /* Be always aligned */ + if (sector & ((1<<(align+3))-1)) + break; + + /* do not cross extent boundaries */ + if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0) + break; + /* now, is it actually dirty, after all? + * caution, drbd_bm_test_bit is tri-state for some + * obscure reason; ( b == 0 ) would get the out-of-band + * only accidentally right because of the "oddly sized" + * adjustment below */ + if (drbd_bm_test_bit(device, bit+1) != 1) + break; + bit++; + size += BM_BLOCK_SIZE; + if ((BM_BLOCK_SIZE << align) <= size) + align++; + i++; + } + /* if we merged some, + * reset the offset to start the next drbd_bm_find_next from */ + if (size > BM_BLOCK_SIZE) + device->bm_resync_fo = bit + 1; +#endif + + /* adjust very last sectors, in case we are oddly sized */ + if (sector + (size>>9) > capacity) + size = (capacity-sector)<<9; + + if (device->use_csums) { + switch (read_for_csum(peer_device, sector, size)) { + case -EIO: /* Disk failure */ + put_ldev(device); + return -EIO; + case -EAGAIN: /* allocation failed, or ldev busy */ + drbd_rs_complete_io(device, sector); + device->bm_resync_fo = BM_SECT_TO_BIT(sector); + i = rollback_i; + goto requeue; + case 0: + /* everything ok */ + break; + default: + BUG(); + } + } else { + int err; + + inc_rs_pending(device); + err = drbd_send_drequest(peer_device, P_RS_DATA_REQUEST, + sector, size, ID_SYNCER); + if (err) { + drbd_err(device, "drbd_send_drequest() failed, aborting...\n"); + dec_rs_pending(device); + put_ldev(device); + return err; + } + } + } + + if (device->bm_resync_fo >= drbd_bm_bits(device)) { + /* last syncer _request_ was sent, + * but the P_RS_DATA_REPLY not yet received. sync will end (and + * next sync group will resume), as soon as we receive the last + * resync data block, and the last bit is cleared. + * until then resync "work" is "inactive" ... + */ + put_ldev(device); + return 0; + } + + requeue: + device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9)); + mod_timer(&device->resync_timer, jiffies + SLEEP_TIME); + put_ldev(device); + return 0; +} + +static int make_ov_request(struct drbd_device *device, int cancel) +{ + int number, i, size; + sector_t sector; + const sector_t capacity = drbd_get_capacity(device->this_bdev); + bool stop_sector_reached = false; + + if (unlikely(cancel)) + return 1; + + number = drbd_rs_number_requests(device); + + sector = device->ov_position; + for (i = 0; i < number; i++) { + if (sector >= capacity) + return 1; + + /* We check for "finished" only in the reply path: + * w_e_end_ov_reply(). + * We need to send at least one request out. */ + stop_sector_reached = i > 0 + && verify_can_do_stop_sector(device) + && sector >= device->ov_stop_sector; + if (stop_sector_reached) + break; + + size = BM_BLOCK_SIZE; + + if (drbd_try_rs_begin_io(device, sector)) { + device->ov_position = sector; + goto requeue; + } + + if (sector + (size>>9) > capacity) + size = (capacity-sector)<<9; + + inc_rs_pending(device); + if (drbd_send_ov_request(first_peer_device(device), sector, size)) { + dec_rs_pending(device); + return 0; + } + sector += BM_SECT_PER_BIT; + } + device->ov_position = sector; + + requeue: + device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9)); + if (i == 0 || !stop_sector_reached) + mod_timer(&device->resync_timer, jiffies + SLEEP_TIME); + return 1; +} + +int w_ov_finished(struct drbd_work *w, int cancel) +{ + struct drbd_device_work *dw = + container_of(w, struct drbd_device_work, w); + struct drbd_device *device = dw->device; + kfree(dw); + ov_out_of_sync_print(device); + drbd_resync_finished(device); + + return 0; +} + +static int w_resync_finished(struct drbd_work *w, int cancel) +{ + struct drbd_device_work *dw = + container_of(w, struct drbd_device_work, w); + struct drbd_device *device = dw->device; + kfree(dw); + + drbd_resync_finished(device); + + return 0; +} + +static void ping_peer(struct drbd_device *device) +{ + struct drbd_connection *connection = first_peer_device(device)->connection; + + clear_bit(GOT_PING_ACK, &connection->flags); + request_ping(connection); + wait_event(connection->ping_wait, + test_bit(GOT_PING_ACK, &connection->flags) || device->state.conn < C_CONNECTED); +} + +int drbd_resync_finished(struct drbd_device *device) +{ + unsigned long db, dt, dbdt; + unsigned long n_oos; + union drbd_state os, ns; + struct drbd_device_work *dw; + char *khelper_cmd = NULL; + int verify_done = 0; + + /* Remove all elements from the resync LRU. Since future actions + * might set bits in the (main) bitmap, then the entries in the + * resync LRU would be wrong. */ + if (drbd_rs_del_all(device)) { + /* In case this is not possible now, most probably because + * there are P_RS_DATA_REPLY Packets lingering on the worker's + * queue (or even the read operations for those packets + * is not finished by now). Retry in 100ms. */ + + schedule_timeout_interruptible(HZ / 10); + dw = kmalloc(sizeof(struct drbd_device_work), GFP_ATOMIC); + if (dw) { + dw->w.cb = w_resync_finished; + dw->device = device; + drbd_queue_work(&first_peer_device(device)->connection->sender_work, + &dw->w); + return 1; + } + drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(dw).\n"); + } + + dt = (jiffies - device->rs_start - device->rs_paused) / HZ; + if (dt <= 0) + dt = 1; + + db = device->rs_total; + /* adjust for verify start and stop sectors, respective reached position */ + if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T) + db -= device->ov_left; + + dbdt = Bit2KB(db/dt); + device->rs_paused /= HZ; + + if (!get_ldev(device)) + goto out; + + ping_peer(device); + + spin_lock_irq(&device->resource->req_lock); + os = drbd_read_state(device); + + verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T); + + /* This protects us against multiple calls (that can happen in the presence + of application IO), and against connectivity loss just before we arrive here. */ + if (os.conn <= C_CONNECTED) + goto out_unlock; + + ns = os; + ns.conn = C_CONNECTED; + + drbd_info(device, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n", + verify_done ? "Online verify" : "Resync", + dt + device->rs_paused, device->rs_paused, dbdt); + + n_oos = drbd_bm_total_weight(device); + + if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) { + if (n_oos) { + drbd_alert(device, "Online verify found %lu %dk block out of sync!\n", + n_oos, Bit2KB(1)); + khelper_cmd = "out-of-sync"; + } + } else { + D_ASSERT(device, (n_oos - device->rs_failed) == 0); + + if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) + khelper_cmd = "after-resync-target"; + + if (device->use_csums && device->rs_total) { + const unsigned long s = device->rs_same_csum; + const unsigned long t = device->rs_total; + const int ratio = + (t == 0) ? 0 : + (t < 100000) ? ((s*100)/t) : (s/(t/100)); + drbd_info(device, "%u %% had equal checksums, eliminated: %luK; " + "transferred %luK total %luK\n", + ratio, + Bit2KB(device->rs_same_csum), + Bit2KB(device->rs_total - device->rs_same_csum), + Bit2KB(device->rs_total)); + } + } + + if (device->rs_failed) { + drbd_info(device, " %lu failed blocks\n", device->rs_failed); + + if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) { + ns.disk = D_INCONSISTENT; + ns.pdsk = D_UP_TO_DATE; + } else { + ns.disk = D_UP_TO_DATE; + ns.pdsk = D_INCONSISTENT; + } + } else { + ns.disk = D_UP_TO_DATE; + ns.pdsk = D_UP_TO_DATE; + + if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) { + if (device->p_uuid) { + int i; + for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++) + _drbd_uuid_set(device, i, device->p_uuid[i]); + drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_CURRENT]); + _drbd_uuid_set(device, UI_CURRENT, device->p_uuid[UI_CURRENT]); + } else { + drbd_err(device, "device->p_uuid is NULL! BUG\n"); + } + } + + if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) { + /* for verify runs, we don't update uuids here, + * so there would be nothing to report. */ + drbd_uuid_set_bm(device, 0UL); + drbd_print_uuids(device, "updated UUIDs"); + if (device->p_uuid) { + /* Now the two UUID sets are equal, update what we + * know of the peer. */ + int i; + for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++) + device->p_uuid[i] = device->ldev->md.uuid[i]; + } + } + } + + _drbd_set_state(device, ns, CS_VERBOSE, NULL); +out_unlock: + spin_unlock_irq(&device->resource->req_lock); + put_ldev(device); +out: + device->rs_total = 0; + device->rs_failed = 0; + device->rs_paused = 0; + + /* reset start sector, if we reached end of device */ + if (verify_done && device->ov_left == 0) + device->ov_start_sector = 0; + + drbd_md_sync(device); + + if (khelper_cmd) + drbd_khelper(device, khelper_cmd); + + return 1; +} + +/* helper */ +static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_request *peer_req) +{ + if (drbd_peer_req_has_active_page(peer_req)) { + /* This might happen if sendpage() has not finished */ + int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT; + atomic_add(i, &device->pp_in_use_by_net); + atomic_sub(i, &device->pp_in_use); + spin_lock_irq(&device->resource->req_lock); + list_add_tail(&peer_req->w.list, &device->net_ee); + spin_unlock_irq(&device->resource->req_lock); + wake_up(&drbd_pp_wait); + } else + drbd_free_peer_req(device, peer_req); +} + +/** + * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST + * @device: DRBD device. + * @w: work object. + * @cancel: The connection will be closed anyways + */ +int w_e_end_data_req(struct drbd_work *w, int cancel) +{ + struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); + struct drbd_peer_device *peer_device = peer_req->peer_device; + struct drbd_device *device = peer_device->device; + int err; + + if (unlikely(cancel)) { + drbd_free_peer_req(device, peer_req); + dec_unacked(device); + return 0; + } + + if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { + err = drbd_send_block(peer_device, P_DATA_REPLY, peer_req); + } else { + if (__ratelimit(&drbd_ratelimit_state)) + drbd_err(device, "Sending NegDReply. sector=%llus.\n", + (unsigned long long)peer_req->i.sector); + + err = drbd_send_ack(peer_device, P_NEG_DREPLY, peer_req); + } + + dec_unacked(device); + + move_to_net_ee_or_free(device, peer_req); + + if (unlikely(err)) + drbd_err(device, "drbd_send_block() failed\n"); + return err; +} + +/** + * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST + * @w: work object. + * @cancel: The connection will be closed anyways + */ +int w_e_end_rsdata_req(struct drbd_work *w, int cancel) +{ + struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); + struct drbd_peer_device *peer_device = peer_req->peer_device; + struct drbd_device *device = peer_device->device; + int err; + + if (unlikely(cancel)) { + drbd_free_peer_req(device, peer_req); + dec_unacked(device); + return 0; + } + + if (get_ldev_if_state(device, D_FAILED)) { + drbd_rs_complete_io(device, peer_req->i.sector); + put_ldev(device); + } + + if (device->state.conn == C_AHEAD) { + err = drbd_send_ack(peer_device, P_RS_CANCEL, peer_req); + } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { + if (likely(device->state.pdsk >= D_INCONSISTENT)) { + inc_rs_pending(device); + err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req); + } else { + if (__ratelimit(&drbd_ratelimit_state)) + drbd_err(device, "Not sending RSDataReply, " + "partner DISKLESS!\n"); + err = 0; + } + } else { + if (__ratelimit(&drbd_ratelimit_state)) + drbd_err(device, "Sending NegRSDReply. sector %llus.\n", + (unsigned long long)peer_req->i.sector); + + err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req); + + /* update resync data with failure */ + drbd_rs_failed_io(device, peer_req->i.sector, peer_req->i.size); + } + + dec_unacked(device); + + move_to_net_ee_or_free(device, peer_req); + + if (unlikely(err)) + drbd_err(device, "drbd_send_block() failed\n"); + return err; +} + +int w_e_end_csum_rs_req(struct drbd_work *w, int cancel) +{ + struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); + struct drbd_peer_device *peer_device = peer_req->peer_device; + struct drbd_device *device = peer_device->device; + struct digest_info *di; + int digest_size; + void *digest = NULL; + int err, eq = 0; + + if (unlikely(cancel)) { + drbd_free_peer_req(device, peer_req); + dec_unacked(device); + return 0; + } + + if (get_ldev(device)) { + drbd_rs_complete_io(device, peer_req->i.sector); + put_ldev(device); + } + + di = peer_req->digest; + + if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { + /* quick hack to try to avoid a race against reconfiguration. + * a real fix would be much more involved, + * introducing more locking mechanisms */ + if (peer_device->connection->csums_tfm) { + digest_size = crypto_hash_digestsize(peer_device->connection->csums_tfm); + D_ASSERT(device, digest_size == di->digest_size); + digest = kmalloc(digest_size, GFP_NOIO); + } + if (digest) { + drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest); + eq = !memcmp(digest, di->digest, digest_size); + kfree(digest); + } + + if (eq) { + drbd_set_in_sync(device, peer_req->i.sector, peer_req->i.size); + /* rs_same_csums unit is BM_BLOCK_SIZE */ + device->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT; + err = drbd_send_ack(peer_device, P_RS_IS_IN_SYNC, peer_req); + } else { + inc_rs_pending(device); + peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */ + peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */ + kfree(di); + err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req); + } + } else { + err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req); + if (__ratelimit(&drbd_ratelimit_state)) + drbd_err(device, "Sending NegDReply. I guess it gets messy.\n"); + } + + dec_unacked(device); + move_to_net_ee_or_free(device, peer_req); + + if (unlikely(err)) + drbd_err(device, "drbd_send_block/ack() failed\n"); + return err; +} + +int w_e_end_ov_req(struct drbd_work *w, int cancel) +{ + struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); + struct drbd_peer_device *peer_device = peer_req->peer_device; + struct drbd_device *device = peer_device->device; + sector_t sector = peer_req->i.sector; + unsigned int size = peer_req->i.size; + int digest_size; + void *digest; + int err = 0; + + if (unlikely(cancel)) + goto out; + + digest_size = crypto_hash_digestsize(peer_device->connection->verify_tfm); + digest = kmalloc(digest_size, GFP_NOIO); + if (!digest) { + err = 1; /* terminate the connection in case the allocation failed */ + goto out; + } + + if (likely(!(peer_req->flags & EE_WAS_ERROR))) + drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest); + else + memset(digest, 0, digest_size); + + /* Free e and pages before send. + * In case we block on congestion, we could otherwise run into + * some distributed deadlock, if the other side blocks on + * congestion as well, because our receiver blocks in + * drbd_alloc_pages due to pp_in_use > max_buffers. */ + drbd_free_peer_req(device, peer_req); + peer_req = NULL; + inc_rs_pending(device); + err = drbd_send_drequest_csum(peer_device, sector, size, digest, digest_size, P_OV_REPLY); + if (err) + dec_rs_pending(device); + kfree(digest); + +out: + if (peer_req) + drbd_free_peer_req(device, peer_req); + dec_unacked(device); + return err; +} + +void drbd_ov_out_of_sync_found(struct drbd_device *device, sector_t sector, int size) +{ + if (device->ov_last_oos_start + device->ov_last_oos_size == sector) { + device->ov_last_oos_size += size>>9; + } else { + device->ov_last_oos_start = sector; + device->ov_last_oos_size = size>>9; + } + drbd_set_out_of_sync(device, sector, size); +} + +int w_e_end_ov_reply(struct drbd_work *w, int cancel) +{ + struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); + struct drbd_peer_device *peer_device = peer_req->peer_device; + struct drbd_device *device = peer_device->device; + struct digest_info *di; + void *digest; + sector_t sector = peer_req->i.sector; + unsigned int size = peer_req->i.size; + int digest_size; + int err, eq = 0; + bool stop_sector_reached = false; + + if (unlikely(cancel)) { + drbd_free_peer_req(device, peer_req); + dec_unacked(device); + return 0; + } + + /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all + * the resync lru has been cleaned up already */ + if (get_ldev(device)) { + drbd_rs_complete_io(device, peer_req->i.sector); + put_ldev(device); + } + + di = peer_req->digest; + + if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { + digest_size = crypto_hash_digestsize(peer_device->connection->verify_tfm); + digest = kmalloc(digest_size, GFP_NOIO); + if (digest) { + drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest); + + D_ASSERT(device, digest_size == di->digest_size); + eq = !memcmp(digest, di->digest, digest_size); + kfree(digest); + } + } + + /* Free peer_req and pages before send. + * In case we block on congestion, we could otherwise run into + * some distributed deadlock, if the other side blocks on + * congestion as well, because our receiver blocks in + * drbd_alloc_pages due to pp_in_use > max_buffers. */ + drbd_free_peer_req(device, peer_req); + if (!eq) + drbd_ov_out_of_sync_found(device, sector, size); + else + ov_out_of_sync_print(device); + + err = drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, + eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); + + dec_unacked(device); + + --device->ov_left; + + /* let's advance progress step marks only for every other megabyte */ + if ((device->ov_left & 0x200) == 0x200) + drbd_advance_rs_marks(device, device->ov_left); + + stop_sector_reached = verify_can_do_stop_sector(device) && + (sector + (size>>9)) >= device->ov_stop_sector; + + if (device->ov_left == 0 || stop_sector_reached) { + ov_out_of_sync_print(device); + drbd_resync_finished(device); + } + + return err; +} + +/* FIXME + * We need to track the number of pending barrier acks, + * and to be able to wait for them. + * See also comment in drbd_adm_attach before drbd_suspend_io. + */ +static int drbd_send_barrier(struct drbd_connection *connection) +{ + struct p_barrier *p; + struct drbd_socket *sock; + + sock = &connection->data; + p = conn_prepare_command(connection, sock); + if (!p) + return -EIO; + p->barrier = connection->send.current_epoch_nr; + p->pad = 0; + connection->send.current_epoch_writes = 0; + + return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0); +} + +int w_send_write_hint(struct drbd_work *w, int cancel) +{ + struct drbd_device *device = + container_of(w, struct drbd_device, unplug_work); + struct drbd_socket *sock; + + if (cancel) + return 0; + sock = &first_peer_device(device)->connection->data; + if (!drbd_prepare_command(first_peer_device(device), sock)) + return -EIO; + return drbd_send_command(first_peer_device(device), sock, P_UNPLUG_REMOTE, 0, NULL, 0); +} + +static void re_init_if_first_write(struct drbd_connection *connection, unsigned int epoch) +{ + if (!connection->send.seen_any_write_yet) { + connection->send.seen_any_write_yet = true; + connection->send.current_epoch_nr = epoch; + connection->send.current_epoch_writes = 0; + } +} + +static void maybe_send_barrier(struct drbd_connection *connection, unsigned int epoch) +{ + /* re-init if first write on this connection */ + if (!connection->send.seen_any_write_yet) + return; + if (connection->send.current_epoch_nr != epoch) { + if (connection->send.current_epoch_writes) + drbd_send_barrier(connection); + connection->send.current_epoch_nr = epoch; + } +} + +int w_send_out_of_sync(struct drbd_work *w, int cancel) +{ + struct drbd_request *req = container_of(w, struct drbd_request, w); + struct drbd_device *device = req->device; + struct drbd_peer_device *const peer_device = first_peer_device(device); + struct drbd_connection *const connection = peer_device->connection; + int err; + + if (unlikely(cancel)) { + req_mod(req, SEND_CANCELED); + return 0; + } + req->pre_send_jif = jiffies; + + /* this time, no connection->send.current_epoch_writes++; + * If it was sent, it was the closing barrier for the last + * replicated epoch, before we went into AHEAD mode. + * No more barriers will be sent, until we leave AHEAD mode again. */ + maybe_send_barrier(connection, req->epoch); + + err = drbd_send_out_of_sync(peer_device, req); + req_mod(req, OOS_HANDED_TO_NETWORK); + + return err; +} + +/** + * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request + * @w: work object. + * @cancel: The connection will be closed anyways + */ +int w_send_dblock(struct drbd_work *w, int cancel) +{ + struct drbd_request *req = container_of(w, struct drbd_request, w); + struct drbd_device *device = req->device; + struct drbd_peer_device *const peer_device = first_peer_device(device); + struct drbd_connection *connection = peer_device->connection; + int err; + + if (unlikely(cancel)) { + req_mod(req, SEND_CANCELED); + return 0; + } + req->pre_send_jif = jiffies; + + re_init_if_first_write(connection, req->epoch); + maybe_send_barrier(connection, req->epoch); + connection->send.current_epoch_writes++; + + err = drbd_send_dblock(peer_device, req); + req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); + + return err; +} + +/** + * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet + * @w: work object. + * @cancel: The connection will be closed anyways + */ +int w_send_read_req(struct drbd_work *w, int cancel) +{ + struct drbd_request *req = container_of(w, struct drbd_request, w); + struct drbd_device *device = req->device; + struct drbd_peer_device *const peer_device = first_peer_device(device); + struct drbd_connection *connection = peer_device->connection; + int err; + + if (unlikely(cancel)) { + req_mod(req, SEND_CANCELED); + return 0; + } + req->pre_send_jif = jiffies; + + /* Even read requests may close a write epoch, + * if there was any yet. */ + maybe_send_barrier(connection, req->epoch); + + err = drbd_send_drequest(peer_device, P_DATA_REQUEST, req->i.sector, req->i.size, + (unsigned long)req); + + req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); + + return err; +} + +int w_restart_disk_io(struct drbd_work *w, int cancel) +{ + struct drbd_request *req = container_of(w, struct drbd_request, w); + struct drbd_device *device = req->device; + + if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) + drbd_al_begin_io(device, &req->i); + + drbd_req_make_private_bio(req, req->master_bio); + req->private_bio->bi_bdev = device->ldev->backing_bdev; + generic_make_request(req->private_bio); + + return 0; +} + +static int _drbd_may_sync_now(struct drbd_device *device) +{ + struct drbd_device *odev = device; + int resync_after; + + while (1) { + if (!odev->ldev || odev->state.disk == D_DISKLESS) + return 1; + rcu_read_lock(); + resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; + rcu_read_unlock(); + if (resync_after == -1) + return 1; + odev = minor_to_device(resync_after); + if (!odev) + return 1; + if ((odev->state.conn >= C_SYNC_SOURCE && + odev->state.conn <= C_PAUSED_SYNC_T) || + odev->state.aftr_isp || odev->state.peer_isp || + odev->state.user_isp) + return 0; + } +} + +/** + * _drbd_pause_after() - Pause resync on all devices that may not resync now + * @device: DRBD device. + * + * Called from process context only (admin command and after_state_ch). + */ +static int _drbd_pause_after(struct drbd_device *device) +{ + struct drbd_device *odev; + int i, rv = 0; + + rcu_read_lock(); + idr_for_each_entry(&drbd_devices, odev, i) { + if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) + continue; + if (!_drbd_may_sync_now(odev)) + rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL) + != SS_NOTHING_TO_DO); + } + rcu_read_unlock(); + + return rv; +} + +/** + * _drbd_resume_next() - Resume resync on all devices that may resync now + * @device: DRBD device. + * + * Called from process context only (admin command and worker). + */ +static int _drbd_resume_next(struct drbd_device *device) +{ + struct drbd_device *odev; + int i, rv = 0; + + rcu_read_lock(); + idr_for_each_entry(&drbd_devices, odev, i) { + if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) + continue; + if (odev->state.aftr_isp) { + if (_drbd_may_sync_now(odev)) + rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0), + CS_HARD, NULL) + != SS_NOTHING_TO_DO) ; + } + } + rcu_read_unlock(); + return rv; +} + +void resume_next_sg(struct drbd_device *device) +{ + write_lock_irq(&global_state_lock); + _drbd_resume_next(device); + write_unlock_irq(&global_state_lock); +} + +void suspend_other_sg(struct drbd_device *device) +{ + write_lock_irq(&global_state_lock); + _drbd_pause_after(device); + write_unlock_irq(&global_state_lock); +} + +/* caller must hold global_state_lock */ +enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor) +{ + struct drbd_device *odev; + int resync_after; + + if (o_minor == -1) + return NO_ERROR; + if (o_minor < -1 || o_minor > MINORMASK) + return ERR_RESYNC_AFTER; + + /* check for loops */ + odev = minor_to_device(o_minor); + while (1) { + if (odev == device) + return ERR_RESYNC_AFTER_CYCLE; + + /* You are free to depend on diskless, non-existing, + * or not yet/no longer existing minors. + * We only reject dependency loops. + * We cannot follow the dependency chain beyond a detached or + * missing minor. + */ + if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS) + return NO_ERROR; + + rcu_read_lock(); + resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; + rcu_read_unlock(); + /* dependency chain ends here, no cycles. */ + if (resync_after == -1) + return NO_ERROR; + + /* follow the dependency chain */ + odev = minor_to_device(resync_after); + } +} + +/* caller must hold global_state_lock */ +void drbd_resync_after_changed(struct drbd_device *device) +{ + int changes; + + do { + changes = _drbd_pause_after(device); + changes |= _drbd_resume_next(device); + } while (changes); +} + +void drbd_rs_controller_reset(struct drbd_device *device) +{ + struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk; + struct fifo_buffer *plan; + + atomic_set(&device->rs_sect_in, 0); + atomic_set(&device->rs_sect_ev, 0); + device->rs_in_flight = 0; + device->rs_last_events = + (int)part_stat_read(&disk->part0, sectors[0]) + + (int)part_stat_read(&disk->part0, sectors[1]); + + /* Updating the RCU protected object in place is necessary since + this function gets called from atomic context. + It is valid since all other updates also lead to an completely + empty fifo */ + rcu_read_lock(); + plan = rcu_dereference(device->rs_plan_s); + plan->total = 0; + fifo_set(plan, 0); + rcu_read_unlock(); +} + +void start_resync_timer_fn(unsigned long data) +{ + struct drbd_device *device = (struct drbd_device *) data; + drbd_device_post_work(device, RS_START); +} + +static void do_start_resync(struct drbd_device *device) +{ + if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) { + drbd_warn(device, "postponing start_resync ...\n"); + device->start_resync_timer.expires = jiffies + HZ/10; + add_timer(&device->start_resync_timer); + return; + } + + drbd_start_resync(device, C_SYNC_SOURCE); + clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags); +} + +static bool use_checksum_based_resync(struct drbd_connection *connection, struct drbd_device *device) +{ + bool csums_after_crash_only; + rcu_read_lock(); + csums_after_crash_only = rcu_dereference(connection->net_conf)->csums_after_crash_only; + rcu_read_unlock(); + return connection->agreed_pro_version >= 89 && /* supported? */ + connection->csums_tfm && /* configured? */ + (csums_after_crash_only == 0 /* use for each resync? */ + || test_bit(CRASHED_PRIMARY, &device->flags)); /* or only after Primary crash? */ +} + +/** + * drbd_start_resync() - Start the resync process + * @device: DRBD device. + * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET + * + * This function might bring you directly into one of the + * C_PAUSED_SYNC_* states. + */ +void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) +{ + struct drbd_peer_device *peer_device = first_peer_device(device); + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + union drbd_state ns; + int r; + + if (device->state.conn >= C_SYNC_SOURCE && device->state.conn < C_AHEAD) { + drbd_err(device, "Resync already running!\n"); + return; + } + + if (!test_bit(B_RS_H_DONE, &device->flags)) { + if (side == C_SYNC_TARGET) { + /* Since application IO was locked out during C_WF_BITMAP_T and + C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET + we check that we might make the data inconsistent. */ + r = drbd_khelper(device, "before-resync-target"); + r = (r >> 8) & 0xff; + if (r > 0) { + drbd_info(device, "before-resync-target handler returned %d, " + "dropping connection.\n", r); + conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); + return; + } + } else /* C_SYNC_SOURCE */ { + r = drbd_khelper(device, "before-resync-source"); + r = (r >> 8) & 0xff; + if (r > 0) { + if (r == 3) { + drbd_info(device, "before-resync-source handler returned %d, " + "ignoring. Old userland tools?", r); + } else { + drbd_info(device, "before-resync-source handler returned %d, " + "dropping connection.\n", r); + conn_request_state(connection, + NS(conn, C_DISCONNECTING), CS_HARD); + return; + } + } + } + } + + if (current == connection->worker.task) { + /* The worker should not sleep waiting for state_mutex, + that can take long */ + if (!mutex_trylock(device->state_mutex)) { + set_bit(B_RS_H_DONE, &device->flags); + device->start_resync_timer.expires = jiffies + HZ/5; + add_timer(&device->start_resync_timer); + return; + } + } else { + mutex_lock(device->state_mutex); + } + clear_bit(B_RS_H_DONE, &device->flags); + + /* req_lock: serialize with drbd_send_and_submit() and others + * global_state_lock: for stable sync-after dependencies */ + spin_lock_irq(&device->resource->req_lock); + write_lock(&global_state_lock); + /* Did some connection breakage or IO error race with us? */ + if (device->state.conn < C_CONNECTED + || !get_ldev_if_state(device, D_NEGOTIATING)) { + write_unlock(&global_state_lock); + spin_unlock_irq(&device->resource->req_lock); + mutex_unlock(device->state_mutex); + return; + } + + ns = drbd_read_state(device); + + ns.aftr_isp = !_drbd_may_sync_now(device); + + ns.conn = side; + + if (side == C_SYNC_TARGET) + ns.disk = D_INCONSISTENT; + else /* side == C_SYNC_SOURCE */ + ns.pdsk = D_INCONSISTENT; + + r = __drbd_set_state(device, ns, CS_VERBOSE, NULL); + ns = drbd_read_state(device); + + if (ns.conn < C_CONNECTED) + r = SS_UNKNOWN_ERROR; + + if (r == SS_SUCCESS) { + unsigned long tw = drbd_bm_total_weight(device); + unsigned long now = jiffies; + int i; + + device->rs_failed = 0; + device->rs_paused = 0; + device->rs_same_csum = 0; + device->rs_last_sect_ev = 0; + device->rs_total = tw; + device->rs_start = now; + for (i = 0; i < DRBD_SYNC_MARKS; i++) { + device->rs_mark_left[i] = tw; + device->rs_mark_time[i] = now; + } + _drbd_pause_after(device); + /* Forget potentially stale cached per resync extent bit-counts. + * Open coded drbd_rs_cancel_all(device), we already have IRQs + * disabled, and know the disk state is ok. */ + spin_lock(&device->al_lock); + lc_reset(device->resync); + device->resync_locked = 0; + device->resync_wenr = LC_FREE; + spin_unlock(&device->al_lock); + } + write_unlock(&global_state_lock); + spin_unlock_irq(&device->resource->req_lock); + + if (r == SS_SUCCESS) { + wake_up(&device->al_wait); /* for lc_reset() above */ + /* reset rs_last_bcast when a resync or verify is started, + * to deal with potential jiffies wrap. */ + device->rs_last_bcast = jiffies - HZ; + + drbd_info(device, "Began resync as %s (will sync %lu KB [%lu bits set]).\n", + drbd_conn_str(ns.conn), + (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10), + (unsigned long) device->rs_total); + if (side == C_SYNC_TARGET) { + device->bm_resync_fo = 0; + device->use_csums = use_checksum_based_resync(connection, device); + } else { + device->use_csums = 0; + } + + /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid + * with w_send_oos, or the sync target will get confused as to + * how much bits to resync. We cannot do that always, because for an + * empty resync and protocol < 95, we need to do it here, as we call + * drbd_resync_finished from here in that case. + * We drbd_gen_and_send_sync_uuid here for protocol < 96, + * and from after_state_ch otherwise. */ + if (side == C_SYNC_SOURCE && connection->agreed_pro_version < 96) + drbd_gen_and_send_sync_uuid(peer_device); + + if (connection->agreed_pro_version < 95 && device->rs_total == 0) { + /* This still has a race (about when exactly the peers + * detect connection loss) that can lead to a full sync + * on next handshake. In 8.3.9 we fixed this with explicit + * resync-finished notifications, but the fix + * introduces a protocol change. Sleeping for some + * time longer than the ping interval + timeout on the + * SyncSource, to give the SyncTarget the chance to + * detect connection loss, then waiting for a ping + * response (implicit in drbd_resync_finished) reduces + * the race considerably, but does not solve it. */ + if (side == C_SYNC_SOURCE) { + struct net_conf *nc; + int timeo; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9; + rcu_read_unlock(); + schedule_timeout_interruptible(timeo); + } + drbd_resync_finished(device); + } + + drbd_rs_controller_reset(device); + /* ns.conn may already be != device->state.conn, + * we may have been paused in between, or become paused until + * the timer triggers. + * No matter, that is handled in resync_timer_fn() */ + if (ns.conn == C_SYNC_TARGET) + mod_timer(&device->resync_timer, jiffies); + + drbd_md_sync(device); + } + put_ldev(device); + mutex_unlock(device->state_mutex); +} + +static void update_on_disk_bitmap(struct drbd_device *device, bool resync_done) +{ + struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, }; + device->rs_last_bcast = jiffies; + + if (!get_ldev(device)) + return; + + drbd_bm_write_lazy(device, 0); + if (resync_done && is_sync_state(device->state.conn)) + drbd_resync_finished(device); + + drbd_bcast_event(device, &sib); + /* update timestamp, in case it took a while to write out stuff */ + device->rs_last_bcast = jiffies; + put_ldev(device); +} + +static void drbd_ldev_destroy(struct drbd_device *device) +{ + lc_destroy(device->resync); + device->resync = NULL; + lc_destroy(device->act_log); + device->act_log = NULL; + + __acquire(local); + drbd_free_ldev(device->ldev); + device->ldev = NULL; + __release(local); + + clear_bit(GOING_DISKLESS, &device->flags); + wake_up(&device->misc_wait); +} + +static void go_diskless(struct drbd_device *device) +{ + D_ASSERT(device, device->state.disk == D_FAILED); + /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will + * inc/dec it frequently. Once we are D_DISKLESS, no one will touch + * the protected members anymore, though, so once put_ldev reaches zero + * again, it will be safe to free them. */ + + /* Try to write changed bitmap pages, read errors may have just + * set some bits outside the area covered by the activity log. + * + * If we have an IO error during the bitmap writeout, + * we will want a full sync next time, just in case. + * (Do we want a specific meta data flag for this?) + * + * If that does not make it to stable storage either, + * we cannot do anything about that anymore. + * + * We still need to check if both bitmap and ldev are present, we may + * end up here after a failed attach, before ldev was even assigned. + */ + if (device->bitmap && device->ldev) { + /* An interrupted resync or similar is allowed to recounts bits + * while we detach. + * Any modifications would not be expected anymore, though. + */ + if (drbd_bitmap_io_from_worker(device, drbd_bm_write, + "detach", BM_LOCKED_TEST_ALLOWED)) { + if (test_bit(WAS_READ_ERROR, &device->flags)) { + drbd_md_set_flag(device, MDF_FULL_SYNC); + drbd_md_sync(device); + } + } + } + + drbd_force_state(device, NS(disk, D_DISKLESS)); +} + +static int do_md_sync(struct drbd_device *device) +{ + drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); + drbd_md_sync(device); + return 0; +} + +/* only called from drbd_worker thread, no locking */ +void __update_timing_details( + struct drbd_thread_timing_details *tdp, + unsigned int *cb_nr, + void *cb, + const char *fn, const unsigned int line) +{ + unsigned int i = *cb_nr % DRBD_THREAD_DETAILS_HIST; + struct drbd_thread_timing_details *td = tdp + i; + + td->start_jif = jiffies; + td->cb_addr = cb; + td->caller_fn = fn; + td->line = line; + td->cb_nr = *cb_nr; + + i = (i+1) % DRBD_THREAD_DETAILS_HIST; + td = tdp + i; + memset(td, 0, sizeof(*td)); + + ++(*cb_nr); +} + +static void do_device_work(struct drbd_device *device, const unsigned long todo) +{ + if (test_bit(MD_SYNC, &todo)) + do_md_sync(device); + if (test_bit(RS_DONE, &todo) || + test_bit(RS_PROGRESS, &todo)) + update_on_disk_bitmap(device, test_bit(RS_DONE, &todo)); + if (test_bit(GO_DISKLESS, &todo)) + go_diskless(device); + if (test_bit(DESTROY_DISK, &todo)) + drbd_ldev_destroy(device); + if (test_bit(RS_START, &todo)) + do_start_resync(device); +} + +#define DRBD_DEVICE_WORK_MASK \ + ((1UL << GO_DISKLESS) \ + |(1UL << DESTROY_DISK) \ + |(1UL << MD_SYNC) \ + |(1UL << RS_START) \ + |(1UL << RS_PROGRESS) \ + |(1UL << RS_DONE) \ + ) + +static unsigned long get_work_bits(unsigned long *flags) +{ + unsigned long old, new; + do { + old = *flags; + new = old & ~DRBD_DEVICE_WORK_MASK; + } while (cmpxchg(flags, old, new) != old); + return old & DRBD_DEVICE_WORK_MASK; +} + +static void do_unqueued_work(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + unsigned long todo = get_work_bits(&device->flags); + if (!todo) + continue; + + kref_get(&device->kref); + rcu_read_unlock(); + do_device_work(device, todo); + kref_put(&device->kref, drbd_destroy_device); + rcu_read_lock(); + } + rcu_read_unlock(); +} + +static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list) +{ + spin_lock_irq(&queue->q_lock); + list_splice_tail_init(&queue->q, work_list); + spin_unlock_irq(&queue->q_lock); + return !list_empty(work_list); +} + +static void wait_for_work(struct drbd_connection *connection, struct list_head *work_list) +{ + DEFINE_WAIT(wait); + struct net_conf *nc; + int uncork, cork; + + dequeue_work_batch(&connection->sender_work, work_list); + if (!list_empty(work_list)) + return; + + /* Still nothing to do? + * Maybe we still need to close the current epoch, + * even if no new requests are queued yet. + * + * Also, poke TCP, just in case. + * Then wait for new work (or signal). */ + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + uncork = nc ? nc->tcp_cork : 0; + rcu_read_unlock(); + if (uncork) { + mutex_lock(&connection->data.mutex); + if (connection->data.socket) + drbd_tcp_uncork(connection->data.socket); + mutex_unlock(&connection->data.mutex); + } + + for (;;) { + int send_barrier; + prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE); + spin_lock_irq(&connection->resource->req_lock); + spin_lock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */ + if (!list_empty(&connection->sender_work.q)) + list_splice_tail_init(&connection->sender_work.q, work_list); + spin_unlock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */ + if (!list_empty(work_list) || signal_pending(current)) { + spin_unlock_irq(&connection->resource->req_lock); + break; + } + + /* We found nothing new to do, no to-be-communicated request, + * no other work item. We may still need to close the last + * epoch. Next incoming request epoch will be connection -> + * current transfer log epoch number. If that is different + * from the epoch of the last request we communicated, it is + * safe to send the epoch separating barrier now. + */ + send_barrier = + atomic_read(&connection->current_tle_nr) != + connection->send.current_epoch_nr; + spin_unlock_irq(&connection->resource->req_lock); + + if (send_barrier) + maybe_send_barrier(connection, + connection->send.current_epoch_nr + 1); + + if (test_bit(DEVICE_WORK_PENDING, &connection->flags)) + break; + + /* drbd_send() may have called flush_signals() */ + if (get_t_state(&connection->worker) != RUNNING) + break; + + schedule(); + /* may be woken up for other things but new work, too, + * e.g. if the current epoch got closed. + * In which case we send the barrier above. */ + } + finish_wait(&connection->sender_work.q_wait, &wait); + + /* someone may have changed the config while we have been waiting above. */ + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + cork = nc ? nc->tcp_cork : 0; + rcu_read_unlock(); + mutex_lock(&connection->data.mutex); + if (connection->data.socket) { + if (cork) + drbd_tcp_cork(connection->data.socket); + else if (!uncork) + drbd_tcp_uncork(connection->data.socket); + } + mutex_unlock(&connection->data.mutex); +} + +int drbd_worker(struct drbd_thread *thi) +{ + struct drbd_connection *connection = thi->connection; + struct drbd_work *w = NULL; + struct drbd_peer_device *peer_device; + LIST_HEAD(work_list); + int vnr; + + while (get_t_state(thi) == RUNNING) { + drbd_thread_current_set_cpu(thi); + + if (list_empty(&work_list)) { + update_worker_timing_details(connection, wait_for_work); + wait_for_work(connection, &work_list); + } + + if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) { + update_worker_timing_details(connection, do_unqueued_work); + do_unqueued_work(connection); + } + + if (signal_pending(current)) { + flush_signals(current); + if (get_t_state(thi) == RUNNING) { + drbd_warn(connection, "Worker got an unexpected signal\n"); + continue; + } + break; + } + + if (get_t_state(thi) != RUNNING) + break; + + if (!list_empty(&work_list)) { + w = list_first_entry(&work_list, struct drbd_work, list); + list_del_init(&w->list); + update_worker_timing_details(connection, w->cb); + if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0) + continue; + if (connection->cstate >= C_WF_REPORT_PARAMS) + conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD); + } + } + + do { + if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) { + update_worker_timing_details(connection, do_unqueued_work); + do_unqueued_work(connection); + } + if (!list_empty(&work_list)) { + w = list_first_entry(&work_list, struct drbd_work, list); + list_del_init(&w->list); + update_worker_timing_details(connection, w->cb); + w->cb(w, 1); + } else + dequeue_work_batch(&connection->sender_work, &work_list); + } while (!list_empty(&work_list) || test_bit(DEVICE_WORK_PENDING, &connection->flags)); + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + D_ASSERT(device, device->state.disk == D_DISKLESS && device->state.conn == C_STANDALONE); + kref_get(&device->kref); + rcu_read_unlock(); + drbd_device_cleanup(device); + kref_put(&device->kref, drbd_destroy_device); + rcu_read_lock(); + } + rcu_read_unlock(); + + return 0; +} |